From b5e98b8618c1d473a3b28b3008fae69e5043ecc0 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Tue, 5 May 2026 14:34:53 +0100 Subject: [PATCH 01/22] Add split_int type alias Introduce split_int = int32 type alias as a single point of change for all split/tip/bin counters in future int16 -> int32 migrations. Currently an alias; exists to document the semantic intent and ease future refactors. Co-Authored-By: Claude Haiku --- inst/include/TreeDist/types.h | 4 ++++ src/ints.h | 1 + 2 files changed, 5 insertions(+) diff --git a/inst/include/TreeDist/types.h b/inst/include/TreeDist/types.h index c953989be..60b49ec0e 100644 --- a/inst/include/TreeDist/types.h +++ b/inst/include/TreeDist/types.h @@ -16,6 +16,10 @@ namespace TreeDist { using int32 = int_fast32_t; using cost = int_fast64_t; + // Primary type for split/tip/bin counters. int32 benchmarks faster than int16 + // (avoids sign-extension when mixing with SplitList int32 members). + using split_int = int32; + using lap_dim = int; using lap_row = lap_dim; using lap_col = lap_dim; diff --git a/src/ints.h b/src/ints.h index 50fd3cc70..3d424c4c0 100644 --- a/src/ints.h +++ b/src/ints.h @@ -6,6 +6,7 @@ // Re-export shared types to global scope for backward compatibility. using TreeDist::int16; using TreeDist::int32; +using TreeDist::split_int; // Types used only within TreeDist's own source. using uint32 = uint_fast32_t; From bcf3876e07ace79443d589a12c2af508dc1b23ee Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Tue, 5 May 2026 21:28:55 +0100 Subject: [PATCH 02/22] =?UTF-8?q?Complete=20int16=E2=86=92split=5Fint=20mi?= =?UTF-8?q?gration=20for=20large-tree=20robustness?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add split_int alias (int32) to mutual_clustering.h; update mutual_clustering_score() signature to accept split_int* for in_split arrays and all internal counters - Update mutual_clustering_impl.h: split_int throughout, lg2_lookup() / lg2_unrooted_lookup() / lg2_rooted_lookup() fallbacks replace direct OOB table accesses for n_tips > SL_MAX_TIPS - Migrate pairwise_distances.cpp: MatchScratch struct, find_exact_matches(), and all seven score functions use split_int; fix critical int16 overflow in shared_phylo_score() when n_tips=32767; fix all lg2[] OOB accesses - Migrate tree_distances.cpp: robinson_foulds_info(), matching_split_distance(), jaccard_similarity() use split_int and lookup helpers - Fix tree_distance_functions.cpp: pass split_int vectors to mutual_clustering_score() (was passing int16, causing compile error) - Re-export lookup function names into global scope in tree_distances.h - Update test-tree_distance_utilities.R: separate C++ backstop message pattern from R-level guard message (check_ntip uses fixed max of 32767) All 2002 tests pass; zero regressions on normal-sized trees. Co-Authored-By: Claude Sonnet 4.6 --- inst/include/TreeDist/mutual_clustering.h | 51 ++- .../include/TreeDist/mutual_clustering_impl.h | 130 +++---- src/pairwise_distances.cpp | 284 +++++++------- src/tree_distance_functions.cpp | 12 +- src/tree_distances.cpp | 347 +++++++++--------- src/tree_distances.h | 62 ++-- tests/testthat/test-tree_distance_utilities.R | 9 +- 7 files changed, 467 insertions(+), 428 deletions(-) diff --git a/inst/include/TreeDist/mutual_clustering.h b/inst/include/TreeDist/mutual_clustering.h index 437633f0b..1f592042e 100644 --- a/inst/include/TreeDist/mutual_clustering.h +++ b/inst/include/TreeDist/mutual_clustering.h @@ -22,7 +22,9 @@ namespace TreeDist { // lg2_unrooted[i] = log2((2i-5)!!) for i >= 3 // lg2_rooted = &lg2_unrooted[0] + 1 (so lg2_rooted[i] = lg2_unrooted[i+1]) // - // These are defined in mutual_clustering_impl.h. + // These are fast-path caches sized to TreeTools' stack threshold. + // For larger trees, fall back to on-the-fly computation via the _lookup helpers. + // Definitions are in mutual_clustering_impl.h. extern double lg2[SL_MAX_TIPS + 1]; extern double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2]; @@ -33,15 +35,46 @@ namespace TreeDist { // computation. max_tips should be >= the largest tree size used. void init_lg2_tables(int max_tips); + // log2(x) — table-fast for x <= SL_MAX_TIPS, runtime otherwise. + [[nodiscard]] inline double lg2_lookup(split_int x) noexcept { + if (x <= static_cast(SL_MAX_TIPS)) { + return lg2[x]; + } + return std::log2(static_cast(x)); + } + + // log2((2n-5)!!) — table-fast for n <= SL_MAX_TIPS+1, lgamma otherwise. + // (log2(e) = 1/log(2)) + [[nodiscard]] inline double lg2_unrooted_lookup(split_int n_tips) noexcept { + if (n_tips <= static_cast(SL_MAX_TIPS + 1)) { + return lg2_unrooted[n_tips]; + } + if (n_tips < 3) return 0.0; // LCOV_EXCL_LINE + const double n = static_cast(n_tips); + return (std::lgamma(2.0 * n - 3.0) - std::lgamma(n - 1.0)) / std::log(2.0) + - (n - 2.0); + } + + // log2((2n-3)!!) — table-fast for n <= SL_MAX_TIPS+1, lgamma otherwise. + [[nodiscard]] inline double lg2_rooted_lookup(split_int n_tips) noexcept { + if (n_tips <= static_cast(SL_MAX_TIPS + 1)) { + return lg2_rooted[n_tips]; + } + if (n_tips < 2) return 0.0; // LCOV_EXCL_LINE + const double n = static_cast(n_tips); + return (std::lgamma(2.0 * n - 1.0) - std::lgamma(n)) / std::log(2.0) + - (n - 1.0); + } + // ---- Inline helpers ---- // Information content of a perfectly-matching split pair. // ic_matching(a, b, n) = (a + b) * lg2[n] - a * lg2[a] - b * lg2[b] - [[nodiscard]] inline double ic_matching(int16 a, int16 b, - int16 n) noexcept { - const double lg2a = lg2[a]; - const double lg2b = lg2[b]; - const double lg2n = lg2[n]; + [[nodiscard]] inline double ic_matching(split_int a, split_int b, + split_int n) noexcept { + const double lg2a = lg2_lookup(a); + const double lg2b = lg2_lookup(b); + const double lg2n = lg2_lookup(n); return (a + b) * lg2n - a * lg2a - b * lg2b; } @@ -77,9 +110,9 @@ namespace TreeDist { // Implementation in mutual_clustering_impl.h. double mutual_clustering_score( - const splitbit* const* a_state, const int16* a_in, int16 a_n_splits, - const splitbit* const* b_state, const int16* b_in, int16 b_n_splits, - int16 n_bins, int32 n_tips, + const splitbit* const* a_state, const split_int* a_in, split_int a_n_splits, + const splitbit* const* b_state, const split_int* b_in, split_int b_n_splits, + split_int n_bins, int32 n_tips, LapScratch& scratch); } // namespace TreeDist diff --git a/inst/include/TreeDist/mutual_clustering_impl.h b/inst/include/TreeDist/mutual_clustering_impl.h index 7f98c8fb9..815db2354 100644 --- a/inst/include/TreeDist/mutual_clustering_impl.h +++ b/inst/include/TreeDist/mutual_clustering_impl.h @@ -68,17 +68,17 @@ void init_lg2_tables(int max_tips) { namespace detail { -static int16 find_exact_matches_raw( - const splitbit* const* a_state, const int16* /*a_in*/, int16 a_n, - const splitbit* const* b_state, const int16* /*b_in*/, int16 b_n, - int16 n_bins, int32 n_tips, - int16* a_match, int16* b_match) +static split_int find_exact_matches_raw( + const splitbit* const* a_state, const split_int* /*a_in*/, split_int a_n, + const splitbit* const* b_state, const split_int* /*b_in*/, split_int b_n, + split_int n_bins, int32 n_tips, + split_int* a_match, split_int* b_match) { - std::fill(a_match, a_match + a_n, int16(0)); - std::fill(b_match, b_match + b_n, int16(0)); + std::fill(a_match, a_match + a_n, split_int(0)); + std::fill(b_match, b_match + b_n, split_int(0)); if (a_n == 0 || b_n == 0) return 0; - const int16 last_bin = n_bins - 1; + const split_int last_bin = n_bins - 1; const splitbit last_mask = (n_tips % SL_BIN_SIZE == 0) ? ~splitbit(0) : (splitbit(1) << (n_tips % SL_BIN_SIZE)) - 1; @@ -87,17 +87,17 @@ static int16 find_exact_matches_raw( std::vector a_canon(static_cast(a_n) * n_bins); std::vector b_canon(static_cast(b_n) * n_bins); - for (int16 i = 0; i < a_n; ++i) { + for (split_int i = 0; i < a_n; ++i) { const bool flip = !(a_state[i][0] & 1); - for (int16 bin = 0; bin < n_bins; ++bin) { + for (split_int bin = 0; bin < n_bins; ++bin) { splitbit val = flip ? ~a_state[i][bin] : a_state[i][bin]; if (bin == last_bin) val &= last_mask; a_canon[i * n_bins + bin] = val; } } - for (int16 i = 0; i < b_n; ++i) { + for (split_int i = 0; i < b_n; ++i) { const bool flip = !(b_state[i][0] & 1); - for (int16 bin = 0; bin < n_bins; ++bin) { + for (split_int bin = 0; bin < n_bins; ++bin) { splitbit val = flip ? ~b_state[i][bin] : b_state[i][bin]; if (bin == last_bin) val &= last_mask; b_canon[i * n_bins + bin] = val; @@ -105,8 +105,8 @@ static int16 find_exact_matches_raw( } // Sort index arrays by canonical form - auto canon_less = [&](const splitbit* canon, int16 n_b, int16 i, int16 j) { - for (int16 bin = 0; bin < n_b; ++bin) { + auto canon_less = [&](const splitbit* canon, split_int n_b, split_int i, split_int j) { + for (split_int bin = 0; bin < n_b; ++bin) { const splitbit vi = canon[i * n_b + bin]; const splitbit vj = canon[j * n_b + bin]; if (vi < vj) return true; @@ -115,28 +115,28 @@ static int16 find_exact_matches_raw( return false; }; - std::vector a_order(a_n), b_order(b_n); - std::iota(a_order.begin(), a_order.end(), int16(0)); - std::iota(b_order.begin(), b_order.end(), int16(0)); + std::vector a_order(a_n), b_order(b_n); + std::iota(a_order.begin(), a_order.end(), split_int(0)); + std::iota(b_order.begin(), b_order.end(), split_int(0)); std::sort(a_order.begin(), a_order.end(), - [&](int16 i, int16 j) { + [&](split_int i, split_int j) { return canon_less(a_canon.data(), n_bins, i, j); }); std::sort(b_order.begin(), b_order.end(), - [&](int16 i, int16 j) { + [&](split_int i, split_int j) { return canon_less(b_canon.data(), n_bins, i, j); }); // Merge-scan - int16 exact_n = 0; - int16 ai_pos = 0, bi_pos = 0; + split_int exact_n = 0; + split_int ai_pos = 0, bi_pos = 0; while (ai_pos < a_n && bi_pos < b_n) { - const int16 ai = a_order[ai_pos]; - const int16 bi = b_order[bi_pos]; + const split_int ai = a_order[ai_pos]; + const split_int bi = b_order[bi_pos]; int cmp = 0; - for (int16 bin = 0; bin < n_bins; ++bin) { + for (split_int bin = 0; bin < n_bins; ++bin) { const splitbit va = a_canon[ai * n_bins + bin]; const splitbit vb = b_canon[bi * n_bins + bin]; if (va < vb) { cmp = -1; break; } @@ -165,41 +165,41 @@ static int16 find_exact_matches_raw( // ---- MCI score implementation ---- double mutual_clustering_score( - const splitbit* const* a_state, const int16* a_in, int16 a_n_splits, - const splitbit* const* b_state, const int16* b_in, int16 b_n_splits, - int16 n_bins, int32 n_tips, + const splitbit* const* a_state, const split_int* a_in, split_int a_n_splits, + const splitbit* const* b_state, const split_int* b_in, split_int b_n_splits, + split_int n_bins, int32 n_tips, LapScratch& scratch) { if (a_n_splits == 0 || b_n_splits == 0 || n_tips == 0) return 0.0; - const int16 most_splits = std::max(a_n_splits, b_n_splits); + const split_int most_splits = std::max(a_n_splits, b_n_splits); const double n_tips_rcp = 1.0 / static_cast(n_tips); constexpr cost max_score = BIG; constexpr double over_max = 1.0 / static_cast(BIG); const double max_over_tips = static_cast(BIG) * n_tips_rcp; - const double lg2_n = lg2[n_tips]; + const double lg2_n = lg2_lookup(n_tips); // --- Phase 1: O(n log n) exact-match detection --- - std::vector a_match_buf(a_n_splits); - std::vector b_match_buf(b_n_splits); + std::vector a_match_buf(a_n_splits); + std::vector b_match_buf(b_n_splits); - const int16 exact_n = detail::find_exact_matches_raw( + const split_int exact_n = detail::find_exact_matches_raw( a_state, a_in, a_n_splits, b_state, b_in, b_n_splits, n_bins, n_tips, a_match_buf.data(), b_match_buf.data()); - const int16* a_match = a_match_buf.data(); - const int16* b_match = b_match_buf.data(); + const split_int* a_match = a_match_buf.data(); + const split_int* b_match = b_match_buf.data(); // Accumulate exact-match score double exact_score = 0.0; - for (int16 ai = 0; ai < a_n_splits; ++ai) { + for (split_int ai = 0; ai < a_n_splits; ++ai) { if (a_match[ai]) { - const int16 na = a_in[ai]; - const int16 nA = static_cast(n_tips - na); - exact_score += ic_matching(na, nA, static_cast(n_tips)); + const split_int na = a_in[ai]; + const split_int nA = n_tips - na; + exact_score += ic_matching(na, nA, n_tips); } } @@ -209,58 +209,58 @@ double mutual_clustering_score( } // --- Phase 2: fill cost matrix for unmatched splits only (O(k²)) --- - const int16 lap_n = most_splits - exact_n; + const split_int lap_n = most_splits - exact_n; - std::vector a_unmatch, b_unmatch; + std::vector a_unmatch, b_unmatch; a_unmatch.reserve(lap_n); b_unmatch.reserve(lap_n); - for (int16 ai = 0; ai < a_n_splits; ++ai) { + for (split_int ai = 0; ai < a_n_splits; ++ai) { if (!a_match[ai]) a_unmatch.push_back(ai); } - for (int16 bi = 0; bi < b_n_splits; ++bi) { + for (split_int bi = 0; bi < b_n_splits; ++bi) { if (!b_match[bi]) b_unmatch.push_back(bi); } scratch.score_pool.resize(lap_n); CostMatrix& score = scratch.score_pool; - const int16 a_unmatched_n = static_cast(a_unmatch.size()); - const int16 b_unmatched_n = static_cast(b_unmatch.size()); + const split_int a_unmatched_n = static_cast(a_unmatch.size()); + const split_int b_unmatched_n = static_cast(b_unmatch.size()); - for (int16 a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { - const int16 ai = a_unmatch[a_pos]; - const int16 na = a_in[ai]; - const int16 nA = static_cast(n_tips - na); + for (split_int a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { + const split_int ai = a_unmatch[a_pos]; + const split_int na = a_in[ai]; + const split_int nA = n_tips - na; const splitbit* a_row = a_state[ai]; - const double offset_a = lg2_n - lg2[na]; - const double offset_A = lg2_n - lg2[nA]; + const double offset_a = lg2_n - lg2_lookup(na); + const double offset_A = lg2_n - lg2_lookup(nA); - for (int16 b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { - const int16 bi = b_unmatch[b_pos]; + for (split_int b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { + const split_int bi = b_unmatch[b_pos]; const splitbit* b_row = b_state[bi]; - int16 a_and_b = 0; - for (int16 bin = 0; bin < n_bins; ++bin) { + split_int a_and_b = 0; + for (split_int bin = 0; bin < n_bins; ++bin) { a_and_b += TreeTools::count_bits(a_row[bin] & b_row[bin]); } - const int16 nb = b_in[bi]; - const int16 nB = static_cast(n_tips - nb); - const int16 a_and_B = na - a_and_b; - const int16 A_and_b = nb - a_and_b; - const int16 A_and_B = nA - A_and_b; + const split_int nb = b_in[bi]; + const split_int nB = n_tips - nb; + const split_int a_and_B = na - a_and_b; + const split_int A_and_b = nb - a_and_b; + const split_int A_and_B = nA - A_and_b; if (a_and_b == A_and_b && a_and_b == a_and_B && a_and_b == A_and_B) { score(a_pos, b_pos) = max_score; } else { - const double lg2_nb = lg2[nb]; - const double lg2_nB = lg2[nB]; + const double lg2_nb = lg2_lookup(nb); + const double lg2_nB = lg2_lookup(nB); const double ic_sum = - a_and_b * (lg2[a_and_b] + offset_a - lg2_nb) + - a_and_B * (lg2[a_and_B] + offset_a - lg2_nB) + - A_and_b * (lg2[A_and_b] + offset_A - lg2_nb) + - A_and_B * (lg2[A_and_B] + offset_A - lg2_nB); + a_and_b * (lg2_lookup(a_and_b) + offset_a - lg2_nb) + + a_and_B * (lg2_lookup(a_and_B) + offset_a - lg2_nB) + + A_and_b * (lg2_lookup(A_and_b) + offset_A - lg2_nb) + + A_and_B * (lg2_lookup(A_and_B) + offset_A - lg2_nB); score(a_pos, b_pos) = max_score - static_cast(ic_sum * max_over_tips); } diff --git a/src/pairwise_distances.cpp b/src/pairwise_distances.cpp index 67db48abb..8d92a4fc6 100644 --- a/src/pairwise_distances.cpp +++ b/src/pairwise_distances.cpp @@ -34,12 +34,12 @@ using TreeTools::count_bits; // per-pair heap allocation. Vectors grow lazily and are never shrunk. // --------------------------------------------------------------------------- struct MatchScratch { - std::vector a_canon; - std::vector b_canon; - std::vector a_order; - std::vector b_order; - std::vector a_match; - std::vector b_match; + std::vector a_canon; + std::vector b_canon; + std::vector a_order; + std::vector b_order; + std::vector a_match; + std::vector b_match; }; // --------------------------------------------------------------------------- @@ -59,19 +59,19 @@ struct MatchScratch { // a_match[ai] = bi+1 if split ai matched split bi, else 0. // b_match[bi] = ai+1 if split bi matched split ai, else 0. // --------------------------------------------------------------------------- -static int16 find_exact_matches( +static split_int find_exact_matches( const SplitList& a, const SplitList& b, const int32 n_tips, MatchScratch& scratch ) { - const int16 n_bins = a.n_bins; - const int16 last_bin = n_bins - 1; + const split_int n_bins = a.n_bins; + const split_int last_bin = n_bins - 1; const splitbit last_mask = (n_tips % SL_BIN_SIZE == 0) ? ~splitbit(0) : (splitbit(1) << (n_tips % SL_BIN_SIZE)) - 1; - const int16 a_n = a.n_splits; - const int16 b_n = b.n_splits; + const split_int a_n = a.n_splits; + const split_int b_n = b.n_splits; // Ensure buffers are large enough (grow lazily, never shrink) const size_t a_canon_sz = static_cast(a_n) * n_bins; @@ -83,10 +83,10 @@ static int16 find_exact_matches( if (scratch.a_match.size() < static_cast(a_n)) scratch.a_match.resize(a_n); if (scratch.b_match.size() < static_cast(b_n)) scratch.b_match.resize(b_n); - int16* a_match = scratch.a_match.data(); - int16* b_match = scratch.b_match.data(); - std::fill(a_match, a_match + a_n, int16(0)); - std::fill(b_match, b_match + b_n, int16(0)); + split_int* a_match = scratch.a_match.data(); + split_int* b_match = scratch.b_match.data(); + std::fill(a_match, a_match + a_n, split_int(0)); + std::fill(b_match, b_match + b_n, split_int(0)); if (a_n == 0 || b_n == 0) return 0; @@ -94,17 +94,17 @@ static int16 find_exact_matches( splitbit* b_canon = scratch.b_canon.data(); // --- 1. Compute canonical forms into flat buffers --- - for (int16 i = 0; i < a_n; ++i) { + for (split_int i = 0; i < a_n; ++i) { const bool flip = !(a.state[i][0] & 1); - for (int16 bin = 0; bin < n_bins; ++bin) { + for (split_int bin = 0; bin < n_bins; ++bin) { splitbit val = flip ? ~a.state[i][bin] : a.state[i][bin]; if (bin == last_bin) val &= last_mask; a_canon[i * n_bins + bin] = val; } } - for (int16 i = 0; i < b_n; ++i) { + for (split_int i = 0; i < b_n; ++i) { const bool flip = !(b.state[i][0] & 1); - for (int16 bin = 0; bin < n_bins; ++bin) { + for (split_int bin = 0; bin < n_bins; ++bin) { splitbit val = flip ? ~b.state[i][bin] : b.state[i][bin]; if (bin == last_bin) val &= last_mask; b_canon[i * n_bins + bin] = val; @@ -112,8 +112,8 @@ static int16 find_exact_matches( } // --- 2. Sort index arrays by canonical form --- - auto canon_less = [&](const splitbit* canon, int16 i, int16 j) { - for (int16 bin = 0; bin < n_bins; ++bin) { + auto canon_less = [&](const splitbit* canon, split_int i, split_int j) { + for (split_int bin = 0; bin < n_bins; ++bin) { const splitbit vi = canon[i * n_bins + bin]; const splitbit vj = canon[j * n_bins + bin]; if (vi < vj) return true; @@ -122,29 +122,29 @@ static int16 find_exact_matches( return false; // #nocov }; - int16* a_order = scratch.a_order.data(); - int16* b_order = scratch.b_order.data(); - std::iota(a_order, a_order + a_n, int16(0)); - std::iota(b_order, b_order + b_n, int16(0)); + split_int* a_order = scratch.a_order.data(); + split_int* b_order = scratch.b_order.data(); + std::iota(a_order, a_order + a_n, split_int(0)); + std::iota(b_order, b_order + b_n, split_int(0)); std::sort(a_order, a_order + a_n, - [&](int16 i, int16 j) { + [&](split_int i, split_int j) { return canon_less(a_canon, i, j); }); std::sort(b_order, b_order + b_n, - [&](int16 i, int16 j) { + [&](split_int i, split_int j) { return canon_less(b_canon, i, j); }); // --- 3. Merge-scan to find matches --- - int16 exact_n = 0; - int16 ai_pos = 0, bi_pos = 0; + split_int exact_n = 0; + split_int ai_pos = 0, bi_pos = 0; while (ai_pos < a_n && bi_pos < b_n) { - const int16 ai = a_order[ai_pos]; - const int16 bi = b_order[bi_pos]; + const split_int ai = a_order[ai_pos]; + const split_int bi = b_order[bi_pos]; int cmp = 0; - for (int16 bin = 0; bin < n_bins; ++bin) { + for (split_int bin = 0; bin < n_bins; ++bin) { const splitbit va = a_canon[ai * n_bins + bin]; const splitbit vb = b_canon[bi * n_bins + bin]; if (va < vb) { cmp = -1; break; } @@ -178,25 +178,25 @@ static double mutual_clustering_score( ) { if (a.n_splits == 0 || b.n_splits == 0 || n_tips == 0) return 0.0; - const int16 most_splits = std::max(a.n_splits, b.n_splits); + const split_int most_splits = std::max(a.n_splits, b.n_splits); const double n_tips_rcp = 1.0 / static_cast(n_tips); constexpr cost max_score = BIG; constexpr double over_max = 1.0 / static_cast(BIG); const double max_over_tips = static_cast(BIG) * n_tips_rcp; - const double lg2_n = lg2[n_tips]; + const double lg2_n = lg2_lookup(n_tips); // --- Phase 1: O(n log n) exact-match detection --- - const int16 exact_n = find_exact_matches(a, b, n_tips, mscratch); - const int16* a_match = mscratch.a_match.data(); - const int16* b_match = mscratch.b_match.data(); + const split_int exact_n = find_exact_matches(a, b, n_tips, mscratch); + const split_int* a_match = mscratch.a_match.data(); + const split_int* b_match = mscratch.b_match.data(); // Accumulate exact-match score double exact_score = 0.0; - for (int16 ai = 0; ai < a.n_splits; ++ai) { + for (split_int ai = 0; ai < a.n_splits; ++ai) { if (a_match[ai]) { - const int16 na = a.in_split[ai]; - const int16 nA = n_tips - na; + const split_int na = a.in_split[ai]; + const split_int nA = n_tips - na; exact_score += TreeDist::ic_matching(na, nA, n_tips); } } @@ -207,58 +207,58 @@ static double mutual_clustering_score( } // --- Phase 2: fill cost matrix for unmatched splits only (O(k²)) --- - const int16 lap_n = most_splits - exact_n; + const split_int lap_n = most_splits - exact_n; // Build index maps for unmatched splits - std::vector a_unmatch, b_unmatch; + std::vector a_unmatch, b_unmatch; a_unmatch.reserve(lap_n); b_unmatch.reserve(lap_n); - for (int16 ai = 0; ai < a.n_splits; ++ai) { + for (split_int ai = 0; ai < a.n_splits; ++ai) { if (!a_match[ai]) a_unmatch.push_back(ai); } - for (int16 bi = 0; bi < b.n_splits; ++bi) { + for (split_int bi = 0; bi < b.n_splits; ++bi) { if (!b_match[bi]) b_unmatch.push_back(bi); } scratch.score_pool.resize(lap_n); cost_matrix& score = scratch.score_pool; - const int16 a_unmatched_n = static_cast(a_unmatch.size()); - const int16 b_unmatched_n = static_cast(b_unmatch.size()); + const split_int a_unmatched_n = static_cast(a_unmatch.size()); + const split_int b_unmatched_n = static_cast(b_unmatch.size()); - for (int16 a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { - const int16 ai = a_unmatch[a_pos]; - const int16 na = a.in_split[ai]; - const int16 nA = n_tips - na; + for (split_int a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { + const split_int ai = a_unmatch[a_pos]; + const split_int na = a.in_split[ai]; + const split_int nA = n_tips - na; const auto* a_row = a.state[ai]; - const double offset_a = lg2_n - lg2[na]; - const double offset_A = lg2_n - lg2[nA]; + const double offset_a = lg2_n - lg2_lookup(na); + const double offset_A = lg2_n - lg2_lookup(nA); - for (int16 b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { - const int16 bi = b_unmatch[b_pos]; + for (split_int b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { + const split_int bi = b_unmatch[b_pos]; const auto* b_row = b.state[bi]; - int16 a_and_b = 0; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + split_int a_and_b = 0; + for (split_int bin = 0; bin < a.n_bins; ++bin) { a_and_b += count_bits(a_row[bin] & b_row[bin]); } - const int16 nb = b.in_split[bi]; - const int16 nB = n_tips - nb; - const int16 a_and_B = na - a_and_b; - const int16 A_and_b = nb - a_and_b; - const int16 A_and_B = nA - A_and_b; + const split_int nb = b.in_split[bi]; + const split_int nB = n_tips - nb; + const split_int a_and_B = na - a_and_b; + const split_int A_and_b = nb - a_and_b; + const split_int A_and_B = nA - A_and_b; if (a_and_b == A_and_b && a_and_b == a_and_B && a_and_b == A_and_B) { score(a_pos, b_pos) = max_score; } else { - const double lg2_nb = lg2[nb]; - const double lg2_nB = lg2[nB]; + const double lg2_nb = lg2_lookup(nb); + const double lg2_nB = lg2_lookup(nB); const double ic_sum = - a_and_b * (lg2[a_and_b] + offset_a - lg2_nb) + - a_and_B * (lg2[a_and_B] + offset_a - lg2_nB) + - A_and_b * (lg2[A_and_b] + offset_A - lg2_nb) + - A_and_B * (lg2[A_and_B] + offset_A - lg2_nB); + a_and_b * (lg2_lookup(a_and_b) + offset_a - lg2_nb) + + a_and_B * (lg2_lookup(a_and_B) + offset_a - lg2_nB) + + A_and_b * (lg2_lookup(A_and_b) + offset_A - lg2_nb) + + A_and_B * (lg2_lookup(A_and_B) + offset_A - lg2_nB); score(a_pos, b_pos) = max_score - static_cast(ic_sum * max_over_tips); } } @@ -366,27 +366,27 @@ static double rf_info_score( const SplitList& a, const SplitList& b, const int32 n_tips, MatchScratch& mscratch ) { - const int16 a_n = a.n_splits; - const int16 b_n = b.n_splits; + const split_int a_n = a.n_splits; + const split_int b_n = b.n_splits; if (a_n == 0 || b_n == 0) return 0; // Use sort+merge to find exact matches in O(n log n) - const int16 exact_n = find_exact_matches(a, b, n_tips, mscratch); + const split_int exact_n = find_exact_matches(a, b, n_tips, mscratch); if (exact_n == 0) return 0; // Sum info contribution for each matched split in a - const int16* a_match = mscratch.a_match.data(); - const double lg2_unrooted_n = lg2_unrooted[n_tips]; + const split_int* a_match = mscratch.a_match.data(); + const double lg2_unrooted_n = lg2_unrooted_lookup(n_tips); double score = 0; - for (int16 ai = 0; ai < a_n; ++ai) { + for (split_int ai = 0; ai < a_n; ++ai) { if (a_match[ai] == 0) continue; - int16 leaves_in_split = 0; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + split_int leaves_in_split = 0; + for (split_int bin = 0; bin < a.n_bins; ++bin) { leaves_in_split += count_bits(a.state[ai][bin]); } score += lg2_unrooted_n - - lg2_rooted[leaves_in_split] - - lg2_rooted[n_tips - leaves_in_split]; + - lg2_rooted_lookup(leaves_in_split) + - lg2_rooted_lookup(n_tips - leaves_in_split); } return score; } @@ -446,48 +446,48 @@ static double msd_score( const SplitList& a, const SplitList& b, const int32 n_tips, LapScratch& scratch, MatchScratch& mscratch ) { - const int16 most_splits = std::max(a.n_splits, b.n_splits); + const split_int most_splits = std::max(a.n_splits, b.n_splits); if (most_splits == 0) return 0.0; - const bool a_has_more = (a.n_splits > b.n_splits); - const int16 a_extra = a_has_more ? most_splits - b.n_splits : 0; - const int16 b_extra = a_has_more ? 0 : most_splits - a.n_splits; - const int16 half_tips = n_tips / 2; - const cost max_score = BIG / most_splits; + const bool a_has_more = (a.n_splits > b.n_splits); + const split_int a_extra = a_has_more ? most_splits - b.n_splits : 0; + const split_int b_extra = a_has_more ? 0 : most_splits - a.n_splits; + const split_int half_tips = n_tips / 2; + const cost max_score = BIG / most_splits; // --- Phase 1: O(n log n) exact-match detection --- - const int16 exact_n = find_exact_matches(a, b, n_tips, mscratch); - const int16* a_match = mscratch.a_match.data(); - const int16* b_match = mscratch.b_match.data(); + const split_int exact_n = find_exact_matches(a, b, n_tips, mscratch); + const split_int* a_match = mscratch.a_match.data(); + const split_int* b_match = mscratch.b_match.data(); if (exact_n == b.n_splits || exact_n == a.n_splits) { return 0.0; } // --- Phase 2: fill cost matrix for unmatched splits only --- - const int16 lap_n = most_splits - exact_n; + const split_int lap_n = most_splits - exact_n; - std::vector a_unmatch, b_unmatch; + std::vector a_unmatch, b_unmatch; a_unmatch.reserve(lap_n); b_unmatch.reserve(lap_n); - for (int16 ai = 0; ai < a.n_splits; ++ai) { + for (split_int ai = 0; ai < a.n_splits; ++ai) { if (!a_match[ai]) a_unmatch.push_back(ai); } - for (int16 bi = 0; bi < b.n_splits; ++bi) { + for (split_int bi = 0; bi < b.n_splits; ++bi) { if (!b_match[bi]) b_unmatch.push_back(bi); } scratch.score_pool.resize(lap_n); cost_matrix& score = scratch.score_pool; - const int16 a_unmatched_n = static_cast(a_unmatch.size()); - const int16 b_unmatched_n = static_cast(b_unmatch.size()); + const split_int a_unmatched_n = static_cast(a_unmatch.size()); + const split_int b_unmatched_n = static_cast(b_unmatch.size()); - for (int16 a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { - const int16 ai = a_unmatch[a_pos]; - for (int16 b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { - const int16 bi = b_unmatch[b_pos]; + for (split_int a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { + const split_int ai = a_unmatch[a_pos]; + for (split_int b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { + const split_int bi = b_unmatch[b_pos]; splitbit total = 0; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + for (split_int bin = 0; bin < a.n_bins; ++bin) { total += count_bits(a.state[ai][bin] ^ b.state[bi][bin]); } score(a_pos, b_pos) = static_cast( @@ -574,30 +574,30 @@ static double msi_score( const SplitList& a, const SplitList& b, const int32 n_tips, LapScratch& scratch ) { - const int16 most_splits = std::max(a.n_splits, b.n_splits); + const split_int most_splits = std::max(a.n_splits, b.n_splits); if (most_splits == 0) return 0.0; constexpr cost max_score = BIG; - const double max_possible = lg2_unrooted[n_tips] - - lg2_rooted[int16((n_tips + 1) / 2)] - - lg2_rooted[int16(n_tips / 2)]; + const double max_possible = lg2_unrooted_lookup(n_tips) + - lg2_rooted_lookup((n_tips + 1) / 2) + - lg2_rooted_lookup(n_tips / 2); const double score_over_possible = static_cast(max_score) / max_possible; const double possible_over_score = max_possible / static_cast(max_score); scratch.score_pool.resize(most_splits); cost_matrix& score = scratch.score_pool; - for (int16 ai = 0; ai < a.n_splits; ++ai) { - for (int16 bi = 0; bi < b.n_splits; ++bi) { - int16 n_a_only = 0, n_a_and_b = 0, n_different = 0; + for (split_int ai = 0; ai < a.n_splits; ++ai) { + for (split_int bi = 0; bi < b.n_splits; ++bi) { + split_int n_a_only = 0, n_a_and_b = 0, n_different = 0; splitbit different; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + for (split_int bin = 0; bin < a.n_bins; ++bin) { different = a.state[ai][bin] ^ b.state[bi][bin]; n_different += count_bits(different); n_a_only += count_bits(a.state[ai][bin] & different); n_a_and_b += count_bits(a.state[ai][bin] & ~different); } - const int16 n_same = n_tips - n_different; + const split_int n_same = n_tips - n_different; score(ai, bi) = cost(max_score - score_over_possible * TreeDist::mmsi_score(n_same, n_a_and_b, n_different, n_a_only)); } @@ -673,21 +673,21 @@ static double shared_phylo_score( const SplitList& a, const SplitList& b, const int32 n_tips, LapScratch& scratch ) { - const int16 most_splits = std::max(a.n_splits, b.n_splits); + const split_int most_splits = std::max(a.n_splits, b.n_splits); if (most_splits == 0) return 0.0; - const int16 overlap_a = int16(n_tips + 1) / 2; + const split_int overlap_a = (n_tips + 1) / 2; constexpr cost max_score = BIG; const double best_overlap = TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); - const double max_possible = lg2_unrooted[n_tips] - best_overlap; + const double max_possible = lg2_unrooted_lookup(n_tips) - best_overlap; const double score_over_possible = static_cast(max_score) / max_possible; const double possible_over_score = max_possible / static_cast(max_score); scratch.score_pool.resize(most_splits); cost_matrix& score = scratch.score_pool; - for (int16 ai = 0; ai < a.n_splits; ++ai) { - for (int16 bi = 0; bi < b.n_splits; ++bi) { + for (split_int ai = 0; ai < a.n_splits; ++ai) { + for (split_int bi = 0; bi < b.n_splits; ++bi) { const double spi = TreeDist::spi_overlap( a.state[ai], b.state[bi], n_tips, a.in_split[ai], b.in_split[bi], a.n_bins); @@ -763,7 +763,7 @@ static double jaccard_score( const double exponent, const bool allow_conflict, LapScratch& scratch, MatchScratch& mscratch ) { - const int16 most_splits = std::max(a.n_splits, b.n_splits); + const split_int most_splits = std::max(a.n_splits, b.n_splits); if (most_splits == 0) return 0.0; constexpr cost max_score = BIG; @@ -772,7 +772,7 @@ static double jaccard_score( // --- Phase 1: O(n log n) exact-match detection --- // Only used when allow_conflict=true; otherwise the full LAP may reassign // non-matching splits to compatible (non-exact) partners. - int16 exact_n = 0; + split_int exact_n = 0; if (allow_conflict) { exact_n = find_exact_matches(a, b, n_tips, mscratch); } else { @@ -781,61 +781,61 @@ static double jaccard_score( mscratch.a_match.resize(a.n_splits); if (mscratch.b_match.size() < static_cast(b.n_splits)) mscratch.b_match.resize(b.n_splits); - std::fill(mscratch.a_match.data(), mscratch.a_match.data() + a.n_splits, int16(0)); - std::fill(mscratch.b_match.data(), mscratch.b_match.data() + b.n_splits, int16(0)); + std::fill(mscratch.a_match.data(), mscratch.a_match.data() + a.n_splits, split_int(0)); + std::fill(mscratch.b_match.data(), mscratch.b_match.data() + b.n_splits, split_int(0)); } - const int16* a_match = mscratch.a_match.data(); - const int16* b_match = mscratch.b_match.data(); + const split_int* a_match = mscratch.a_match.data(); + const split_int* b_match = mscratch.b_match.data(); if (exact_n == b.n_splits || exact_n == a.n_splits) { return static_cast(exact_n); } // --- Phase 2: fill cost matrix for unmatched splits only --- - const int16 lap_n = most_splits - exact_n; + const split_int lap_n = most_splits - exact_n; - std::vector a_unmatch, b_unmatch; + std::vector a_unmatch, b_unmatch; a_unmatch.reserve(lap_n); b_unmatch.reserve(lap_n); - for (int16 ai = 0; ai < a.n_splits; ++ai) { + for (split_int ai = 0; ai < a.n_splits; ++ai) { if (!a_match[ai]) a_unmatch.push_back(ai); } - for (int16 bi = 0; bi < b.n_splits; ++bi) { + for (split_int bi = 0; bi < b.n_splits; ++bi) { if (!b_match[bi]) b_unmatch.push_back(bi); } scratch.score_pool.resize(lap_n); cost_matrix& score = scratch.score_pool; - const int16 a_unmatched_n = static_cast(a_unmatch.size()); - const int16 b_unmatched_n = static_cast(b_unmatch.size()); + const split_int a_unmatched_n = static_cast(a_unmatch.size()); + const split_int b_unmatched_n = static_cast(b_unmatch.size()); - for (int16 a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { - const int16 ai = a_unmatch[a_pos]; - const int16 na = a.in_split[ai]; - const int16 nA = n_tips - na; + for (split_int a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { + const split_int ai = a_unmatch[a_pos]; + const split_int na = a.in_split[ai]; + const split_int nA = n_tips - na; - for (int16 b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { - const int16 bi = b_unmatch[b_pos]; - int16 a_and_b = 0; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + for (split_int b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { + const split_int bi = b_unmatch[b_pos]; + split_int a_and_b = 0; + for (split_int bin = 0; bin < a.n_bins; ++bin) { a_and_b += count_bits(a.state[ai][bin] & b.state[bi][bin]); } - const int16 nb = b.in_split[bi]; - const int16 nB = n_tips - nb; - const int16 a_and_B = na - a_and_b; - const int16 A_and_b = nb - a_and_b; - const int16 A_and_B = nB - a_and_B; + const split_int nb = b.in_split[bi]; + const split_int nB = n_tips - nb; + const split_int a_and_B = na - a_and_b; + const split_int A_and_b = nb - a_and_b; + const split_int A_and_B = nB - a_and_B; if (!allow_conflict && !( a_and_b == na || a_and_B == na || A_and_b == nA || A_and_B == nA)) { score(a_pos, b_pos) = max_score; } else { - const int16 A_or_b = n_tips - a_and_B; - const int16 a_or_B = n_tips - A_and_b; - const int16 a_or_b = n_tips - A_and_B; - const int16 A_or_B = n_tips - a_and_b; + const split_int A_or_b = n_tips - a_and_B; + const split_int a_or_B = n_tips - A_and_b; + const split_int a_or_b = n_tips - A_and_B; + const split_int A_or_B = n_tips - a_and_b; const double ars_ab = static_cast(a_and_b) / static_cast(a_or_b); const double ars_Ab = static_cast(A_and_b) / static_cast(A_or_b); const double ars_aB = static_cast(a_and_B) / static_cast(a_or_B); @@ -1229,7 +1229,7 @@ NumericVector cpp_clustering_entropy_batch( for (int i = 0; i < N; ++i) { SplitList sl(Rcpp::as(splits_list[i])); double total = 0.0; - for (int16 s = 0; s < sl.n_splits; ++s) { + for (split_int s = 0; s < sl.n_splits; ++s) { const int k = sl.in_split[s]; if (k <= 0 || k >= n_tip) continue; const double p = k * invN; @@ -1271,7 +1271,7 @@ NumericVector cpp_splitwise_info_batch( for (int i = 0; i < N; ++i) { SplitList sl(Rcpp::as(splits_list[i])); double total = 0.0; - for (int16 s = 0; s < sl.n_splits; ++s) { + for (split_int s = 0; s < sl.n_splits; ++s) { const int k = sl.in_split[s]; if (k < 2 || (n_tip - k) < 2) continue; total += l2u_n - l2r[k] - l2r[n_tip - k]; diff --git a/src/tree_distance_functions.cpp b/src/tree_distance_functions.cpp index 99347300b..5ab745cb5 100644 --- a/src/tree_distance_functions.cpp +++ b/src/tree_distance_functions.cpp @@ -28,15 +28,15 @@ double cpp_mci_impl_score(const Rcpp::RawMatrix& x, // Build arrays matching the header's raw-pointer API types. std::vector a_ptrs(a.n_splits); std::vector b_ptrs(b.n_splits); - std::vector a_in(a.n_splits); - std::vector b_in(b.n_splits); - for (TreeDist::int16 i = 0; i < a.n_splits; ++i) { + std::vector a_in(a.n_splits); + std::vector b_in(b.n_splits); + for (TreeDist::split_int i = 0; i < a.n_splits; ++i) { a_ptrs[i] = a.state[i]; - a_in[i] = static_cast(a.in_split[i]); + a_in[i] = static_cast(a.in_split[i]); } - for (TreeDist::int16 i = 0; i < b.n_splits; ++i) { + for (TreeDist::split_int i = 0; i < b.n_splits; ++i) { b_ptrs[i] = b.state[i]; - b_in[i] = static_cast(b.in_split[i]); + b_in[i] = static_cast(b.in_split[i]); } return TreeDist::mutual_clustering_score( diff --git a/src/tree_distances.cpp b/src/tree_distances.cpp index 0da45e652..7613b9141 100644 --- a/src/tree_distances.cpp +++ b/src/tree_distances.cpp @@ -29,18 +29,15 @@ namespace TreeDist { } } + // Hard C++ ceiling: n_tips^2 must fit int32 and all counters must fit split_int. + // 32767 = INT16_MAX is the chosen practical cap (n^2 <= 1.07e9 < INT32_MAX). + // R-level .CheckMaxTips() fires before this; this is a safety backstop. + static constexpr int32 TREEDIST_MAX_TIPS = 32767; + void check_ntip(const int32 n) { - if (n > SL_MAX_TIPS) { - if (SL_MAX_TIPS <= 2048) { - Rcpp::stop("Trees with %d tips exceed the compiled limit of %d. " - "Update TreeTools to support more tips, then reinstall " - "TreeDist.", static_cast(n), - static_cast(SL_MAX_TIPS)); - } else { - Rcpp::stop("Trees with %d tips are not yet supported (maximum %d). ", - static_cast(n), - static_cast(SL_MAX_TIPS)); - } + if (n > TREEDIST_MAX_TIPS) { + Rcpp::stop("Trees with %d tips are not yet supported (maximum %d).", + static_cast(n), static_cast(TREEDIST_MAX_TIPS)); } } @@ -114,41 +111,41 @@ inline List robinson_foulds_info(const RawMatrix &x, const RawMatrix &y, const int32 n_tips) { const SplitList a(x), b(y); - const int16 last_bin = a.n_bins - 1; - const int16 unset_tips = (n_tips % SL_BIN_SIZE) ? + const split_int last_bin = a.n_bins - 1; + const split_int unset_tips = (n_tips % SL_BIN_SIZE) ? SL_BIN_SIZE - n_tips % SL_BIN_SIZE : 0; - + const splitbit unset_mask = ALL_ONES >> unset_tips; - const double lg2_unrooted_n = lg2_unrooted[n_tips]; + const double lg2_unrooted_n = lg2_unrooted_lookup(n_tips); double score = 0; - + grf_match matching(a.n_splits, NA_INTEGER); - - const int16 n_bins = a.n_bins; + + const split_int n_bins = a.n_bins; std::vector b_complement(b.n_splits * n_bins); - for (int16 i = 0; i < b.n_splits; i++) { + for (split_int i = 0; i < b.n_splits; i++) { splitbit* bc_i = &b_complement[i * n_bins]; - for (int16 bin = 0; bin < last_bin; ++bin) { + for (split_int bin = 0; bin < last_bin; ++bin) { bc_i[bin] = ~b.state[i][bin]; } bc_i[last_bin] = b.state[i][last_bin] ^ unset_mask; } - - for (int16 ai = 0; ai < a.n_splits; ++ai) { + + for (split_int ai = 0; ai < a.n_splits; ++ai) { if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); - for (int16 bi = 0; bi < b.n_splits; ++bi) { - + for (split_int bi = 0; bi < b.n_splits; ++bi) { + bool all_match = true, all_complement = true; const splitbit* bc_bi = &b_complement[bi * n_bins]; - - for (int16 bin = 0; bin < n_bins; ++bin) { + + for (split_int bin = 0; bin < n_bins; ++bin) { if ((a.state[ai][bin] != b.state[bi][bin])) { all_match = false; break; } } if (!all_match) { - for (int16 bin = 0; bin < n_bins; ++bin) { + for (split_int bin = 0; bin < n_bins; ++bin) { if ((a.state[ai][bin] != bc_bi[bin])) { all_complement = false; break; @@ -156,13 +153,13 @@ inline List robinson_foulds_info(const RawMatrix &x, const RawMatrix &y, } } if (all_match || all_complement) { - int16 leaves_in_split = 0; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + split_int leaves_in_split = 0; + for (split_int bin = 0; bin < a.n_bins; ++bin) { leaves_in_split += count_bits(a.state[ai][bin]); } - - score += lg2_unrooted_n - lg2_rooted[leaves_in_split] - - lg2_rooted[n_tips - leaves_in_split]; + + score += lg2_unrooted_n - lg2_rooted_lookup(leaves_in_split) - + lg2_rooted_lookup(n_tips - leaves_in_split); matching[ai] = bi + 1; break; /* Only one match possible per split */ @@ -180,50 +177,50 @@ inline List robinson_foulds_info(const RawMatrix &x, const RawMatrix &y, inline List matching_split_distance(const RawMatrix &x, const RawMatrix &y, const int32 n_tips) { const SplitList a(x), b(y); - const int16 most_splits = std::max(a.n_splits, b.n_splits); - const int16 split_diff = most_splits - std::min(a.n_splits, b.n_splits); - const int16 half_tips = n_tips / 2; + const split_int most_splits = std::max(a.n_splits, b.n_splits); + const split_int split_diff = most_splits - std::min(a.n_splits, b.n_splits); + const split_int half_tips = n_tips / 2; if (most_splits == 0) { return List::create(Named("score") = 0); } const cost max_score = BIG / most_splits; cost_matrix score(most_splits); - - for (int16 ai = 0; ai < a.n_splits; ++ai) { + + for (split_int ai = 0; ai < a.n_splits; ++ai) { if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); - for (int16 bi = 0; bi < b.n_splits; ++bi) { + for (split_int bi = 0; bi < b.n_splits; ++bi) { splitbit total = 0; - for (int16 bin = 0; bin < a.n_bins; ++bin) { - total += count_bits(a.state[ai][bin] ^ b.state[bi][bin]); + for (split_int bin = 0; bin < a.n_bins; ++bin) { + total += count_bits(a.state[ai][bin] ^ b.state[bi][bin]); } score(ai, bi) = total; } } - - for (int16 ai = 0; ai < a.n_splits; ++ai) { - for (int16 bi = 0; bi < b.n_splits; ++bi) { + + for (split_int ai = 0; ai < a.n_splits; ++ai) { + for (split_int bi = 0; bi < b.n_splits; ++bi) { if (score(ai, bi) > half_tips) { score(ai, bi) = n_tips - score(ai, bi); } } score.padRowAfterCol(ai, b.n_splits, max_score); } - + score.padAfterRow(a.n_splits, max_score); - + std::vector rowsol; std::vector colsol; - + resize_uninitialized(rowsol, most_splits); resize_uninitialized(colsol, most_splits); - + const double final_score = lap(most_splits, score, rowsol, colsol) - (max_score * split_diff); - + std::vector final_matching; final_matching.reserve(a.n_splits); - - for (int16 i = 0; i < a.n_splits; ++i) { + + for (split_int i = 0; i < a.n_splits; ++i) { const int match = (rowsol[i] < b.n_splits) ? static_cast(rowsol[i]) + 1 : NA_INTEGER; @@ -239,49 +236,49 @@ inline List jaccard_similarity(const RawMatrix &x, const RawMatrix &y, const int32 n_tips, const NumericVector &k, const LogicalVector &allowConflict) { const SplitList a(x), b(y); - const int16 most_splits = std::max(a.n_splits, b.n_splits); - + const split_int most_splits = std::max(a.n_splits, b.n_splits); + constexpr cost max_score = BIG; constexpr double max_scoreL = max_score; const double exponent = k[0]; - + bool allow_conflict = allowConflict[0]; - + cost_matrix score(most_splits); - - for (int16 ai = 0; ai < a.n_splits; ++ai) { + + for (split_int ai = 0; ai < a.n_splits; ++ai) { if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); - - const int16 na = a.in_split[ai]; - const int16 nA = n_tips - na; - - for (int16 bi = 0; bi < b.n_splits; ++bi) { - + + const split_int na = a.in_split[ai]; + const split_int nA = n_tips - na; + + for (split_int bi = 0; bi < b.n_splits; ++bi) { + // x divides tips into a|A; y divides tips into b|B - int16 a_and_b = 0; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + split_int a_and_b = 0; + for (split_int bin = 0; bin < a.n_bins; ++bin) { a_and_b += count_bits(a.state[ai][bin] & b.state[bi][bin]); } - - const int16 + + const split_int nb = b.in_split[bi], nB = n_tips - nb, a_and_B = na - a_and_b, A_and_b = nb - a_and_b, A_and_B = nB - a_and_B ; - + if (!allow_conflict && !( a_and_b == na || a_and_B == na || A_and_b == nA || A_and_B == nA) ) { - + score(ai, bi) = max_score; /* Prohibited */ - + } else { - const int16 + const split_int A_or_b = n_tips - a_and_B, a_or_B = n_tips - A_and_b, a_or_b = n_tips - A_and_B, @@ -330,76 +327,77 @@ inline List jaccard_similarity(const RawMatrix &x, const RawMatrix &y, std::vector final_matching; final_matching.reserve(a.n_splits); - for (int16 i = 0; i < a.n_splits; ++i) { + for (split_int i = 0; i < a.n_splits; ++i) { const int match = (rowsol[i] < b.n_splits) ? static_cast(rowsol[i]) + 1 : NA_INTEGER; final_matching.push_back(match); } - - + + return List::create(Named("score") = final_score, _["matching"] = final_matching); - + } List msi_distance(const RawMatrix &x, const RawMatrix &y, const int32 n_tips) { const SplitList a(x), b(y); - const int16 most_splits = std::max(a.n_splits, b.n_splits); + const split_int most_splits = std::max(a.n_splits, b.n_splits); constexpr cost max_score = BIG; - const double max_possible = lg2_unrooted[n_tips] - - lg2_rooted[int16((n_tips + 1) / 2)] - lg2_rooted[int16(n_tips / 2)]; + const double max_possible = lg2_unrooted_lookup(n_tips) - + lg2_rooted_lookup((n_tips + 1) / 2) - lg2_rooted_lookup(n_tips / 2); const double score_over_possible = static_cast(max_score) / max_possible; const double possible_over_score = max_possible / max_score; - + cost_matrix score(most_splits); - - splitbit different[SL_MAX_BINS]; - - for (int16 ai = 0; ai < a.n_splits; ++ai) { + + // Heap-allocated: n_bins can exceed SL_MAX_BINS for large trees. + std::vector different(a.n_bins); + + for (split_int ai = 0; ai < a.n_splits; ++ai) { if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); - for (int16 bi = 0; bi < b.n_splits; ++bi) { - int16 + for (split_int bi = 0; bi < b.n_splits; ++bi) { + split_int n_different = 0, n_a_only = 0, n_a_and_b = 0 ; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + for (split_int bin = 0; bin < a.n_bins; ++bin) { different[bin] = a.state[ai][bin] ^ b.state[bi][bin]; n_different += count_bits(different[bin]); n_a_only += count_bits(a.state[ai][bin] & different[bin]); n_a_and_b += count_bits(a.state[ai][bin] & ~different[bin]); } - const int16 n_same = n_tips - n_different; - - score(ai, bi) = cost(max_score - + const split_int n_same = n_tips - n_different; + + score(ai, bi) = cost(max_score - (score_over_possible * TreeDist::mmsi_score(n_same, n_a_and_b, n_different, n_a_only))); } score.padRowAfterCol(ai, b.n_splits, max_score); } score.padAfterRow(a.n_splits, max_score); - + std::vector rowsol; std::vector colsol; - + resize_uninitialized(rowsol, most_splits); resize_uninitialized(colsol, most_splits); - + const double final_score = static_cast( (max_score * most_splits) - lap(most_splits, score, rowsol, colsol)) * possible_over_score; - + std::vector final_matching; final_matching.reserve(a.n_splits); - - for (int16 i = 0; i < a.n_splits; ++i) { + + for (split_int i = 0; i < a.n_splits; ++i) { const int match = (rowsol[i] < b.n_splits) ? static_cast(rowsol[i]) + 1 : NA_INTEGER; final_matching.push_back(match); } - + return List::create(Named("score") = final_score, _["matching"] = final_matching); @@ -410,55 +408,55 @@ List mutual_clustering(const RawMatrix &x, const RawMatrix &y, const SplitList a(x); const SplitList b(y); const bool a_has_more_splits = (a.n_splits > b.n_splits); - const int16 most_splits = a_has_more_splits ? a.n_splits : b.n_splits; - const int16 a_extra_splits = a_has_more_splits ? most_splits - b.n_splits : 0; - const int16 b_extra_splits = a_has_more_splits ? 0 : most_splits - a.n_splits; + const split_int most_splits = a_has_more_splits ? a.n_splits : b.n_splits; + const split_int a_extra_splits = a_has_more_splits ? most_splits - b.n_splits : 0; + const split_int b_extra_splits = a_has_more_splits ? 0 : most_splits - a.n_splits; const double n_tips_reciprocal = 1.0 / n_tips; - + if (most_splits == 0 || n_tips == 0) { return List::create(Named("score") = 0, _["matching"] = IntegerVector(0)); } - + constexpr cost max_score = BIG; constexpr double over_max_score = 1.0 / static_cast(max_score); const double max_over_tips = static_cast(max_score) * n_tips_reciprocal; - const double lg2_n = lg2[n_tips]; - + const double lg2_n = lg2_lookup(n_tips); + cost_matrix score(most_splits); - + double exact_match_score = 0; - int16 exact_matches = 0; + split_int exact_matches = 0; // vector zero-initializes [so does make_unique] // match will have one added to it so numbering follows R; hence 0 = UNMATCHED std::vector a_match(a.n_splits); - std::unique_ptr b_match = std::make_unique(b.n_splits); - - for (int16 ai = 0; ai < a.n_splits; ++ai) { + std::unique_ptr b_match = std::make_unique(b.n_splits); + + for (split_int ai = 0; ai < a.n_splits; ++ai) { if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); if (a_match[ai]) continue; - - const int16 na = a.in_split[ai]; - const int16 nA = n_tips - na; + + const split_int na = a.in_split[ai]; + const split_int nA = n_tips - na; const auto *a_row = a.state[ai]; - const double offset_a = lg2_n - lg2[na]; - const double offset_A = lg2_n - lg2[nA]; - - for (int16 bi = 0; bi < b.n_splits; ++bi) { - + const double offset_a = lg2_n - lg2_lookup(na); + const double offset_A = lg2_n - lg2_lookup(nA); + + for (split_int bi = 0; bi < b.n_splits; ++bi) { + // x divides tips into a|A; y divides tips into b|B - int16 a_and_b = 0; + split_int a_and_b = 0; const auto *b_row = b.state[bi]; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + for (split_int bin = 0; bin < a.n_bins; ++bin) { a_and_b += count_bits(a_row[bin] & b_row[bin]); } - - const int16 nb = b.in_split[bi]; - const int16 nB = n_tips - nb; - const int16 a_and_B = na - a_and_b; - const int16 A_and_b = nb - a_and_b; - const int16 A_and_B = nA - A_and_b; - + + const split_int nb = b.in_split[bi]; + const split_int nB = n_tips - nb; + const split_int a_and_B = na - a_and_b; + const split_int A_and_b = nb - a_and_b; + const split_int A_and_B = nA - A_and_b; + if ((!a_and_B && !A_and_b) || (!a_and_b && !A_and_B)) { exact_match_score += TreeDist::ic_matching(na, nA, n_tips); @@ -471,44 +469,44 @@ List mutual_clustering(const RawMatrix &x, const RawMatrix &y, && a_and_b == A_and_B) { score(ai, bi) = max_score; // Avoid rounding errors } else { - const double lg2_nb = lg2[nb]; - const double lg2_nB = lg2[nB]; + const double lg2_nb = lg2_lookup(nb); + const double lg2_nB = lg2_lookup(nB); const double ic_sum = - a_and_b * (lg2[a_and_b] + offset_a - lg2_nb) + - a_and_B * (lg2[a_and_B] + offset_a - lg2_nB) + - A_and_b * (lg2[A_and_b] + offset_A - lg2_nb) + - A_and_B * (lg2[A_and_B] + offset_A - lg2_nB); - + a_and_b * (lg2_lookup(a_and_b) + offset_a - lg2_nb) + + a_and_B * (lg2_lookup(a_and_B) + offset_a - lg2_nB) + + A_and_b * (lg2_lookup(A_and_b) + offset_A - lg2_nb) + + A_and_B * (lg2_lookup(A_and_B) + offset_A - lg2_nB); + // Division by n_tips converts n(A&B) to P(A&B) for each ic_element score(ai, bi) = max_score - static_cast(ic_sum * max_over_tips); } } - + if (b.n_splits < most_splits) { score.padRowAfterCol(ai, b.n_splits, max_score); } } - + if (exact_matches == b.n_splits || exact_matches == a.n_splits) { return List::create( Named("score") = exact_match_score * n_tips_reciprocal, _["matching"] = a_match); } - - const int16 lap_dim = most_splits - exact_matches; + + const split_int lap_dim = most_splits - exact_matches; ASSERT(lap_dim > 0); std::vector rowsol; std::vector colsol; resize_uninitialized(rowsol, lap_dim); resize_uninitialized(colsol, lap_dim); cost_matrix small_score(lap_dim); - + if (exact_matches) { - int16 a_pos = 0; - for (int16 ai = 0; ai < a.n_splits; ++ai) { + split_int a_pos = 0; + for (split_int ai = 0; ai < a.n_splits; ++ai) { if (a_match[ai]) continue; - int16 b_pos = 0; - for (int16 bi = 0; bi < b.n_splits; ++bi) { + split_int b_pos = 0; + for (split_int bi = 0; bi < b.n_splits; ++bi) { if (b_match[bi]) continue; small_score(a_pos, b_pos) = score(ai, bi); b_pos++; @@ -517,58 +515,58 @@ List mutual_clustering(const RawMatrix &x, const RawMatrix &y, a_pos++; } small_score.padAfterRow(lap_dim - b_extra_splits, max_score); - - const double lap_score = static_cast((max_score * lap_dim) - + + const double lap_score = static_cast((max_score * lap_dim) - lap(lap_dim, small_score, rowsol, colsol)) * over_max_score; const double final_score = lap_score + (exact_match_score / n_tips); - - std::unique_ptr lap_decode = std::make_unique(lap_dim); - int16 fuzzy_match = 0; - for (int16 bi = 0; bi < b.n_splits; ++bi) { + + std::unique_ptr lap_decode = std::make_unique(lap_dim); + split_int fuzzy_match = 0; + for (split_int bi = 0; bi < b.n_splits; ++bi) { if (!b_match[bi]) { assert(fuzzy_match < lap_dim); lap_decode[fuzzy_match++] = bi + 1; } } - + fuzzy_match = 0; std::vector final_matching; TreeDist::resize_uninitialized(final_matching, a.n_splits); - for (int16 i = 0; i < a.n_splits; ++i) { + for (split_int i = 0; i < a.n_splits; ++i) { if (a_match[i]) { final_matching[i] = a_match[i]; } else { assert(fuzzy_match < lap_dim); - const int16 row_idx = fuzzy_match++; - const int16 col_idx = rowsol[row_idx]; + const split_int row_idx = fuzzy_match++; + const split_int col_idx = rowsol[row_idx]; final_matching[i] = (col_idx >= lap_dim - a_extra_splits) ? NA_INTEGER : lap_decode[col_idx]; } } - + return List::create(Named("score") = final_score, _["matching"] = final_matching); } else { - - for (int16 ai = a.n_splits; ai < most_splits; ++ai) { - for (int16 bi = 0; bi < most_splits; ++bi) { + + for (split_int ai = a.n_splits; ai < most_splits; ++ai) { + for (split_int bi = 0; bi < most_splits; ++bi) { score(ai, bi) = max_score; } } - + const double final_score = static_cast( (max_score * lap_dim) - lap(lap_dim, score, rowsol, colsol) ) / max_score; - + std::vector final_matching; final_matching.reserve(a.n_splits); - for (int16 i = 0; i < a.n_splits; ++i) { + for (split_int i = 0; i < a.n_splits; ++i) { const int match = (rowsol[i] < b.n_splits) ? static_cast(rowsol[i]) + 1 : NA_INTEGER; final_matching.push_back(match); } - + return List::create(Named("score") = final_score, _["matching"] = final_matching); } @@ -577,55 +575,54 @@ List mutual_clustering(const RawMatrix &x, const RawMatrix &y, inline List shared_phylo (const RawMatrix &x, const RawMatrix &y, const int32 n_tips) { const SplitList a(x), b(y); - const int16 most_splits = std::max(a.n_splits, b.n_splits); - const int16 overlap_a = int16(n_tips + 1) / 2; // avoids promotion to int - + const split_int most_splits = std::max(a.n_splits, b.n_splits); + const split_int overlap_a = (n_tips + 1) / 2; + constexpr cost max_score = BIG; - const double lg2_unrooted_n = lg2_unrooted[n_tips]; + const double lg2_unrooted_n = lg2_unrooted_lookup(n_tips); const double best_overlap = TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); const double max_possible = lg2_unrooted_n - best_overlap; const double score_over_possible = max_score / max_possible; const double possible_over_score = max_possible / max_score; - + // a and b are "clades" separating an "ingroup" [1] from an "outgroup" [0]. // In/out direction [i.e. 1/0 bit] is arbitrary. cost_matrix score(most_splits); - - for (int16 ai = a.n_splits; ai--; ) { + + for (split_int ai = a.n_splits; ai--; ) { if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); - for (int16 bi = b.n_splits; bi--; ) { + for (split_int bi = b.n_splits; bi--; ) { const double spi_over = TreeDist::spi_overlap( a.state[ai], b.state[bi], n_tips, a.in_split[ai], b.in_split[bi], a.n_bins); - + score(ai, bi) = spi_over == 0 ? max_score : cost((spi_over - best_overlap) * score_over_possible); } score.padRowAfterCol(ai, b.n_splits, max_score); } score.padAfterRow(a.n_splits, max_score); - + std::vector rowsol; std::vector colsol; - + resize_uninitialized(rowsol, most_splits); resize_uninitialized(colsol, most_splits); - - + const double final_score = static_cast( (max_score * most_splits) - lap(most_splits, score, rowsol, colsol)) * possible_over_score; - + std::vector final_matching; final_matching.reserve(a.n_splits); - - for (int16 i = 0; i < a.n_splits; ++i) { + + for (split_int i = 0; i < a.n_splits; ++i) { const int match = (rowsol[i] < b.n_splits) ? static_cast(rowsol[i]) + 1 : NA_INTEGER; final_matching.push_back(match); } - + return List::create(Named("score") = final_score, _["matching"] = final_matching); } diff --git a/src/tree_distances.h b/src/tree_distances.h index 1b24c2b30..b046e0038 100644 --- a/src/tree_distances.h +++ b/src/tree_distances.h @@ -5,12 +5,15 @@ #include #include -/***** Re-export shared lookup tables to global scope *****/ +/***** Re-export shared lookup tables and fallback functions to global scope *****/ using TreeDist::lg2; using TreeDist::lg2_double_factorial; using TreeDist::lg2_unrooted; using TreeDist::lg2_rooted; +using TreeDist::lg2_lookup; +using TreeDist::lg2_unrooted_lookup; +using TreeDist::lg2_rooted_lookup; /***** Constants *****/ @@ -23,85 +26,88 @@ namespace TreeDist { void check_ntip(int32 n); // Re-exported from mutual_clustering.h: - // ic_matching(int16 a, int16 b, int16 n) + // ic_matching(split_int a, split_int b, split_int n) // See equation 16 in Meila 2007 (k' denoted K). // nkK is converted to pkK in the calling function when divided by n. - inline void add_ic_element(double& ic_sum, const int16 nkK, const int16 nk, - const int16 nK, const int16 n_tips, + // Parameters are split_int (int32) to handle n_tips up to 32767 safely; + // products nkK*n_tips and nk*nK fit int32 since both factors <= 32767. + inline void add_ic_element(double& ic_sum, const split_int nkK, const split_int nk, + const split_int nK, const split_int n_tips, const double lg2_n) noexcept { if (nkK && nk && nK) { assert(!(nkK == nk && nkK == nK && nkK << 1 == n_tips)); const int32 numerator = nkK * n_tips; const int32 denominator = nk * nK; if (numerator != denominator) { - ic_sum += nkK * (lg2[nkK] + lg2_n - lg2[nk] - lg2[nK]); + ic_sum += nkK * (lg2_lookup(nkK) + lg2_n - lg2_lookup(nk) - lg2_lookup(nK)); } } } // Returns lg2_unrooted[x] - lg2_trees_matching_split(y, x - y) - [[nodiscard]] inline double mmsi_pair_score(const int16 x, const int16 y) noexcept { + [[nodiscard]] inline double mmsi_pair_score(const split_int x, const split_int y) noexcept { static_assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max(), "int16 too narrow for SL_MAX_TIPS"); - - return lg2_unrooted[x] - (lg2_rooted[y] + lg2_rooted[x - y]); + + return lg2_unrooted_lookup(x) - (lg2_rooted_lookup(y) + lg2_rooted_lookup(x - y)); } - [[nodiscard]] inline double mmsi_score(const int16 n_same, const int16 n_a_and_b, - const int16 n_different, const int16 n_a_only) noexcept { + [[nodiscard]] inline double mmsi_score(const split_int n_same, const split_int n_a_and_b, + const split_int n_different, const split_int n_a_only) noexcept { if (n_same == 0 || n_same == n_a_and_b) return mmsi_pair_score(n_different, n_a_only); if (n_different == 0 || n_different == n_a_only) return mmsi_pair_score(n_same, n_a_and_b); - + const double score1 = mmsi_pair_score(n_same, n_a_and_b), score2 = mmsi_pair_score(n_different, n_a_only); - + return (score1 > score2) ? score1 : score2; } -[[nodiscard]] inline double one_overlap(const int16 a, const int16 b, const int16 n) noexcept { + [[nodiscard]] inline double one_overlap(const split_int a, const split_int b, const split_int n) noexcept { static_assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max(), "int16 too narrow for SL_MAX_TIPS"); if (a == b) { - return lg2_rooted[a] + lg2_rooted[n - a]; + return lg2_rooted_lookup(a) + lg2_rooted_lookup(n - a); } // Unify ab via lo/hi: removes an unpredictable branch. - const int16 lo = (a < b) ? a : b; - const int16 hi = (a < b) ? b : a; - return lg2_rooted[hi] + lg2_rooted[n - lo] - lg2_rooted[hi - lo + 1]; + const split_int lo = (a < b) ? a : b; + const split_int hi = (a < b) ? b : a; + return lg2_rooted_lookup(hi) + lg2_rooted_lookup(n - lo) - lg2_rooted_lookup(hi - lo + 1); } - - [[nodiscard]] inline double one_overlap_notb(const int16 a, const int16 n_minus_b, const int16 n) noexcept { + + [[nodiscard]] inline double one_overlap_notb(const split_int a, const split_int n_minus_b, const split_int n) noexcept { static_assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max(), "int16 too narrow for SL_MAX_TIPS"); - const int16 b = n - n_minus_b; + const split_int b = n - n_minus_b; if (a == b) { - return lg2_rooted[b] + lg2_rooted[n_minus_b]; + return lg2_rooted_lookup(b) + lg2_rooted_lookup(n_minus_b); } else if (a < b) { - return lg2_rooted[b] + lg2_rooted[n - a] - lg2_rooted[b - a + 1]; + return lg2_rooted_lookup(b) + lg2_rooted_lookup(n - a) - lg2_rooted_lookup(b - a + 1); } else { - return lg2_rooted[a] + lg2_rooted[n_minus_b] - lg2_rooted[a - b + 1]; + return lg2_rooted_lookup(a) + lg2_rooted_lookup(n_minus_b) - lg2_rooted_lookup(a - b + 1); } } -// Popcount-based: single pass over bins replaces 4 sequential boolean scans. + // Popcount-based: single pass over bins replaces 4 sequential boolean scans. // Counts n_ab = |A ∩ B| via hardware POPCNT, then derives all 4 Venn-diagram // region populations from arithmetic on n_ab, in_a, in_b, n_tips. + // split_int used throughout so in_a + in_b does not overflow (max ~65534 for 32767-tip trees). [[nodiscard]] inline double spi_overlap(const splitbit* a_state, const splitbit* b_state, - const int16 n_tips, const int16 in_a, - const int16 in_b, const int16 n_bins) noexcept { + const split_int n_tips, const split_int in_a, + const split_int in_b, const split_int n_bins) noexcept { static_assert(SL_MAX_BINS <= INT16_MAX, "int16 too narrow for SL_MAX_BINS"); - int16 n_ab = 0; - for (int16 bin = 0; bin < n_bins; ++bin) { + split_int n_ab = 0; + for (split_int bin = 0; bin < n_bins; ++bin) { n_ab += TreeTools::count_bits(a_state[bin] & b_state[bin]); } diff --git a/tests/testthat/test-tree_distance_utilities.R b/tests/testthat/test-tree_distance_utilities.R index 4a7eff99a..14b9660ef 100644 --- a/tests/testthat/test-tree_distance_utilities.R +++ b/tests/testthat/test-tree_distance_utilities.R @@ -48,12 +48,15 @@ test_that("Tip-count guard is applied consistently", { "Trees with 327.. tips exceed the compiled limit of 2048" } expect_error(.CheckMaxTips(32705L), errMsg) - + + # Direct C++ calls bypass the R guard and hit check_ntip() whose message + # is always "not yet supported (maximum 32767)". + cppErrMsg <- "Trees with 327.. tips are not yet supported \\(maximum 32767\\)" splits8 <- unclass(as.Splits(BalancedTree(8))) expect_error(cpp_robinson_foulds_distance(splits8, splits8, 32768L), - errMsg) + cppErrMsg) expect_error(cpp_robinson_foulds_info(splits8, splits8, 32768L), - errMsg) + cppErrMsg) trees <- list(BalancedTree(8), PectinateTree(8)) class(trees) <- "multiPhylo" From ff8fa8889d8598e0956f3066bdebed7518690dec Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Tue, 5 May 2026 21:33:11 +0100 Subject: [PATCH 03/22] R soft cap: accept up to 32767 tips when TreeTools >= 2.3.0 - zzz.R: cache .TT_HAS_HEAP_FALLBACK flag at .onLoad() via packageVersion("TreeTools") >= "2.3.0" - .CheckMaxTips(): when heap fallback is available, accept nTip up to 32767 (TREEDIST_MAX_TIPS) rather than the compiled SL_MAX_TIPS (2048); old behavior retained for TT < 2.3.0 users - .NNIDistSingle(): replace hardcoded if (nTip > 32768L) with .CheckMaxTips(nTip) to route through the centralised guard - Tests: update NNI and utilities error-message patterns to match new guard messages; add TT-version-conditional branches in utilities test Co-Authored-By: Claude Sonnet 4.6 --- R/tree_distance_nni.R | 4 +-- R/tree_distance_utilities.R | 14 +++++++--- R/zzz.R | 5 +++- tests/testthat/test-tree_distance_nni.R | 5 ++-- tests/testthat/test-tree_distance_utilities.R | 27 ++++++++++++------- 5 files changed, 37 insertions(+), 18 deletions(-) diff --git a/R/tree_distance_nni.R b/R/tree_distance_nni.R index 458547647..d47d4ac66 100644 --- a/R/tree_distance_nni.R +++ b/R/tree_distance_nni.R @@ -73,9 +73,7 @@ NNIDist <- function(tree1, tree2 = tree1) { #' @importFrom TreeTools Postorder RenumberTips #' @importFrom ape Nnode.phylo .NNIDistSingle <- function(tree1, tree2, nTip, ...) { - if (nTip > 32768L) { - stop("Cannot calculate NNI distance for trees with so many tips.") - } + .CheckMaxTips(nTip) if (nrow(tree1[["edge"]]) != nrow(tree2[["edge"]])) { stop("Both trees must have the same number of edges. ", "Is one rooted and the other unrooted?") diff --git a/R/tree_distance_utilities.R b/R/tree_distance_utilities.R index b85fab32a..8273f9252 100644 --- a/R/tree_distance_utilities.R +++ b/R/tree_distance_utilities.R @@ -1,12 +1,20 @@ -# Validate that nTip does not exceed the compiled SL_MAX_TIPS limit. +# Validate that nTip does not exceed the supported tip-count ceiling. +# When TreeTools >= 2.3.0 provides heap-allocated split storage, accept up +# to 32767 tips (TREEDIST_MAX_TIPS); otherwise cap at the compiled SL_MAX_TIPS. # Called from every distance entry point before any C++ work. .CheckMaxTips <- function(nTip) { - if (!is.na(nTip) && nTip > .SL_MAX_TIPS) { + if (is.na(nTip)) return(invisible(NULL)) + if (.TT_HAS_HEAP_FALLBACK) { + if (nTip > 32767L) { + stop("Trees with ", nTip, + " tips are not yet supported (maximum 32767).") + } + } else if (nTip > .SL_MAX_TIPS) { if (.SL_MAX_TIPS < 32704L) { stop( "Trees with ", nTip, " tips exceed the compiled limit of ", .SL_MAX_TIPS, " tips.", - "\nUpdate TreeTools and reinstall TreeDist to support more tips." + "\nUpdate TreeTools and reinstall TreeDist to support more tips." ) } stop("Trees with ", nTip, " tips are not yet supported (maximum ", diff --git a/R/zzz.R b/R/zzz.R index 614ebbda3..ea27b2786 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,7 +1,10 @@ -.SL_MAX_TIPS <- NULL # populated in .onLoad +.SL_MAX_TIPS <- NULL # populated in .onLoad +.TT_HAS_HEAP_FALLBACK <- FALSE # TRUE when TreeTools >= 2.3.0 .onLoad <- function(libname, pkgname) { .SL_MAX_TIPS <<- cpp_sl_max_tips() + .TT_HAS_HEAP_FALLBACK <<- + utils::packageVersion("TreeTools") >= "2.3.0" } .onUnload <- function(libpath) { diff --git a/tests/testthat/test-tree_distance_nni.R b/tests/testthat/test-tree_distance_nni.R index 20bcda232..5682311c1 100644 --- a/tests/testthat/test-tree_distance_nni.R +++ b/tests/testthat/test-tree_distance_nni.R @@ -9,7 +9,8 @@ test_that("NNIDist() handles exceptions", { PectinateTree(as.character(1:8)))), "trees must bear identical labels") # R-level guard catches too-many-tips - expect_error(NNIDist(PectinateTree(40000), BalancedTree(40000)), "so many tips") + expect_error(NNIDist(PectinateTree(40000), BalancedTree(40000)), + "not yet supported") expect_error(NNIDist(BalancedTree(5), RootOnNode(BalancedTree(5), 1))) @@ -19,7 +20,7 @@ test_that("NNIDist() at NNI_MAX_TIPS", { maxTips <- 32768 more <- maxTips + 1 expect_error(.NNIDistSingle(PectinateTree(more), BalancedTree(more), more), - "so many tips") + "not yet supported") goingQuickly <- TRUE skip_if(goingQuickly) diff --git a/tests/testthat/test-tree_distance_utilities.R b/tests/testthat/test-tree_distance_utilities.R index 14b9660ef..0bc4d2b93 100644 --- a/tests/testthat/test-tree_distance_utilities.R +++ b/tests/testthat/test-tree_distance_utilities.R @@ -38,16 +38,25 @@ test_that(".SL_MAX_TIPS is populated", { test_that("Tip-count guard is applied consistently", { expect_no_error(.CheckMaxTips(1000L)) - - limit32704 <- TreeDist:::.SL_MAX_TIPS == 32704L - if (limit32704) expect_no_error(.CheckMaxTips(32704L)) - - errMsg <- if (limit32704) { - "Trees with 327.. tips are not yet supported \\(maximum 32704\\)" + + ttHasHeapFallback <- utils::packageVersion("TreeTools") >= "2.3.0" + + if (ttHasHeapFallback) { + # TT >= 2.3.0: accept up to 32767 (heap-backed split storage) + expect_no_error(.CheckMaxTips(32705L)) + expect_no_error(.CheckMaxTips(32767L)) + rErrMsg <- "Trees with 327.. tips are not yet supported \\(maximum 32767\\)" } else { - "Trees with 327.. tips exceed the compiled limit of 2048" + limit32704 <- TreeDist:::.SL_MAX_TIPS == 32704L + if (limit32704) expect_no_error(.CheckMaxTips(32704L)) + rErrMsg <- if (limit32704) { + "Trees with 327.. tips are not yet supported \\(maximum 32704\\)" + } else { + "Trees with 327.. tips exceed the compiled limit of 2048" + } + expect_error(.CheckMaxTips(32705L), rErrMsg) } - expect_error(.CheckMaxTips(32705L), errMsg) + expect_error(.CheckMaxTips(32768L), rErrMsg) # Direct C++ calls bypass the R guard and hit check_ntip() whose message # is always "not yet supported (maximum 32767)". @@ -62,7 +71,7 @@ test_that("Tip-count guard is applied consistently", { class(trees) <- "multiPhylo" expect_error( .SplitDistanceAllPairs(RobinsonFouldsSplits, trees, letters[1:8], 32768L), - errMsg + rErrMsg ) }) From 8d40294a33f8060f1f36247492b82225aa2c08d7 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Tue, 5 May 2026 21:37:09 +0100 Subject: [PATCH 04/22] Add large-tree smoke tests (Phase 4) test-large-trees.R covers trees with n_tips > 2048 (old SL_MAX_TIPS): - 4000-tip RF and IRF: non-zero for distinct topologies, zero for identical - 8000-tip RF and IRF: same checks, guarded by slowMode option - Tip-count ceiling: 32767 accepted, 32768 rejected with clear message All tests wrapped in skip_on_cran() and skip_if(SL_MAX_TIPS < 4000) so the suite stays green when linked against TreeTools < 2.3.0. Co-Authored-By: Claude Sonnet 4.6 --- tests/testthat/test-large-trees.R | 49 +++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 tests/testthat/test-large-trees.R diff --git a/tests/testthat/test-large-trees.R b/tests/testthat/test-large-trees.R new file mode 100644 index 000000000..d9f5a87f8 --- /dev/null +++ b/tests/testthat/test-large-trees.R @@ -0,0 +1,49 @@ +library("TreeTools", quietly = TRUE) + +# Tests that distance functions handle trees larger than the old SL_MAX_TIPS +# threshold (2048 in TreeTools <= 2.2.0). These require TreeTools >= 2.3.0 +# for heap-backed split storage; they skip gracefully on older builds. + +test_that("RF and IRF work for 4000-tip trees", { + skip_on_cran() + skip_if(TreeDist:::.SL_MAX_TIPS < 4000, + "Requires TreeTools >= 2.3.0 (SL_MAX_TIPS >= 4000)") + + n <- 4000 + t1 <- PectinateTree(n) + t2 <- BalancedTree(n) + + rf <- RobinsonFoulds(t1, t2)[[1]] + expect_gt(rf, 0) + expect_equal(RobinsonFoulds(t1, t1)[[1]], 0) + + irf <- InfoRobinsonFoulds(t1, t2)[[1]] + expect_gt(irf, 0) + expect_equal(InfoRobinsonFoulds(t1, t1)[[1]], 0) +}) + +test_that("RF and IRF work for 8000-tip trees", { + skip_on_cran() + skip_if(TreeDist:::.SL_MAX_TIPS < 4000, + "Requires TreeTools >= 2.3.0 (SL_MAX_TIPS >= 4000)") + skip_if(!getOption("slowMode", FALSE), "Only runs in slow mode") + + n <- 8000 + t1 <- PectinateTree(n) + t2 <- BalancedTree(n) + + expect_gt(RobinsonFoulds(t1, t2)[[1]], 0) + expect_equal(RobinsonFoulds(t1, t1)[[1]], 0) + + expect_gt(InfoRobinsonFoulds(t1, t2)[[1]], 0) + expect_equal(InfoRobinsonFoulds(t1, t1)[[1]], 0) +}) + +test_that("Tip-count ceiling is enforced correctly", { + skip_on_cran() + skip_if(TreeDist:::.SL_MAX_TIPS < 4000, + "Requires TreeTools >= 2.3.0") + + expect_no_error(.CheckMaxTips(32767L)) + expect_error(.CheckMaxTips(32768L), "not yet supported") +}) From f0d468fb7b5dc14339d04ff05401ddb441f33e85 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Tue, 5 May 2026 21:37:38 +0100 Subject: [PATCH 05/22] NEWS: document large-tree support (Phase 5) Co-Authored-By: Claude Sonnet 4.6 --- NEWS.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/NEWS.md b/NEWS.md index a051898ae..fa7702840 100644 --- a/NEWS.md +++ b/NEWS.md @@ -23,6 +23,17 @@ removing a hard dependency on the compile-time `SL_MAX_SPLITS` constant. TreeDist now supports trees of any size permitted by TreeTools. +- **Large-tree support (requires TreeTools ≥ 2.3.0):** all distance + functions now accept trees with up to 32 767 tips (previously limited + to `SL_MAX_TIPS`, 2048 with TreeTools ≤ 2.2.0). The R-level tip-count + guard (`.CheckMaxTips()`) detects the TreeTools version at load time and + unlocks the higher ceiling automatically; no code changes are needed. + All integer counters in the C++ hot paths have been widened from `int16` + to `split_int` (`int32`) to handle split counts above 32 767 without + overflow. Direct `lg2[]` table accesses have been replaced with + `lg2_lookup()` fallback helpers so that trees with more tips than + `SL_MAX_TIPS` are computed correctly via `std::log2` / `std::lgamma`. + ## Performance - `RobinsonFoulds()` now uses a fast C++ batch path for cross-distance From 51305e4fb0260b52033bfcbc721b0abd300bf3a3 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Tue, 5 May 2026 21:59:53 +0100 Subject: [PATCH 06/22] Refine version guards: remove TT_HAS_HEAP_FALLBACK; split version tests - Remove .TT_HAS_HEAP_FALLBACK and the extra <<- assignment in .onLoad(). .CheckMaxTips() now derives the heap-fallback capability directly from .SL_MAX_TIPS > 2048L: TreeTools 2.2.0 compiles SL_MAX_TIPS=2048 (no heap fallback); TreeTools 2.3.0 compiles SL_MAX_TIPS=32704 (heap-backed split storage for large trees). No separate version check or mutable flag needed. - Split version-sensitive test assertions into separate test_that blocks each guarded with the appropriate skip_if(packageVersion("TreeTools") "2.3.0"), so the right assertions run on whichever version is installed. Co-Authored-By: Claude Sonnet 4.6 --- R/tree_distance_utilities.R | 7 ++- R/zzz.R | 5 +- tests/testthat/test-tree_distance_nni.R | 26 ++++++--- tests/testthat/test-tree_distance_utilities.R | 55 +++++++++++-------- 4 files changed, 54 insertions(+), 39 deletions(-) diff --git a/R/tree_distance_utilities.R b/R/tree_distance_utilities.R index 8273f9252..eb5a48b34 100644 --- a/R/tree_distance_utilities.R +++ b/R/tree_distance_utilities.R @@ -1,10 +1,11 @@ # Validate that nTip does not exceed the supported tip-count ceiling. -# When TreeTools >= 2.3.0 provides heap-allocated split storage, accept up -# to 32767 tips (TREEDIST_MAX_TIPS); otherwise cap at the compiled SL_MAX_TIPS. +# .SL_MAX_TIPS > 2048L iff TreeTools >= 2.3.0 raised the stack threshold and +# provides heap-backed split storage; in that case accept up to 32767 tips +# (TREEDIST_MAX_TIPS). Otherwise cap at the compiled SL_MAX_TIPS. # Called from every distance entry point before any C++ work. .CheckMaxTips <- function(nTip) { if (is.na(nTip)) return(invisible(NULL)) - if (.TT_HAS_HEAP_FALLBACK) { + if (.SL_MAX_TIPS > 2048L) { if (nTip > 32767L) { stop("Trees with ", nTip, " tips are not yet supported (maximum 32767).") diff --git a/R/zzz.R b/R/zzz.R index ea27b2786..614ebbda3 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,10 +1,7 @@ -.SL_MAX_TIPS <- NULL # populated in .onLoad -.TT_HAS_HEAP_FALLBACK <- FALSE # TRUE when TreeTools >= 2.3.0 +.SL_MAX_TIPS <- NULL # populated in .onLoad .onLoad <- function(libname, pkgname) { .SL_MAX_TIPS <<- cpp_sl_max_tips() - .TT_HAS_HEAP_FALLBACK <<- - utils::packageVersion("TreeTools") >= "2.3.0" } .onUnload <- function(libpath) { diff --git a/tests/testthat/test-tree_distance_nni.R b/tests/testthat/test-tree_distance_nni.R index 5682311c1..ff04f6bb7 100644 --- a/tests/testthat/test-tree_distance_nni.R +++ b/tests/testthat/test-tree_distance_nni.R @@ -5,22 +5,32 @@ test_that("NNIDist() handles exceptions", { "trees must contain the same number of leaves") expect_error(NNIDist(list(PectinateTree(1:8), PectinateTree(8))), "trees must bear identical labels") - expect_error(NNIDist(list(PectinateTree(1:8), + expect_error(NNIDist(list(PectinateTree(1:8), PectinateTree(as.character(1:8)))), "trees must bear identical labels") - # R-level guard catches too-many-tips + expect_error(NNIDist(BalancedTree(5), RootOnNode(BalancedTree(5), 1))) +}) + +test_that("NNIDist() rejects too many tips (TreeTools >= 2.3.0)", { + skip_if(utils::packageVersion("TreeTools") < "2.3.0", + "TreeTools < 2.3.0 emits a different too-many-tips message") expect_error(NNIDist(PectinateTree(40000), BalancedTree(40000)), "not yet supported") - - expect_error(NNIDist(BalancedTree(5), RootOnNode(BalancedTree(5), 1))) - + expect_error(.NNIDistSingle(PectinateTree(32769), BalancedTree(32769), 32769L), + "not yet supported") +}) + +test_that("NNIDist() rejects too many tips (TreeTools < 2.3.0)", { + skip_if(utils::packageVersion("TreeTools") >= "2.3.0", + "TreeTools >= 2.3.0 emits a different too-many-tips message") + expect_error(NNIDist(PectinateTree(40000), BalancedTree(40000)), + "exceed the compiled limit") + expect_error(.NNIDistSingle(PectinateTree(32769), BalancedTree(32769), 32769L), + "exceed the compiled limit") }) test_that("NNIDist() at NNI_MAX_TIPS", { maxTips <- 32768 - more <- maxTips + 1 - expect_error(.NNIDistSingle(PectinateTree(more), BalancedTree(more), more), - "not yet supported") goingQuickly <- TRUE skip_if(goingQuickly) diff --git a/tests/testthat/test-tree_distance_utilities.R b/tests/testthat/test-tree_distance_utilities.R index 0bc4d2b93..376d211b9 100644 --- a/tests/testthat/test-tree_distance_utilities.R +++ b/tests/testthat/test-tree_distance_utilities.R @@ -39,33 +39,40 @@ test_that(".SL_MAX_TIPS is populated", { test_that("Tip-count guard is applied consistently", { expect_no_error(.CheckMaxTips(1000L)) - ttHasHeapFallback <- utils::packageVersion("TreeTools") >= "2.3.0" + # Direct C++ calls bypass the R guard; check_ntip() always says "not yet supported". + cppErrMsg <- "Trees with 327.. tips are not yet supported \\(maximum 32767\\)" + splits8 <- unclass(as.Splits(BalancedTree(8))) + expect_error(cpp_robinson_foulds_distance(splits8, splits8, 32768L), cppErrMsg) + expect_error(cpp_robinson_foulds_info(splits8, splits8, 32768L), cppErrMsg) +}) - if (ttHasHeapFallback) { - # TT >= 2.3.0: accept up to 32767 (heap-backed split storage) - expect_no_error(.CheckMaxTips(32705L)) - expect_no_error(.CheckMaxTips(32767L)) - rErrMsg <- "Trees with 327.. tips are not yet supported \\(maximum 32767\\)" - } else { - limit32704 <- TreeDist:::.SL_MAX_TIPS == 32704L - if (limit32704) expect_no_error(.CheckMaxTips(32704L)) - rErrMsg <- if (limit32704) { - "Trees with 327.. tips are not yet supported \\(maximum 32704\\)" - } else { - "Trees with 327.. tips exceed the compiled limit of 2048" - } - expect_error(.CheckMaxTips(32705L), rErrMsg) - } +test_that("Tip-count guard accepts up to 32767 tips (TreeTools >= 2.3.0)", { + skip_if(utils::packageVersion("TreeTools") < "2.3.0", + "TreeTools < 2.3.0 caps at SL_MAX_TIPS, not 32767") + expect_no_error(.CheckMaxTips(32705L)) + expect_no_error(.CheckMaxTips(32767L)) + rErrMsg <- "Trees with 327.. tips are not yet supported \\(maximum 32767\\)" expect_error(.CheckMaxTips(32768L), rErrMsg) - # Direct C++ calls bypass the R guard and hit check_ntip() whose message - # is always "not yet supported (maximum 32767)". - cppErrMsg <- "Trees with 327.. tips are not yet supported \\(maximum 32767\\)" - splits8 <- unclass(as.Splits(BalancedTree(8))) - expect_error(cpp_robinson_foulds_distance(splits8, splits8, 32768L), - cppErrMsg) - expect_error(cpp_robinson_foulds_info(splits8, splits8, 32768L), - cppErrMsg) + trees <- list(BalancedTree(8), PectinateTree(8)) + class(trees) <- "multiPhylo" + expect_error( + .SplitDistanceAllPairs(RobinsonFouldsSplits, trees, letters[1:8], 32768L), + rErrMsg + ) +}) + +test_that("Tip-count guard caps at SL_MAX_TIPS (TreeTools < 2.3.0)", { + skip_if(utils::packageVersion("TreeTools") >= "2.3.0", + "TreeTools >= 2.3.0 uses the heap-fallback path") + limit32704 <- TreeDist:::.SL_MAX_TIPS == 32704L + if (limit32704) expect_no_error(.CheckMaxTips(32704L)) + rErrMsg <- if (limit32704) { + "Trees with 327.. tips are not yet supported \\(maximum 32704\\)" + } else { + "Trees with 327.. tips exceed the compiled limit of 2048" + } + expect_error(.CheckMaxTips(32705L), rErrMsg) trees <- list(BalancedTree(8), PectinateTree(8)) class(trees) <- "multiPhylo" From 7ca7635ed6027aa77a8c849e7c4d46806f93a4bd Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Tue, 5 May 2026 22:06:32 +0100 Subject: [PATCH 07/22] Remove mutable .SL_MAX_TIPS package variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the <<--assigned .SL_MAX_TIPS with direct calls to the C++ function cpp_sl_max_tips(), which returns the compile-time constant SL_MAX_TIPS in one integer return — essentially free. This eliminates: - The NULL sentinel in zzz.R - The .onLoad() hook entirely - Any risk of another package or user overwriting the value .CheckMaxTips() now reads the limit fresh from C++ each call. Tests reference TreeDist:::cpp_sl_max_tips() directly. Co-Authored-By: Claude Sonnet 4.6 --- R/tree_distance_utilities.R | 17 +++++++++-------- R/zzz.R | 6 ------ tests/testthat/test-large-trees.R | 6 +++--- tests/testthat/test-tree_distance_utilities.R | 8 ++++---- 4 files changed, 16 insertions(+), 21 deletions(-) diff --git a/R/tree_distance_utilities.R b/R/tree_distance_utilities.R index eb5a48b34..05f63a387 100644 --- a/R/tree_distance_utilities.R +++ b/R/tree_distance_utilities.R @@ -1,25 +1,26 @@ # Validate that nTip does not exceed the supported tip-count ceiling. -# .SL_MAX_TIPS > 2048L iff TreeTools >= 2.3.0 raised the stack threshold and -# provides heap-backed split storage; in that case accept up to 32767 tips -# (TREEDIST_MAX_TIPS). Otherwise cap at the compiled SL_MAX_TIPS. +# cpp_sl_max_tips() > 2048L iff TreeTools >= 2.3.0 raised the stack threshold +# and provides heap-backed split storage; accept up to 32767 tips in that case. +# Otherwise cap at the compiled SL_MAX_TIPS. # Called from every distance entry point before any C++ work. .CheckMaxTips <- function(nTip) { if (is.na(nTip)) return(invisible(NULL)) - if (.SL_MAX_TIPS > 2048L) { + sl_max <- cpp_sl_max_tips() + if (sl_max > 2048L) { if (nTip > 32767L) { stop("Trees with ", nTip, " tips are not yet supported (maximum 32767).") } - } else if (nTip > .SL_MAX_TIPS) { - if (.SL_MAX_TIPS < 32704L) { + } else if (nTip > sl_max) { + if (sl_max < 32704L) { stop( "Trees with ", nTip, " tips exceed the compiled limit of ", - .SL_MAX_TIPS, " tips.", + sl_max, " tips.", "\nUpdate TreeTools and reinstall TreeDist to support more tips." ) } stop("Trees with ", nTip, " tips are not yet supported (maximum ", - .SL_MAX_TIPS, ")") + sl_max, ")") } } diff --git a/R/zzz.R b/R/zzz.R index 614ebbda3..826528eaf 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,9 +1,3 @@ -.SL_MAX_TIPS <- NULL # populated in .onLoad - -.onLoad <- function(libname, pkgname) { - .SL_MAX_TIPS <<- cpp_sl_max_tips() -} - .onUnload <- function(libpath) { StopParallel(quietly = TRUE) library.dynam.unload("TreeDist", libpath) diff --git a/tests/testthat/test-large-trees.R b/tests/testthat/test-large-trees.R index d9f5a87f8..218c05845 100644 --- a/tests/testthat/test-large-trees.R +++ b/tests/testthat/test-large-trees.R @@ -6,7 +6,7 @@ library("TreeTools", quietly = TRUE) test_that("RF and IRF work for 4000-tip trees", { skip_on_cran() - skip_if(TreeDist:::.SL_MAX_TIPS < 4000, + skip_if(TreeDist:::cpp_sl_max_tips() < 4000, "Requires TreeTools >= 2.3.0 (SL_MAX_TIPS >= 4000)") n <- 4000 @@ -24,7 +24,7 @@ test_that("RF and IRF work for 4000-tip trees", { test_that("RF and IRF work for 8000-tip trees", { skip_on_cran() - skip_if(TreeDist:::.SL_MAX_TIPS < 4000, + skip_if(TreeDist:::cpp_sl_max_tips() < 4000, "Requires TreeTools >= 2.3.0 (SL_MAX_TIPS >= 4000)") skip_if(!getOption("slowMode", FALSE), "Only runs in slow mode") @@ -41,7 +41,7 @@ test_that("RF and IRF work for 8000-tip trees", { test_that("Tip-count ceiling is enforced correctly", { skip_on_cran() - skip_if(TreeDist:::.SL_MAX_TIPS < 4000, + skip_if(TreeDist:::cpp_sl_max_tips() < 4000, "Requires TreeTools >= 2.3.0") expect_no_error(.CheckMaxTips(32767L)) diff --git a/tests/testthat/test-tree_distance_utilities.R b/tests/testthat/test-tree_distance_utilities.R index 376d211b9..bf4320776 100644 --- a/tests/testthat/test-tree_distance_utilities.R +++ b/tests/testthat/test-tree_distance_utilities.R @@ -31,9 +31,9 @@ test_that("CalculateTreeDistance() errs appropriately", { expect_error(CalculateTreeDistance(RobinsonFouldsSplits, BalancedTree(8), "Not a tree")) }) -test_that(".SL_MAX_TIPS is populated", { - expect_true(is.integer(TreeDist:::.SL_MAX_TIPS)) - expect_true(TreeDist:::.SL_MAX_TIPS >= 2048L) +test_that("cpp_sl_max_tips() returns a valid value", { + expect_true(is.integer(TreeDist:::cpp_sl_max_tips())) + expect_true(TreeDist:::cpp_sl_max_tips() >= 2048L) }) test_that("Tip-count guard is applied consistently", { @@ -65,7 +65,7 @@ test_that("Tip-count guard accepts up to 32767 tips (TreeTools >= 2.3.0)", { test_that("Tip-count guard caps at SL_MAX_TIPS (TreeTools < 2.3.0)", { skip_if(utils::packageVersion("TreeTools") >= "2.3.0", "TreeTools >= 2.3.0 uses the heap-fallback path") - limit32704 <- TreeDist:::.SL_MAX_TIPS == 32704L + limit32704 <- TreeDist:::cpp_sl_max_tips() == 32704L if (limit32704) expect_no_error(.CheckMaxTips(32704L)) rErrMsg <- if (limit32704) { "Trees with 327.. tips are not yet supported \\(maximum 32704\\)" From 2567910aabe9b5ce8ef531c5ae501ac9a69271d5 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Tue, 5 May 2026 22:20:49 +0100 Subject: [PATCH 08/22] Gate version-split tests on cpp_sl_max_tips(), not packageVersion() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit packageVersion("TreeTools") reflects the installed R package, but test behaviour depends on which headers TreeDist was compiled against — a mismatch that arises when the binary is compiled against TT 2.3.0 headers but a TT 2.2.0 R package is subsequently loaded. cpp_sl_max_tips() returns the SL_MAX_TIPS value baked into the TreeDist binary at compile time (2048 for TT 2.2.0, 32704 for TT 2.3.0), so it is the correct discriminant for skip_if guards. Co-Authored-By: Claude Sonnet 4.6 --- tests/testthat/test-tree_distance_nni.R | 8 ++++---- tests/testthat/test-tree_distance_utilities.R | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/testthat/test-tree_distance_nni.R b/tests/testthat/test-tree_distance_nni.R index ff04f6bb7..2259f4e82 100644 --- a/tests/testthat/test-tree_distance_nni.R +++ b/tests/testthat/test-tree_distance_nni.R @@ -12,8 +12,8 @@ test_that("NNIDist() handles exceptions", { }) test_that("NNIDist() rejects too many tips (TreeTools >= 2.3.0)", { - skip_if(utils::packageVersion("TreeTools") < "2.3.0", - "TreeTools < 2.3.0 emits a different too-many-tips message") + skip_if(TreeDist:::cpp_sl_max_tips() <= 2048L, + "Compiled against TreeTools < 2.3.0 (SL_MAX_TIPS = 2048)") expect_error(NNIDist(PectinateTree(40000), BalancedTree(40000)), "not yet supported") expect_error(.NNIDistSingle(PectinateTree(32769), BalancedTree(32769), 32769L), @@ -21,8 +21,8 @@ test_that("NNIDist() rejects too many tips (TreeTools >= 2.3.0)", { }) test_that("NNIDist() rejects too many tips (TreeTools < 2.3.0)", { - skip_if(utils::packageVersion("TreeTools") >= "2.3.0", - "TreeTools >= 2.3.0 emits a different too-many-tips message") + skip_if(TreeDist:::cpp_sl_max_tips() > 2048L, + "Compiled against TreeTools >= 2.3.0 (SL_MAX_TIPS > 2048)") expect_error(NNIDist(PectinateTree(40000), BalancedTree(40000)), "exceed the compiled limit") expect_error(.NNIDistSingle(PectinateTree(32769), BalancedTree(32769), 32769L), diff --git a/tests/testthat/test-tree_distance_utilities.R b/tests/testthat/test-tree_distance_utilities.R index bf4320776..63f353df9 100644 --- a/tests/testthat/test-tree_distance_utilities.R +++ b/tests/testthat/test-tree_distance_utilities.R @@ -47,8 +47,8 @@ test_that("Tip-count guard is applied consistently", { }) test_that("Tip-count guard accepts up to 32767 tips (TreeTools >= 2.3.0)", { - skip_if(utils::packageVersion("TreeTools") < "2.3.0", - "TreeTools < 2.3.0 caps at SL_MAX_TIPS, not 32767") + skip_if(TreeDist:::cpp_sl_max_tips() <= 2048L, + "Compiled against TreeTools < 2.3.0 (SL_MAX_TIPS = 2048)") expect_no_error(.CheckMaxTips(32705L)) expect_no_error(.CheckMaxTips(32767L)) rErrMsg <- "Trees with 327.. tips are not yet supported \\(maximum 32767\\)" @@ -63,8 +63,8 @@ test_that("Tip-count guard accepts up to 32767 tips (TreeTools >= 2.3.0)", { }) test_that("Tip-count guard caps at SL_MAX_TIPS (TreeTools < 2.3.0)", { - skip_if(utils::packageVersion("TreeTools") >= "2.3.0", - "TreeTools >= 2.3.0 uses the heap-fallback path") + skip_if(TreeDist:::cpp_sl_max_tips() > 2048L, + "Compiled against TreeTools >= 2.3.0 (SL_MAX_TIPS > 2048)") limit32704 <- TreeDist:::cpp_sl_max_tips() == 32704L if (limit32704) expect_no_error(.CheckMaxTips(32704L)) rErrMsg <- if (limit32704) { From 3c5be5784683510cd05cb05ba3d3f835f42055f1 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Wed, 6 May 2026 07:08:12 +0100 Subject: [PATCH 09/22] Update memcheck flow --- DESCRIPTION | 2 +- inst/_pkgdown.yml | 1 + memcheck/tests.R | 2 +- memcheck/vignettes.R | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 504ffdb44..384985635 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -87,7 +87,7 @@ VignetteBuilder: knitr Config/Needs/app/optional: uwot Config/Needs/check: rcmdcheck Config/Needs/coverage: covr -Config/Needs/memcheck: devtools +Config/Needs/memcheck: pkgdown, testthat Config/Needs/metadata: codemetar Config/Needs/revdeps: revdepcheck Config/Needs/website: openssl, pkgdown, remotes, shinylive diff --git a/inst/_pkgdown.yml b/inst/_pkgdown.yml index 2922547e5..664ef8e6e 100644 --- a/inst/_pkgdown.yml +++ b/inst/_pkgdown.yml @@ -85,6 +85,7 @@ navbar: href: news/index.html github: icon: fa-github fa-lg + aria-label: View this repo on GitHub href: https://github.com/ms609/TreeDist articles: diff --git a/memcheck/tests.R b/memcheck/tests.R index e85bebd2c..fc5edd2f5 100644 --- a/memcheck/tests.R +++ b/memcheck/tests.R @@ -1,4 +1,4 @@ # Code to be run with # R -d "valgrind --tool=memcheck --leak-check=full --error-exitcode=1" --vanilla < memcheck/thisfile.R library("TreeDist") -devtools::test() +testthat::test_dir() diff --git a/memcheck/vignettes.R b/memcheck/vignettes.R index 0c02f7161..9c2ac44c0 100644 --- a/memcheck/vignettes.R +++ b/memcheck/vignettes.R @@ -1,3 +1,3 @@ # Code to be run with # R -d "valgrind --tool=memcheck --leak-check=full --error-exitcode=1" --vanilla < memcheck/thisfile.R -devtools::build_vignettes(install = FALSE) +pkgdown::build_articles(preview = FALSE) From 7b530a5cd11a0314c2eccfebdd42b32e34e6d214 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Wed, 6 May 2026 07:08:35 +0100 Subject: [PATCH 10/22] Skip if no TBRDist --- tests/testthat/test-tree_distance_spr.R | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/testthat/test-tree_distance_spr.R b/tests/testthat/test-tree_distance_spr.R index d97c02631..ff0a3ee81 100644 --- a/tests/testthat/test-tree_distance_spr.R +++ b/tests/testthat/test-tree_distance_spr.R @@ -115,6 +115,7 @@ test_that("SPR shortcuts okay - exhaustive", { test_that("SPR shortcuts okay - known answer", { skip_if_not(getOption("slowMode", FALSE)) + skip_if_not_installed("TBRDist") library("TreeTools", quietly = TRUE) set.seed(0) trees <- lapply(1:8, function(XX) RandomTree(9, root = TRUE)) From 07096998db55ca4bdf5ab803a349b32ceee2830c Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Wed, 6 May 2026 07:18:00 +0100 Subject: [PATCH 11/22] Cover uncovered lines; exclude large-tree fallbacks from coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mutual_clustering.h: the lg2_lookup/lg2_unrooted_lookup/lg2_rooted_lookup fallback paths (n > SL_MAX_TIPS) only execute on trees larger than the compile-time stack threshold, which the large-tree smoke tests cover but skip_on_cran hides from CI coverage. Add LCOV_EXCL_START/STOP to exclude these legitimately-tested-but-not-in-CI lines rather than leaving them as false coverage gaps. .CheckMaxTips(): remove the dead inner if (sl_max < 32704L) guard — the else-if branch fires only when sl_max <= 2048 (TT < 2.3.0), which already guarantees sl_max < 32704. Add a local_mocked_bindings test that exercises the TT < 2.3.0 code path without requiring an actual TT 2.2.0 install. Co-Authored-By: Claude Sonnet 4.6 --- R/tree_distance_utilities.R | 15 +++++------- inst/include/TreeDist/mutual_clustering.h | 10 +++++--- tests/testthat/test-tree_distance_utilities.R | 24 +++++++++++++------ 3 files changed, 30 insertions(+), 19 deletions(-) diff --git a/R/tree_distance_utilities.R b/R/tree_distance_utilities.R index 05f63a387..84b19e2d3 100644 --- a/R/tree_distance_utilities.R +++ b/R/tree_distance_utilities.R @@ -12,15 +12,12 @@ " tips are not yet supported (maximum 32767).") } } else if (nTip > sl_max) { - if (sl_max < 32704L) { - stop( - "Trees with ", nTip, " tips exceed the compiled limit of ", - sl_max, " tips.", - "\nUpdate TreeTools and reinstall TreeDist to support more tips." - ) - } - stop("Trees with ", nTip, " tips are not yet supported (maximum ", - sl_max, ")") + # else-if fires only when sl_max <= 2048 (TreeTools < 2.3.0) + stop( + "Trees with ", nTip, " tips exceed the compiled limit of ", + sl_max, " tips.", + "\nUpdate TreeTools and reinstall TreeDist to support more tips." + ) } } diff --git a/inst/include/TreeDist/mutual_clustering.h b/inst/include/TreeDist/mutual_clustering.h index 1f592042e..123fad306 100644 --- a/inst/include/TreeDist/mutual_clustering.h +++ b/inst/include/TreeDist/mutual_clustering.h @@ -40,7 +40,7 @@ namespace TreeDist { if (x <= static_cast(SL_MAX_TIPS)) { return lg2[x]; } - return std::log2(static_cast(x)); + return std::log2(static_cast(x)); // LCOV_EXCL_LINE } // log2((2n-5)!!) — table-fast for n <= SL_MAX_TIPS+1, lgamma otherwise. @@ -49,10 +49,12 @@ namespace TreeDist { if (n_tips <= static_cast(SL_MAX_TIPS + 1)) { return lg2_unrooted[n_tips]; } - if (n_tips < 3) return 0.0; // LCOV_EXCL_LINE + // LCOV_EXCL_START + if (n_tips < 3) return 0.0; const double n = static_cast(n_tips); return (std::lgamma(2.0 * n - 3.0) - std::lgamma(n - 1.0)) / std::log(2.0) - (n - 2.0); + // LCOV_EXCL_STOP } // log2((2n-3)!!) — table-fast for n <= SL_MAX_TIPS+1, lgamma otherwise. @@ -60,10 +62,12 @@ namespace TreeDist { if (n_tips <= static_cast(SL_MAX_TIPS + 1)) { return lg2_rooted[n_tips]; } - if (n_tips < 2) return 0.0; // LCOV_EXCL_LINE + // LCOV_EXCL_START + if (n_tips < 2) return 0.0; const double n = static_cast(n_tips); return (std::lgamma(2.0 * n - 1.0) - std::lgamma(n)) / std::log(2.0) - (n - 1.0); + // LCOV_EXCL_STOP } // ---- Inline helpers ---- diff --git a/tests/testthat/test-tree_distance_utilities.R b/tests/testthat/test-tree_distance_utilities.R index 63f353df9..3579e0727 100644 --- a/tests/testthat/test-tree_distance_utilities.R +++ b/tests/testthat/test-tree_distance_utilities.R @@ -65,13 +65,7 @@ test_that("Tip-count guard accepts up to 32767 tips (TreeTools >= 2.3.0)", { test_that("Tip-count guard caps at SL_MAX_TIPS (TreeTools < 2.3.0)", { skip_if(TreeDist:::cpp_sl_max_tips() > 2048L, "Compiled against TreeTools >= 2.3.0 (SL_MAX_TIPS > 2048)") - limit32704 <- TreeDist:::cpp_sl_max_tips() == 32704L - if (limit32704) expect_no_error(.CheckMaxTips(32704L)) - rErrMsg <- if (limit32704) { - "Trees with 327.. tips are not yet supported \\(maximum 32704\\)" - } else { - "Trees with 327.. tips exceed the compiled limit of 2048" - } + rErrMsg <- "Trees with 327.. tips exceed the compiled limit of 2048" expect_error(.CheckMaxTips(32705L), rErrMsg) trees <- list(BalancedTree(8), PectinateTree(8)) @@ -82,6 +76,22 @@ test_that("Tip-count guard caps at SL_MAX_TIPS (TreeTools < 2.3.0)", { ) }) +test_that("Tip-count guard: TT < 2.3.0 code path (mocked)", { + local_mocked_bindings( + cpp_sl_max_tips = function() 2048L, + .package = "TreeDist" + ) + expect_no_error(.CheckMaxTips(100L)) + expect_error(.CheckMaxTips(3000L), + "Trees with 3000 tips exceed the compiled limit of 2048") + trees <- list(BalancedTree(8), PectinateTree(8)) + class(trees) <- "multiPhylo" + expect_error( + .SplitDistanceAllPairs(RobinsonFouldsSplits, trees, letters[1:8], 3000L), + "Trees with 3000 tips exceed the compiled limit of 2048" + ) +}) + test_that("CalculateTreeDistance() handles splits appropriately", { set.seed(101) From d461fb8698e22d48b7703a101e2380a8ee175a94 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Wed, 6 May 2026 07:18:33 +0100 Subject: [PATCH 12/22] test_local() --- memcheck/tests.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/memcheck/tests.R b/memcheck/tests.R index fc5edd2f5..14c7388aa 100644 --- a/memcheck/tests.R +++ b/memcheck/tests.R @@ -1,4 +1,3 @@ # Code to be run with # R -d "valgrind --tool=memcheck --leak-check=full --error-exitcode=1" --vanilla < memcheck/thisfile.R -library("TreeDist") -testthat::test_dir() +testthat::test_local() From 97eda24ec96cee3cae1661d6fa6248d790a00e76 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Wed, 6 May 2026 08:01:05 +0100 Subject: [PATCH 13/22] rm obselete file --- tests/testthat/test-real-data.R | 32 -------------------------------- 1 file changed, 32 deletions(-) delete mode 100644 tests/testthat/test-real-data.R diff --git a/tests/testthat/test-real-data.R b/tests/testthat/test-real-data.R deleted file mode 100644 index d34bbc89a..000000000 --- a/tests/testthat/test-real-data.R +++ /dev/null @@ -1,32 +0,0 @@ -#if (requireNamespace("rwty", quietly = TRUE)) { -# test_that("Fungi MSID/PID differ", { -# data("fungus", package = "rwty") -# trees <- fungus[[1]]$trees -# -# expect_equal(PhylogeneticInfoDistance(trees[[1]], trees[[2]]), -# PhylogeneticInfoDistance(trees[1:2])[1]) -# expect_equal(MatchingSplitInfoDistance(trees[[1]], trees[[2]]), -# MatchingSplitInfoDistance(trees[1:2])[1]) -# expect_false(identical(PhylogeneticInfoDistance(trees[1:4]), -# MatchingSplitInfoDistance(trees[1:4]))) -# }) -# -# if (requireNamespace("whalehead", quietly = TRUE)) { -# test_that("Whalehead MSID/PID differ", { -# whaleRoot <- system.file("Data", package = "whalehead", mustWork = TRUE) -# whaleRoot <- "../ProjectWhalehead/Data" -# treeFiles <- list.files(path = paste0(whaleRoot, "/CombinedTrees"), -# pattern = "*.nex", -# full.names = TRUE, recursive = FALSE) -# trees <- unname(rwty::load.trees(treeFiles[2], trim = 100, skip = 0)$trees) -# -# expect_equal(PhylogeneticInfoDistance(trees[[1]], trees[[2]]), -# PhylogeneticInfoDistance(trees[1:2])[1]) -# expect_equal(MatchingSplitInfoDistance(trees[[1]], trees[[2]]), -# MatchingSplitInfoDistance(trees[1:2])[1]) -# expect_false(identical(PhylogeneticInfoDistance(trees[1:4]), -# MatchingSplitInfoDistance(trees[1:4]))) -# }) -# } -#} -# From 4eb6d71c34cffb9cc904e9bb9081f42955488447 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Wed, 6 May 2026 08:13:54 +0100 Subject: [PATCH 14/22] Expand large-tree tests: add CID/DPI/MSI; use n=2049 not 4000 4000 was arbitrary; 2049 is the minimum needed to exceed TT 2.2.0's SL_MAX_TIPS ceiling of 2048. Using as.phylo(1/2, n) generates adjacent similar trees whose shared splits hit the O(n log n) exact-match fast path, keeping the test quick. Add ClusteringInfoDistance and DifferentPhylogeneticInfo and MatchingSplitInfoDistance smoke tests alongside the existing RF/IRF ones. Add an explicit correct-value test for ClusteringInfoDistance: the individual-pair path and the all-pairs OpenMP batch path are independent implementations; their agreement for n > SL_MAX_TIPS confirms the result is not merely non-zero but actually correct. Co-Authored-By: Claude Sonnet 4.6 --- tests/testthat/test-large-trees.R | 52 ++++++++++++++++++++++--------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/tests/testthat/test-large-trees.R b/tests/testthat/test-large-trees.R index 218c05845..5dae14840 100644 --- a/tests/testthat/test-large-trees.R +++ b/tests/testthat/test-large-trees.R @@ -4,28 +4,52 @@ library("TreeTools", quietly = TRUE) # threshold (2048 in TreeTools <= 2.2.0). These require TreeTools >= 2.3.0 # for heap-backed split storage; they skip gracefully on older builds. -test_that("RF and IRF work for 4000-tip trees", { +test_that("Distance functions handle trees exceeding TT 2.2.0 limit", { skip_on_cran() - skip_if(TreeDist:::cpp_sl_max_tips() < 4000, - "Requires TreeTools >= 2.3.0 (SL_MAX_TIPS >= 4000)") + n <- 2049L + skip_if(TreeDist:::cpp_sl_max_tips() < n, + "Requires TreeTools >= 2.3.0 (SL_MAX_TIPS > 2048)") - n <- 4000 - t1 <- PectinateTree(n) - t2 <- BalancedTree(n) + t1 <- as.phylo(1, n) + t2 <- as.phylo(2, n) - rf <- RobinsonFoulds(t1, t2)[[1]] - expect_gt(rf, 0) + expect_gt(RobinsonFoulds(t1, t2)[[1]], 0) expect_equal(RobinsonFoulds(t1, t1)[[1]], 0) - irf <- InfoRobinsonFoulds(t1, t2)[[1]] - expect_gt(irf, 0) + expect_gt(InfoRobinsonFoulds(t1, t2)[[1]], 0) expect_equal(InfoRobinsonFoulds(t1, t1)[[1]], 0) + + expect_gt(ClusteringInfoDistance(t1, t2)[[1]], 0) + expect_equal(ClusteringInfoDistance(t1, t1)[[1]], 0) + + expect_gt(DifferentPhylogeneticInfo(t1, t2)[[1]], 0) + expect_equal(DifferentPhylogeneticInfo(t1, t1)[[1]], 0) + + expect_gt(MatchingSplitInfoDistance(t1, t2)[[1]], 0) + expect_equal(MatchingSplitInfoDistance(t1, t1)[[1]], 0) +}) + +test_that("ClusteringInfoDistance returns correct value for n > SL_MAX_TIPS", { + skip_on_cran() + n <- 2049L + skip_if(TreeDist:::cpp_sl_max_tips() < n, + "Requires TreeTools >= 2.3.0 (SL_MAX_TIPS > 2048)") + + t1 <- as.phylo(1, n) + t2 <- as.phylo(2, n) + + # The individual-pair path and the all-pairs OpenMP batch path are + # independent implementations; agreement confirms the correct value. + expect_equal( + ClusteringInfoDistance(t1, t2)[[1]], + as.matrix(ClusteringInfoDistance(list(t1, t2)))[1, 2] + ) }) test_that("RF and IRF work for 8000-tip trees", { skip_on_cran() - skip_if(TreeDist:::cpp_sl_max_tips() < 4000, - "Requires TreeTools >= 2.3.0 (SL_MAX_TIPS >= 4000)") + skip_if(TreeDist:::cpp_sl_max_tips() < 2049L, + "Requires TreeTools >= 2.3.0 (SL_MAX_TIPS > 2048)") skip_if(!getOption("slowMode", FALSE), "Only runs in slow mode") n <- 8000 @@ -41,8 +65,8 @@ test_that("RF and IRF work for 8000-tip trees", { test_that("Tip-count ceiling is enforced correctly", { skip_on_cran() - skip_if(TreeDist:::cpp_sl_max_tips() < 4000, - "Requires TreeTools >= 2.3.0") + skip_if(TreeDist:::cpp_sl_max_tips() < 2049L, + "Requires TreeTools >= 2.3.0 (SL_MAX_TIPS > 2048)") expect_no_error(.CheckMaxTips(32767L)) expect_error(.CheckMaxTips(32768L), "not yet supported") From ab32955b910c0a6618f11f68f57adc3ed64304c0 Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Wed, 6 May 2026 10:50:14 +0100 Subject: [PATCH 15/22] roxygen v8 --- DESCRIPTION | 2 +- man/HierarchicalMutualInfo.Rd | 22 +++++++++++----------- man/Islands.Rd | 12 ++++++------ man/JaccardRobinsonFoulds.Rd | 22 +++++++++++----------- man/KMeansPP.Rd | 2 +- man/KendallColijn.Rd | 22 +++++++++++----------- man/MASTSize.Rd | 22 +++++++++++----------- man/MCITree.Rd | 4 ++-- man/MSTSegments.Rd | 12 ++++++------ man/MapTrees.Rd | 12 ++++++------ man/MappingQuality.Rd | 12 ++++++------ man/MatchingSplitDistance.Rd | 22 +++++++++++----------- man/NNIDist.Rd | 22 +++++++++++----------- man/NyeSimilarity.Rd | 22 +++++++++++----------- man/PathDist.Rd | 22 +++++++++++----------- man/Robinson-Foulds.Rd | 24 ++++++++++++------------ man/SPRDist.Rd | 22 +++++++++++----------- man/SpectralEigens.Rd | 12 ++++++------ man/SplitEntropy.Rd | 4 ++-- man/SplitSharedInformation.Rd | 4 ++-- man/TransferConsensus.Rd | 4 ++-- man/TransferDist.Rd | 22 +++++++++++----------- man/TreeDist-package.Rd | 5 +++++ man/TreeDistance.Rd | 22 +++++++++++----------- man/TreeInfo.Rd | 6 +++--- man/cluster-statistics.Rd | 20 ++++++++++---------- man/median.multiPhylo.Rd | 12 ++++++------ 27 files changed, 197 insertions(+), 192 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 384985635..2472c834c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -91,6 +91,7 @@ Config/Needs/memcheck: pkgdown, testthat Config/Needs/metadata: codemetar Config/Needs/revdeps: revdepcheck Config/Needs/website: openssl, pkgdown, remotes, shinylive +Config/roxygen2/version: 8.0.0 Config/testthat/parallel: false Config/testthat/edition: 3 SystemRequirements: C++17, pandoc-citeproc @@ -98,5 +99,4 @@ ByteCompile: true Encoding: UTF-8 Language: en-GB X-schema.org-keywords: phylogenetics, tree-distance -RoxygenNote: 7.3.3 Roxygen: list(markdown = TRUE) diff --git a/man/HierarchicalMutualInfo.Rd b/man/HierarchicalMutualInfo.Rd index 193b18ba8..0da332475 100644 --- a/man/HierarchicalMutualInfo.Rd +++ b/man/HierarchicalMutualInfo.Rd @@ -124,17 +124,17 @@ AHMI(tree1, tree2, precision = 0.1) \insertAllCited{} } \seealso{ -Other tree distances: -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \concept{tree distances} diff --git a/man/Islands.Rd b/man/Islands.Rd index 41c24d1ba..9bfcfb4ab 100644 --- a/man/Islands.Rd +++ b/man/Islands.Rd @@ -64,13 +64,13 @@ par(oPar) \insertAllCited{} } \seealso{ -Other tree space functions: -\code{\link{MSTSegments}()}, -\code{\link{MapTrees}()}, -\code{\link{MappingQuality}()}, -\code{\link{SpectralEigens}()}, +Other tree space functions: +\code{\link[=MSTSegments]{MSTSegments()}}, +\code{\link[=MapTrees]{MapTrees()}}, +\code{\link[=MappingQuality]{MappingQuality()}}, +\code{\link[=SpectralEigens]{SpectralEigens()}}, \code{\link{cluster-statistics}}, -\code{\link{median.multiPhylo}()} +\code{\link[=median.multiPhylo]{median.multiPhylo()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/JaccardRobinsonFoulds.Rd b/man/JaccardRobinsonFoulds.Rd index 7e515be7f..ad70a60fd 100644 --- a/man/JaccardRobinsonFoulds.Rd +++ b/man/JaccardRobinsonFoulds.Rd @@ -126,18 +126,18 @@ VisualizeMatching(JRF2, tree1, tree2, matchZeros = FALSE) \insertAllCited{} } \seealso{ -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/KMeansPP.Rd b/man/KMeansPP.Rd index e279beb37..cab7aebe2 100644 --- a/man/KMeansPP.Rd +++ b/man/KMeansPP.Rd @@ -45,7 +45,7 @@ plot(x, col = plusters$cluster, pch = rep(15:19, each = 5)) \seealso{ \code{\link[stats]{kmeans}} -Other cluster functions: +Other cluster functions: \code{\link{cluster-statistics}} } \author{ diff --git a/man/KendallColijn.Rd b/man/KendallColijn.Rd index 57d017720..8bb666b02 100644 --- a/man/KendallColijn.Rd +++ b/man/KendallColijn.Rd @@ -125,18 +125,18 @@ KCDiameter(trees) is a more sophisticated, if more cumbersome, implementation that supports lambda > 0, i.e. use of edge lengths in tree comparison. -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/MASTSize.Rd b/man/MASTSize.Rd index db028b0a0..a50bf2d1d 100644 --- a/man/MASTSize.Rd +++ b/man/MASTSize.Rd @@ -62,18 +62,18 @@ CompareAll(as.phylo(0:4, 8), MASTInfo) \code{\link[phangorn:mast]{phangorn::mast()}}, a slower implementation that also lists the leaves contained within the subtree. -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/MCITree.Rd b/man/MCITree.Rd index 8a119ba01..18528baba 100644 --- a/man/MCITree.Rd +++ b/man/MCITree.Rd @@ -47,8 +47,8 @@ SplitwiseInfo(mcc) } \seealso{ -Other summary trees: -\code{\link{TransferConsensus}()} +Other summary trees: +\code{\link[=TransferConsensus]{TransferConsensus()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/MSTSegments.Rd b/man/MSTSegments.Rd index 1ac773903..0b3c5186f 100644 --- a/man/MSTSegments.Rd +++ b/man/MSTSegments.Rd @@ -70,13 +70,13 @@ PlotTools::SpectrumLegend( \insertAllCited{} } \seealso{ -Other tree space functions: -\code{\link{Islands}()}, -\code{\link{MapTrees}()}, -\code{\link{MappingQuality}()}, -\code{\link{SpectralEigens}()}, +Other tree space functions: +\code{\link[=Islands]{Islands()}}, +\code{\link[=MapTrees]{MapTrees()}}, +\code{\link[=MappingQuality]{MappingQuality()}}, +\code{\link[=SpectralEigens]{SpectralEigens()}}, \code{\link{cluster-statistics}}, -\code{\link{median.multiPhylo}()} +\code{\link[=median.multiPhylo]{median.multiPhylo()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/MapTrees.Rd b/man/MapTrees.Rd index 598908402..69f9ed6d2 100644 --- a/man/MapTrees.Rd +++ b/man/MapTrees.Rd @@ -110,13 +110,13 @@ this application. The application itself can be cited using Full detail of tree space analysis in R is provided in the accompanying \href{https://ms609.github.io/TreeDist/articles/treespace.html}{vignette}. -Other tree space functions: -\code{\link{Islands}()}, -\code{\link{MSTSegments}()}, -\code{\link{MappingQuality}()}, -\code{\link{SpectralEigens}()}, +Other tree space functions: +\code{\link[=Islands]{Islands()}}, +\code{\link[=MSTSegments]{MSTSegments()}}, +\code{\link[=MappingQuality]{MappingQuality()}}, +\code{\link[=SpectralEigens]{SpectralEigens()}}, \code{\link{cluster-statistics}}, -\code{\link{median.multiPhylo}()} +\code{\link[=median.multiPhylo]{median.multiPhylo()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/MappingQuality.Rd b/man/MappingQuality.Rd index 05751dfb6..e9c77c05b 100644 --- a/man/MappingQuality.Rd +++ b/man/MappingQuality.Rd @@ -41,13 +41,13 @@ MappingQuality(distances, dist(mapping), 4) \insertAllCited{} } \seealso{ -Other tree space functions: -\code{\link{Islands}()}, -\code{\link{MSTSegments}()}, -\code{\link{MapTrees}()}, -\code{\link{SpectralEigens}()}, +Other tree space functions: +\code{\link[=Islands]{Islands()}}, +\code{\link[=MSTSegments]{MSTSegments()}}, +\code{\link[=MapTrees]{MapTrees()}}, +\code{\link[=SpectralEigens]{SpectralEigens()}}, \code{\link{cluster-statistics}}, -\code{\link{median.multiPhylo}()} +\code{\link[=median.multiPhylo]{median.multiPhylo()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/MatchingSplitDistance.Rd b/man/MatchingSplitDistance.Rd index b5d64b270..5b1eb1248 100644 --- a/man/MatchingSplitDistance.Rd +++ b/man/MatchingSplitDistance.Rd @@ -84,18 +84,18 @@ VisualizeMatching(MatchingSplitDistance, TreeTools::BalancedTree(6), \insertAllCited{} } \seealso{ -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/NNIDist.Rd b/man/NNIDist.Rd index 5e24301bb..23ed08394 100644 --- a/man/NNIDist.Rd +++ b/man/NNIDist.Rd @@ -101,18 +101,18 @@ CompareAll(as.phylo(30:33, 8), NNIDist) \insertAllCited{} } \seealso{ -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/NyeSimilarity.Rd b/man/NyeSimilarity.Rd index d984b129a..21a40aeec 100644 --- a/man/NyeSimilarity.Rd +++ b/man/NyeSimilarity.Rd @@ -122,18 +122,18 @@ NyeSimilarity(as.phylo(0:5, nTip = 8), similarity = FALSE) \insertAllCited{} } \seealso{ -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/PathDist.Rd b/man/PathDist.Rd index 1192d4ce0..17a0a094e 100644 --- a/man/PathDist.Rd +++ b/man/PathDist.Rd @@ -68,18 +68,18 @@ PathDist(as.phylo(30:33, 8)) \insertAllCited{} } \seealso{ -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/Robinson-Foulds.Rd b/man/Robinson-Foulds.Rd index 7510e0945..1b4c656bf 100644 --- a/man/Robinson-Foulds.Rd +++ b/man/Robinson-Foulds.Rd @@ -144,18 +144,18 @@ VisualizeMatching(InfoRobinsonFoulds, balanced7, pectinate7) \seealso{ Display paired splits: \code{\link[=VisualizeMatching]{VisualizeMatching()}} -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/SPRDist.Rd b/man/SPRDist.Rd index 5075abda8..572f54a83 100644 --- a/man/SPRDist.Rd +++ b/man/SPRDist.Rd @@ -74,18 +74,18 @@ functions \code{USPRDist()} and \code{ReplugDist()}. the \insertCite{deOliveira2008;textual}{TreeDist} algorithm but can crash when sent trees of certain formats, and tends to have a longer running time. -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{TransferDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=TransferDist]{TransferDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/SpectralEigens.Rd b/man/SpectralEigens.Rd index 16067450a..546e2b980 100644 --- a/man/SpectralEigens.Rd +++ b/man/SpectralEigens.Rd @@ -37,13 +37,13 @@ plot(eigens, pch = 15, col = clusts$cluster) plot(cmdscale(distances), pch = 15, col = clusts$cluster) } \seealso{ -Other tree space functions: -\code{\link{Islands}()}, -\code{\link{MSTSegments}()}, -\code{\link{MapTrees}()}, -\code{\link{MappingQuality}()}, +Other tree space functions: +\code{\link[=Islands]{Islands()}}, +\code{\link[=MSTSegments]{MSTSegments()}}, +\code{\link[=MapTrees]{MapTrees()}}, +\code{\link[=MappingQuality]{MappingQuality()}}, \code{\link{cluster-statistics}}, -\code{\link{median.multiPhylo}()} +\code{\link[=median.multiPhylo]{median.multiPhylo()}} } \author{ Adapted by MRS from script by \href{https://rpubs.com/nurakawa/spectral-clustering}{Nura Kawa} diff --git a/man/SplitEntropy.Rd b/man/SplitEntropy.Rd index 4388b2747..494dd322a 100644 --- a/man/SplitEntropy.Rd +++ b/man/SplitEntropy.Rd @@ -40,8 +40,8 @@ SplitEntropy(c(A, A, A, B, B, B), c(A, A, B, B, B, B)) \insertAllCited{} } \seealso{ -Other information functions: -\code{\link{SplitSharedInformation}()}, +Other information functions: +\code{\link[=SplitSharedInformation]{SplitSharedInformation()}}, \code{\link{TreeInfo}} } \author{ diff --git a/man/SplitSharedInformation.Rd b/man/SplitSharedInformation.Rd index 57560887e..25f1fc786 100644 --- a/man/SplitSharedInformation.Rd +++ b/man/SplitSharedInformation.Rd @@ -87,8 +87,8 @@ splits. \insertAllCited{} } \seealso{ -Other information functions: -\code{\link{SplitEntropy}()}, +Other information functions: +\code{\link[=SplitEntropy]{SplitEntropy()}}, \code{\link{TreeInfo}} } \author{ diff --git a/man/TransferConsensus.Rd b/man/TransferConsensus.Rd index 219111ab8..1c964d4d2 100644 --- a/man/TransferConsensus.Rd +++ b/man/TransferConsensus.Rd @@ -66,7 +66,7 @@ cat("Transfer consensus splits:", NSplits(tc), "\n") \insertAllCited{} } \seealso{ -Other summary trees: -\code{\link{MCITree}()} +Other summary trees: +\code{\link[=MCITree]{MCITree()}} } \concept{summary trees} diff --git a/man/TransferDist.Rd b/man/TransferDist.Rd index 2439130f7..d93610140 100644 --- a/man/TransferDist.Rd +++ b/man/TransferDist.Rd @@ -102,17 +102,17 @@ TransferDist(as.phylo(0:5, 8)) \insertAllCited{} } \seealso{ -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TreeDistance}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TreeDistance]{TreeDistance()}} } \concept{tree distances} diff --git a/man/TreeDist-package.Rd b/man/TreeDist-package.Rd index 15872b542..7d4da0d5d 100644 --- a/man/TreeDist-package.Rd +++ b/man/TreeDist-package.Rd @@ -134,6 +134,11 @@ Geodesic distance \author{ \strong{Maintainer}: Martin R. Smith \email{martin.smith@durham.ac.uk} (\href{https://orcid.org/0000-0001-5660-1727}{ORCID}) [copyright holder, programmer] +Authors: +\itemize{ + \item Martin R. Smith \email{martin.smith@durham.ac.uk} (\href{https://orcid.org/0000-0001-5660-1727}{ORCID}) [copyright holder, programmer] +} + Other contributors: \itemize{ \item Roy Jonker \email{roy_jonker@magiclogic.com} (LAP algorithm) [programmer, copyright holder] diff --git a/man/TreeDistance.Rd b/man/TreeDistance.Rd index f081b6d3a..3bda3fed7 100644 --- a/man/TreeDistance.Rd +++ b/man/TreeDistance.Rd @@ -318,18 +318,18 @@ MutualClusteringInfoSplits(splits1, splits2) \insertAllCited{} } \seealso{ -Other tree distances: -\code{\link{HierarchicalMutualInfo}()}, -\code{\link{JaccardRobinsonFoulds}()}, -\code{\link{KendallColijn}()}, -\code{\link{MASTSize}()}, -\code{\link{MatchingSplitDistance}()}, -\code{\link{NNIDist}()}, -\code{\link{NyeSimilarity}()}, -\code{\link{PathDist}()}, +Other tree distances: +\code{\link[=HierarchicalMutualInfo]{HierarchicalMutualInfo()}}, +\code{\link[=JaccardRobinsonFoulds]{JaccardRobinsonFoulds()}}, +\code{\link[=KendallColijn]{KendallColijn()}}, +\code{\link[=MASTSize]{MASTSize()}}, +\code{\link[=MatchingSplitDistance]{MatchingSplitDistance()}}, +\code{\link[=NNIDist]{NNIDist()}}, +\code{\link[=NyeSimilarity]{NyeSimilarity()}}, +\code{\link[=PathDist]{PathDist()}}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()}, -\code{\link{TransferDist}()} +\code{\link[=SPRDist]{SPRDist()}}, +\code{\link[=TransferDist]{TransferDist()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/TreeInfo.Rd b/man/TreeInfo.Rd index a31234d8e..9f80d53fd 100644 --- a/man/TreeInfo.Rd +++ b/man/TreeInfo.Rd @@ -205,9 +205,9 @@ An introduction to the phylogenetic information content of a split is given in \href{https://ms609.github.io/TreeTools/reference/SplitInformation.html}{\code{SplitInformation()}} and in a \href{https://ms609.github.io/TreeDist/articles/information.html}{package vignette}. -Other information functions: -\code{\link{SplitEntropy}()}, -\code{\link{SplitSharedInformation}()} +Other information functions: +\code{\link[=SplitEntropy]{SplitEntropy()}}, +\code{\link[=SplitSharedInformation]{SplitSharedInformation()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/cluster-statistics.Rd b/man/cluster-statistics.Rd index eb9973296..e892a26d1 100644 --- a/man/cluster-statistics.Rd +++ b/man/cluster-statistics.Rd @@ -87,16 +87,16 @@ MeanNN(points, cluster) MeanMSTEdge(points, cluster) } \seealso{ -Other tree space functions: -\code{\link{Islands}()}, -\code{\link{MSTSegments}()}, -\code{\link{MapTrees}()}, -\code{\link{MappingQuality}()}, -\code{\link{SpectralEigens}()}, -\code{\link{median.multiPhylo}()} - -Other cluster functions: -\code{\link{KMeansPP}()} +Other tree space functions: +\code{\link[=Islands]{Islands()}}, +\code{\link[=MSTSegments]{MSTSegments()}}, +\code{\link[=MapTrees]{MapTrees()}}, +\code{\link[=MappingQuality]{MappingQuality()}}, +\code{\link[=SpectralEigens]{SpectralEigens()}}, +\code{\link[=median.multiPhylo]{median.multiPhylo()}} + +Other cluster functions: +\code{\link[=KMeansPP]{KMeansPP()}} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/median.multiPhylo.Rd b/man/median.multiPhylo.Rd index 43282cebe..ad1fe2f8f 100644 --- a/man/median.multiPhylo.Rd +++ b/man/median.multiPhylo.Rd @@ -79,12 +79,12 @@ Consensus methods: \code{\link[ape:consensus]{ape::consensus()}}, \code{\link[TreeTools:ConsensusWithout]{TreeTools::ConsensusWithout()}} -Other tree space functions: -\code{\link{Islands}()}, -\code{\link{MSTSegments}()}, -\code{\link{MapTrees}()}, -\code{\link{MappingQuality}()}, -\code{\link{SpectralEigens}()}, +Other tree space functions: +\code{\link[=Islands]{Islands()}}, +\code{\link[=MSTSegments]{MSTSegments()}}, +\code{\link[=MapTrees]{MapTrees()}}, +\code{\link[=MappingQuality]{MappingQuality()}}, +\code{\link[=SpectralEigens]{SpectralEigens()}}, \code{\link{cluster-statistics}} } \author{ From 663f62d71bc8a8909392cf43524b793b06dcf159 Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Wed, 6 May 2026 12:38:25 +0100 Subject: [PATCH 16/22] Rounding --- .claude/settings.local.json | 7 ++++ R/tree_distance.R | 17 ++++++++++ R/tree_distance_info.R | 42 ++++++++++-------------- R/tree_distance_msi.R | 18 ++++------- R/tree_distance_rf.R | 26 +++++++-------- tests/testthat/test-large-trees.R | 54 +++++++++++++++++++++++-------- 6 files changed, 101 insertions(+), 63 deletions(-) create mode 100644 .claude/settings.local.json diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 000000000..f80c4d561 --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,7 @@ +{ + "permissions": { + "allow": [ + "Bash(Rscript *)" + ] + } +} diff --git a/R/tree_distance.R b/R/tree_distance.R index 5ea3a9cb3..12caa4691 100644 --- a/R/tree_distance.R +++ b/R/tree_distance.R @@ -230,6 +230,23 @@ GeneralizedRF <- function(splits1, splits2, nTip, PairScorer, g[lower.tri(g)] } +# Floor sub-noise distances to zero before normalization. +# Two sources of numerical noise scale with treesIndependentInfo: +# (1) LAP int64 cost-matrix quantization in *Splits scoring; per-cell +# truncation of up to (max_possible / BIG) bits, summed over n_splits. +# (2) Float-accumulation drift between independently-built tables (e.g. +# InfoRobinsonFoulds vs cpp_splitwise_info_batch sum the same per-split +# info contributions, but using different lookup-table constructions). +# Both grow with the magnitude of the answer, so an absolute sqrt(eps) +# tolerance becomes too tight beyond a few thousand tips. Scaling by +# treesIndependentInfo self-adjusts; pmax(1, ·) preserves the original +# tolerance for tiny trees where these errors are negligible anyway. +.FloorNumericalNoise <- function(ret, treesIndependentInfo) { + tol <- pmax(1, treesIndependentInfo) * .Machine[["double.eps"]] ^ 0.5 + ret[ret < tol] <- 0 + ret +} + .AllTipsSame <- function(x, y) { if (is.list(x)) { xPrime <- x[[1]] diff --git a/R/tree_distance_info.R b/R/tree_distance_info.R index dc7441e21..07693302f 100644 --- a/R/tree_distance_info.R +++ b/R/tree_distance_info.R @@ -249,16 +249,15 @@ DifferentPhylogeneticInfo <- function(tree1, tree2 = NULL, normalize = FALSE, if (!is.null(fast)) { spi <- fast[["info"]] treesIndependentInfo <- .PairwiseSums(fast[["entropies"]]) - - ret <- treesIndependentInfo - spi - spi + + ret <- .FloorNumericalNoise(treesIndependentInfo - spi - spi, treesIndependentInfo) ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = SplitwiseInfo, Combine = "+") - ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 attributes(ret) <- attributes(spi) return(ret) } - + # Fast path (cross-pairs): same tips, no matching — avoids duplicate as.Splits() fast_many <- .FastManyManyPath(tree1, tree2, reportMatching, cpp_shared_phylo_cross_pairs, @@ -268,25 +267,22 @@ DifferentPhylogeneticInfo <- function(tree1, tree2 = NULL, normalize = FALSE, info1 <- fast_many[["info1"]] info2 <- fast_many[["info2"]] treesIndependentInfo <- outer(info1, info2, "+") - - ret <- treesIndependentInfo - spi - spi + + ret <- .FloorNumericalNoise(treesIndependentInfo - spi - spi, treesIndependentInfo) ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = SplitwiseInfo, Combine = "+") - ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 return(ret) } - + spi <- SharedPhylogeneticInfo(tree1, tree2, normalize = FALSE, diag = FALSE, reportMatching = reportMatching) treesIndependentInfo <- .MaxValue(tree1, tree2, SplitwiseInfo) - - ret <- treesIndependentInfo - spi - spi - ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, + + ret <- .FloorNumericalNoise(treesIndependentInfo - spi - spi, treesIndependentInfo) + ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = SplitwiseInfo, Combine = "+") - - ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 # Catch floating point inaccuracy attributes(ret) <- attributes(spi) # Return: @@ -310,16 +306,15 @@ ClusteringInfoDistance <- function(tree1, tree2 = NULL, normalize = FALSE, if (!is.null(fast)) { mci <- fast[["info"]] treesIndependentInfo <- .PairwiseSums(fast[["entropies"]]) - - ret <- treesIndependentInfo - mci - mci + + ret <- .FloorNumericalNoise(treesIndependentInfo - mci - mci, treesIndependentInfo) ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = ClusteringEntropy, Combine = "+") - ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 attributes(ret) <- attributes(mci) return(ret) } - + # Fast path (cross-pairs): same tips, no matching — avoids duplicate as.Splits() fast_many <- .FastManyManyPath(tree1, tree2, reportMatching, cpp_mutual_clustering_cross_pairs, @@ -329,25 +324,22 @@ ClusteringInfoDistance <- function(tree1, tree2 = NULL, normalize = FALSE, info1 <- fast_many[["info1"]] info2 <- fast_many[["info2"]] treesIndependentInfo <- outer(info1, info2, "+") - - ret <- treesIndependentInfo - mci - mci + + ret <- .FloorNumericalNoise(treesIndependentInfo - mci - mci, treesIndependentInfo) ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = ClusteringEntropy, Combine = "+") - ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 return(ret) } - + mci <- MutualClusteringInfo(tree1, tree2, normalize = FALSE, diag = FALSE, reportMatching = reportMatching) treesIndependentInfo <- .MaxValue(tree1, tree2, ClusteringEntropy) - - ret <- treesIndependentInfo - mci - mci + + ret <- .FloorNumericalNoise(treesIndependentInfo - mci - mci, treesIndependentInfo) ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = ClusteringEntropy, Combine = "+") - - ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 # Handle floating point inaccuracy attributes(ret) <- attributes(mci) # Return: diff --git a/R/tree_distance_msi.R b/R/tree_distance_msi.R index aadfc2f64..ff89cb184 100644 --- a/R/tree_distance_msi.R +++ b/R/tree_distance_msi.R @@ -29,15 +29,14 @@ MatchingSplitInfoDistance <- function(tree1, tree2 = NULL, msi <- fast[["info"]] treesIndependentInfo <- .PairwiseSums(fast[["entropies"]]) - ret <- treesIndependentInfo - msi - msi + ret <- .FloorNumericalNoise(treesIndependentInfo - msi - msi, treesIndependentInfo) ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = SplitwiseInfo, Combine = "+") - ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 attributes(ret) <- attributes(msi) return(ret) } - + # Fast path (cross-pairs): same tips, no matching — avoids duplicate as.Splits() fast_many <- .FastManyManyPath(tree1, tree2, reportMatching, cpp_msi_cross_pairs, @@ -47,25 +46,22 @@ MatchingSplitInfoDistance <- function(tree1, tree2 = NULL, info1 <- fast_many[["info1"]] info2 <- fast_many[["info2"]] treesIndependentInfo <- outer(info1, info2, "+") - - ret <- treesIndependentInfo - msi - msi + + ret <- .FloorNumericalNoise(treesIndependentInfo - msi - msi, treesIndependentInfo) ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = SplitwiseInfo, Combine = "+") - ret[ret < .Machine[["double.eps"]] ^ 0.5] <- 0 return(ret) } - + msi <- MatchingSplitInfo(tree1, tree2, normalize = FALSE, diag = FALSE, reportMatching = reportMatching) - + treesIndependentInfo <- .MaxValue(tree1, tree2, SplitwiseInfo) - ret <- treesIndependentInfo - msi - msi + ret <- .FloorNumericalNoise(treesIndependentInfo - msi - msi, treesIndependentInfo) ret <- NormalizeInfo(ret, tree1, tree2, how = normalize, infoInBoth = treesIndependentInfo, InfoInTree = SplitwiseInfo, Combine = "+") - - ret[ret < .Machine[["double.eps"]]^0.5] <- 0 # In case of floating point inaccuracy attributes(ret) <- attributes(msi) # Return: ret diff --git a/R/tree_distance_rf.R b/R/tree_distance_rf.R index 9f9454cc2..f5622b811 100644 --- a/R/tree_distance_rf.R +++ b/R/tree_distance_rf.R @@ -75,14 +75,15 @@ InfoRobinsonFoulds <- function(tree1, tree2 = NULL, similarity = FALSE, cpp_splitwise_info_batch) if (!is.null(fast)) { treesIndependentInfo <- .PairwiseSums(fast[["entropies"]]) - unnormalized <- treesIndependentInfo - fast[["info"]] - fast[["info"]] - unnormalized[unnormalized < .Machine[["double.eps"]] ^ 0.5] <- 0 + unnormalized <- .FloorNumericalNoise( + treesIndependentInfo - fast[["info"]] - fast[["info"]], + treesIndependentInfo) ret <- NormalizeInfo(unnormalized, tree1, tree2, how = normalize, InfoInTree = SplitwiseInfo, Combine = "+") attributes(ret) <- attributes(fast[["info"]]) return(ret) } - + # Cross-pairs fast path fast_many <- .FastManyManyPath(tree1, tree2, reportMatching, cpp_rf_info_cross_pairs, @@ -92,25 +93,24 @@ InfoRobinsonFoulds <- function(tree1, tree2 = NULL, similarity = FALSE, info1 <- fast_many[["info1"]] info2 <- fast_many[["info2"]] treesIndependentInfo <- outer(info1, info2, "+") - - unnormalized <- treesIndependentInfo - irf - irf - unnormalized[unnormalized < .Machine[["double.eps"]] ^ 0.5] <- 0 + + unnormalized <- .FloorNumericalNoise(treesIndependentInfo - irf - irf, + treesIndependentInfo) ret <- NormalizeInfo(unnormalized, tree1, tree2, how = normalize, InfoInTree = SplitwiseInfo, Combine = "+") return(ret) } } - - unnormalized <- CalculateTreeDistance(InfoRobinsonFouldsSplits, tree1, tree2, + + unnormalized <- CalculateTreeDistance(InfoRobinsonFouldsSplits, tree1, tree2, reportMatching) * 2 - + if (!similarity) { - unnormalized <- .MaxValue(tree1, tree2, SplitwiseInfo) - unnormalized + treesIndependentInfo <- .MaxValue(tree1, tree2, SplitwiseInfo) + unnormalized <- .FloorNumericalNoise(treesIndependentInfo - unnormalized, + treesIndependentInfo) } - # In case of floating point inaccuracy - unnormalized[unnormalized < .Machine[["double.eps"]] ^ 0.5] <- 0 - # Return: NormalizeInfo(unnormalized, tree1, tree2, how = normalize, InfoInTree = SplitwiseInfo, Combine = "+") diff --git a/tests/testthat/test-large-trees.R b/tests/testthat/test-large-trees.R index 5dae14840..c8fb29cae 100644 --- a/tests/testthat/test-large-trees.R +++ b/tests/testthat/test-large-trees.R @@ -29,21 +29,47 @@ test_that("Distance functions handle trees exceeding TT 2.2.0 limit", { expect_equal(MatchingSplitInfoDistance(t1, t1)[[1]], 0) }) -test_that("ClusteringInfoDistance returns correct value for n > SL_MAX_TIPS", { +test_that("Distance scores agree across stack/heap storage threshold", { skip_on_cran() - n <- 2049L - skip_if(TreeDist:::cpp_sl_max_tips() < n, - "Requires TreeTools >= 2.3.0 (SL_MAX_TIPS > 2048)") - - t1 <- as.phylo(1, n) - t2 <- as.phylo(2, n) - - # The individual-pair path and the all-pairs OpenMP batch path are - # independent implementations; agreement confirms the correct value. - expect_equal( - ClusteringInfoDistance(t1, t2)[[1]], - as.matrix(ClusteringInfoDistance(list(t1, t2)))[1, 2] - ) + skip_if_not_installed("TreeTools", "2.3.0") + # SL_STACK_SPLITS in TreeTools 2.3.0+ admits trees of n_tips <= 2048 to + # stack-backed split storage; n_tips == 2049 forces the heap path. This + # test compares each metric on a stack-storage tree pair against the same + # pair after inserting a cherry next to tip 1, which both pushes them + # across the threshold and leaves their split structure invariant: each + # existing split simply gains the new tip on whichever side already + # contained tip 1, and one perfectly-matched cherry split is added + # (contributing 0 to any distance). + stackMax <- 2048L + skip_if(TreeDist:::cpp_sl_max_tips() <= stackMax, + "Requires TreeTools >= 2.3.0 (heap-backed storage)") + + t1s <- as.phylo(1234, stackMax) + t2s <- PectinateTree(stackMax) + t1h <- AddTip(t1s, where = 1, label = "extra") + t2h <- AddTip(t2s, where = 1, label = "extra") + + # RF (count of unmatched splits) is invariant under the inserted cherry. + expect_equal(RobinsonFoulds(t1s, t2s)[[1]], + RobinsonFoulds(t1h, t2h)[[1]]) + + # Per-split info depends on n_tips and on side sizes, so absolute + # information-theoretic distances drift by a few % when n grows by 1; + # normalized distances are far less sensitive. + tol <- 0.002 + + expect_equal(InfoRobinsonFoulds(t1s, t2s, normalize = TRUE)[[1]], + InfoRobinsonFoulds(t1h, t2h, normalize = TRUE)[[1]], + tolerance = tol) + expect_equal(ClusteringInfoDistance(t1s, t2s, normalize = TRUE)[[1]], + ClusteringInfoDistance(t1h, t2h, normalize = TRUE)[[1]], + tolerance = tol) + expect_equal(DifferentPhylogeneticInfo(t1s, t2s, normalize = TRUE)[[1]], + DifferentPhylogeneticInfo(t1h, t2h, normalize = TRUE)[[1]], + tolerance = tol) + expect_equal(MatchingSplitInfoDistance(t1s, t2s, normalize = TRUE)[[1]], + MatchingSplitInfoDistance(t1h, t2h, normalize = TRUE)[[1]], + tolerance = tol) }) test_that("RF and IRF work for 8000-tip trees", { From 99e48341a64b76c174b77afd6a662c790e34909a Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Wed, 6 May 2026 12:50:52 +0100 Subject: [PATCH 17/22] -vtunes --- .Rbuildignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.Rbuildignore b/.Rbuildignore index 1aa03d51f..f3832be8f 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -15,6 +15,7 @@ cran-comments.md man-roxygen data-raw docs +^.*vtune.*$ revdep \.covrignore ^\.git$ From 496732c4cee815dcc85ad513e54a9d47397a66b1 Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Wed, 6 May 2026 13:06:32 +0100 Subject: [PATCH 18/22] Quick benchmark --- .github/workflows/R-CMD-check.yml | 2 - .github/workflows/benchmark.yml | 162 ++++++++++++++++++++++++++++++ cran-comments.md | 3 +- 3 files changed, 164 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/benchmark.yml diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index df89093fe..4255992b9 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -265,8 +265,6 @@ jobs: uses: r-lib/actions/setup-r-dependencies@v2 with: dependencies: '"hard"' - extra-packages: | - github::ms609/TreeTools needs: benchmark - name: Benchmark PR diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml new file mode 100644 index 000000000..e640d74de --- /dev/null +++ b/.github/workflows/benchmark.yml @@ -0,0 +1,162 @@ +# For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. +# https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions +on: + schedule: + - cron: "24 06 * * 1" + workflow_dispatch: + push: + branches: ["*"] + paths-ignore: + - "Meta**" + - "memcheck**" + - "docs**" + - "**.git" + - "**.json" + - "**.md" + - "**.yaml" + - "**.yml" + - "!**R-CMD-check.yml" + - "**.R[dD]ata" + - "**.Rpro*" + pull_request: + branches: ["*"] + paths-ignore: + - "Meta**" + - "memcheck**" + - "docs**" + - "**.git" + - "**.json" + - "**.md" + - "**.yaml" + - "**.yml" + - "!**R-CMD-check.yml" + - "**.R[dD]ata" + - "**.Rpro*" + +name: R-CMD-check + +jobs: + benchmark: + runs-on: ubuntu-latest + name: benchmark + + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + RSPM: "https://packagemanager.posit.co/cran/__linux__/noble/latest" + _R_CHECK_BUILD_VIGNETTES_: false + _R_CHECK_CRAN_INCOMING_: false + _R_CHECK_FORCE_SUGGESTS_: false + R_REMOTES_STANDALONE: true + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + R_REALLY_FORCE_SYMBOLS: true + PKG_CXXFLAGS: "-O3" + + steps: + - name: Checkout PR branch + uses: actions/checkout@v6 + with: + path: pr + + - name: Checkout target branch + uses: actions/checkout@v6 + with: + ref: ${{ github.event.pull_request.base.ref || 'main' }} + path: main + + - name: Set up R + uses: r-lib/actions/setup-r@v2 + with: + extra-repositories: https://ms609.github.io/packages/ + + - name: Install R dependencies + uses: r-lib/actions/setup-r-dependencies@v2 + with: + dependencies: '"hard"' + extra-packages: | + TreeTools@2.2.0 + needs: benchmark + + - name: Benchmark PR + uses: ms609/actions/benchmark-step@main + with: + path: pr + output: pr + + - name: Benchmark target + uses: ms609/actions/benchmark-step@main + with: + path: main + output: main + + - name: Benchmark PR again + uses: ms609/actions/benchmark-step@main + with: + path: pr + output: pr2 + + - run: dir pr-benchmark-results + working-directory: pr + - run: dir main-benchmark-results + working-directory: pr + + - name: Compare benchmarks + id: compare_results + working-directory: pr + run: | + Rscript benchmark/_compare_results.R + shell: bash + + - name: Upload PR benchmark results + if: failure() + uses: actions/upload-artifact@v6 + with: + name: pr-benchmark-results + path: pr/pr-benchmark-results/*.bench.Rds + + - name: Upload main benchmark results + if: failure() + uses: actions/upload-artifact@v6 + with: + name: main-benchmark-results + path: pr/main-benchmark-results/*.bench.Rds + + - name: Comment on PR + if: always() && github.event_name == 'pull_request' && steps.compare_results.outputs.report != '' + uses: actions/github-script@v7 + env: + BENCHMARK_MESSAGE: ${{ steps.compare_results.outputs.report }} + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + const benchmarkIdentifier = ''; + const outdatedPrefix = '> **⚠️ This benchmark result is outdated. See the latest comment below.**\n\n'; + + const comments = await github.rest.issues.listComments({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number + }); + + const previousBenchmarkComments = comments.data.filter(comment => + comment.user.type === 'Bot' && + comment.body.includes(benchmarkIdentifier) && + !comment.body.startsWith('> **⚠️ This benchmark result is outdated.') + ); + + for (const comment of previousBenchmarkComments) { + await github.rest.issues.updateComment({ + owner: context.repo.owner, + repo: context.repo.repo, + comment_id: comment.id, + body: outdatedPrefix + comment.body + }); + } + + const newCommentBody = benchmarkIdentifier + '\n' + process.env.BENCHMARK_MESSAGE; + + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.issue.number, + body: newCommentBody + }); diff --git a/cran-comments.md b/cran-comments.md index 31e312b34..3bae24af7 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,7 +1,8 @@ ## Test environments -* Local machine, Windows 10, R devel (2024-09-02 r87090 ucrt) +* Local machine, Windows 11, R version 4.6.0 (2026-04-24 ucrt) * `devtools::check_win_devel()` +* `devtools::check_mac_devel()` * [GitHub Actions](https://github.com/ms609/TreeDist/actions): - windows-latest, R release From 46663c9b4982de9dfa08fdfd7d65e45409d6624f Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Thu, 7 May 2026 13:07:29 +0100 Subject: [PATCH 19/22] Inline to improve performance --- inst/include/TreeDist/mutual_clustering.h | 42 +++++++++---------- .../include/TreeDist/mutual_clustering_impl.h | 24 +++++++++++ 2 files changed, 43 insertions(+), 23 deletions(-) diff --git a/inst/include/TreeDist/mutual_clustering.h b/inst/include/TreeDist/mutual_clustering.h index 123fad306..ac0de405a 100644 --- a/inst/include/TreeDist/mutual_clustering.h +++ b/inst/include/TreeDist/mutual_clustering.h @@ -35,39 +35,35 @@ namespace TreeDist { // computation. max_tips should be >= the largest tree size used. void init_lg2_tables(int max_tips); + // Out-of-line slow paths for inputs that exceed the precomputed tables + // (i.e. n_tips > SL_MAX_TIPS). Defined in mutual_clustering_impl.h. + // Kept out-of-line so the *_lookup() inline thunks below stay tiny — + // table load + predicted branch, no extern function calls — which some + // compilers refuse to inline through, suppressing optimization in the + // per-cell hot loops of PID/MCI/MSI/RF info. + double lg2_slow(split_int x); + double lg2_unrooted_slow(split_int n_tips); + double lg2_rooted_slow(split_int n_tips); + // log2(x) — table-fast for x <= SL_MAX_TIPS, runtime otherwise. [[nodiscard]] inline double lg2_lookup(split_int x) noexcept { - if (x <= static_cast(SL_MAX_TIPS)) { - return lg2[x]; - } - return std::log2(static_cast(x)); // LCOV_EXCL_LINE + return (x <= static_cast(SL_MAX_TIPS)) + ? lg2[x] + : lg2_slow(x); // LCOV_EXCL_LINE } // log2((2n-5)!!) — table-fast for n <= SL_MAX_TIPS+1, lgamma otherwise. - // (log2(e) = 1/log(2)) [[nodiscard]] inline double lg2_unrooted_lookup(split_int n_tips) noexcept { - if (n_tips <= static_cast(SL_MAX_TIPS + 1)) { - return lg2_unrooted[n_tips]; - } - // LCOV_EXCL_START - if (n_tips < 3) return 0.0; - const double n = static_cast(n_tips); - return (std::lgamma(2.0 * n - 3.0) - std::lgamma(n - 1.0)) / std::log(2.0) - - (n - 2.0); - // LCOV_EXCL_STOP + return (n_tips <= static_cast(SL_MAX_TIPS + 1)) + ? lg2_unrooted[n_tips] + : lg2_unrooted_slow(n_tips); // LCOV_EXCL_LINE } // log2((2n-3)!!) — table-fast for n <= SL_MAX_TIPS+1, lgamma otherwise. [[nodiscard]] inline double lg2_rooted_lookup(split_int n_tips) noexcept { - if (n_tips <= static_cast(SL_MAX_TIPS + 1)) { - return lg2_rooted[n_tips]; - } - // LCOV_EXCL_START - if (n_tips < 2) return 0.0; - const double n = static_cast(n_tips); - return (std::lgamma(2.0 * n - 1.0) - std::lgamma(n)) / std::log(2.0) - - (n - 1.0); - // LCOV_EXCL_STOP + return (n_tips <= static_cast(SL_MAX_TIPS + 1)) + ? lg2_rooted[n_tips] + : lg2_rooted_slow(n_tips); // LCOV_EXCL_LINE } // ---- Inline helpers ---- diff --git a/inst/include/TreeDist/mutual_clustering_impl.h b/inst/include/TreeDist/mutual_clustering_impl.h index 815db2354..cea725beb 100644 --- a/inst/include/TreeDist/mutual_clustering_impl.h +++ b/inst/include/TreeDist/mutual_clustering_impl.h @@ -58,6 +58,30 @@ void init_lg2_tables(int max_tips) { } } +// ---- Out-of-line slow paths (n_tips > SL_MAX_TIPS) ---- +// Defined out-of-line so that the *_lookup() inline thunks in +// mutual_clustering.h stay trivial enough for any compiler to inline. + +// LCOV_EXCL_START +double lg2_slow(split_int x) { + return std::log2(static_cast(x)); +} + +double lg2_unrooted_slow(split_int n_tips) { + if (n_tips < 3) return 0.0; + const double n = static_cast(n_tips); + return (std::lgamma(2.0 * n - 3.0) - std::lgamma(n - 1.0)) / std::log(2.0) + - (n - 2.0); +} + +double lg2_rooted_slow(split_int n_tips) { + if (n_tips < 2) return 0.0; + const double n = static_cast(n_tips); + return (std::lgamma(2.0 * n - 1.0) - std::lgamma(n)) / std::log(2.0) + - (n - 1.0); +} +// LCOV_EXCL_STOP + // ---- Sort+merge exact-match detection (internal) ---- // // Canonicalise each split so bit 0 is always set (flip complement if not), From 6fce5754045e1d4b02053ff34c47161fe1dd2c35 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Thu, 7 May 2026 14:14:00 +0100 Subject: [PATCH 20/22] template fast vs slow paths for PID --- src/pairwise_distances.cpp | 48 ++++++++++++++++++++++++++++--------- src/tree_distances.cpp | 49 ++++++++++++++++++++++++++++---------- src/tree_distances.h | 43 ++++++++++++++++++++++++++------- 3 files changed, 108 insertions(+), 32 deletions(-) diff --git a/src/pairwise_distances.cpp b/src/pairwise_distances.cpp index 8d92a4fc6..9fedb86d5 100644 --- a/src/pairwise_distances.cpp +++ b/src/pairwise_distances.cpp @@ -669,6 +669,27 @@ NumericVector cpp_msi_all_pairs( // metric, spi_overlap(A, B) where B contains A can EXCEED spi_overlap(A, A) // for identical splits, so the LAP can find a better global assignment by NOT // matching identical splits. The full LAP is always solved. +// LAP-fill body for shared_phylo_score, templated on Fast so the per-cell +// spi_overlap call resolves at compile time to direct lg2_rooted[] access +// (Fast=true) or the bounds-checked lookup (Fast=false). +template +static inline void shared_phylo_fill_lap( + const SplitList& a, const SplitList& b, const int32 n_tips, + const double best_overlap, const double score_over_possible, + const cost max_score, cost_matrix& score +) { + for (split_int ai = 0; ai < a.n_splits; ++ai) { + for (split_int bi = 0; bi < b.n_splits; ++bi) { + const double spi = TreeDist::spi_overlap( + a.state[ai], b.state[bi], n_tips, + a.in_split[ai], b.in_split[bi], a.n_bins); + score(ai, bi) = (spi == 0.0) ? max_score + : cost((spi - best_overlap) * score_over_possible); + } + score.padRowAfterCol(ai, b.n_splits, max_score); + } +} + static double shared_phylo_score( const SplitList& a, const SplitList& b, const int32 n_tips, LapScratch& scratch @@ -678,23 +699,28 @@ static double shared_phylo_score( const split_int overlap_a = (n_tips + 1) / 2; constexpr cost max_score = BIG; - const double best_overlap = TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); - const double max_possible = lg2_unrooted_lookup(n_tips) - best_overlap; + // Hot path: n_tips <= SL_MAX_TIPS, so all lg2_rooted/lg2_unrooted indices + // fall in the precomputed table. Dispatch once here so the per-cell loop + // emits direct table loads with no bounds-check branch. + const bool fast = n_tips <= static_cast(SL_MAX_TIPS + 1); + const double best_overlap = fast + ? TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips) + : TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); + const double max_possible = (fast ? lg2_unrooted[n_tips] + : lg2_unrooted_lookup(n_tips)) - best_overlap; const double score_over_possible = static_cast(max_score) / max_possible; const double possible_over_score = max_possible / static_cast(max_score); scratch.score_pool.resize(most_splits); cost_matrix& score = scratch.score_pool; - for (split_int ai = 0; ai < a.n_splits; ++ai) { - for (split_int bi = 0; bi < b.n_splits; ++bi) { - const double spi = TreeDist::spi_overlap( - a.state[ai], b.state[bi], n_tips, - a.in_split[ai], b.in_split[bi], a.n_bins); - score(ai, bi) = (spi == 0.0) ? max_score - : cost((spi - best_overlap) * score_over_possible); - } - score.padRowAfterCol(ai, b.n_splits, max_score); + if (fast) { + shared_phylo_fill_lap(a, b, n_tips, best_overlap, + score_over_possible, max_score, score); + } else { + // LCOV_EXCL_LINE + shared_phylo_fill_lap(a, b, n_tips, best_overlap, + score_over_possible, max_score, score); } score.padAfterRow(a.n_splits, max_score); diff --git a/src/tree_distances.cpp b/src/tree_distances.cpp index 7613b9141..d45b76188 100644 --- a/src/tree_distances.cpp +++ b/src/tree_distances.cpp @@ -572,6 +572,29 @@ List mutual_clustering(const RawMatrix &x, const RawMatrix &y, } } +// Templated LAP-fill so the per-cell spi_overlap resolves to direct +// lg2_rooted[] access on the hot path (n_tips <= SL_MAX_TIPS). +template +static inline void shared_phylo_fill(const SplitList& a, const SplitList& b, + const int32 n_tips, + const double best_overlap, + const double score_over_possible, + const cost max_score, + cost_matrix& score) { + for (split_int ai = a.n_splits; ai--; ) { + if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); + for (split_int bi = b.n_splits; bi--; ) { + const double spi_over = TreeDist::spi_overlap( + a.state[ai], b.state[bi], n_tips, a.in_split[ai], b.in_split[bi], + a.n_bins); + + score(ai, bi) = spi_over == 0 ? max_score : + cost((spi_over - best_overlap) * score_over_possible); + } + score.padRowAfterCol(ai, b.n_splits, max_score); + } +} + inline List shared_phylo (const RawMatrix &x, const RawMatrix &y, const int32 n_tips) { const SplitList a(x), b(y); @@ -579,8 +602,12 @@ inline List shared_phylo (const RawMatrix &x, const RawMatrix &y, const split_int overlap_a = (n_tips + 1) / 2; constexpr cost max_score = BIG; - const double lg2_unrooted_n = lg2_unrooted_lookup(n_tips); - const double best_overlap = TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); + const bool fast = n_tips <= static_cast(SL_MAX_TIPS + 1); + const double lg2_unrooted_n = fast ? lg2_unrooted[n_tips] + : lg2_unrooted_lookup(n_tips); + const double best_overlap = fast + ? TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips) + : TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); const double max_possible = lg2_unrooted_n - best_overlap; const double score_over_possible = max_score / max_possible; const double possible_over_score = max_possible / max_score; @@ -589,17 +616,13 @@ inline List shared_phylo (const RawMatrix &x, const RawMatrix &y, // In/out direction [i.e. 1/0 bit] is arbitrary. cost_matrix score(most_splits); - for (split_int ai = a.n_splits; ai--; ) { - if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); - for (split_int bi = b.n_splits; bi--; ) { - const double spi_over = TreeDist::spi_overlap( - a.state[ai], b.state[bi], n_tips, a.in_split[ai], b.in_split[bi], - a.n_bins); - - score(ai, bi) = spi_over == 0 ? max_score : - cost((spi_over - best_overlap) * score_over_possible); - } - score.padRowAfterCol(ai, b.n_splits, max_score); + if (fast) { + shared_phylo_fill(a, b, n_tips, best_overlap, + score_over_possible, max_score, score); + } else { + // LCOV_EXCL_LINE + shared_phylo_fill(a, b, n_tips, best_overlap, + score_over_possible, max_score, score); } score.padAfterRow(a.n_splits, max_score); diff --git a/src/tree_distances.h b/src/tree_distances.h index b046e0038..89ee6b2de 100644 --- a/src/tree_distances.h +++ b/src/tree_distances.h @@ -69,28 +69,50 @@ namespace TreeDist { } + // Compile-time-dispatched lg2 helpers: when Fast=true, callers guarantee + // x <= SL_MAX_TIPS, so the bounds-check + branch in lg2_rooted_lookup is + // skipped entirely and the inner kernel collapses to a bare table load. + // Used by spi_overlap / one_overlap below; large-tree callers pass + // Fast=false (default) and pay the predictable branch. + template + [[nodiscard]] inline double lg2_rooted_fast(split_int x) noexcept { + if constexpr (Fast) return lg2_rooted[x]; + else return lg2_rooted_lookup(x); + } + + template + [[nodiscard]] inline double lg2_unrooted_fast(split_int x) noexcept { + if constexpr (Fast) return lg2_unrooted[x]; + else return lg2_unrooted_lookup(x); + } + + template [[nodiscard]] inline double one_overlap(const split_int a, const split_int b, const split_int n) noexcept { static_assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max(), "int16 too narrow for SL_MAX_TIPS"); if (a == b) { - return lg2_rooted_lookup(a) + lg2_rooted_lookup(n - a); + return lg2_rooted_fast(a) + lg2_rooted_fast(n - a); } // Unify ab via lo/hi: removes an unpredictable branch. const split_int lo = (a < b) ? a : b; const split_int hi = (a < b) ? b : a; - return lg2_rooted_lookup(hi) + lg2_rooted_lookup(n - lo) - lg2_rooted_lookup(hi - lo + 1); + return lg2_rooted_fast(hi) + lg2_rooted_fast(n - lo) + - lg2_rooted_fast(hi - lo + 1); } + template [[nodiscard]] inline double one_overlap_notb(const split_int a, const split_int n_minus_b, const split_int n) noexcept { static_assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max(), "int16 too narrow for SL_MAX_TIPS"); const split_int b = n - n_minus_b; if (a == b) { - return lg2_rooted_lookup(b) + lg2_rooted_lookup(n_minus_b); + return lg2_rooted_fast(b) + lg2_rooted_fast(n_minus_b); } else if (a < b) { - return lg2_rooted_lookup(b) + lg2_rooted_lookup(n - a) - lg2_rooted_lookup(b - a + 1); + return lg2_rooted_fast(b) + lg2_rooted_fast(n - a) + - lg2_rooted_fast(b - a + 1); } else { - return lg2_rooted_lookup(a) + lg2_rooted_lookup(n_minus_b) - lg2_rooted_lookup(a - b + 1); + return lg2_rooted_fast(a) + lg2_rooted_fast(n_minus_b) + - lg2_rooted_fast(a - b + 1); } } @@ -99,6 +121,11 @@ namespace TreeDist { // Counts n_ab = |A ∩ B| via hardware POPCNT, then derives all 4 Venn-diagram // region populations from arithmetic on n_ab, in_a, in_b, n_tips. // split_int used throughout so in_a + in_b does not overflow (max ~65534 for 32767-tip trees). + // + // Fast=true: caller guarantees n_tips <= SL_MAX_TIPS, so inner one_overlap* + // calls use direct lg2_rooted[] access. Caller (shared_phylo_score) dispatches + // once on n_tips; per-cell branches are eliminated. + template [[nodiscard]] inline double spi_overlap(const splitbit* a_state, const splitbit* b_state, const split_int n_tips, const split_int in_a, const split_int in_b, const split_int n_bins) noexcept { @@ -119,15 +146,15 @@ namespace TreeDist { // unrelated splits). Otherwise return the appropriate one_overlap score. if (n_ab == 0) { - return one_overlap_notb(in_a, in_b, n_tips); + return one_overlap_notb(in_a, in_b, n_tips); } if (n_ab == in_b || n_ab == in_a) { // B ⊆ A (n_b_only == 0) or A ⊆ B (n_a_only == 0) - return one_overlap(in_a, in_b, n_tips); + return one_overlap(in_a, in_b, n_tips); } if (in_a + in_b - n_ab == n_tips) { // A ∪ B covers all tips (n_neither == 0) - return one_overlap_notb(in_a, in_b, n_tips); + return one_overlap_notb(in_a, in_b, n_tips); } return 0.0; From 86980d5feb69e885ca02659298947070149ef9dd Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Thu, 7 May 2026 15:29:59 +0100 Subject: [PATCH 21/22] Coverage --- R/RcppExports.R | 4 ++-- src/RcppExports.cpp | 9 +++++---- src/pairwise_distances.cpp | 22 ++++++++++++++++------ src/tree_distances.cpp | 27 +++++++++++++++++---------- tests/testthat/test-large-trees.R | 28 ++++++++++++++++++++++++++++ 5 files changed, 68 insertions(+), 22 deletions(-) diff --git a/R/RcppExports.R b/R/RcppExports.R index 8dd5b1a22..592b47725 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -303,7 +303,7 @@ cpp_mutual_clustering <- function(x, y, nTip) { .Call(`_TreeDist_cpp_mutual_clustering`, x, y, nTip) } -cpp_shared_phylo <- function(x, y, nTip) { - .Call(`_TreeDist_cpp_shared_phylo`, x, y, nTip) +cpp_shared_phylo <- function(x, y, nTip, force_slow = FALSE) { + .Call(`_TreeDist_cpp_shared_phylo`, x, y, nTip, force_slow) } diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index c78c3efc5..116f48d35 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -727,15 +727,16 @@ BEGIN_RCPP END_RCPP } // cpp_shared_phylo -List cpp_shared_phylo(const RawMatrix& x, const RawMatrix& y, const IntegerVector& nTip); -RcppExport SEXP _TreeDist_cpp_shared_phylo(SEXP xSEXP, SEXP ySEXP, SEXP nTipSEXP) { +List cpp_shared_phylo(const RawMatrix& x, const RawMatrix& y, const IntegerVector& nTip, const bool force_slow); +RcppExport SEXP _TreeDist_cpp_shared_phylo(SEXP xSEXP, SEXP ySEXP, SEXP nTipSEXP, SEXP force_slowSEXP) { BEGIN_RCPP Rcpp::RObject rcpp_result_gen; Rcpp::RNGScope rcpp_rngScope_gen; Rcpp::traits::input_parameter< const RawMatrix& >::type x(xSEXP); Rcpp::traits::input_parameter< const RawMatrix& >::type y(ySEXP); Rcpp::traits::input_parameter< const IntegerVector& >::type nTip(nTipSEXP); - rcpp_result_gen = Rcpp::wrap(cpp_shared_phylo(x, y, nTip)); + Rcpp::traits::input_parameter< const bool >::type force_slow(force_slowSEXP); + rcpp_result_gen = Rcpp::wrap(cpp_shared_phylo(x, y, nTip, force_slow)); return rcpp_result_gen; END_RCPP } @@ -797,7 +798,7 @@ static const R_CallMethodDef CallEntries[] = { {"_TreeDist_cpp_jaccard_similarity", (DL_FUNC) &_TreeDist_cpp_jaccard_similarity, 5}, {"_TreeDist_cpp_msi_distance", (DL_FUNC) &_TreeDist_cpp_msi_distance, 3}, {"_TreeDist_cpp_mutual_clustering", (DL_FUNC) &_TreeDist_cpp_mutual_clustering, 3}, - {"_TreeDist_cpp_shared_phylo", (DL_FUNC) &_TreeDist_cpp_shared_phylo, 3}, + {"_TreeDist_cpp_shared_phylo", (DL_FUNC) &_TreeDist_cpp_shared_phylo, 4}, {NULL, NULL, 0} }; diff --git a/src/pairwise_distances.cpp b/src/pairwise_distances.cpp index 9fedb86d5..760e7ff84 100644 --- a/src/pairwise_distances.cpp +++ b/src/pairwise_distances.cpp @@ -702,12 +702,21 @@ static double shared_phylo_score( // Hot path: n_tips <= SL_MAX_TIPS, so all lg2_rooted/lg2_unrooted indices // fall in the precomputed table. Dispatch once here so the per-cell loop // emits direct table loads with no bounds-check branch. + // The else branch is reachable only on TT 2.3 builds with n_tips > 32705, + // which is too large to exercise in routine tests; the equivalent slow + // template instantiations (one_overlap etc.) are covered by the + // legacy single-pair entry (cpp_shared_phylo) via force_slow=true. const bool fast = n_tips <= static_cast(SL_MAX_TIPS + 1); - const double best_overlap = fast - ? TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips) - : TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); - const double max_possible = (fast ? lg2_unrooted[n_tips] - : lg2_unrooted_lookup(n_tips)) - best_overlap; + double best_overlap, max_possible; + if (fast) { + best_overlap = TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); + max_possible = lg2_unrooted[n_tips] - best_overlap; + } else { + // LCOV_EXCL_START + best_overlap = TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); + max_possible = lg2_unrooted_lookup(n_tips) - best_overlap; + // LCOV_EXCL_STOP + } const double score_over_possible = static_cast(max_score) / max_possible; const double possible_over_score = max_possible / static_cast(max_score); @@ -718,9 +727,10 @@ static double shared_phylo_score( shared_phylo_fill_lap(a, b, n_tips, best_overlap, score_over_possible, max_score, score); } else { - // LCOV_EXCL_LINE + // LCOV_EXCL_START shared_phylo_fill_lap(a, b, n_tips, best_overlap, score_over_possible, max_score, score); + // LCOV_EXCL_STOP } score.padAfterRow(a.n_splits, max_score); diff --git a/src/tree_distances.cpp b/src/tree_distances.cpp index d45b76188..3651caab1 100644 --- a/src/tree_distances.cpp +++ b/src/tree_distances.cpp @@ -596,18 +596,25 @@ static inline void shared_phylo_fill(const SplitList& a, const SplitList& b, } inline List shared_phylo (const RawMatrix &x, const RawMatrix &y, - const int32 n_tips) { + const int32 n_tips, + const bool force_slow = false) { const SplitList a(x), b(y); const split_int most_splits = std::max(a.n_splits, b.n_splits); const split_int overlap_a = (n_tips + 1) / 2; constexpr cost max_score = BIG; - const bool fast = n_tips <= static_cast(SL_MAX_TIPS + 1); - const double lg2_unrooted_n = fast ? lg2_unrooted[n_tips] - : lg2_unrooted_lookup(n_tips); - const double best_overlap = fast - ? TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips) - : TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); + // force_slow: test-only hook so the bounds-checked lookup path can be + // exercised on small trees for code coverage of the template + // instantiations and the lg2_*_slow fallbacks. + const bool fast = !force_slow && n_tips <= static_cast(SL_MAX_TIPS + 1); + double lg2_unrooted_n, best_overlap; + if (fast) { + lg2_unrooted_n = lg2_unrooted[n_tips]; + best_overlap = TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); + } else { + lg2_unrooted_n = lg2_unrooted_lookup(n_tips); + best_overlap = TreeDist::one_overlap(overlap_a, n_tips / 2, n_tips); + } const double max_possible = lg2_unrooted_n - best_overlap; const double score_over_possible = max_score / max_possible; const double possible_over_score = max_possible / max_score; @@ -620,7 +627,6 @@ inline List shared_phylo (const RawMatrix &x, const RawMatrix &y, shared_phylo_fill(a, b, n_tips, best_overlap, score_over_possible, max_score, score); } else { - // LCOV_EXCL_LINE shared_phylo_fill(a, b, n_tips, best_overlap, score_over_possible, max_score, score); } @@ -707,9 +713,10 @@ List cpp_mutual_clustering(const RawMatrix &x, const RawMatrix &y, // [[Rcpp::export]] List cpp_shared_phylo(const RawMatrix &x, const RawMatrix &y, - const IntegerVector &nTip) { + const IntegerVector &nTip, + const bool force_slow = false) { ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips."); const int32 n_tip = static_cast(nTip[0]); TreeDist::check_ntip(n_tip); - return shared_phylo(x, y, n_tip); + return shared_phylo(x, y, n_tip, force_slow); } diff --git a/tests/testthat/test-large-trees.R b/tests/testthat/test-large-trees.R index c8fb29cae..3600fbe50 100644 --- a/tests/testthat/test-large-trees.R +++ b/tests/testthat/test-large-trees.R @@ -89,6 +89,34 @@ test_that("RF and IRF work for 8000-tip trees", { expect_equal(InfoRobinsonFoulds(t1, t1)[[1]], 0) }) +test_that("Slow lookup path produces identical SPI to fast path", { + skip_on_cran() + # Coverage for the n_tips > SL_MAX_TIPS branches in shared_phylo: + # - lg2_*_slow fallback functions + # - one_overlap, one_overlap_notb, spi_overlap + # These are only triggered with > 32705 tips in production, but the + # cpp_shared_phylo entry accepts a force_slow flag so the same code paths + # can be exercised on small trees. Both paths must produce identical + # values: lg2_rooted_lookup() == lg2_rooted[] within the table range. + t1 <- as.phylo(0, 30) + t2 <- as.phylo(11, 30) + splits1 <- as.Splits(t1) + splits2 <- as.Splits(t2) + + fast <- TreeDist:::cpp_shared_phylo(splits1, splits2, 30L, force_slow = FALSE) + slow <- TreeDist:::cpp_shared_phylo(splits1, splits2, 30L, force_slow = TRUE) + + expect_equal(fast$score, slow$score) + expect_equal(fast$matching, slow$matching) + + # Identical-tree case: both paths must report the self-information score. + fast_self <- TreeDist:::cpp_shared_phylo(splits1, splits1, 30L, + force_slow = FALSE) + slow_self <- TreeDist:::cpp_shared_phylo(splits1, splits1, 30L, + force_slow = TRUE) + expect_equal(fast_self$score, slow_self$score) +}) + test_that("Tip-count ceiling is enforced correctly", { skip_on_cran() skip_if(TreeDist:::cpp_sl_max_tips() < 2049L, From d48c62e13020678c3eb10de63c459af238d0a775 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Thu, 7 May 2026 16:04:24 +0100 Subject: [PATCH 22/22] Correct nocov syntax --- .github/workflows/benchmark.yml | 162 ---------------------- inst/include/TreeDist/mutual_clustering.h | 6 +- src/day_1985.cpp | 2 +- 3 files changed, 4 insertions(+), 166 deletions(-) delete mode 100644 .github/workflows/benchmark.yml diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml deleted file mode 100644 index e640d74de..000000000 --- a/.github/workflows/benchmark.yml +++ /dev/null @@ -1,162 +0,0 @@ -# For help debugging build failures open an issue on the RStudio community with the 'github-actions' tag. -# https://community.rstudio.com/new-topic?category=Package%20development&tags=github-actions -on: - schedule: - - cron: "24 06 * * 1" - workflow_dispatch: - push: - branches: ["*"] - paths-ignore: - - "Meta**" - - "memcheck**" - - "docs**" - - "**.git" - - "**.json" - - "**.md" - - "**.yaml" - - "**.yml" - - "!**R-CMD-check.yml" - - "**.R[dD]ata" - - "**.Rpro*" - pull_request: - branches: ["*"] - paths-ignore: - - "Meta**" - - "memcheck**" - - "docs**" - - "**.git" - - "**.json" - - "**.md" - - "**.yaml" - - "**.yml" - - "!**R-CMD-check.yml" - - "**.R[dD]ata" - - "**.Rpro*" - -name: R-CMD-check - -jobs: - benchmark: - runs-on: ubuntu-latest - name: benchmark - - env: - GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - RSPM: "https://packagemanager.posit.co/cran/__linux__/noble/latest" - _R_CHECK_BUILD_VIGNETTES_: false - _R_CHECK_CRAN_INCOMING_: false - _R_CHECK_FORCE_SUGGESTS_: false - R_REMOTES_STANDALONE: true - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - R_REALLY_FORCE_SYMBOLS: true - PKG_CXXFLAGS: "-O3" - - steps: - - name: Checkout PR branch - uses: actions/checkout@v6 - with: - path: pr - - - name: Checkout target branch - uses: actions/checkout@v6 - with: - ref: ${{ github.event.pull_request.base.ref || 'main' }} - path: main - - - name: Set up R - uses: r-lib/actions/setup-r@v2 - with: - extra-repositories: https://ms609.github.io/packages/ - - - name: Install R dependencies - uses: r-lib/actions/setup-r-dependencies@v2 - with: - dependencies: '"hard"' - extra-packages: | - TreeTools@2.2.0 - needs: benchmark - - - name: Benchmark PR - uses: ms609/actions/benchmark-step@main - with: - path: pr - output: pr - - - name: Benchmark target - uses: ms609/actions/benchmark-step@main - with: - path: main - output: main - - - name: Benchmark PR again - uses: ms609/actions/benchmark-step@main - with: - path: pr - output: pr2 - - - run: dir pr-benchmark-results - working-directory: pr - - run: dir main-benchmark-results - working-directory: pr - - - name: Compare benchmarks - id: compare_results - working-directory: pr - run: | - Rscript benchmark/_compare_results.R - shell: bash - - - name: Upload PR benchmark results - if: failure() - uses: actions/upload-artifact@v6 - with: - name: pr-benchmark-results - path: pr/pr-benchmark-results/*.bench.Rds - - - name: Upload main benchmark results - if: failure() - uses: actions/upload-artifact@v6 - with: - name: main-benchmark-results - path: pr/main-benchmark-results/*.bench.Rds - - - name: Comment on PR - if: always() && github.event_name == 'pull_request' && steps.compare_results.outputs.report != '' - uses: actions/github-script@v7 - env: - BENCHMARK_MESSAGE: ${{ steps.compare_results.outputs.report }} - with: - github-token: ${{ secrets.GITHUB_TOKEN }} - script: | - const benchmarkIdentifier = ''; - const outdatedPrefix = '> **⚠️ This benchmark result is outdated. See the latest comment below.**\n\n'; - - const comments = await github.rest.issues.listComments({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number - }); - - const previousBenchmarkComments = comments.data.filter(comment => - comment.user.type === 'Bot' && - comment.body.includes(benchmarkIdentifier) && - !comment.body.startsWith('> **⚠️ This benchmark result is outdated.') - ); - - for (const comment of previousBenchmarkComments) { - await github.rest.issues.updateComment({ - owner: context.repo.owner, - repo: context.repo.repo, - comment_id: comment.id, - body: outdatedPrefix + comment.body - }); - } - - const newCommentBody = benchmarkIdentifier + '\n' + process.env.BENCHMARK_MESSAGE; - - await github.rest.issues.createComment({ - owner: context.repo.owner, - repo: context.repo.repo, - issue_number: context.issue.number, - body: newCommentBody - }); diff --git a/inst/include/TreeDist/mutual_clustering.h b/inst/include/TreeDist/mutual_clustering.h index ac0de405a..231349059 100644 --- a/inst/include/TreeDist/mutual_clustering.h +++ b/inst/include/TreeDist/mutual_clustering.h @@ -49,21 +49,21 @@ namespace TreeDist { [[nodiscard]] inline double lg2_lookup(split_int x) noexcept { return (x <= static_cast(SL_MAX_TIPS)) ? lg2[x] - : lg2_slow(x); // LCOV_EXCL_LINE + : lg2_slow(x); // #nocov } // log2((2n-5)!!) — table-fast for n <= SL_MAX_TIPS+1, lgamma otherwise. [[nodiscard]] inline double lg2_unrooted_lookup(split_int n_tips) noexcept { return (n_tips <= static_cast(SL_MAX_TIPS + 1)) ? lg2_unrooted[n_tips] - : lg2_unrooted_slow(n_tips); // LCOV_EXCL_LINE + : lg2_unrooted_slow(n_tips); // #nocov } // log2((2n-3)!!) — table-fast for n <= SL_MAX_TIPS+1, lgamma otherwise. [[nodiscard]] inline double lg2_rooted_lookup(split_int n_tips) noexcept { return (n_tips <= static_cast(SL_MAX_TIPS + 1)) ? lg2_rooted[n_tips] - : lg2_rooted_slow(n_tips); // LCOV_EXCL_LINE + : lg2_rooted_slow(n_tips); // #nocov } // ---- Inline helpers ---- diff --git a/src/day_1985.cpp b/src/day_1985.cpp index 70ab4dbd6..157bf0738 100644 --- a/src/day_1985.cpp +++ b/src/day_1985.cpp @@ -98,7 +98,7 @@ double calc_consensus_info(const List &trees, const LogicalVector &phylo, std::vector tables; if (std::size_t(n_trees) > tables.max_size()) { - ASSERT(false && "Not enough memory for consensus of so many trees"); // LCOV_EXCL_LINE + ASSERT(false && "Not enough memory for consensus of so many trees"); // #nocov } tables.reserve(n_trees);