Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions src/ts_data.h
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <cmath>
#include <cstdint>
#include <vector>
#include <unordered_set>

namespace ts {

Expand Down Expand Up @@ -184,6 +185,24 @@ struct DataSet {
// aggregated. Excludes NNI-warmup and annealing candidates (neither funnels
// through tbr_search).
mutable long long n_candidates_evaluated = 0;

// Exact-verify optimum memoization (NA path; see exact_verify_sweep in
// ts_tbr.cpp). A topology certified a true unrooted-TBR optimum under the
// current weighting regime is cached here so repeated convergences — notably
// across the ratchet's regime excursions — skip the O(n^2) full-neighbourhood
// sweep. Keyed by hash(child-pairs) ^ dataset-fp ^ weight-fp; cleared when
// the dataset fingerprint changes.
//
// Lives on DataSet (NOT a function-local `static thread_local`) deliberately:
// each parallel worker owns a private `ds_local` copy for its whole lifetime,
// so this gives the same per-thread, cross-replicate persistence the old
// thread_local had — but WITHOUT MinGW emutls, whose thread_local teardown
// across std::thread spawn/exit corrupted the heap (parallel-only crash).
// `mutable` because the scorer takes `const DataSet&`. Single-writer per
// copy: workers touch only their own ds_local; the shared prototype's cache
// is written solely in the post-join (single-threaded) MPT phase.
mutable std::unordered_set<uint64_t> evs_false_cache;
mutable uint64_t evs_last_fp = 0;
};

// Build a DataSet from R-side data.
Expand Down
2 changes: 1 addition & 1 deletion src/ts_fitch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ void fitch_incremental_uppass(TreeState& tree, const DataSet& ds,
// per-clip heap allocation. thread_local keeps it per-thread-safe (each
// search thread owns its TreeState); char avoids vector<bool> proxy-bit
// access in the reverse scan below. assign() reuses capacity after warmup.
static thread_local std::vector<char> dirty;
std::vector<char> dirty;
dirty.assign(tree.n_node, 0);

// Mark root as dirty (we just updated it; its children need checking)
Expand Down
44 changes: 26 additions & 18 deletions src/ts_tbr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ static void collect_main_edges(
{
edges.clear();
// Reusable per-thread DFS stack (Tier 1): avoids a heap alloc per clip.
static thread_local std::vector<int> stack;
std::vector<int> stack;
stack.clear();
stack.push_back(tree.n_tip);

Expand Down Expand Up @@ -161,7 +161,7 @@ static void collect_subtree_edges(
if (subtree_root < tree.n_tip) return;

// Reusable per-thread DFS stack (Tier 1): avoids a heap alloc per clip.
static thread_local std::vector<int> stack;
std::vector<int> stack;
stack.clear();
stack.push_back(subtree_root);

Expand Down Expand Up @@ -214,10 +214,10 @@ static void compute_from_above(
int tw = tree.total_words;

// Reusable per-thread scratch (Tier 1): avoids two heap allocs per clip.
static thread_local std::vector<int> preorder;
std::vector<int> preorder;
preorder.clear();
{
static thread_local std::vector<int> stack;
std::vector<int> stack;
stack.clear();
stack.push_back(subtree_root);
while (!stack.empty()) {
Expand Down Expand Up @@ -585,7 +585,7 @@ static bool try_root_edge_moves_rescore(TreeState& tree, const DataSet& ds,

// Rerooting metadata for each half: identity {-1,-1} plus each internal edge
// (sp,sc) with sp != croot (same distinct-rootings set as the EW path).
static thread_local std::vector<std::pair<int,int>> metaL, metaR, edges;
std::vector<std::pair<int,int>> metaL, metaR, edges;
auto build_meta = [&](int croot, std::vector<std::pair<int,int>>& meta) {
meta.clear();
meta.push_back({-1, -1});
Expand All @@ -598,7 +598,7 @@ static bool try_root_edge_moves_rescore(TreeState& tree, const DataSet& ds,
const int nL = static_cast<int>(metaL.size());
const int nR = static_cast<int>(metaR.size());

static thread_local TopoSnapshot snap;
TopoSnapshot snap;
save_topology(tree, snap);

auto rejoin = [&](int li, int ri) -> bool {
Expand Down Expand Up @@ -685,10 +685,10 @@ static bool try_root_edge_moves(TreeState& tree, const DataSet& ds,
const int rootjoin = fitch_indirect_length_cached(pL, pR, ds, INT_MAX);
const double base_split = best_score - rootjoin;

static thread_local std::vector<uint64_t> from_above;
static thread_local std::vector<uint64_t> rowsL, rowsR;
static thread_local std::vector<std::pair<int,int>> metaL, metaR;
static thread_local std::vector<std::pair<int,int>> edges;
std::vector<uint64_t> from_above;
std::vector<uint64_t> rowsL, rowsR;
std::vector<std::pair<int,int>> metaL, metaR;
std::vector<std::pair<int,int>> edges;
from_above.assign(static_cast<size_t>(tree.n_node) * tw, 0ULL);

// Build the rerooting state-sets for one half. Row 0 = identity (the half's
Expand Down Expand Up @@ -858,10 +858,14 @@ uint64_t exact_verify_cache_key(const TreeState& tree, const DataSet& ds) {
static bool exact_verify_sweep(TreeState& tree, const DataSet& ds,
double& best_score) {
const double eps = std::isfinite(ds.concavity) ? 1e-9 : 0.5;
static thread_local TopoSnapshot snap;
static thread_local std::vector<std::pair<int,int>> sub_edges;
static thread_local std::vector<char> in_sub;
static thread_local std::vector<int> dfs, marked;
// Plain locals (not thread_local): MinGW emutls thread_local teardown across
// std::thread spawn/exit corrupted the heap on the parallel path. Each worker
// owns its call frame, so plain locals are per-thread-safe; the per-clip
// (re)allocation is in the noise (measured <=1.6% on 88-tip data).
TopoSnapshot snap;
std::vector<std::pair<int,int>> sub_edges;
std::vector<char> in_sub;
std::vector<int> dfs, marked;

tree.build_postorder();
best_score = full_rescore(tree, ds); // sync to the current (converged) tree
Expand All @@ -878,10 +882,14 @@ static bool exact_verify_sweep(TreeState& tree, const DataSet& ds,
// hash(child-pairs) XOR dataset-fingerprint XOR weight-fingerprint; only the
// dataset-fingerprint is the clear-trigger (a true dataset switch), so
// base-regime entries survive across perturbation excursions and are reused.
static thread_local std::unordered_set<uint64_t> evs_false_cache;
static thread_local uint64_t evs_last_fp = 0;
// Memoization lives on the (per-worker) DataSet, NOT a function-local
// thread_local — MinGW emutls thread_local teardown across std::thread
// spawn/exit corrupted the heap. ds_local has the same per-thread,
// cross-replicate lifetime, so the cache's persistence is unchanged. See
// the evs_false_cache / evs_last_fp comment in ts_data.h.
std::unordered_set<uint64_t>& evs_false_cache = ds.evs_false_cache;
const uint64_t fp = ds_fingerprint(ds); // clear-trigger: a true dataset switch
if (fp != evs_last_fp) { evs_false_cache.clear(); evs_last_fp = fp; }
if (fp != ds.evs_last_fp) { evs_false_cache.clear(); ds.evs_last_fp = fp; }
const uint64_t cache_key = exact_verify_cache_key(tree, ds);

// TS_EV_AUDIT (dev/bench only): distrust cache hits. On a hit, run the full
Expand Down Expand Up @@ -1004,7 +1012,7 @@ static void compute_subtree_sizes(const TreeState& tree,
static void add_clip_internal_steps(const TreeState& tree, const DataSet& ds,
int clip_node,
std::vector<int>& char_steps) {
static thread_local std::vector<int> stack;
std::vector<int> stack;
stack.clear();
stack.push_back(clip_node);
while (!stack.empty()) {
Expand Down
Loading