diff --git a/src/ts_data.h b/src/ts_data.h index e5af113cd..5d28cdff3 100644 --- a/src/ts_data.h +++ b/src/ts_data.h @@ -13,6 +13,7 @@ #include #include #include +#include namespace ts { @@ -184,6 +185,24 @@ struct DataSet { // aggregated. Excludes NNI-warmup and annealing candidates (neither funnels // through tbr_search). mutable long long n_candidates_evaluated = 0; + + // Exact-verify optimum memoization (NA path; see exact_verify_sweep in + // ts_tbr.cpp). A topology certified a true unrooted-TBR optimum under the + // current weighting regime is cached here so repeated convergences — notably + // across the ratchet's regime excursions — skip the O(n^2) full-neighbourhood + // sweep. Keyed by hash(child-pairs) ^ dataset-fp ^ weight-fp; cleared when + // the dataset fingerprint changes. + // + // Lives on DataSet (NOT a function-local `static thread_local`) deliberately: + // each parallel worker owns a private `ds_local` copy for its whole lifetime, + // so this gives the same per-thread, cross-replicate persistence the old + // thread_local had — but WITHOUT MinGW emutls, whose thread_local teardown + // across std::thread spawn/exit corrupted the heap (parallel-only crash). + // `mutable` because the scorer takes `const DataSet&`. Single-writer per + // copy: workers touch only their own ds_local; the shared prototype's cache + // is written solely in the post-join (single-threaded) MPT phase. + mutable std::unordered_set evs_false_cache; + mutable uint64_t evs_last_fp = 0; }; // Build a DataSet from R-side data. diff --git a/src/ts_fitch.cpp b/src/ts_fitch.cpp index 4d62d3565..4339916d5 100644 --- a/src/ts_fitch.cpp +++ b/src/ts_fitch.cpp @@ -228,7 +228,7 @@ void fitch_incremental_uppass(TreeState& tree, const DataSet& ds, // per-clip heap allocation. thread_local keeps it per-thread-safe (each // search thread owns its TreeState); char avoids vector proxy-bit // access in the reverse scan below. assign() reuses capacity after warmup. - static thread_local std::vector dirty; + std::vector dirty; dirty.assign(tree.n_node, 0); // Mark root as dirty (we just updated it; its children need checking) diff --git a/src/ts_tbr.cpp b/src/ts_tbr.cpp index a5da3ef84..e2907aa48 100644 --- a/src/ts_tbr.cpp +++ b/src/ts_tbr.cpp @@ -133,7 +133,7 @@ static void collect_main_edges( { edges.clear(); // Reusable per-thread DFS stack (Tier 1): avoids a heap alloc per clip. - static thread_local std::vector stack; + std::vector stack; stack.clear(); stack.push_back(tree.n_tip); @@ -161,7 +161,7 @@ static void collect_subtree_edges( if (subtree_root < tree.n_tip) return; // Reusable per-thread DFS stack (Tier 1): avoids a heap alloc per clip. - static thread_local std::vector stack; + std::vector stack; stack.clear(); stack.push_back(subtree_root); @@ -214,10 +214,10 @@ static void compute_from_above( int tw = tree.total_words; // Reusable per-thread scratch (Tier 1): avoids two heap allocs per clip. - static thread_local std::vector preorder; + std::vector preorder; preorder.clear(); { - static thread_local std::vector stack; + std::vector stack; stack.clear(); stack.push_back(subtree_root); while (!stack.empty()) { @@ -585,7 +585,7 @@ static bool try_root_edge_moves_rescore(TreeState& tree, const DataSet& ds, // Rerooting metadata for each half: identity {-1,-1} plus each internal edge // (sp,sc) with sp != croot (same distinct-rootings set as the EW path). - static thread_local std::vector> metaL, metaR, edges; + std::vector> metaL, metaR, edges; auto build_meta = [&](int croot, std::vector>& meta) { meta.clear(); meta.push_back({-1, -1}); @@ -598,7 +598,7 @@ static bool try_root_edge_moves_rescore(TreeState& tree, const DataSet& ds, const int nL = static_cast(metaL.size()); const int nR = static_cast(metaR.size()); - static thread_local TopoSnapshot snap; + TopoSnapshot snap; save_topology(tree, snap); auto rejoin = [&](int li, int ri) -> bool { @@ -685,10 +685,10 @@ static bool try_root_edge_moves(TreeState& tree, const DataSet& ds, const int rootjoin = fitch_indirect_length_cached(pL, pR, ds, INT_MAX); const double base_split = best_score - rootjoin; - static thread_local std::vector from_above; - static thread_local std::vector rowsL, rowsR; - static thread_local std::vector> metaL, metaR; - static thread_local std::vector> edges; + std::vector from_above; + std::vector rowsL, rowsR; + std::vector> metaL, metaR; + std::vector> edges; from_above.assign(static_cast(tree.n_node) * tw, 0ULL); // Build the rerooting state-sets for one half. Row 0 = identity (the half's @@ -858,10 +858,14 @@ uint64_t exact_verify_cache_key(const TreeState& tree, const DataSet& ds) { static bool exact_verify_sweep(TreeState& tree, const DataSet& ds, double& best_score) { const double eps = std::isfinite(ds.concavity) ? 1e-9 : 0.5; - static thread_local TopoSnapshot snap; - static thread_local std::vector> sub_edges; - static thread_local std::vector in_sub; - static thread_local std::vector dfs, marked; + // Plain locals (not thread_local): MinGW emutls thread_local teardown across + // std::thread spawn/exit corrupted the heap on the parallel path. Each worker + // owns its call frame, so plain locals are per-thread-safe; the per-clip + // (re)allocation is in the noise (measured <=1.6% on 88-tip data). + TopoSnapshot snap; + std::vector> sub_edges; + std::vector in_sub; + std::vector dfs, marked; tree.build_postorder(); best_score = full_rescore(tree, ds); // sync to the current (converged) tree @@ -878,10 +882,14 @@ static bool exact_verify_sweep(TreeState& tree, const DataSet& ds, // hash(child-pairs) XOR dataset-fingerprint XOR weight-fingerprint; only the // dataset-fingerprint is the clear-trigger (a true dataset switch), so // base-regime entries survive across perturbation excursions and are reused. - static thread_local std::unordered_set evs_false_cache; - static thread_local uint64_t evs_last_fp = 0; + // Memoization lives on the (per-worker) DataSet, NOT a function-local + // thread_local — MinGW emutls thread_local teardown across std::thread + // spawn/exit corrupted the heap. ds_local has the same per-thread, + // cross-replicate lifetime, so the cache's persistence is unchanged. See + // the evs_false_cache / evs_last_fp comment in ts_data.h. + std::unordered_set& evs_false_cache = ds.evs_false_cache; const uint64_t fp = ds_fingerprint(ds); // clear-trigger: a true dataset switch - if (fp != evs_last_fp) { evs_false_cache.clear(); evs_last_fp = fp; } + if (fp != ds.evs_last_fp) { evs_false_cache.clear(); ds.evs_last_fp = fp; } const uint64_t cache_key = exact_verify_cache_key(tree, ds); // TS_EV_AUDIT (dev/bench only): distrust cache hits. On a hit, run the full @@ -1004,7 +1012,7 @@ static void compute_subtree_sizes(const TreeState& tree, static void add_clip_internal_steps(const TreeState& tree, const DataSet& ds, int clip_node, std::vector& char_steps) { - static thread_local std::vector stack; + std::vector stack; stack.clear(); stack.push_back(clip_node); while (!stack.empty()) {