From a0eff4b0c2052fa631f097c424c07c18f20be0c2 Mon Sep 17 00:00:00 2001 From: chenshengxin Date: Tue, 7 Apr 2026 18:58:41 +0800 Subject: [PATCH] Perf: cache hash and prefetch chain in TensorMap lookup/insert - Cache the hash(addr) result from lookup() and reuse it in the subsequent insert() call for INOUT tensors, eliminating a redundant 64-bit multiply per tensor - Add software prefetch of next_in_bucket during chain traversal to hide memory latency on chains longer than one entry - Add lookup/insert/link_entry overloads that accept precomputed hash Benchmarked on Ascend910 (device 11, 100 rounds, 3 runs averaged): benchmark_bgemm -3.8%, other workloads -0.2% to -0.7%. --- .../runtime/pto_orchestrator.cpp | 10 ++++-- .../runtime/pto_tensormap.h | 31 ++++++++++++++++++- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp index 5e754f9e0..e05e67b70 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp @@ -557,6 +557,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke CYCLE_COUNT_LAP_RECORD(g_orch_sync_cycle, AicpuPhaseId::ORCH_SYNC, task_id.raw); // === STEP 3: Lookup inputs + materialize runtime-created outputs === + uint32_t cached_hashes[MAX_TENSOR_ARGS] = {}; for (int i = 0; i < args.tensor_count(); i++) { TensorArgType ptype = args.tag(i); if (ptype == TensorArgType::OUTPUT) { @@ -587,7 +588,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke } PTO2LookupResult lookup_result; - orch->tensor_map.lookup(*tensor, lookup_result); + orch->tensor_map.lookup(*tensor, lookup_result, cached_hashes[i]); for (int r = 0; r < lookup_result.count; r++) { PTO2TensorMapEntry &entry = *lookup_result.entries[r].entry; @@ -614,7 +615,12 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke TensorArgType ptype = args.tag(i); if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) { if (!args.tensor(i).ptr->manual_dep) { - orch->tensor_map.insert(*args.tensor(i).ptr, task_id); + if (ptype == TensorArgType::INOUT) { + // Reuse hash cached during lookup (STEP 3) + orch->tensor_map.insert(*args.tensor(i).ptr, task_id, cached_hashes[i]); + } else { + orch->tensor_map.insert(*args.tensor(i).ptr, task_id); + } } } } diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h index 98ff5211e..e847a86d3 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h @@ -301,7 +301,17 @@ struct PTO2TensorMap { * @param result Output: stack-allocated result buffer */ void lookup(const Tensor &tensor, PTO2LookupResult &result) { + uint32_t unused; + lookup(tensor, result, unused); + } + + /** + * Lookup with hash output — returns the computed bucket index for reuse + * in a subsequent insert() call on the same tensor address. + */ + void lookup(const Tensor &tensor, PTO2LookupResult &result, uint32_t &out_hash) { uint32_t bucket_index = hash(tensor.buffer.addr); + out_hash = bucket_index; PTO2TensorMapEntry *cur_entry = buckets[bucket_index]; result.count = 0; @@ -312,6 +322,9 @@ struct PTO2TensorMap { while (cur_entry != nullptr) { PTO2TensorMapEntry *next_entry = cur_entry->next_in_bucket; + if (next_entry != nullptr) { + __builtin_prefetch(next_entry, 0, 1); // Prefetch next entry's cache line 1 (read, moderate locality) + } #if PTO2_TENSORMAP_PROFILING chain_len++; @@ -364,6 +377,16 @@ struct PTO2TensorMap { link_entry(entry, tensor.buffer.addr, producer_task_id); } + /** + * Insert with precomputed hash — avoids recomputing hash(addr) when + * the caller already has it from a prior lookup() on the same address. + */ + void insert(const Tensor &tensor, PTO2TaskId producer_task_id, uint32_t precomputed_hash) { + PTO2TensorMapEntry *entry = new_entry(); + entry->copy_from_tensor(tensor); + link_entry(entry, producer_task_id, precomputed_hash); + } + /** * Cleanup stale entries for retired tasks * @@ -417,10 +440,16 @@ struct PTO2TensorMap { * Link an initialized entry into bucket and task chains. */ void link_entry(PTO2TensorMapEntry *entry, uint64_t addr, PTO2TaskId producer_task_id) { + link_entry(entry, producer_task_id, hash(addr)); + } + + /** + * Link an initialized entry into bucket and task chains (with precomputed hash). + */ + void link_entry(PTO2TensorMapEntry *entry, PTO2TaskId producer_task_id, uint32_t bucket_index) { #if PTO2_TENSORMAP_PROFILING g_insert_count++; #endif - uint32_t bucket_index = hash(addr); auto ring_id = producer_task_id.ring(); auto local_id = producer_task_id.local(); int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);