From a0eff4b0c2052fa631f097c424c07c18f20be0c2 Mon Sep 17 00:00:00 2001
From: chenshengxin <hw_chenshengxin@163.com>
Date: Tue, 7 Apr 2026 18:58:41 +0800
Subject: [PATCH] Perf: cache hash and prefetch chain in TensorMap
 lookup/insert

- Cache the hash(addr) result from lookup() and reuse it in the
  subsequent insert() call for INOUT tensors, eliminating a redundant
  64-bit multiply per tensor
- Add software prefetch of next_in_bucket during chain traversal to
  hide memory latency on chains longer than one entry
- Add lookup/insert/link_entry overloads that accept precomputed hash

Benchmarked on Ascend910 (device 11, 100 rounds, 3 runs averaged):
benchmark_bgemm -3.8%, other workloads -0.2% to -0.7%.
---
 .../runtime/pto_orchestrator.cpp              | 10 ++++--
 .../runtime/pto_tensormap.h                   | 31 ++++++++++++++++++-
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
index 5e754f9e0..e05e67b70 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_orchestrator.cpp
@@ -557,6 +557,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke
     CYCLE_COUNT_LAP_RECORD(g_orch_sync_cycle, AicpuPhaseId::ORCH_SYNC, task_id.raw);
 
     // === STEP 3: Lookup inputs + materialize runtime-created outputs ===
+    uint32_t cached_hashes[MAX_TENSOR_ARGS] = {};
     for (int i = 0; i < args.tensor_count(); i++) {
         TensorArgType ptype = args.tag(i);
         if (ptype == TensorArgType::OUTPUT) {
@@ -587,7 +588,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke
         }
 
         PTO2LookupResult lookup_result;
-        orch->tensor_map.lookup(*tensor, lookup_result);
+        orch->tensor_map.lookup(*tensor, lookup_result, cached_hashes[i]);
 
         for (int r = 0; r < lookup_result.count; r++) {
             PTO2TensorMapEntry &entry = *lookup_result.entries[r].entry;
@@ -614,7 +615,12 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke
             TensorArgType ptype = args.tag(i);
             if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) {
                 if (!args.tensor(i).ptr->manual_dep) {
-                    orch->tensor_map.insert(*args.tensor(i).ptr, task_id);
+                    if (ptype == TensorArgType::INOUT) {
+                        // Reuse hash cached during lookup (STEP 3)
+                        orch->tensor_map.insert(*args.tensor(i).ptr, task_id, cached_hashes[i]);
+                    } else {
+                        orch->tensor_map.insert(*args.tensor(i).ptr, task_id);
+                    }
                 }
             }
         }
diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
index 98ff5211e..e847a86d3 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_tensormap.h
@@ -301,7 +301,17 @@ struct PTO2TensorMap {
      * @param result  Output: stack-allocated result buffer
      */
     void lookup(const Tensor &tensor, PTO2LookupResult &result) {
+        uint32_t unused;
+        lookup(tensor, result, unused);
+    }
+
+    /**
+     * Lookup with hash output — returns the computed bucket index for reuse
+     * in a subsequent insert() call on the same tensor address.
+     */
+    void lookup(const Tensor &tensor, PTO2LookupResult &result, uint32_t &out_hash) {
         uint32_t bucket_index = hash(tensor.buffer.addr);
+        out_hash = bucket_index;
         PTO2TensorMapEntry *cur_entry = buckets[bucket_index];
 
         result.count = 0;
@@ -312,6 +322,9 @@ struct PTO2TensorMap {
 
         while (cur_entry != nullptr) {
             PTO2TensorMapEntry *next_entry = cur_entry->next_in_bucket;
+            if (next_entry != nullptr) {
+                __builtin_prefetch(next_entry, 0, 1);  // Prefetch next entry's cache line 1 (read, moderate locality)
+            }
 
 #if PTO2_TENSORMAP_PROFILING
             chain_len++;
@@ -364,6 +377,16 @@ struct PTO2TensorMap {
         link_entry(entry, tensor.buffer.addr, producer_task_id);
     }
 
+    /**
+     * Insert with precomputed hash — avoids recomputing hash(addr) when
+     * the caller already has it from a prior lookup() on the same address.
+     */
+    void insert(const Tensor &tensor, PTO2TaskId producer_task_id, uint32_t precomputed_hash) {
+        PTO2TensorMapEntry *entry = new_entry();
+        entry->copy_from_tensor(tensor);
+        link_entry(entry, producer_task_id, precomputed_hash);
+    }
+
     /**
      * Cleanup stale entries for retired tasks
      *
@@ -417,10 +440,16 @@ struct PTO2TensorMap {
      * Link an initialized entry into bucket and task chains.
      */
     void link_entry(PTO2TensorMapEntry *entry, uint64_t addr, PTO2TaskId producer_task_id) {
+        link_entry(entry, producer_task_id, hash(addr));
+    }
+
+    /**
+     * Link an initialized entry into bucket and task chains (with precomputed hash).
+     */
+    void link_entry(PTO2TensorMapEntry *entry, PTO2TaskId producer_task_id, uint32_t bucket_index) {
 #if PTO2_TENSORMAP_PROFILING
         g_insert_count++;
 #endif
-        uint32_t bucket_index = hash(addr);
         auto ring_id = producer_task_id.ring();
         auto local_id = producer_task_id.local();
         int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);