Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke
CYCLE_COUNT_LAP_RECORD(g_orch_sync_cycle, AicpuPhaseId::ORCH_SYNC, task_id.raw);

// === STEP 3: Lookup inputs + materialize runtime-created outputs ===
uint32_t cached_hashes[MAX_TENSOR_ARGS] = {};
for (int i = 0; i < args.tensor_count(); i++) {
TensorArgType ptype = args.tag(i);
if (ptype == TensorArgType::OUTPUT) {
Expand Down Expand Up @@ -587,7 +588,7 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke
}

PTO2LookupResult lookup_result;
orch->tensor_map.lookup(*tensor, lookup_result);
orch->tensor_map.lookup(*tensor, lookup_result, cached_hashes[i]);

for (int r = 0; r < lookup_result.count; r++) {
PTO2TensorMapEntry &entry = *lookup_result.entries[r].entry;
Expand All @@ -614,7 +615,12 @@ pto2_submit_mixed_task(PTO2OrchestratorState *orch, const MixedKernels &mixed_ke
TensorArgType ptype = args.tag(i);
if (ptype == TensorArgType::INOUT || ptype == TensorArgType::OUTPUT_EXISTING) {
if (!args.tensor(i).ptr->manual_dep) {
orch->tensor_map.insert(*args.tensor(i).ptr, task_id);
if (ptype == TensorArgType::INOUT) {
// Reuse hash cached during lookup (STEP 3)
orch->tensor_map.insert(*args.tensor(i).ptr, task_id, cached_hashes[i]);
} else {
orch->tensor_map.insert(*args.tensor(i).ptr, task_id);
}
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,17 @@ struct PTO2TensorMap {
* @param result Output: stack-allocated result buffer
*/
void lookup(const Tensor &tensor, PTO2LookupResult &result) {
uint32_t unused;
lookup(tensor, result, unused);
}

/**
* Lookup with hash output — returns the computed bucket index for reuse
* in a subsequent insert() call on the same tensor address.
*/
void lookup(const Tensor &tensor, PTO2LookupResult &result, uint32_t &out_hash) {
uint32_t bucket_index = hash(tensor.buffer.addr);
out_hash = bucket_index;
PTO2TensorMapEntry *cur_entry = buckets[bucket_index];

result.count = 0;
Expand All @@ -312,6 +322,9 @@ struct PTO2TensorMap {

while (cur_entry != nullptr) {
PTO2TensorMapEntry *next_entry = cur_entry->next_in_bucket;
if (next_entry != nullptr) {
__builtin_prefetch(next_entry, 0, 1); // Prefetch next entry's cache line 1 (read, moderate locality)
}

#if PTO2_TENSORMAP_PROFILING
chain_len++;
Expand Down Expand Up @@ -364,6 +377,16 @@ struct PTO2TensorMap {
link_entry(entry, tensor.buffer.addr, producer_task_id);
}

/**
* Insert with precomputed hash — avoids recomputing hash(addr) when
* the caller already has it from a prior lookup() on the same address.
*/
void insert(const Tensor &tensor, PTO2TaskId producer_task_id, uint32_t precomputed_hash) {
PTO2TensorMapEntry *entry = new_entry();
entry->copy_from_tensor(tensor);
link_entry(entry, producer_task_id, precomputed_hash);
}

/**
* Cleanup stale entries for retired tasks
*
Expand Down Expand Up @@ -417,10 +440,16 @@ struct PTO2TensorMap {
* Link an initialized entry into bucket and task chains.
*/
void link_entry(PTO2TensorMapEntry *entry, uint64_t addr, PTO2TaskId producer_task_id) {
link_entry(entry, producer_task_id, hash(addr));
}

/**
* Link an initialized entry into bucket and task chains (with precomputed hash).
*/
void link_entry(PTO2TensorMapEntry *entry, PTO2TaskId producer_task_id, uint32_t bucket_index) {
#if PTO2_TENSORMAP_PROFILING
g_insert_count++;
#endif
uint32_t bucket_index = hash(addr);
auto ring_id = producer_task_id.ring();
auto local_id = producer_task_id.local();
int32_t task_slot = local_id & (task_window_sizes[ring_id] - 1);
Expand Down
Loading