Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions esp-dl/dl/base/isa/esp32p4/dl_base_esp32p4.h
Original file line number Diff line number Diff line change
Expand Up @@ -832,4 +832,8 @@ void dl_esp32p4_dotprod_i16k8o16(int16_t *output_ptr, int8_t *input0_ptr, int16_
void dl_esp32p4_dotprod_i8k8o16(int16_t *out2_int8, int8_t *in1_int8, int8_t *in2_int8, int shift, int n);

void dl_esp32p4_s8_avg_pool2d_hwc_sum(int32_t *buffer_ptr, int8_t *input_ptr, void *args_ptr);

void dl_esp32p4_s8_hard_silu8(int8_t *output_ptr, int8_t *input_ptr, int n_16,
int16_t *half_buf, int clamp_hi, int sar_total,
int16_t *scale_buf);
}
104 changes: 104 additions & 0 deletions esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s16_lut_nearest_neighbor.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
.text
.global dl_esp32p4_s16_lut_nearest_neighbor
.type dl_esp32p4_s16_lut_nearest_neighbor, @function
.balign 4
.option norvc
// ===================================================================
// Author: Boumedine Billal (https://github.com/BoumedineBillal)
//
// dl_esp32p4_s16_lut_nearest_neighbor (v1 - interleaved pipeline)
//
// INT16 LUT activation with SIMD load/store, no interpolation.
// Nearest-neighbor lookup with HALF_EVEN rounding.
//
// v1 optimization: 4-element interleaving for ILP.
// - Extract 4 indices in parallel (no dependencies)
// - Compute 4 addresses in parallel (no dependencies)
// - Issue 4 loads back-to-back (memory pipelining)
// - Insert 4 values in parallel (no dependencies)
//
// Arguments:
// a0: int16_t *output_ptr
// a1: int16_t *input_ptr
// a2: int n_8 (number of 8-element chunks)
// a3: int16_t *table_ptr
// a4: int16_t *ones_buf (16B-aligned, broadcast(1))
// a5: int16_t *xor_buf (16B-aligned, broadcast(0x8000))
// a6: int shift (SAR = log2(step))
//
// Register allocation (after setup):
// a0 -- output ptr (auto-inc)
// a1 -- input ptr (auto-inc)
// a2, a3, a5, t5, t6 -- temps (for 4-way interleave)
// a4 -- table_base
// t3 -- loop counter
// t4 -- reused for address computation
// ===================================================================
dl_esp32p4_s16_lut_nearest_neighbor:
// -- Setup --
esp.vld.128.ip q5, a4, 0 // q5 = broadcast(1)
esp.vld.128.ip q7, a5, 0 // q7 = broadcast(0x8000)
mv t4, a6 // t4 = shift (a6 not PIE-addressable)
esp.movx.w.sar t4 // SAR = shift
mv t3, a2 // t3 = n_8
mv a4, a3 // a4 = table_base

// -- Software loop --
.L_lut_loop:
// SIMD: load 8 int16 + compute rounded indices
esp.vld.128.ip q0, a1, 16 // q0 = input[0..7]
esp.xorq q0, q0, q7 // signed → unsigned: +32768
esp.vmul.u16 q0, q0, q5 // q0 = (q0 * 1) >> SAR = rounded index

// ---- Wave 1: elements 0-3 (4-way interleave) ----
// Extract 4 indices (all independent)
esp.movi.16.a q0, a2, 0 // idx[0] → a2
esp.movi.16.a q0, a5, 1 // idx[1] → a5
esp.movi.16.a q0, t5, 2 // idx[2] → t5
esp.movi.16.a q0, t6, 3 // idx[3] → t6
// Compute 4 addresses (all independent)
esp.addx2 a3, a4, a2 // addr[0] = table + idx[0]*2
esp.addx2 a2, a4, a5 // addr[1] = table + idx[1]*2
esp.addx2 a5, a4, t5 // addr[2] = table + idx[2]*2
esp.addx2 t4, a4, t6 // addr[3] = table + idx[3]*2
// Load 4 table values (back-to-back for memory pipelining)
lh t5, 0(a3) // val[0]
lh t6, 0(a2) // val[1]
lh a3, 0(a5) // val[2]
lh a2, 0(t4) // val[3]
// Insert 4 values into output register
esp.movi.16.q q1, t5, 0
esp.movi.16.q q1, t6, 1
esp.movi.16.q q1, a3, 2
esp.movi.16.q q1, a2, 3

// ---- Wave 2: elements 4-7 (4-way interleave) ----
// Extract 4 indices
esp.movi.16.a q0, a2, 4
esp.movi.16.a q0, a5, 5
esp.movi.16.a q0, t5, 6
esp.movi.16.a q0, t6, 7
// Compute 4 addresses
esp.addx2 a3, a4, a2
esp.addx2 a2, a4, a5
esp.addx2 a5, a4, t5
esp.addx2 t4, a4, t6
// Load 4 table values
lh t5, 0(a3)
lh t6, 0(a2)
lh a3, 0(a5)
lh a2, 0(t4)
// Insert 4 values
esp.movi.16.q q1, t5, 4
esp.movi.16.q q1, t6, 5
esp.movi.16.q q1, a3, 6
esp.movi.16.q q1, a2, 7

// SIMD: store 8 results
esp.vst.128.ip q1, a0, 16
addi t3, t3, -1
bnez t3, .L_lut_loop

ret

.size dl_esp32p4_s16_lut_nearest_neighbor, .-dl_esp32p4_s16_lut_nearest_neighbor
130 changes: 130 additions & 0 deletions esp-dl/dl/base/isa/esp32p4/dl_esp32p4_s8_hard_silu8.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
// ═══════════════════════════════════════════════════════════════════
// Author: Boumedine Billal (https://github.com/BoumedineBillal)
//
// Contribution:
// - Designed the HardSiluPie8 activation: a custom HardSiLU variant
// using /8 (2^3) instead of /6, enabling pure shift-based quantization
// with zero multiply overhead for the divisor. Same accuracy as the
// /6 variant.
// - Derived the integer-domain formula from the float definition,
// mapping all constants to powers of 2 for PIE SIMD compatibility.
// - Wrote the SIMD kernel using 6 vector registers (q0-q5).
// - Validated bit-exact on ESP32-P4 silicon (256/256 INT8 values correct)
// using the P4-JIT hardware debugging framework.
// (https://github.com/BoumedineBillal/esp32-p4-jit)
// ═══════════════════════════════════════════════════════════════════
//
// ═══════════════════════════════════════════════════════════════════
// dl_esp32p4_s8_hard_silu8
//
// Float: y = x × clamp(x/8 + 0.5, 0, 1) × (scale_int / 256)
//
// Integer: xs_q = x_q × s_int (exact, 16-bit)
// gate_q = clamp(x_q + 2^(-e1+2), 0, 2^(-e1+3))
// y_q = xs_q × gate_q × 2^(2*e1 - 3 - e2 - 8) (single rounding)
// e1 = input exponent, e2 = output exponent
// SAR_total = -2*e1 + 3 + e2 + 8 (absorbs both hardsilu shift and scale /256)
//
// Pre-multiply approach (single rounding):
// xs = x × scale_int >> 0 (exact, no rounding)
// gate = clamp(x + half, 0, max)
// y = xs × gate >> SAR_total (single rounding)
//
// SAR_total = (-2*e1 + 3 + e2) + 8 (caller must pass this!)
//
// 12 PIE instructions per 16 INT8 elements. 6 vector registers (q0-q5).
// Operates in INT16 domain: INT8 input is sign-extended to INT16 for
// arithmetic (vadd, vsat, vmul), then packed back to INT8 via vunzip.8.
// No output clamping needed: gate ∈ [0,1] and scale < 1 guarantees |y| ≤ |x| ≤ 127.
// esp.vmul.s16 uses banker's rounding (HALF_EVEN) via PIE CFG register.
//
// Overflow handling:
// xs = x × s_int >> 0: max = 127 × 255 = 32,385 → fits INT16 (< 32,767) ✓
// xs × gate: max = 32,385 × 1024 = 33M → fits INT32 (< 2^31) ✓
// >> SAR_total: result may exceed INT8 when e2 < e1 (finer output scale)
// → explicit esp.vsat.s16 [-128, 127] before vunzip.8 pack
//
// Arguments (RISC-V calling convention — all PIE-compatible):
// a0: int8_t *output_ptr
// a1: int8_t *input_ptr
// a2: int n_16 (number of 16-element chunks)
// a3: int16_t *half_buf (16-byte aligned, 8× INT16 broadcast of 2^(-e1+2))
// a4: int clamp_hi (upper clamp bound = 2^(-e1+3))
// a5: int sar_total (SAR shift = -2*e1 + 3 + e2 + 8)
// a6: int16_t *scale_buf (16-byte aligned, 8× INT16 broadcast of scale_int)
//
// PIE-compatible GP registers used:
// a0 (x10) — output pointer, auto-incremented
// a1 (x11) — input pointer, auto-incremented
// a2 (x12) — clamp_lo = 0 (gate clamping)
// a3 (x13) — half_buf pointer (setup only)
// a4 (x14) — clamp_hi (gate clamping)
// a5 (x15) — sar_total
// a6 (x16) — scale_buf pointer (setup only)
// t3 (x28) — loop counter
// t4 (x29) — SAR=0 for scale pre-multiply
// t5 (x30) — INT8_MIN = -128 (output saturation)
// t6 (x31) — INT8_MAX = 127 (output saturation)
//
// Vector registers:
// q0 — input / gate_lo / y_lo / output
// q1 — x_lo / xs_lo
// q2 — x_hi / xs_hi
// q3 — const half_offset (set once)
// q4 — const scale_int broadcast (set once)
// q5 — gate_hi
// ═══════════════════════════════════════════════════════════════════

.text
.global dl_esp32p4_s8_hard_silu8
.type dl_esp32p4_s8_hard_silu8, @function
.balign 4
.option norvc

dl_esp32p4_s8_hard_silu8:

// ── Setup (once) ──────────────────────────────
esp.vld.128.ip q3, a3, 0 // q3 = broadcast(half_offset)
mv t4, a6 // t4 = scale_buf ptr (PIE-compatible)
esp.vld.128.ip q4, t4, 0 // q4 = broadcast(scale_int)
li t4, 0 // t4 = 0 (SAR for scale pre-multiply, exact)
mv t3, a2 // t3 = n_16 (save loop count)
li a2, 0 // a2 = clamp_lo = 0
li t5, -128 // t5 = INT8_MIN (for output saturation)
li t6, 127 // t6 = INT8_MAX (for output saturation)
// a4 already holds clamp_hi
// a5 already holds sar_total

// ── Software loop (branch-based) ──────────────
beqz t3, .L_hard_silu_done // skip if n_16 == 0

.L_hard_silu_loop:
esp.vld.128.ip q0, a1, 16 // load 16 × INT8, a1 += 16
esp.vext.s8 q1, q2, q0 // sign-extend INT8→INT16: x_lo=q1, x_hi=q2
// Compute BOTH gates from original x FIRST (before x is overwritten)
esp.vadd.s16 q0, q1, q3 // gate_lo = x_lo + half → q0
esp.vsat.s16 q0, q0, a2, a4 // clamp gate_lo [0, clamp_hi]
esp.vadd.s16 q5, q2, q3 // gate_hi = x_hi + half → q5
esp.vsat.s16 q5, q5, a2, a4 // clamp gate_hi [0, clamp_hi]
// Scale pre-multiply: xs = x × s_int (SAR=0, exact, no rounding)
esp.movx.w.sar t4 // SAR = 0
esp.vmul.s16 q1, q1, q4 // xs_lo = x_lo × s_int >> 0 (exact)
esp.vmul.s16 q2, q2, q4 // xs_hi = x_hi × s_int >> 0 (exact)
// Final multiply: y = xs × gate (SAR_total, single rounding)
esp.movx.w.sar a5 // SAR = sar_total
esp.vmul.s16 q0, q1, q0 // y_lo = xs_lo × gate_lo >> SAR_total
esp.vmul.s16 q2, q2, q5 // y_hi = xs_hi × gate_hi >> SAR_total
// Saturate to INT8 range before pack (prevents overflow when e2 < e1)
esp.vsat.s16 q0, q0, t5, t6 // clamp y_lo to [-128, 127]
esp.vsat.s16 q2, q2, t5, t6 // clamp y_hi to [-128, 127]
// Pack + store
esp.vunzip.8 q0, q2 // pack INT16→INT8 (now safe)
esp.vst.128.ip q0, a0, 16 // store 16 × INT8, a0 += 16
addi t3, t3, -1 // decrement counter
bnez t3, .L_hard_silu_loop

.L_hard_silu_done:
ret

.size dl_esp32p4_s8_hard_silu8, .-dl_esp32p4_s8_hard_silu8

Loading