diff --git a/scripts/chart.py b/scripts/chart.py index 3456115..de18eac 100644 --- a/scripts/chart.py +++ b/scripts/chart.py @@ -112,33 +112,63 @@ def parse_case_name(name: str) -> tuple[str, int, bool] | None: return position, length, case_insensitive +def _extract_rates_from_entry(entry: dict) -> dict[str, float]: + """Extract normalized rates for all categories from a single JSON entry.""" + rates: dict[str, float] = {} + + for case in entry.get("cases", []): + name = case.get("name", "") + rate = case.get("rate") + + if not isinstance(rate, (int, float)): + continue + + parsed = parse_case_name(name) + if not parsed: + continue + + position, length, case_insensitive = parsed + ci_str = "ci" if case_insensitive else "cs" + category = f"{position} 5 {ci_str}" + + rates[category] = normalize_rate(rate, length, case_insensitive) + + return rates + + def extract_rates(entries: list[dict], title: str) -> dict[str, float]: """Extract normalized rates for each category from benchmark entries.""" for entry in entries: if entry.get("title") != title: continue + return _extract_rates_from_entry(entry) - rates = {} - for case in entry.get("cases", []): - name = case.get("name", "") - rate = case.get("rate") - - if not isinstance(rate, (int, float)): - continue + return {} - parsed = parse_case_name(name) - if not parsed: - continue - position, length, case_insensitive = parsed - ci_str = "ci" if case_insensitive else "cs" - category = f"{position} 5 {ci_str}" +def extract_latest_new_rates(entries: list[dict]) -> dict[str, float]: + """ + Extract normalized rates for the "new" implementation as + the latest entry (by timestamp) per device, excluding the + old implementation baseline. + """ + latest_entry: dict | None = None + latest_ts: float | None = None - rates[category] = normalize_rate(rate, length, case_insensitive) + for entry in entries: + if entry.get("title") == OLD_IMPL: + continue + ts = entry.get("timestamp") + if not isinstance(ts, (int, float)): + continue + if latest_ts is None or ts > latest_ts: + latest_ts = ts + latest_entry = entry - return rates + if latest_entry is None: + return {} - return {} + return _extract_rates_from_entry(latest_entry) def build_benchmark_data(raw_data: dict) -> list[BenchmarkResult]: @@ -147,7 +177,10 @@ def build_benchmark_data(raw_data: dict) -> list[BenchmarkResult]: for device, entries in raw_data.items(): old_rates = extract_rates(entries, OLD_IMPL) - new_rates = extract_rates(entries, NEW_IMPL) + # For the "new" implementation, always take the latest + # entry per device (by timestamp), so charts reflect the + # most recent benchmark run. + new_rates = extract_latest_new_rates(entries) if not old_rates or not new_rates: continue diff --git a/src/kernel.cl b/src/kernel.cl index ee62a0f..b6aec1b 100644 --- a/src/kernel.cl +++ b/src/kernel.cl @@ -1,3 +1,5 @@ +// --- SHA256 Definitions and Optimizations --- + #define F1(x,y,z) (bitselect(z,y,x)) #define F0(x,y,z) (bitselect (x, y, ((x) ^ (z)))) #define shr32(x,n) ((x) >> (n)) @@ -8,8 +10,13 @@ #define S2(x) (rotl32 ((x), 30u) ^ rotl32 ((x), 19u) ^ rotl32 ((x), 10u)) #define S3(x) (rotl32 ((x), 26u) ^ rotl32 ((x), 21u) ^ rotl32 ((x), 7u)) -#define SWAP(val) (rotate(((val) & 0x00FF00FF), 24U) | rotate(((val) & 0xFF00FF00), 8U)); +// Optimization: Efficient Byte Swapping using vector shuffling. +#define SWAP(val) (as_uint(as_uchar4(val).wzyx)) + +// Optimization: Macro to extract byte k from a LE uint array. +#define GET_BYTE_LE_ARRAY(arr, k) ((uchar)((arr[(k) >> 2] >> (((k) & 3) << 3)) & 0xFF)) +// SHA256 Constants (Used as immediate values) #define SHA256C00 0x428a2f98u #define SHA256C01 0x71374491u #define SHA256C02 0xb5c0fbcfu @@ -75,39 +82,32 @@ #define SHA256C3e 0xbef9a3f7u #define SHA256C3f 0xc67178f2u -__constant uint k_sha256[64] = -{ - SHA256C00, SHA256C01, SHA256C02, SHA256C03, - SHA256C04, SHA256C05, SHA256C06, SHA256C07, - SHA256C08, SHA256C09, SHA256C0a, SHA256C0b, - SHA256C0c, SHA256C0d, SHA256C0e, SHA256C0f, - SHA256C10, SHA256C11, SHA256C12, SHA256C13, - SHA256C14, SHA256C15, SHA256C16, SHA256C17, - SHA256C18, SHA256C19, SHA256C1a, SHA256C1b, - SHA256C1c, SHA256C1d, SHA256C1e, SHA256C1f, - SHA256C20, SHA256C21, SHA256C22, SHA256C23, - SHA256C24, SHA256C25, SHA256C26, SHA256C27, - SHA256C28, SHA256C29, SHA256C2a, SHA256C2b, - SHA256C2c, SHA256C2d, SHA256C2e, SHA256C2f, - SHA256C30, SHA256C31, SHA256C32, SHA256C33, - SHA256C34, SHA256C35, SHA256C36, SHA256C37, - SHA256C38, SHA256C39, SHA256C3a, SHA256C3b, - SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f, -}; +#define SHA256_EXPAND(x,y,z,w) (S1 (x) + y + S0 (z) + w) -#define SHA256_STEP(F0a,F1a,a,b,c,d,e,f,g,h,x,K) \ -{ \ - h += K; \ - h += x; \ - h += S3 (e); \ - h += F1a (e,f,g); \ - d += h; \ - h += S2 (a); \ - h += F0a (a,b,c); \ +// Optimization: ILP optimized SHA256 STEP (T1/T2 formulation). Takes combined K+W (KW). +#define SHA256_STEP_OPT(F0a,F1a,a,b,c,d,e,f,g,h,KW) \ +{ \ + uint T1 = h + S3(e) + F1a(e,f,g) + KW; \ + uint T2 = S2(a) + F0a(a,b,c); \ + d += T1; \ + h = T1 + T2; \ } -#define SHA256_EXPAND(x,y,z,w) (S1 (x) + y + S0 (z) + w) +// Rounds 0-15: Combine K+W using immediate constants. +#define ROUND_0_15_IMM(a, b, c, d, e, f, g, h, w, K_IMM) \ +{ \ + SHA256_STEP_OPT(F0, F1, a, b, c, d, e, f, g, h, w + K_IMM); \ +} + +// Rounds 16-63: Interleaved expansion and compression using immediate constants. +// W_i_16 is updated in place (register reuse). +#define ROUND_16_63_IMM(a, b, c, d, e, f, g, h, W_i_16, W_i_15, W_i_7, W_i_2, K_IMM) \ +{ \ + W_i_16 = SHA256_EXPAND(W_i_2, W_i_7, W_i_15, W_i_16); \ + SHA256_STEP_OPT(F0, F1, a, b, c, d, e, f, g, h, W_i_16 + K_IMM); \ +} +// Maximally optimized SHA256 process function: Fully Unrolled, Interleaved, ILP Optimized, Immediate Constants. static void sha256_process2 (const unsigned int *W, unsigned int *digest) { unsigned int a = digest[0]; @@ -119,6 +119,7 @@ static void sha256_process2 (const unsigned int *W, unsigned int *digest) unsigned int g = digest[6]; unsigned int h = digest[7]; + // Load W into private registers unsigned int w0_t = W[0]; unsigned int w1_t = W[1]; unsigned int w2_t = W[2]; @@ -136,56 +137,82 @@ static void sha256_process2 (const unsigned int *W, unsigned int *digest) unsigned int we_t = W[14]; unsigned int wf_t = W[15]; - #define ROUND_EXPAND() \ - { \ - w0_t = SHA256_EXPAND (we_t, w9_t, w1_t, w0_t); \ - w1_t = SHA256_EXPAND (wf_t, wa_t, w2_t, w1_t); \ - w2_t = SHA256_EXPAND (w0_t, wb_t, w3_t, w2_t); \ - w3_t = SHA256_EXPAND (w1_t, wc_t, w4_t, w3_t); \ - w4_t = SHA256_EXPAND (w2_t, wd_t, w5_t, w4_t); \ - w5_t = SHA256_EXPAND (w3_t, we_t, w6_t, w5_t); \ - w6_t = SHA256_EXPAND (w4_t, wf_t, w7_t, w6_t); \ - w7_t = SHA256_EXPAND (w5_t, w0_t, w8_t, w7_t); \ - w8_t = SHA256_EXPAND (w6_t, w1_t, w9_t, w8_t); \ - w9_t = SHA256_EXPAND (w7_t, w2_t, wa_t, w9_t); \ - wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); \ - wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); \ - wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); \ - wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); \ - we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); \ - wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); \ - } - - #define ROUND_STEP(i) \ - { \ - SHA256_STEP (F0, F1, a, b, c, d, e, f, g, h, w0_t, k_sha256[i + 0]); \ - SHA256_STEP (F0, F1, h, a, b, c, d, e, f, g, w1_t, k_sha256[i + 1]); \ - SHA256_STEP (F0, F1, g, h, a, b, c, d, e, f, w2_t, k_sha256[i + 2]); \ - SHA256_STEP (F0, F1, f, g, h, a, b, c, d, e, w3_t, k_sha256[i + 3]); \ - SHA256_STEP (F0, F1, e, f, g, h, a, b, c, d, w4_t, k_sha256[i + 4]); \ - SHA256_STEP (F0, F1, d, e, f, g, h, a, b, c, w5_t, k_sha256[i + 5]); \ - SHA256_STEP (F0, F1, c, d, e, f, g, h, a, b, w6_t, k_sha256[i + 6]); \ - SHA256_STEP (F0, F1, b, c, d, e, f, g, h, a, w7_t, k_sha256[i + 7]); \ - SHA256_STEP (F0, F1, a, b, c, d, e, f, g, h, w8_t, k_sha256[i + 8]); \ - SHA256_STEP (F0, F1, h, a, b, c, d, e, f, g, w9_t, k_sha256[i + 9]); \ - SHA256_STEP (F0, F1, g, h, a, b, c, d, e, f, wa_t, k_sha256[i + 10]); \ - SHA256_STEP (F0, F1, f, g, h, a, b, c, d, e, wb_t, k_sha256[i + 11]); \ - SHA256_STEP (F0, F1, e, f, g, h, a, b, c, d, wc_t, k_sha256[i + 12]); \ - SHA256_STEP (F0, F1, d, e, f, g, h, a, b, c, wd_t, k_sha256[i + 13]); \ - SHA256_STEP (F0, F1, c, d, e, f, g, h, a, b, we_t, k_sha256[i + 14]); \ - SHA256_STEP (F0, F1, b, c, d, e, f, g, h, a, wf_t, k_sha256[i + 15]); \ - } - - ROUND_STEP (0); - - ROUND_EXPAND(); - ROUND_STEP(16); - - ROUND_EXPAND(); - ROUND_STEP(32); - - ROUND_EXPAND(); - ROUND_STEP(48); + // --- Fully Unrolled Rounds --- + + // Rounds 0-15 + ROUND_0_15_IMM(a, b, c, d, e, f, g, h, w0_t, SHA256C00); + ROUND_0_15_IMM(h, a, b, c, d, e, f, g, w1_t, SHA256C01); + ROUND_0_15_IMM(g, h, a, b, c, d, e, f, w2_t, SHA256C02); + ROUND_0_15_IMM(f, g, h, a, b, c, d, e, w3_t, SHA256C03); + ROUND_0_15_IMM(e, f, g, h, a, b, c, d, w4_t, SHA256C04); + ROUND_0_15_IMM(d, e, f, g, h, a, b, c, w5_t, SHA256C05); + ROUND_0_15_IMM(c, d, e, f, g, h, a, b, w6_t, SHA256C06); + ROUND_0_15_IMM(b, c, d, e, f, g, h, a, w7_t, SHA256C07); + ROUND_0_15_IMM(a, b, c, d, e, f, g, h, w8_t, SHA256C08); + ROUND_0_15_IMM(h, a, b, c, d, e, f, g, w9_t, SHA256C09); + ROUND_0_15_IMM(g, h, a, b, c, d, e, f, wa_t, SHA256C0a); + ROUND_0_15_IMM(f, g, h, a, b, c, d, e, wb_t, SHA256C0b); + ROUND_0_15_IMM(e, f, g, h, a, b, c, d, wc_t, SHA256C0c); + ROUND_0_15_IMM(d, e, f, g, h, a, b, c, wd_t, SHA256C0d); + ROUND_0_15_IMM(c, d, e, f, g, h, a, b, we_t, SHA256C0e); + ROUND_0_15_IMM(b, c, d, e, f, g, h, a, wf_t, SHA256C0f); + + // Rounds 16-31 (Interleaved) + ROUND_16_63_IMM(a, b, c, d, e, f, g, h, w0_t, w1_t, w9_t, we_t, SHA256C10); + ROUND_16_63_IMM(h, a, b, c, d, e, f, g, w1_t, w2_t, wa_t, wf_t, SHA256C11); + ROUND_16_63_IMM(g, h, a, b, c, d, e, f, w2_t, w3_t, wb_t, w0_t, SHA256C12); + ROUND_16_63_IMM(f, g, h, a, b, c, d, e, w3_t, w4_t, wc_t, w1_t, SHA256C13); + ROUND_16_63_IMM(e, f, g, h, a, b, c, d, w4_t, w5_t, wd_t, w2_t, SHA256C14); + ROUND_16_63_IMM(d, e, f, g, h, a, b, c, w5_t, w6_t, we_t, w3_t, SHA256C15); + ROUND_16_63_IMM(c, d, e, f, g, h, a, b, w6_t, w7_t, wf_t, w4_t, SHA256C16); + ROUND_16_63_IMM(b, c, d, e, f, g, h, a, w7_t, w8_t, w0_t, w5_t, SHA256C17); + ROUND_16_63_IMM(a, b, c, d, e, f, g, h, w8_t, w9_t, w1_t, w6_t, SHA256C18); + ROUND_16_63_IMM(h, a, b, c, d, e, f, g, w9_t, wa_t, w2_t, w7_t, SHA256C19); + ROUND_16_63_IMM(g, h, a, b, c, d, e, f, wa_t, wb_t, w3_t, w8_t, SHA256C1a); + ROUND_16_63_IMM(f, g, h, a, b, c, d, e, wb_t, wc_t, w4_t, w9_t, SHA256C1b); + ROUND_16_63_IMM(e, f, g, h, a, b, c, d, wc_t, wd_t, w5_t, wa_t, SHA256C1c); + ROUND_16_63_IMM(d, e, f, g, h, a, b, c, wd_t, we_t, w6_t, wb_t, SHA256C1d); + ROUND_16_63_IMM(c, d, e, f, g, h, a, b, we_t, wf_t, w7_t, wc_t, SHA256C1e); + ROUND_16_63_IMM(b, c, d, e, f, g, h, a, wf_t, w0_t, w8_t, wd_t, SHA256C1f); + + // Rounds 32-47 (Interleaved) + ROUND_16_63_IMM(a, b, c, d, e, f, g, h, w0_t, w1_t, w9_t, we_t, SHA256C20); + ROUND_16_63_IMM(h, a, b, c, d, e, f, g, w1_t, w2_t, wa_t, wf_t, SHA256C21); + ROUND_16_63_IMM(g, h, a, b, c, d, e, f, w2_t, w3_t, wb_t, w0_t, SHA256C22); + ROUND_16_63_IMM(f, g, h, a, b, c, d, e, w3_t, w4_t, wc_t, w1_t, SHA256C23); + ROUND_16_63_IMM(e, f, g, h, a, b, c, d, w4_t, w5_t, wd_t, w2_t, SHA256C24); + ROUND_16_63_IMM(d, e, f, g, h, a, b, c, w5_t, w6_t, we_t, w3_t, SHA256C25); + ROUND_16_63_IMM(c, d, e, f, g, h, a, b, w6_t, w7_t, wf_t, w4_t, SHA256C26); + ROUND_16_63_IMM(b, c, d, e, f, g, h, a, w7_t, w8_t, w0_t, w5_t, SHA256C27); + ROUND_16_63_IMM(a, b, c, d, e, f, g, h, w8_t, w9_t, w1_t, w6_t, SHA256C28); + ROUND_16_63_IMM(h, a, b, c, d, e, f, g, w9_t, wa_t, w2_t, w7_t, SHA256C29); + ROUND_16_63_IMM(g, h, a, b, c, d, e, f, wa_t, wb_t, w3_t, w8_t, SHA256C2a); + ROUND_16_63_IMM(f, g, h, a, b, c, d, e, wb_t, wc_t, w4_t, w9_t, SHA256C2b); + ROUND_16_63_IMM(e, f, g, h, a, b, c, d, wc_t, wd_t, w5_t, wa_t, SHA256C2c); + ROUND_16_63_IMM(d, e, f, g, h, a, b, c, wd_t, we_t, w6_t, wb_t, SHA256C2d); + ROUND_16_63_IMM(c, d, e, f, g, h, a, b, we_t, wf_t, w7_t, wc_t, SHA256C2e); + ROUND_16_63_IMM(b, c, d, e, f, g, h, a, wf_t, w0_t, w8_t, wd_t, SHA256C2f); + + // Rounds 48-63 (Interleaved) + ROUND_16_63_IMM(a, b, c, d, e, f, g, h, w0_t, w1_t, w9_t, we_t, SHA256C30); + ROUND_16_63_IMM(h, a, b, c, d, e, f, g, w1_t, w2_t, wa_t, wf_t, SHA256C31); + ROUND_16_63_IMM(g, h, a, b, c, d, e, f, w2_t, w3_t, wb_t, w0_t, SHA256C32); + ROUND_16_63_IMM(f, g, h, a, b, c, d, e, w3_t, w4_t, wc_t, w1_t, SHA256C33); + ROUND_16_63_IMM(e, f, g, h, a, b, c, d, w4_t, w5_t, wd_t, w2_t, SHA256C34); + ROUND_16_63_IMM(d, e, f, g, h, a, b, c, w5_t, w6_t, we_t, w3_t, SHA256C35); + ROUND_16_63_IMM(c, d, e, f, g, h, a, b, w6_t, w7_t, wf_t, w4_t, SHA256C36); + ROUND_16_63_IMM(b, c, d, e, f, g, h, a, w7_t, w8_t, w0_t, w5_t, SHA256C37); + ROUND_16_63_IMM(a, b, c, d, e, f, g, h, w8_t, w9_t, w1_t, w6_t, SHA256C38); + ROUND_16_63_IMM(h, a, b, c, d, e, f, g, w9_t, wa_t, w2_t, w7_t, SHA256C39); + ROUND_16_63_IMM(g, h, a, b, c, d, e, f, wa_t, wb_t, w3_t, w8_t, SHA256C3a); + ROUND_16_63_IMM(f, g, h, a, b, c, d, e, wb_t, wc_t, w4_t, w9_t, SHA256C3b); + ROUND_16_63_IMM(e, f, g, h, a, b, c, d, wc_t, wd_t, w5_t, wa_t, SHA256C3c); + ROUND_16_63_IMM(d, e, f, g, h, a, b, c, wd_t, we_t, w6_t, wb_t, SHA256C3d); + ROUND_16_63_IMM(c, d, e, f, g, h, a, b, we_t, wf_t, w7_t, wc_t, SHA256C3e); + ROUND_16_63_IMM(b, c, d, e, f, g, h, a, wf_t, w0_t, w8_t, wd_t, SHA256C3f); + + #undef ROUND_0_15_IMM + #undef ROUND_16_63_IMM digest[0] += a; digest[1] += b; @@ -203,9 +230,12 @@ static void sha256_process2 (const unsigned int *W, unsigned int *digest) #undef S1 #undef S2 #undef S3 - #undef shr32 #undef rotl32 +#undef SHA256_STEP_OPT + +// --- Configuration injected from host (Placeholders) --- + // first 64 bytes of code cell (constant w.r.t salt), injected from host __constant uchar CODE_PREFIX[64] = { <> }; __constant uint CODE_STATE_BASE[8] = { <> }; @@ -236,9 +266,7 @@ __constant uchar CASE_ALT1[N_CASE_INSENSITIVE] = { <> }; __constant ushort CRC16_TABLE[256] = { <> }; __constant uchar STATEINIT_PREFIX_LENS[N_STATEINIT_VARIANTS] = { <> }; -__constant uchar STATEINIT_PREFIX_VARIANTS[N_STATEINIT_VARIANTS][STATEINIT_PREFIX_MAX_LEN] = { - <> -}; +// STATEINIT_PREFIX_VARIANTS is unused in the optimized kernel logic. #if N_ACTIVE > 0 __constant uchar PREFIX_POS[N_ACTIVE] = { <> }; @@ -247,7 +275,7 @@ __constant uchar PREFIX_POS[N_ACTIVE] = { <> }; __constant uchar PREFIX_POS_NOCRC[N_ACTIVE_NOCRC] = { <> }; #endif -// prepacked prefix contribution to message block words +// prepacked prefix contribution to message block words (Assumed zero-padded by host) __constant uint PREFIX_W[N_STATEINIT_VARIANTS][16] = { <> }; @@ -257,11 +285,13 @@ __constant uint SHA256_IV[8] = { 0x510e527fu, 0x9b05688cu, 0x1f83d9abu, 0x5be0cd19u }; -// Fast CRC16-CCITT using lookup table (poly 0x1021, init 0x0000) -inline ushort gen_crc16_fast(const uchar *data, int size) +// Optimization: Specialized and fully unrolled CRC16-CCITT for 34 bytes. +inline ushort gen_crc16_fast_34(const uchar *data) { ushort crc = 0; - for (int i = 0; i < size; i++) { + #pragma unroll + for (int i = 0; i < 34; i++) { + // The (& 0xff) is technically redundant if types are strictly respected, but ensures correctness. crc = (ushort)((crc << 8) ^ CRC16_TABLE[((crc >> 8) ^ data[i]) & 0xff]); } return crc; @@ -279,102 +309,136 @@ __kernel void hash_main( { uint idx = get_global_id(0); + // Allocate private memory (registers) uint W[16]; uint code_hash_state[8]; uint main_hash[8]; for (int iter = 0; iter < iterations; iter++) { - // derive 128-bit salt from base salt and (iter, idx) + // --- First Hash (Code Hash) --- + + // derive 128-bit salt uint s0 = salt0 ^ (uint)iter; uint s1 = salt1 ^ (uint)idx; uint s2 = salt2; uint s3 = salt3; + // Prepare message block W (BE). Using optimized SWAP. W[0] = SWAP(s0); W[1] = SWAP(s1); W[2] = SWAP(s2); W[3] = SWAP(s3); W[4] = 0x80000000u; - W[5] = 0u; - W[6] = 0u; - W[7] = 0u; - W[8] = 0u; - W[9] = 0u; - W[10] = 0u; - W[11] = 0u; - W[12] = 0u; - W[13] = 0u; - W[14] = 0u; - W[15] = 640u; // total bits of code cell (80 bytes) + #pragma unroll + for (int i = 5; i < 15; i++) { + W[i] = 0u; + } + W[15] = 640u; // 80 bytes * 8 bits/byte + // Calculate code_hash_state #pragma unroll for (int i = 0; i < 8; i++) { code_hash_state[i] = CODE_STATE_BASE[i]; } sha256_process2(W, code_hash_state); - #pragma unroll - for (int i = 0; i < 8; i++) { - code_hash_state[i] = SWAP(code_hash_state[i]); - } + // code_hash_state is now in Big Endian format. - uchar *ch = (uchar *)code_hash_state; + // --- Second Hash (Main Hash) and Checks --- - #if N_STATEINIT_VARIANTS == 5 + // Aggressively unroll the variant loop if count is small (e.g. <= 16). + #if N_STATEINIT_VARIANTS > 0 && N_STATEINIT_VARIANTS <= 16 #pragma unroll #endif for (int v = 0; v < N_STATEINIT_VARIANTS; v++) { uchar prefix_len = STATEINIT_PREFIX_LENS[v]; - const int main_len = (int)prefix_len + 32; // bytes, always < 56 here + const int main_len = (int)prefix_len + 32; + // Initialize W (BE) from pre-packed prefix words (BE) #pragma unroll for (int i = 0; i < 16; i++) { W[i] = PREFIX_W[v][i]; } - // insert code hash bytes into pre-packed prefix words - for (int j = 0; j < 32; j++) { - int idx_b = (int)prefix_len + j; - int w = idx_b >> 2; - int shift = 24 - ((idx_b & 3) * 8); - W[w] |= (uint)ch[j] << shift; + // Optimization: Word-level Message Block Packing (Funnel Shifting). + // Insert code_hash_state (BE) into W (BE). + const uint alignment = prefix_len & 3; + const int start_w = prefix_len >> 2; + + if (alignment == 0) { + // Aligned insertion + #pragma unroll + for (int i = 0; i < 8; i++) { + // Use |= assuming PREFIX_W is zero-padded correctly. + W[start_w + i] |= code_hash_state[i]; + } + } else { + // Unaligned insertion (Right funnel shift for BE data). + const uint shift_r = alignment << 3; // alignment * 8 + const uint shift_l = 32 - shift_r; + + uint prev = 0; + #pragma unroll + for (int i = 0; i < 8; i++) { + uint current = code_hash_state[i]; + // Funnel shift: merge prev (high part/spillover) and current (low part). + W[start_w + i] |= (current >> shift_r) | prev; + // Calculate spillover for the next word. + prev = current << shift_l; + } + // Handle the final spillover word. (Safe because main_len < 56). + W[start_w + 8] |= prev; } - // padding bit + // Add padding bit int pad_w = main_len >> 2; - int pad_shift = 24 - ((main_len & 3) * 8); + int pad_shift = 24 - ((main_len & 3) << 3); // Optimized shift calculation W[pad_w] |= (uint)0x80 << pad_shift; - // length in bits - W[15] = (uint)main_len * 8u; + // Add length in bits (BE). Optimized multiplication. + W[15] = (uint)main_len << 3; + // Calculate main_hash #pragma unroll for (int i = 0; i < 8; i++) { main_hash[i] = SHA256_IV[i]; } sha256_process2(W, main_hash); + + // Swap main_hash (BE) to LE for efficient byte extraction during checks. #pragma unroll for (int i = 0; i < 8; i++) { main_hash[i] = SWAP(main_hash[i]); } - // --- OPTIMIZATION START --- - uchar *mhb = (uchar *)main_hash; + // --- Constraint Checking --- int ok = 1; - // rewrite first hash byte with FREE_HASH_MASK/FREE_HASH_VAL - uchar hash0 = (uchar)((mhb[0] & (~FREE_HASH_MASK)) | (FREE_HASH_VAL & FREE_HASH_MASK)); + // Calculate hash0 (modified first byte H0). H0 is LSB of main_hash[0] (LE). + uchar H0 = (uchar)(main_hash[0] & 0xFF); + uchar hash0 = (uchar)((H0 & (~FREE_HASH_MASK)) | (FREE_HASH_VAL & FREE_HASH_MASK)); - // early check on non-CRC constrained bytes + // Early check on non-CRC constrained bytes #if N_ACTIVE_NOCRC > 0 + // Optimization: Use explicit byte extraction and rigorously preserve original logic. if (ok) { #pragma unroll for (int j = 0; j < N_ACTIVE_NOCRC; j++) { int i = PREFIX_POS_NOCRC[j]; - uchar val = 0; + uchar val; + + // Replicating the original logic: + // val = (i >= 3 && i < 34) ? mhb[i - 2] : (uchar)0; (if i!=2 and 0<=i<36) if (i == 2) { val = hash0; + } else if (i >= 3 && i < 34) { + // H1..H31. + val = GET_BYTE_LE_ARRAY(main_hash, i - 2); } else if (i >= 0 && i < 36) { - val = (i >= 3 && i < 34) ? mhb[i - 2] : (uchar)0; + // i=0, 1, 34, 35. Original logic explicitly used 0 here. + val = 0; + } else { + val = 0; // Fallback for out of bounds } + if ((val & PREFIX_MASK[i]) != PREFIX_VAL[i]) { ok = 0; break; @@ -384,24 +448,29 @@ __kernel void hash_main( #endif if (ok) { + // Construct the full representation array uchar repr[36]; repr[0] = (uchar)FLAGS_HI; repr[1] = (uchar)FLAGS_LO; repr[2] = hash0; + // Copy remaining hash bytes (H1..H31) using optimized extraction #pragma unroll for (int k = 1; k < 32; k++) { - repr[2 + k] = mhb[k]; + repr[2 + k] = GET_BYTE_LE_ARRAY(main_hash, k); } + // Initialize CRC bytes (required if HAS_CRC_CONSTRAINT is false but N_CASE_INSENSITIVE uses these bytes) repr[34] = 0; repr[35] = 0; if (HAS_CRC_CONSTRAINT) { - ushort crc = gen_crc16_fast((uchar *)repr, 34); + // Use optimized, unrolled CRC function. + ushort crc = gen_crc16_fast_34((uchar *)repr); repr[34] = (uchar)(crc >> 8); repr[35] = (uchar)(crc & 0xffu); + // Check CRC-dependent constraints #if N_ACTIVE > 0 #pragma unroll for (int j = 0; j < N_ACTIVE; j++) { @@ -414,15 +483,23 @@ __kernel void hash_main( #endif } + // Check case-insensitivity constraints #if N_CASE_INSENSITIVE > 0 if (ok) { #pragma unroll for (int j = 0; j < N_CASE_INSENSITIVE; j++) { ushort bit = CASE_BITPOS[j]; - int byte = (int)(bit >> 3); + int byte_idx = (int)(bit >> 3); int bit_in_byte = 7 - (int)(bit & 7); - ushort comb = ((ushort)repr[byte] << 8) | (ushort)((byte + 1 < 36) ? repr[byte + 1] : 0); + + // Read 16 bits safely across boundary + uchar byte0 = repr[byte_idx]; + uchar byte1 = (byte_idx + 1 < 36) ? repr[byte_idx + 1] : 0; + + ushort comb = ((ushort)byte0 << 8) | (ushort)byte1; + // Extract the 6-bit value uchar val6 = (uchar)((comb >> (bit_in_byte + 3)) & 0x3fu); + if ((val6 != CASE_ALT0[j]) && (val6 != CASE_ALT1[j])) { ok = 0; break; @@ -431,6 +508,7 @@ __kernel void hash_main( } #endif + // Report result if all checks pass if (ok) { uint slot = atomic_inc(found_counter); if (slot < 1024u) { @@ -440,7 +518,6 @@ __kernel void hash_main( } } } - // --- OPTIMIZATION END --- } } } diff --git a/tests/Benchmark.spec.ts b/tests/Benchmark.spec.ts index 02f0d8e..d41ae20 100644 --- a/tests/Benchmark.spec.ts +++ b/tests/Benchmark.spec.ts @@ -113,7 +113,7 @@ const chooseBenchCases = (names: string[]): BenchCase[] => { { name: 'start 5 cs', start: 'WERTY', caseSensitive: true }, { name: 'start 5 ci', start: 'WeRtY', caseSensitive: false }, { name: 'end 4 cs', end: 'WERT', caseSensitive: true }, - { name: 'end 4 ci', end: 'WeRt', caseSensitive: false }, + { name: 'end 5 ci', end: 'WeRtY', caseSensitive: false }, ]; const lower = names.join(' ').toLowerCase(); const isRTX3Plus = /rtx\s*(3|4|5)\d{2,3}/i.test(lower); @@ -144,7 +144,7 @@ const benchCases: BenchCase[] = (() => { { name: 'start 5 cs', start: 'WERTY', caseSensitive: true }, { name: 'start 5 ci', start: 'WeRtY', caseSensitive: false }, { name: 'end 4 cs', end: 'WERT', caseSensitive: true }, - { name: 'end 4 ci', end: 'WeRt', caseSensitive: false }, + { name: 'end 5 ci', end: 'WeRtY', caseSensitive: false }, ]; })(); const deviceNames = new Set(); diff --git a/tests/benchmarks.png b/tests/benchmarks.png index d40b36f..87d6e53 100644 Binary files a/tests/benchmarks.png and b/tests/benchmarks.png differ diff --git a/tests/results.json b/tests/results.json index a16c42c..a975fb1 100644 --- a/tests/results.json +++ b/tests/results.json @@ -67,6 +67,40 @@ "timedOut": true } ] + }, + { + "title": "PR #25", + "timestamp": 1764882858.415, + "cases": [ + { + "name": "start 5 cs", + "hits": 1395, + "seconds": 18.928040981292725, + "rate": 73.64734688485413, + "timedOut": true + }, + { + "name": "start 5 ci", + "hits": 5298, + "seconds": 18.87300205230713, + "rate": 280.6654704598238, + "timedOut": true + }, + { + "name": "end 4 cs", + "hits": 1594, + "seconds": 19.079343795776367, + "rate": 83.49343756532369, + "timedOut": true + }, + { + "name": "end 5 ci", + "hits": 738, + "seconds": 19.47322678565979, + "rate": 37.84683494482443, + "timedOut": true + } + ] } ], "NVIDIA GeForce RTX 4090": [ @@ -78,7 +112,7 @@ "name": "end 5 ci", "hits": 30, "seconds": 30, - "rate": 1.0, + "rate": 1, "timedOut": true }, { @@ -92,7 +126,7 @@ "name": "start 5 ci", "hits": 30, "seconds": 30, - "rate": 1.0, + "rate": 1, "timedOut": true }, { @@ -137,6 +171,40 @@ "timedOut": true } ] + }, + { + "title": "PR #25", + "timestamp": 1764885173.198, + "cases": [ + { + "name": "start 6 cs", + "hits": 283, + "seconds": 19.482248783111572, + "rate": 14.474715067002695, + "timedOut": true + }, + { + "name": "start 6 ci", + "hits": 2290, + "seconds": 19.563318490982056, + "rate": 117.0046892123717, + "timedOut": true + }, + { + "name": "end 5 cs", + "hits": 242, + "seconds": 18.74107551574707, + "rate": 12.85945407975659, + "timedOut": true + }, + { + "name": "end 5 ci", + "hits": 7166, + "seconds": 18.747441053390503, + "rate": 382.18549292113653, + "timedOut": true + } + ] } ] }