From 0f438113e5d5e41817b8acb5ab172b69ecdbec6f Mon Sep 17 00:00:00 2001 From: shoredatalabs Date: Wed, 13 May 2026 09:54:25 -0700 Subject: [PATCH 1/4] Add to the README --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 44e0723..d0de913 100644 --- a/README.md +++ b/README.md @@ -91,3 +91,4 @@ scripts/ ``` Each round's `data/` directory is generated locally and gitignored. +This is 's PR From ded3b43b35873c1236081e8a1fb880a9979bd52c Mon Sep 17 00:00:00 2001 From: Ibrahim Shore Date: Wed, 13 May 2026 11:32:54 -0700 Subject: [PATCH 2/4] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index d0de913..44e0723 100644 --- a/README.md +++ b/README.md @@ -91,4 +91,3 @@ scripts/ ``` Each round's `data/` directory is generated locally and gitignored. -This is 's PR From d5fb559b5b898a8e2b1c0badb1150e2b07707cd0 Mon Sep 17 00:00:00 2001 From: shoredatalabs Date: Wed, 13 May 2026 11:45:49 -0700 Subject: [PATCH 3/4] ShoreDataLabs performance improvements --- rounds/1_histogram/solution.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py index dffbee5..9e5cd7f 100644 --- a/rounds/1_histogram/solution.py +++ b/rounds/1_histogram/solution.py @@ -5,10 +5,20 @@ own faster implementation. """ +import numpy as np + def compute_histogram(path: str) -> dict[bytes, int]: """Frequency of every 2-byte bigram in the file at ``path``.""" - # TODO: remove this delegation and write your own implementation here. - from .baseline import compute_histogram as _baseline + with open(path, "rb") as f: + data = f.read() + + arr = np.frombuffer(data, dtype=np.uint8) + # Encode each bigram as a uint16 index: high_byte * 256 + low_byte + indices = arr[:-1].astype(np.uint16) * 256 + arr[1:] + counts = np.bincount(indices, minlength=65536) - return _baseline(path) + result: dict[bytes, int] = {} + for idx in np.nonzero(counts)[0]: + result[bytes([idx >> 8, idx & 0xFF])] = int(counts[idx]) + return result From 6a20f62e0376fbff8d49b4c308bfd85e79e012ec Mon Sep 17 00:00:00 2001 From: shoredatalabs Date: Wed, 13 May 2026 12:23:00 -0700 Subject: [PATCH 4/4] improve performance of corruption --- rounds/2_corruption/solution.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/rounds/2_corruption/solution.py b/rounds/2_corruption/solution.py index a5b752a..8934d23 100644 --- a/rounds/2_corruption/solution.py +++ b/rounds/2_corruption/solution.py @@ -5,10 +5,27 @@ own faster implementation. """ -from .baseline import find_corruptions as _baseline +import numpy as np def find_corruptions(ref_path: str, cor_path: str) -> list[tuple[int, int]]: """Return ``[(offset, length), ...]`` for every differing byte range.""" - # TODO: remove this delegation and write your own implementation here. - return _baseline(ref_path, cor_path) + ref = np.fromfile(ref_path, dtype=np.uint8) + cor = np.fromfile(cor_path, dtype=np.uint8) + + if len(ref) != len(cor): + raise ValueError("reference and corrupted files differ in length") + + # Single vectorised comparison — runs entirely in C. + diff_indices = np.where(ref != cor)[0] + + if len(diff_indices) == 0: + return [] + + # Find the boundaries between consecutive runs. + # A new run starts wherever the gap between adjacent indices exceeds 1. + gaps = np.where(np.diff(diff_indices) > 1)[0] + starts = diff_indices[np.concatenate(([0], gaps + 1))] + ends = diff_indices[np.concatenate((gaps, [len(diff_indices) - 1]))] + + return [(int(s), int(e - s + 1)) for s, e in zip(starts, ends)]