diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py index dffbee5..9e5cd7f 100644 --- a/rounds/1_histogram/solution.py +++ b/rounds/1_histogram/solution.py @@ -5,10 +5,20 @@ own faster implementation. """ +import numpy as np + def compute_histogram(path: str) -> dict[bytes, int]: """Frequency of every 2-byte bigram in the file at ``path``.""" - # TODO: remove this delegation and write your own implementation here. - from .baseline import compute_histogram as _baseline + with open(path, "rb") as f: + data = f.read() + + arr = np.frombuffer(data, dtype=np.uint8) + # Encode each bigram as a uint16 index: high_byte * 256 + low_byte + indices = arr[:-1].astype(np.uint16) * 256 + arr[1:] + counts = np.bincount(indices, minlength=65536) - return _baseline(path) + result: dict[bytes, int] = {} + for idx in np.nonzero(counts)[0]: + result[bytes([idx >> 8, idx & 0xFF])] = int(counts[idx]) + return result diff --git a/rounds/2_corruption/solution.py b/rounds/2_corruption/solution.py index a5b752a..8934d23 100644 --- a/rounds/2_corruption/solution.py +++ b/rounds/2_corruption/solution.py @@ -5,10 +5,27 @@ own faster implementation. """ -from .baseline import find_corruptions as _baseline +import numpy as np def find_corruptions(ref_path: str, cor_path: str) -> list[tuple[int, int]]: """Return ``[(offset, length), ...]`` for every differing byte range.""" - # TODO: remove this delegation and write your own implementation here. - return _baseline(ref_path, cor_path) + ref = np.fromfile(ref_path, dtype=np.uint8) + cor = np.fromfile(cor_path, dtype=np.uint8) + + if len(ref) != len(cor): + raise ValueError("reference and corrupted files differ in length") + + # Single vectorised comparison — runs entirely in C. + diff_indices = np.where(ref != cor)[0] + + if len(diff_indices) == 0: + return [] + + # Find the boundaries between consecutive runs. + # A new run starts wherever the gap between adjacent indices exceeds 1. + gaps = np.where(np.diff(diff_indices) > 1)[0] + starts = diff_indices[np.concatenate(([0], gaps + 1))] + ends = diff_indices[np.concatenate((gaps, [len(diff_indices) - 1]))] + + return [(int(s), int(e - s + 1)) for s, e in zip(starts, ends)]