diff --git a/README.md b/README.md index 44e0723..133070f 100644 Binary files a/README.md and b/README.md differ diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py index dffbee5..d1c878d 100644 --- a/rounds/1_histogram/solution.py +++ b/rounds/1_histogram/solution.py @@ -5,10 +5,29 @@ own faster implementation. """ +from array import array + def compute_histogram(path: str) -> dict[bytes, int]: - """Frequency of every 2-byte bigram in the file at ``path``.""" - # TODO: remove this delegation and write your own implementation here. - from .baseline import compute_histogram as _baseline + with open(path, "rb") as f: + data = f.read() + + n = len(data) + if n < 2: + return {} + + # 65,536 possible 2-byte combinations + counts = array("I", [0]) * 65536 + + prev = data[0] + + for i in range(1, n): + curr = data[i] + counts[(prev << 8) | curr] += 1 + prev = curr - return _baseline(path) + return { + i.to_bytes(2, "big"): count + for i, count in enumerate(counts) + if count + } \ No newline at end of file diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 8b917da..431d2ef 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -5,7 +5,23 @@ own faster implementation. """ -from .baseline import find_matches as _baseline +from __future__ import annotations + +def _find_positions(sequence: bytes, pattern: bytes) -> list[int]: + positions = [] + start = 0 + find = sequence.find + + while True: + pos = find(pattern, start) + + if pos == -1: + break + + positions.append(pos) + start = pos + 1 + + return positions def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: @@ -13,5 +29,39 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] Returns ``[(record_id, [positions...]), ...]`` in file order. """ - # TODO: remove this delegation and write your own implementation here. - return _baseline(fasta_path, pattern) + matches: list[tuple[str, list[int]]] = [] + + with open(fasta_path, "rb") as f: + record_id = None + seq_parts = [] + + for line in f: + + if line.startswith(b">"): + + # process previous record + if record_id is not None: + sequence = b"".join(seq_parts) + + positions = _find_positions(sequence, pattern) + + if positions: + matches.append((record_id, positions)) + + # begin new FASTA record + record_id = line[1:].strip().decode("ascii") + seq_parts = [] + + else: + seq_parts.append(line.strip()) + + # process final record + if record_id is not None: + sequence = b"".join(seq_parts) + + positions = _find_positions(sequence, pattern) + + if positions: + matches.append((record_id, positions)) + + return matches