diff --git a/.gitignore b/.gitignore index 32c32e0..8ec97df 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,6 @@ __pycache__/ # CodSpeed .codspeed/ + +# Mine +codes diff --git a/.python-version b/.python-version index d5629d4..6324d40 100644 --- a/.python-version +++ b/.python-version @@ -1 +1 @@ -3.15t +3.14 diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py index dffbee5..e1e67b0 100644 --- a/rounds/1_histogram/solution.py +++ b/rounds/1_histogram/solution.py @@ -4,11 +4,28 @@ passes out of the box. Replace the body of ``compute_histogram`` with your own faster implementation. """ +import numpy as np def compute_histogram(path: str) -> dict[bytes, int]: """Frequency of every 2-byte bigram in the file at ``path``.""" - # TODO: remove this delegation and write your own implementation here. - from .baseline import compute_histogram as _baseline + # Step 1: read the whole file into memory as a single bytes object. + with open(path, "rb") as f: + data = f.read() - return _baseline(path) + # Step 2: slide a 2-byte window across the buffer. For ``b"ABCD"`` the + # iterations produce ``b"AB"``, ``b"BC"``, then ``b"CD"``. For each window, + # bump the matching bucket in a ``dict`` keyed by the bigram itself. + counts= [[0] * 256 for _ in range(256)] + + for i in range(len(data) - 1): + a, b = data[i], data[i + 1] + counts[a][b] += 1 + + result = {} + for i, row in enumerate(counts): + for j, count in enumerate(row): + if count > 0: + bigram = bytes([i, j]) + result[bigram] = count + return result \ No newline at end of file diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 8b917da..f7c4025 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -6,6 +6,8 @@ """ from .baseline import find_matches as _baseline +from threading import Thread +import numpy as np def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: @@ -13,5 +15,48 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] Returns ``[(record_id, [positions...]), ...]`` in file order. """ - # TODO: remove this delegation and write your own implementation here. - return _baseline(fasta_path, pattern) + # Step 1: read the whole FASTA file as text and decode the pattern so the + # search below can use a single ``str`` API. + pattern_str = pattern.decode("ascii") + with open(fasta_path, "rb") as f: + text = f.read() + matches: list[tuple[str, list[int]]] = [] + + # Step 2: split the file on '>' to peel off one record at a time. The + # first element is the chunk before any header (empty for well-formed + # files) and is skipped by the ``.strip()`` guard below. + sequences = [] + for record in text.split(b">"): + if not record.strip(): + continue + + # Step 3: a record looks like ``"\n\n\n..."``. + # The id is the first line; the remaining lines are joined back into a + # single contiguous sequence string. + lines = record.split(b"\n") + record_id = lines[0].strip().decode("ascii") + sequence = b"".join(lines[1:]).replace(b" ", b"").decode("ascii") + sequences.append((record_id, sequence)) + + threads = [] + for record_id, sequence in sequences: + thread = Thread(target=match_record, args=(record_id, sequence, pattern_str, matches)) + thread.start() + threads.append(thread) + for thread in threads: + thread.join() + + return matches + +def match_record(record_id, sequence, pattern_str, matches): + positions: list[int] = [] + start = 0 + while True: + pos = sequence.find(pattern_str, start) + if pos == -1: + break + positions.append(pos) + start = pos + 1 + + if positions: + matches.append((record_id, positions)) \ No newline at end of file