diff --git a/myfile b/myfile new file mode 100644 index 0000000..e69de29 diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py index dffbee5..24a8136 100644 --- a/rounds/1_histogram/solution.py +++ b/rounds/1_histogram/solution.py @@ -5,10 +5,30 @@ own faster implementation. """ +from concurrent.futures import ProcessPoolExecutor +import os + +CHUNK_SIZE = 8 * 1024 * 1024 def compute_histogram(path: str) -> dict[bytes, int]: """Frequency of every 2-byte bigram in the file at ``path``.""" # TODO: remove this delegation and write your own implementation here. - from .baseline import compute_histogram as _baseline + with open(path, "rb") as f: + data = f.read() - return _baseline(path) + counts: list[int] = [0] * 65536 + if len(data) == 0: + return {} + data_iter = iter(data) + window_idx = next(data_iter) + for b in data_iter: + window_idx <<= 8 + window_idx &= 0xff00 + window_idx |= b + counts[window_idx] += 1 + d = {} + for i,cnt in enumerate(counts): + if counts[i] != 0: + b = i.to_bytes(2, byteorder="big") + d[b] = cnt + return d diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 8b917da..8e61013 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -5,7 +5,35 @@ own faster implementation. """ -from .baseline import find_matches as _baseline +import re +import string +from concurrent.futures import ThreadPoolExecutor + +table = bytes.maketrans(b"", b"") + +def find_match(args): + pattern_str,record = args + # Step 3: a record looks like ``"\n\n\n..."``. + # The id is the first line; the remaining lines are joined back into a + # single contiguous sequence string. + lines = record.split(b'\n', 1) + record_id = lines[0].strip() + sequence_raw = lines[1] + sequence = sequence_raw.translate(table, delete=string.whitespace.encode()) + + positions: list[int] = [] + start = 0 + while True: + pos = sequence.find(pattern_str, start) + if pos == -1: + break + positions.append(pos) + start = pos + 1 + + if positions: + return (record_id.decode('ascii'), positions) + else: + return None def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: @@ -14,4 +42,24 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] Returns ``[(record_id, [positions...]), ...]`` in file order. """ # TODO: remove this delegation and write your own implementation here. - return _baseline(fasta_path, pattern) + # Step 1: read the whole FASTA file as text and decode the pattern so the + # search below can use a single ``str`` API. + with open(fasta_path, "rb") as f: + text = f.read() + + matches: list[tuple[str, list[int]]] = [] + + with ThreadPoolExecutor() as ex: + futures = [] + for record in text.split(b">"): + if not record.strip(): + continue + + t = ex.submit(find_match, args=(pattern,record)) + futures.append(t) + + for t in futures: + result = t.result() + if result: + matches.append(result) + return matches