diff --git a/README.md b/README.md index 44e0723..aa94959 100644 --- a/README.md +++ b/README.md @@ -91,3 +91,4 @@ scripts/ ``` Each round's `data/` directory is generated locally and gitignored. +This is Tim Schilling's PR diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py index dffbee5..6f78811 100644 --- a/rounds/1_histogram/solution.py +++ b/rounds/1_histogram/solution.py @@ -4,11 +4,27 @@ passes out of the box. Replace the body of ``compute_histogram`` with your own faster implementation. """ +from collections import Counter +from struct import unpack + + +def get_biagrams(data): + data_iter = iter(unpack(f'{len(data)}c', data)) + val_0, val_1 = next(data_iter), next(data_iter) + for value in data_iter: + yield val_0+val_1 + val_0, val_1 = val_1, value + yield val_0+val_1 def compute_histogram(path: str) -> dict[bytes, int]: """Frequency of every 2-byte bigram in the file at ``path``.""" - # TODO: remove this delegation and write your own implementation here. - from .baseline import compute_histogram as _baseline + # Step 1: read the whole file into memory as a single bytes object. + with open(path, "rb") as f: + data = f.read() - return _baseline(path) + # Step 2: slide a 2-byte window across the buffer. For ``b"ABCD"`` the + # iterations produce ``b"AB"``, ``b"BC"``, then ``b"CD"``. For each window, + # bump the matching bucket in a ``dict`` keyed by the bigram itself. + counts = Counter(get_biagrams(data)) + return counts diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 8b917da..816092f 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -4,8 +4,59 @@ passes out of the box. Replace the body of ``find_matches`` with your own faster implementation. """ +import os +from concurrent.futures import ThreadPoolExecutor -from .baseline import find_matches as _baseline + +def _find_record_matches(pattern, sequence): + positions: list[int] = [] + start = 0 + while True: + pos = sequence.find(pattern, start) + if pos == -1: + break + positions.append(pos) + start = pos + 1 + return positions + + +def _search_chunk(fasta_path, chunk_start, chunk_end, pattern): + with open(fasta_path, "rb") as f: + f.seek(chunk_start) + if chunk_end is None: + text = f.read() + else: + # One bulk read for the chunk, then a few readline() calls to + # complete the last record that extends past our boundary. + # Collect parts in a list to avoid O(n²) bytes concatenation. + parts = [f.read(chunk_end - chunk_start)] + while True: + line = f.readline() + if not line or line.startswith(b">"): + break + parts.append(line) + text = b"".join(parts) + + # For chunks that don't start at byte 0, skip the partial-record fragment + # at the front (bytes belonging to the previous chunk's last record). + if chunk_start > 0: + if not text.startswith(b">"): + idx = text.find(b"\n>") + if idx == -1: + return [] + text = text[idx + 1:] # keep the ">" + + results = [] + for record in text.split(b">"): + if not record.strip(): + continue + lines = record.split(b"\n") + record_id = lines[0].strip().decode("ascii") + sequence = b"".join(lines[1:]).replace(b" ", b"") + positions = _find_record_matches(pattern, sequence) + if positions: + results.append((record_id, positions)) + return results def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: @@ -13,5 +64,22 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] Returns ``[(record_id, [positions...]), ...]`` in file order. """ - # TODO: remove this delegation and write your own implementation here. - return _baseline(fasta_path, pattern) + num_threads = os.cpu_count() or 4 + file_size = os.path.getsize(fasta_path) + chunk_size = max(1, file_size // num_threads) + + chunks = [ + (i * chunk_size, (i + 1) * chunk_size if i < num_threads - 1 else None) + for i in range(num_threads) + ] + + with ThreadPoolExecutor(max_workers=num_threads) as executor: + futures = [ + executor.submit(_search_chunk, fasta_path, start, end, pattern) + for start, end in chunks + ] + + results = [] + for future in futures: + results.extend(future.result()) + return results