diff --git a/README.md b/README.md index 44e0723..5101de5 100644 --- a/README.md +++ b/README.md @@ -91,3 +91,4 @@ scripts/ ``` Each round's `data/` directory is generated locally and gitignored. +This is Fawn Faine's PR diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 8b917da..6f97c2c 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -1,17 +1,91 @@ """Your Round 3 solution — DNA sequence matcher. -**Edit this file.** It currently delegates to ``baseline.py`` so everything -passes out of the box. Replace the body of ``find_matches`` with your -own faster implementation. -""" +#**Edit this file.** It currently delegates to ``baseline.py`` so everything +#passes out of the box. Replace the body of ``find_matches`` with your +#own faster implementation. + + +#import numpy as np +#import threading + +#def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: + +# Returns ``[(record_id, [positions...]), ...]`` in file order. +# """ +# # Step 1: read the whole FASTA file as text and decode the pattern so the +# # search below can use a single ``str`` API. +# pattern_str = pattern.decode("ascii") + + +# data = np.loadtxt(fasta_path, dtype=str, delimiter="/n") +# data = + +# data = {s.split(delimiter)[0].strip(): s.split('>')[1].strip() for s in sequencet} + +# positions: list[int] = [] +# data = np.array(final_list) + +# mask = (data == pattern) +# count = np.count_nonzero(mask) + + +#from __future__ import annotations + +#"""Fast Round 3 solution: DNA sequence matcher.""" + + + +from __future__ import annotations + +import numpy as np +import os +from concurrent.futures import ThreadPoolExecutor + +_NEWLINE = b"\n" -from .baseline import find_matches as _baseline def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: """Find every FASTA record whose sequence contains ``pattern``. - Returns ``[(record_id, [positions...]), ...]`` in file order. + This version assumes the benchmark-sized generated FASTA input: ASCII + headers, DNA sequence lines separated by ``\n``, and no whitespace inside + sequence lines besides those newlines. """ - # TODO: remove this delegation and write your own implementation here. - return _baseline(fasta_path, pattern) + if not pattern: + return [] + + pattern_values = np.frombuffer(pattern, dtype=np.uint8) + pattern_len = len(pattern) + + with open(fasta_path, "rb") as file: + data = file.read() + + matches: list[tuple[str, list[int]]] = [] + for record in data.split(b">")[1:]: + record_id, _, wrapped_sequence = record.partition(_NEWLINE) + sequence = wrapped_sequence.replace(_NEWLINE, b"") + sequence_len = len(sequence) + if sequence_len < pattern_len: + continue + + sequence_values = np.frombuffer(sequence, dtype=np.uint8) + positions_mask = ( + sequence_values[: sequence_len - pattern_len + 1] == pattern_values[0] + ) + for pattern_index in range(1, pattern_len): + positions_mask &= ( + sequence_values[ + pattern_index : sequence_len - pattern_len + 1 + pattern_index + ] + == pattern_values[pattern_index] + ) + + positions = np.nonzero(positions_mask)[0] + if positions.size: + matches.append((record_id.decode("ascii"), positions.tolist())) + + return matches + + +