Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,4 @@ scripts/
```

Each round's `data/` directory is generated locally and gitignored.
This is Fawn Faine's PR
90 changes: 82 additions & 8 deletions rounds/3_dna/solution.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,91 @@
"""Your Round 3 solution — DNA sequence matcher.

**Edit this file.** It currently delegates to ``baseline.py`` so everything
passes out of the box. Replace the body of ``find_matches`` with your
own faster implementation.
"""
#**Edit this file.** It currently delegates to ``baseline.py`` so everything
#passes out of the box. Replace the body of ``find_matches`` with your
#own faster implementation.


#import numpy as np
#import threading

#def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:

# Returns ``[(record_id, [positions...]), ...]`` in file order.
# """
# # Step 1: read the whole FASTA file as text and decode the pattern so the
# # search below can use a single ``str`` API.
# pattern_str = pattern.decode("ascii")


# data = np.loadtxt(fasta_path, dtype=str, delimiter="/n")
# data =

# data = {s.split(delimiter)[0].strip(): s.split('>')[1].strip() for s in sequencet}

# positions: list[int] = []
# data = np.array(final_list)

# mask = (data == pattern)
# count = np.count_nonzero(mask)


#from __future__ import annotations

#"""Fast Round 3 solution: DNA sequence matcher."""



from __future__ import annotations

import numpy as np
import os
from concurrent.futures import ThreadPoolExecutor

_NEWLINE = b"\n"

from .baseline import find_matches as _baseline


def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
"""Find every FASTA record whose sequence contains ``pattern``.

Returns ``[(record_id, [positions...]), ...]`` in file order.
This version assumes the benchmark-sized generated FASTA input: ASCII
headers, DNA sequence lines separated by ``\n``, and no whitespace inside
sequence lines besides those newlines.
"""
# TODO: remove this delegation and write your own implementation here.
return _baseline(fasta_path, pattern)
if not pattern:
return []

pattern_values = np.frombuffer(pattern, dtype=np.uint8)
pattern_len = len(pattern)

with open(fasta_path, "rb") as file:
data = file.read()

matches: list[tuple[str, list[int]]] = []
for record in data.split(b">")[1:]:
record_id, _, wrapped_sequence = record.partition(_NEWLINE)
sequence = wrapped_sequence.replace(_NEWLINE, b"")
sequence_len = len(sequence)
if sequence_len < pattern_len:
continue

sequence_values = np.frombuffer(sequence, dtype=np.uint8)
positions_mask = (
sequence_values[: sequence_len - pattern_len + 1] == pattern_values[0]
)
for pattern_index in range(1, pattern_len):
positions_mask &= (
sequence_values[
pattern_index : sequence_len - pattern_len + 1 + pattern_index
]
== pattern_values[pattern_index]
)

positions = np.nonzero(positions_mask)[0]
if positions.size:
matches.append((record_id.decode("ascii"), positions.tolist()))

return matches



Loading