Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified README.md
Binary file not shown.
27 changes: 23 additions & 4 deletions rounds/1_histogram/solution.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,29 @@
own faster implementation.
"""

from array import array


def compute_histogram(path: str) -> dict[bytes, int]:
"""Frequency of every 2-byte bigram in the file at ``path``."""
# TODO: remove this delegation and write your own implementation here.
from .baseline import compute_histogram as _baseline
with open(path, "rb") as f:
data = f.read()

n = len(data)
if n < 2:
return {}

# 65,536 possible 2-byte combinations
counts = array("I", [0]) * 65536

prev = data[0]

for i in range(1, n):
curr = data[i]
counts[(prev << 8) | curr] += 1
prev = curr

return _baseline(path)
return {
i.to_bytes(2, "big"): count
for i, count in enumerate(counts)
if count
}
56 changes: 53 additions & 3 deletions rounds/3_dna/solution.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,63 @@
own faster implementation.
"""

from .baseline import find_matches as _baseline
from __future__ import annotations

def _find_positions(sequence: bytes, pattern: bytes) -> list[int]:
positions = []
start = 0
find = sequence.find

while True:
pos = find(pattern, start)

if pos == -1:
break

positions.append(pos)
start = pos + 1

return positions


def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
"""Find every FASTA record whose sequence contains ``pattern``.

Returns ``[(record_id, [positions...]), ...]`` in file order.
"""
# TODO: remove this delegation and write your own implementation here.
return _baseline(fasta_path, pattern)
matches: list[tuple[str, list[int]]] = []

with open(fasta_path, "rb") as f:
record_id = None
seq_parts = []

for line in f:

if line.startswith(b">"):

# process previous record
if record_id is not None:
sequence = b"".join(seq_parts)

positions = _find_positions(sequence, pattern)

if positions:
matches.append((record_id, positions))

# begin new FASTA record
record_id = line[1:].strip().decode("ascii")
seq_parts = []

else:
seq_parts.append(line.strip())

# process final record
if record_id is not None:
sequence = b"".join(seq_parts)

positions = _find_positions(sequence, pattern)

if positions:
matches.append((record_id, positions))

return matches
Loading