Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file added myfile
Empty file.
24 changes: 22 additions & 2 deletions rounds/1_histogram/solution.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,30 @@
own faster implementation.
"""

from concurrent.futures import ProcessPoolExecutor
import os

CHUNK_SIZE = 8 * 1024 * 1024

def compute_histogram(path: str) -> dict[bytes, int]:
"""Frequency of every 2-byte bigram in the file at ``path``."""
# TODO: remove this delegation and write your own implementation here.
from .baseline import compute_histogram as _baseline
with open(path, "rb") as f:
data = f.read()

return _baseline(path)
counts: list[int] = [0] * 65536
if len(data) == 0:
return {}
data_iter = iter(data)
window_idx = next(data_iter)
for b in data_iter:
window_idx <<= 8
window_idx &= 0xff00
window_idx |= b
counts[window_idx] += 1
d = {}
for i,cnt in enumerate(counts):
if counts[i] != 0:
b = i.to_bytes(2, byteorder="big")
d[b] = cnt
return d
52 changes: 50 additions & 2 deletions rounds/3_dna/solution.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,35 @@
own faster implementation.
"""

from .baseline import find_matches as _baseline
import re
import string
from concurrent.futures import ThreadPoolExecutor

table = bytes.maketrans(b"", b"")

def find_match(args):
pattern_str,record = args
# Step 3: a record looks like ``"<id>\n<seq line 1>\n<seq line 2>\n..."``.
# The id is the first line; the remaining lines are joined back into a
# single contiguous sequence string.
lines = record.split(b'\n', 1)
record_id = lines[0].strip()
sequence_raw = lines[1]
sequence = sequence_raw.translate(table, delete=string.whitespace.encode())

positions: list[int] = []
start = 0
while True:
pos = sequence.find(pattern_str, start)
if pos == -1:
break
positions.append(pos)
start = pos + 1

if positions:
return (record_id.decode('ascii'), positions)
else:
return None


def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
Expand All @@ -14,4 +42,24 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]
Returns ``[(record_id, [positions...]), ...]`` in file order.
"""
# TODO: remove this delegation and write your own implementation here.
return _baseline(fasta_path, pattern)
# Step 1: read the whole FASTA file as text and decode the pattern so the
# search below can use a single ``str`` API.
with open(fasta_path, "rb") as f:
text = f.read()

matches: list[tuple[str, list[int]]] = []

with ThreadPoolExecutor() as ex:
futures = []
for record in text.split(b">"):
if not record.strip():
continue

t = ex.submit(find_match, args=(pattern,record))
futures.append(t)

for t in futures:
result = t.result()
if result:
matches.append(result)
return matches
Loading