Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,4 @@ scripts/
```

Each round's `data/` directory is generated locally and gitignored.
This is RossK1's PR
19 changes: 10 additions & 9 deletions rounds/1_histogram/solution.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
"""Your Round 1 solution — byte-pair histogram.
"""Your Round 1 solution — byte-pair histogram."""

**Edit this file.** It currently delegates to ``baseline.py`` so everything
passes out of the box. Replace the body of ``compute_histogram`` with your
own faster implementation.
"""
import numpy as np
import mmap


def compute_histogram(path: str) -> dict[bytes, int]:
"""Frequency of every 2-byte bigram in the file at ``path``."""
# TODO: remove this delegation and write your own implementation here.
from .baseline import compute_histogram as _baseline

return _baseline(path)
with open(path, "rb") as f:
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
# Copy while mmap is still open — no exported pointer issue
data = np.frombuffer(mm, dtype=np.uint8).copy()
keys = data[:-1].astype(np.uint16) << 8 | data[1:].astype(np.uint16)
counts = np.bincount(keys, minlength=65536)
return {bytes([k >> 8, k & 0xFF]): int(counts[k]) for k in np.nonzero(counts)[0]}
34 changes: 25 additions & 9 deletions rounds/2_corruption/solution.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,30 @@
"""Your Round 2 solution — corruption scanner.
from __future__ import annotations

**Edit this file.** It currently delegates to ``baseline.py`` so everything
passes out of the box. Replace the body of ``find_corruptions`` with your
own faster implementation.
"""

from .baseline import find_corruptions as _baseline
import numpy as np


def find_corruptions(ref_path: str, cor_path: str) -> list[tuple[int, int]]:
"""Return ``[(offset, length), ...]`` for every differing byte range."""
# TODO: remove this delegation and write your own implementation here.
return _baseline(ref_path, cor_path)
with open(ref_path, "rb") as f:
ref = np.frombuffer(f.read(), dtype=np.uint8)
with open(cor_path, "rb") as f:
cor = np.frombuffer(f.read(), dtype=np.uint8)

if len(ref) != len(cor):
raise ValueError("reference and corrupted files differ in length")

# Boolean mask of differing positions
mask = ref != cor
if not mask.any():
return []

# Find run boundaries using diff on the mask
padded = np.empty(len(mask) + 2, dtype=np.int8)
padded[0] = 0
padded[1:-1] = mask.view(np.int8)
padded[-1] = 0
d = np.diff(padded.astype(np.int8))
starts = np.where(d == 1)[0]
ends = np.where(d == -1)[0]

return [(int(s), int(e - s)) for s, e in zip(starts, ends)]
54 changes: 45 additions & 9 deletions rounds/3_dna/solution.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,53 @@
"""Your Round 3 solution — DNA sequence matcher.
"""Your Round 3 solution — DNA sequence matcher."""

**Edit this file.** It currently delegates to ``baseline.py`` so everything
passes out of the box. Replace the body of ``find_matches`` with your
own faster implementation.
"""

from .baseline import find_matches as _baseline
from __future__ import annotations
from concurrent.futures import ThreadPoolExecutor, as_completed
import mmap
import os


def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
"""Find every FASTA record whose sequence contains ``pattern``.

Returns ``[(record_id, [positions...]), ...]`` in file order.
"""
# TODO: remove this delegation and write your own implementation here.
return _baseline(fasta_path, pattern)
# Read as bytes — no decode overhead, pattern stays as bytes.
with open(fasta_path, "rb") as f:
with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
data = bytes(mm)
size = len(data)

# Find record boundaries without copying data
offsets = [0]
pos = data.find(b">", 1)
while pos != -1:
offsets.append(pos)
pos = data.find(b">", pos + 1)
offsets.append(size)

def process_record(
start: int, end: int, idx: int
) -> tuple[int, tuple[str, list[int]]] | None:
chunk = data[start:end]
lines = chunk.split(b"\n")
record_id = lines[0][1:].rstrip().decode("ascii")
sequence = b"".join(lines[1:])
positions = []
start_pos = 0
while (hit := sequence.find(pattern, start_pos)) != -1:
positions.append(hit)
start_pos = hit + 1
if positions:
return (idx, (record_id, positions))
return None

max_workers = min(32, (os.cpu_count() or 1) * 2)
with ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [
executor.submit(process_record, offsets[i], offsets[i + 1], i)
for i in range(len(offsets) - 1)
]
results = [r for f in as_completed(futures) if (r := f.result()) is not None]

results.sort(key=lambda x: x[0])
return [r for _, r in results]
Loading