Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,4 @@ scripts/
```

Each round's `data/` directory is generated locally and gitignored.
This is Tim Schilling's PR
22 changes: 19 additions & 3 deletions rounds/1_histogram/solution.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,27 @@
passes out of the box. Replace the body of ``compute_histogram`` with your
own faster implementation.
"""
from collections import Counter
from struct import unpack


def get_biagrams(data):
data_iter = iter(unpack(f'{len(data)}c', data))
val_0, val_1 = next(data_iter), next(data_iter)
for value in data_iter:
yield val_0+val_1
val_0, val_1 = val_1, value
yield val_0+val_1


def compute_histogram(path: str) -> dict[bytes, int]:
"""Frequency of every 2-byte bigram in the file at ``path``."""
# TODO: remove this delegation and write your own implementation here.
from .baseline import compute_histogram as _baseline
# Step 1: read the whole file into memory as a single bytes object.
with open(path, "rb") as f:
data = f.read()

return _baseline(path)
# Step 2: slide a 2-byte window across the buffer. For ``b"ABCD"`` the
# iterations produce ``b"AB"``, ``b"BC"``, then ``b"CD"``. For each window,
# bump the matching bucket in a ``dict`` keyed by the bigram itself.
counts = Counter(get_biagrams(data))
return counts
74 changes: 71 additions & 3 deletions rounds/3_dna/solution.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,82 @@
passes out of the box. Replace the body of ``find_matches`` with your
own faster implementation.
"""
import os
from concurrent.futures import ThreadPoolExecutor

from .baseline import find_matches as _baseline

def _find_record_matches(pattern, sequence):
positions: list[int] = []
start = 0
while True:
pos = sequence.find(pattern, start)
if pos == -1:
break
positions.append(pos)
start = pos + 1
return positions


def _search_chunk(fasta_path, chunk_start, chunk_end, pattern):
with open(fasta_path, "rb") as f:
f.seek(chunk_start)
if chunk_end is None:
text = f.read()
else:
# One bulk read for the chunk, then a few readline() calls to
# complete the last record that extends past our boundary.
# Collect parts in a list to avoid O(n²) bytes concatenation.
parts = [f.read(chunk_end - chunk_start)]
while True:
line = f.readline()
if not line or line.startswith(b">"):
break
parts.append(line)
text = b"".join(parts)

# For chunks that don't start at byte 0, skip the partial-record fragment
# at the front (bytes belonging to the previous chunk's last record).
if chunk_start > 0:
if not text.startswith(b">"):
idx = text.find(b"\n>")
if idx == -1:
return []
text = text[idx + 1:] # keep the ">"

results = []
for record in text.split(b">"):
if not record.strip():
continue
lines = record.split(b"\n")
record_id = lines[0].strip().decode("ascii")
sequence = b"".join(lines[1:]).replace(b" ", b"")
positions = _find_record_matches(pattern, sequence)
if positions:
results.append((record_id, positions))
return results


def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]:
"""Find every FASTA record whose sequence contains ``pattern``.

Returns ``[(record_id, [positions...]), ...]`` in file order.
"""
# TODO: remove this delegation and write your own implementation here.
return _baseline(fasta_path, pattern)
num_threads = os.cpu_count() or 4
file_size = os.path.getsize(fasta_path)
chunk_size = max(1, file_size // num_threads)

chunks = [
(i * chunk_size, (i + 1) * chunk_size if i < num_threads - 1 else None)
for i in range(num_threads)
]

with ThreadPoolExecutor(max_workers=num_threads) as executor:
futures = [
executor.submit(_search_chunk, fasta_path, start, end, pattern)
for start, end in chunks
]

results = []
for future in futures:
results.extend(future.result())
return results
Loading