From 18426787a99b19429e03b8de646cec02c956af5a Mon Sep 17 00:00:00 2001 From: Drew Wock Date: Wed, 13 May 2026 12:26:43 -0400 Subject: [PATCH 1/7] initial for making my pr happen, lolol --- myfile | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 myfile diff --git a/myfile b/myfile new file mode 100644 index 0000000..e69de29 From 5dc46554edcc5f74b7e9e0c02348aa58ab91c81d Mon Sep 17 00:00:00 2001 From: Drew Wock Date: Wed, 13 May 2026 13:03:44 -0400 Subject: [PATCH 2/7] iteration one Signed-off-by: Drew Wock --- rounds/1_histogram/solution.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/rounds/1_histogram/solution.py b/rounds/1_histogram/solution.py index dffbee5..24a8136 100644 --- a/rounds/1_histogram/solution.py +++ b/rounds/1_histogram/solution.py @@ -5,10 +5,30 @@ own faster implementation. """ +from concurrent.futures import ProcessPoolExecutor +import os + +CHUNK_SIZE = 8 * 1024 * 1024 def compute_histogram(path: str) -> dict[bytes, int]: """Frequency of every 2-byte bigram in the file at ``path``.""" # TODO: remove this delegation and write your own implementation here. - from .baseline import compute_histogram as _baseline + with open(path, "rb") as f: + data = f.read() - return _baseline(path) + counts: list[int] = [0] * 65536 + if len(data) == 0: + return {} + data_iter = iter(data) + window_idx = next(data_iter) + for b in data_iter: + window_idx <<= 8 + window_idx &= 0xff00 + window_idx |= b + counts[window_idx] += 1 + d = {} + for i,cnt in enumerate(counts): + if counts[i] != 0: + b = i.to_bytes(2, byteorder="big") + d[b] = cnt + return d From 8af615dee4dd174b49ed969be587011cf6e18df8 Mon Sep 17 00:00:00 2001 From: Drew Wock Date: Wed, 13 May 2026 14:33:05 -0400 Subject: [PATCH 3/7] dna round Signed-off-by: Drew Wock --- rounds/3_dna/solution.py | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 8b917da..deb9cbd 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -5,8 +5,7 @@ own faster implementation. """ -from .baseline import find_matches as _baseline - +import re def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: """Find every FASTA record whose sequence contains ``pattern``. @@ -14,4 +13,34 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] Returns ``[(record_id, [positions...]), ...]`` in file order. """ # TODO: remove this delegation and write your own implementation here. - return _baseline(fasta_path, pattern) + # Step 1: read the whole FASTA file as text and decode the pattern so the + # search below can use a single ``str`` API. + pattern_str = pattern.decode("ascii") + with open(fasta_path, "r") as f: + text = f.read() + + matches: list[tuple[str, list[int]]] = [] + pattern_str = pattern.decode('ascii') + regex = re.compile(pattern_str) + + # Step 2: split the file on '>' to peel off one record at a time. The + # first element is the chunk before any header (empty for well-formed + # files) and is skipped by the ``.strip()`` guard below. + for record in text.split(">"): + if not record.strip(): + continue + + # Step 3: a record looks like ``"\n\n\n..."``. + # The id is the first line; the remaining lines are joined back into a + # single contiguous sequence string. + lines = record.split("\n") + record_id = lines[0].strip() + sequence = "".join(lines[1:]).replace(" ", "") + + positions: list[int] = [] + print(sequence) + for m in regex.finditer(sequence): + positions.append(m.start()) + if positions: + matches.append((record_id, positions)) + return matches From d2b310dd6caa72499dc7f2af27fb85cbc68b82ee Mon Sep 17 00:00:00 2001 From: Drew Wock Date: Wed, 13 May 2026 14:37:30 -0400 Subject: [PATCH 4/7] dna 02 Signed-off-by: Drew Wock --- rounds/3_dna/solution.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index deb9cbd..9d057ec 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -15,7 +15,6 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] # TODO: remove this delegation and write your own implementation here. # Step 1: read the whole FASTA file as text and decode the pattern so the # search below can use a single ``str`` API. - pattern_str = pattern.decode("ascii") with open(fasta_path, "r") as f: text = f.read() @@ -38,9 +37,7 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] sequence = "".join(lines[1:]).replace(" ", "") positions: list[int] = [] - print(sequence) - for m in regex.finditer(sequence): - positions.append(m.start()) + positions = [m.start() for m in regex.finditer(sequence)] if positions: matches.append((record_id, positions)) return matches From be73d3432dee526740a577eadec8d101d8054510 Mon Sep 17 00:00:00 2001 From: Drew Wock Date: Wed, 13 May 2026 14:48:26 -0400 Subject: [PATCH 5/7] dna threads Signed-off-by: Drew Wock --- rounds/3_dna/solution.py | 49 +++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 18 deletions(-) diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 9d057ec..2ff5d5a 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -6,6 +6,24 @@ """ import re +from concurrent.futures import ThreadPoolExecutor + +def find_match(args): + regex,record = args + # Step 3: a record looks like ``"\n\n\n..."``. + # The id is the first line; the remaining lines are joined back into a + # single contiguous sequence string. + lines = record.split("\n") + record_id = lines[0].strip() + sequence = "".join(lines[1:]).replace(" ", "") + + positions: list[int] = [] + positions = [m.start() for m in regex.finditer(sequence)] + if positions: + return (record_id, positions) + else: + return None + def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]]: """Find every FASTA record whose sequence contains ``pattern``. @@ -22,22 +40,17 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] pattern_str = pattern.decode('ascii') regex = re.compile(pattern_str) - # Step 2: split the file on '>' to peel off one record at a time. The - # first element is the chunk before any header (empty for well-formed - # files) and is skipped by the ``.strip()`` guard below. - for record in text.split(">"): - if not record.strip(): - continue - - # Step 3: a record looks like ``"\n\n\n..."``. - # The id is the first line; the remaining lines are joined back into a - # single contiguous sequence string. - lines = record.split("\n") - record_id = lines[0].strip() - sequence = "".join(lines[1:]).replace(" ", "") - - positions: list[int] = [] - positions = [m.start() for m in regex.finditer(sequence)] - if positions: - matches.append((record_id, positions)) + with ThreadPoolExecutor() as ex: + futures = [] + for record in text.split(">"): + if not record.strip(): + continue + + t = ex.submit(find_match, args=(regex,record)) + futures.append(t) + + for t in futures: + result = t.result() + if result: + matches.append(result) return matches From a1d83132743dbf1b21d39bac0a364b3d0f7dff36 Mon Sep 17 00:00:00 2001 From: Drew Wock Date: Wed, 13 May 2026 14:55:12 -0400 Subject: [PATCH 6/7] Undid my regression that used regex, lol Signed-off-by: Drew Wock --- rounds/3_dna/solution.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 2ff5d5a..8b5b934 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -9,7 +9,7 @@ from concurrent.futures import ThreadPoolExecutor def find_match(args): - regex,record = args + pattern_str,record = args # Step 3: a record looks like ``"\n\n\n..."``. # The id is the first line; the remaining lines are joined back into a # single contiguous sequence string. @@ -18,7 +18,14 @@ def find_match(args): sequence = "".join(lines[1:]).replace(" ", "") positions: list[int] = [] - positions = [m.start() for m in regex.finditer(sequence)] + start = 0 + while True: + pos = sequence.find(pattern_str, start) + if pos == -1: + break + positions.append(pos) + start = pos + 1 + if positions: return (record_id, positions) else: @@ -38,7 +45,6 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] matches: list[tuple[str, list[int]]] = [] pattern_str = pattern.decode('ascii') - regex = re.compile(pattern_str) with ThreadPoolExecutor() as ex: futures = [] @@ -46,7 +52,7 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] if not record.strip(): continue - t = ex.submit(find_match, args=(regex,record)) + t = ex.submit(find_match, args=(pattern_str,record)) futures.append(t) for t in futures: From 687935298ca1ef5e484e9a7f15a1fafb6a5a1a27 Mon Sep 17 00:00:00 2001 From: Drew Wock Date: Wed, 13 May 2026 15:20:48 -0400 Subject: [PATCH 7/7] Bytes mode Signed-off-by: Drew Wock --- rounds/3_dna/solution.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/rounds/3_dna/solution.py b/rounds/3_dna/solution.py index 8b5b934..8e61013 100644 --- a/rounds/3_dna/solution.py +++ b/rounds/3_dna/solution.py @@ -6,16 +6,20 @@ """ import re +import string from concurrent.futures import ThreadPoolExecutor +table = bytes.maketrans(b"", b"") + def find_match(args): pattern_str,record = args # Step 3: a record looks like ``"\n\n\n..."``. # The id is the first line; the remaining lines are joined back into a # single contiguous sequence string. - lines = record.split("\n") + lines = record.split(b'\n', 1) record_id = lines[0].strip() - sequence = "".join(lines[1:]).replace(" ", "") + sequence_raw = lines[1] + sequence = sequence_raw.translate(table, delete=string.whitespace.encode()) positions: list[int] = [] start = 0 @@ -27,7 +31,7 @@ def find_match(args): start = pos + 1 if positions: - return (record_id, positions) + return (record_id.decode('ascii'), positions) else: return None @@ -40,19 +44,18 @@ def find_matches(fasta_path: str, pattern: bytes) -> list[tuple[str, list[int]]] # TODO: remove this delegation and write your own implementation here. # Step 1: read the whole FASTA file as text and decode the pattern so the # search below can use a single ``str`` API. - with open(fasta_path, "r") as f: + with open(fasta_path, "rb") as f: text = f.read() matches: list[tuple[str, list[int]]] = [] - pattern_str = pattern.decode('ascii') with ThreadPoolExecutor() as ex: futures = [] - for record in text.split(">"): + for record in text.split(b">"): if not record.strip(): continue - t = ex.submit(find_match, args=(pattern_str,record)) + t = ex.submit(find_match, args=(pattern,record)) futures.append(t) for t in futures: