From e0fc536cacec5473bf1daa2b4dfba591073d0684 Mon Sep 17 00:00:00 2001
From: Sebastian Benjamin <sebastiancbenjamin@gmail.com>
Date: Wed, 21 Jan 2026 13:13:37 -0800
Subject: [PATCH] Add majority voting for UMI lengths

---
 nimble/__main__.py                |  4 +-
 nimble/fastq_barcode_processor.py | 96 ++++++++++++++++++++-----------
 2 files changed, 66 insertions(+), 34 deletions(-)

diff --git a/nimble/__main__.py b/nimble/__main__.py
index 5e5bf30..4fb4a9a 100755
--- a/nimble/__main__.py
+++ b/nimble/__main__.py
@@ -468,9 +468,9 @@ def sort_input_bam(bam, cores, tmp_dir):
     )
     fastq_to_bam_parser.add_argument(
         '--infer-prefix-pairs',
-        help='Number of read pairs to buffer for UMI inference (default: 2000).',
+        help='Number of read pairs to buffer for UMI inference (default: 200).',
         type=int,
-        default=2000
+        default=200
     )
     fastq_to_bam_parser.add_argument(
         '--min-records-with-tso',
diff --git a/nimble/fastq_barcode_processor.py b/nimble/fastq_barcode_processor.py
index a711b38..a8c0dfa 100644
--- a/nimble/fastq_barcode_processor.py
+++ b/nimble/fastq_barcode_processor.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 import gzip
 import sys
-from collections import defaultdict
+from collections import defaultdict, Counter
 from concurrent.futures import ThreadPoolExecutor, as_completed
 from itertools import islice, chain
 import pysam
@@ -103,7 +103,6 @@ def correct_cell_barcode(raw_cb, quality_scores, whitelist, hamming_index, corre
     correction_cache[raw_cb] = best_candidate
     return best_candidate
 
-
 def infer_umi_and_tso_lengths_from_r1_records(
     r1_records,
     search_string="TTTCTTATATGGG",
@@ -113,43 +112,54 @@ def infer_umi_and_tso_lengths_from_r1_records(
     """
     Infer UMI length by locating the TSO motif in R1 reads.
     Assumes structure: [CB][UMI][TSO...][cDNA...]
-    umi_length = tso_start - cb_length
-    tso_length = len(search_string) (the removed prefix length)
+
+    Resolution strategy:
+      - Ignore reads with no TSO hit (pos == -1).
+      - Allow multiple observed TSO positions; pick the *mode* (majority) position.
+      - If fewer than min_records hits are available (e.g. EOF), use as many as we have.
     """
+
     tso_starts = []
     for rec in r1_records:
         seq = str(rec.seq)
         pos = seq.find(search_string)
-        if pos != -1:
-            tso_starts.append(pos)
-            if len(tso_starts) >= min_records:
-                break
+        if pos == -1:
+            continue  # skip non-hits for inference
+        tso_starts.append(pos)
 
-    if len(tso_starts) < min_records:
+    if len(tso_starts) == 0:
         raise ValueError(
-            f"Unable to infer UMI length: fewer than {min_records} reads contained TSO motif {search_string}"
+            f"Unable to infer UMI length: no reads contained TSO motif {search_string}"
         )
 
-    unique = set(tso_starts)
-    if len(unique) != 1:
-        raise ValueError(
-            f"Unable to infer UMI length: TSO motif found at inconsistent positions in R1: {sorted(unique)}"
+    counts = Counter(tso_starts)
+    # Choose the most common; break ties by choosing the smallest position
+    mode_pos, mode_count = min(
+        (pos, cnt) for pos, cnt in counts.items() if cnt == max(counts.values())
+    )
+    total_hits = len(tso_starts)
+    top = counts.most_common(10)
+    top_str = ", ".join([f"{p}:{c}" for p, c in top])
+    print(f"TSO hit count used for inference: {total_hits}")
+    print(f"TSO positions (top): {top_str}")
+    if total_hits < min_records:
+        print(
+            f"WARNING: only {total_hits} reads contained the TSO motif; "
+            f"min_records={min_records}. Proceeding with majority position."
         )
 
-    tso_start = next(iter(unique))
-    umi_length = tso_start - cb_length
+    umi_length = mode_pos - cb_length
     if umi_length <= 0:
         raise ValueError(
-            f"Inferred UMI length <= 0 (tso_start={tso_start}, cb_length={cb_length}). Check motif/structure."
+            f"Inferred UMI length <= 0 (mode_tso_start={mode_pos}, cb_length={cb_length}). "
+            f"Check motif/structure."
         )
 
     tso_length = len(search_string)
-
-    print(f"Inferred UMI length: {umi_length}")
+    print(f"Inferred UMI length: {umi_length} (from majority TSO start {mode_pos})")
     print(f"Using TSO length: {tso_length} (len of search_string)")
     return umi_length, tso_length
 
-
 def parse_10x_barcode_from_r1(sequence, cb_length=16, umi_length=12, tso_length=0):
     """
     Parse CB and UMI from R1 sequence, then drop optional TSO.
@@ -216,7 +226,6 @@ def process_pair(
     r1_bam = pysam.AlignedSegment()
     r1_bam.query_name = r1_name
     r1_bam.query_sequence = remaining_r1_seq
-    # IMPORTANT: drop the same prefix from qualities as from sequence
     r1_bam.query_qualities = r1_record.letter_annotations["phred_quality"][prefix_len:]
     r1_bam.flag = 77  # paired, first in pair, unmapped, mate unmapped
     r1_bam.reference_id = -1
@@ -249,22 +258,24 @@ def fastq_to_bam_with_barcodes(
     umi_length=None,
     infer_umi=True,
     tso_search_string="TTTCTTATATGGG",
-    infer_prefix_pairs=2000,
+    infer_prefix_pairs=200,
     min_records_with_tso=10,
 ):
     """
     Convert paired FASTQ files to unaligned BAM with CB/UB tags using multiple threads.
     - Corrects CB using whitelist-based Hamming distance 0/1 correction with CB qualities.
     - Optionally infers UMI length by locating a TSO motif in R1 and removes the TSO from R1.
-      (If infer_umi=False, uses umi_length as provided and does not remove TSO unless you set tso_search_string
-       to match and set infer_umi=True.)
-
+    
     Args:
         r1_fastq, r2_fastq: FASTQ(.gz) paths
         cb_whitelist_file: whitelist file
         output_bam: output BAM path
         num_cores: threads
         cb_length: cell barcode length
+        cb_whitelist_file: whitelist file
+        output_bam: output BAM path
+        num_cores: threads
+        cb_length: cell barcode length
         umi_length: if infer_umi=False, required
         infer_umi: infer umi_length and remove tso motif
         tso_search_string: TSO motif used for inference; tso_length = len(tso_search_string)
@@ -304,15 +315,39 @@ def fastq_to_bam_with_barcodes(
             r1_iter = SeqIO.parse(r1_handle, "fastq")
             r2_iter = SeqIO.parse(r2_handle, "fastq")
 
-            # Buffer a small matched prefix for inference so we don't desync iterators.
-            prefix_pairs = list(islice(zip(r1_iter, r2_iter), infer_prefix_pairs))
+            prefix_pairs = []
+            tso_hit_r1 = []
+
+            target_hits = infer_prefix_pairs if infer_umi else 0
+
+            while True:
+                try:
+                    r1_record = next(r1_iter)
+                    r2_record = next(r2_iter)
+                except StopIteration:
+                    break
+
+                prefix_pairs.append((r1_record, r2_record))
+
+                if infer_umi:
+                    pos = str(r1_record.seq).find(tso_search_string)
+                    if pos != -1:
+                        tso_hit_r1.append(r1_record)
+
+                    # Stop once we have enough usable reads for inference
+                    if len(tso_hit_r1) >= target_hits:
+                        break
+
+                # If not inferring, we only need a minimal buffer to avoid empty input
+                if not infer_umi and len(prefix_pairs) >= 1:
+                    break
+
             if not prefix_pairs:
                 raise ValueError("Input FASTQs are empty or could not be read")
 
             if infer_umi:
-                prefix_r1 = [p[0] for p in prefix_pairs]
                 inferred_umi_length, tso_length = infer_umi_and_tso_lengths_from_r1_records(
-                    prefix_r1,
+                    tso_hit_r1,
                     search_string=tso_search_string,
                     cb_length=cb_length,
                     min_records=min_records_with_tso,
@@ -324,7 +359,6 @@ def fastq_to_bam_with_barcodes(
                 umi_length_use = int(umi_length)
                 tso_length = 0
 
-            # Reconstruct full streams
             r1_full = chain((p[0] for p in prefix_pairs), r1_iter)
             r2_full = chain((p[1] for p in prefix_pairs), r2_iter)
 
@@ -347,7 +381,6 @@ def fastq_to_bam_with_barcodes(
                     )
                     futures[fut] = True
 
-                    # Throttle in-flight futures to limit memory use
                     if len(futures) >= num_cores * 100:
                         done_any = 0
                         for done in as_completed(list(futures)[: num_cores * 10]):
@@ -365,7 +398,6 @@ def fastq_to_bam_with_barcodes(
                     if stats["total_pairs"] % 1_000_000 == 0:
                         print(f"Processed {stats['total_pairs']} read pairs...")
 
-                # Drain remaining futures
                 for done in as_completed(list(futures.keys())):
                     res = done.result()
                     if res: