pachterlab · Yenaled · Mar 8, 2026 · Jul 25, 2023 · Jul 25, 2023 · Jul 25, 2023
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -12,7 +12,7 @@ jobs:
       - name: Setup python
         uses: actions/setup-python@v1
         with:
-          python-version: '3.9.22'
+          python-version: '3.10.19'
           architecture: x64
       - name: Install dependencies
         run: pip install -r dev-requirements.txt
@@ -22,7 +22,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python: [3.9.22, 3.10.17 ]
+        python: [3.10.19 ]
         os: [ubuntu-20.04]
     name: Test on Python ${{ matrix.python }}
     steps:

diff --git a/.gitignore b/.gitignore
@@ -65,6 +65,7 @@ instance/
 
 # Mac stuff:
 .DS_Store
+*.swp
 
 # Sphinx documentation
 docs/_build/
@@ -114,4 +115,4 @@ venv.bak/
 /.idea/
 
 # Temp files
-/scratch/
+/scratch/
diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -1,5 +1,5 @@
 bumpversion==0.6.0
-coverage==5.2.1
+coverage==7.6.1
 flake8==3.8.2
 pytest==8.2.2
 pytest-cov==5.0.0
@@ -8,5 +8,5 @@ sphinx>=3.3.1
 sphinx-autoapi>=1.5.1
 sphinx_rtd_theme>=0.5.0
 twine>=2.0.0
-wheel==0.38.1
+wheel==0.46.2
 yapf==0.30.0
diff --git a/kb_python/bins/linux/kallisto/kallisto b/kb_python/bins/linux/kallisto/kallisto
diff --git a/kb_python/bins/linux/kallisto/kallisto_k64 b/kb_python/bins/linux/kallisto/kallisto_k64
diff --git a/kb_python/bins/linux/kallisto/kallisto_optoff b/kb_python/bins/linux/kallisto/kallisto_optoff
diff --git a/kb_python/bins/linux/kallisto/kallisto_optoff_k64 b/kb_python/bins/linux/kallisto/kallisto_optoff_k64
diff --git a/kb_python/bins/linux/kallisto/license.txt b/kb_python/bins/linux/kallisto/license.txt
diff --git a/kb_python/count.py b/kb_python/count.py
@@ -658,8 +658,12 @@ def bustools_whitelist(
 
 
 def matrix_to_cellranger(
-    matrix_path: str, barcodes_path: str, genes_path: str, t2g_path: str,
-    out_dir: str, gzip: bool = False
+    matrix_path: str,
+    barcodes_path: str,
+    genes_path: str,
+    t2g_path: str,
+    out_dir: str,
+    gzip: bool = False
 ) -> Dict[str, str]:
     """Convert bustools count matrix to cellranger-format matrix.
 
@@ -1065,8 +1069,10 @@ def filter_with_bustools(
         if cellranger:
             if not tcc:
                 cr_result = matrix_to_cellranger(
-                    count_result['mtx'], count_result['barcodes'],
-                    count_result['genes'], t2g_path,
+                    count_result['mtx'],
+                    count_result['barcodes'],
+                    count_result['genes'],
+                    t2g_path,
                     os.path.join(counts_dir, CELLRANGER_DIR),
                     gzip=gzip
                 )
@@ -1290,7 +1296,7 @@ def count(
         by_name: Aggregate counts by name instead of ID.
         cellranger: Whether to convert the final count matrix into a
             cellranger-compatible matrix, defaults to `False`
-        gzip: Whether to gzip compress cellranger output matrices, 
+        gzip: Whether to gzip compress cellranger output matrices,
             defaults to `False`
         delete_bus: Whether to delete intermediate BUS files after successful count,
             defaults to `False`
@@ -1649,8 +1655,10 @@ def update_results_with_suffix(current_results, new_results, suffix):
             final_result = quant_result if quant else count_result
             if cellranger:
                 cr_result = matrix_to_cellranger(
-                    count_result['mtx'], count_result['barcodes'],
-                    count_result['genes'], t2g_path,
+                    count_result['mtx'],
+                    count_result['barcodes'],
+                    count_result['genes'],
+                    t2g_path,
                     os.path.join(counts_dir, f'{CELLRANGER_DIR}{suffix}'),
                     gzip=gzip
                 )
@@ -1760,24 +1768,26 @@ def update_results_with_suffix(current_results, new_results, suffix):
     if delete_bus:
         logger.info('Deleting intermediate BUS files to save disk space')
         bus_files_to_delete = []
-        
+
         # Collect all .bus files from results
         if 'bus' in unfiltered_results:
             bus_files_to_delete.append(unfiltered_results['bus'])
         if 'bus_scs' in unfiltered_results:
             bus_files_to_delete.append(unfiltered_results['bus_scs'])
-        
+
         # For smartseq3, delete suffix versions too
         for suffix in ['', INTERNAL_SUFFIX, UMI_SUFFIX]:
             if f'bus{suffix}' in unfiltered_results:
                 bus_files_to_delete.append(unfiltered_results[f'bus{suffix}'])
             if f'bus_scs{suffix}' in unfiltered_results:
-                bus_files_to_delete.append(unfiltered_results[f'bus_scs{suffix}'])
-
+                bus_files_to_delete.append(
+                    unfiltered_results[f'bus_scs{suffix}']
+                )
+
         # Delete filtered bus if exists
         if 'filtered' in results and 'bus_scs' in results['filtered']:
             bus_files_to_delete.append(results['filtered']['bus_scs'])
-        
+
         # Delete each BUS file
         for bus_file in bus_files_to_delete:
             if bus_file and os.path.exists(bus_file):
@@ -1875,7 +1885,7 @@ def count_nac(
         by_name: Aggregate counts by name instead of ID.
         cellranger: Whether to convert the final count matrix into a
             cellranger-compatible matrix, defaults to `False`
-        gzip: Whether to gzip compress cellranger output matrices, 
+        gzip: Whether to gzip compress cellranger output matrices,
             defaults to `False`
         cellranger_style: Whether to organize output in CellRanger-style directories
             (spliced/ and unspliced/ subdirectories), defaults to `False`
@@ -2181,13 +2191,19 @@ def update_results_with_suffix(current_results, new_results, suffix):
                         elif i == 1:  # unprocessed/unspliced
                             cr_dir = os.path.join(counts_dir, 'unspliced')
                         else:  # ambiguous
-                            cr_dir = os.path.join(counts_dir, f'{CELLRANGER_DIR}_{prefix}{suffix}')
+                            cr_dir = os.path.join(
+                                counts_dir, f'{CELLRANGER_DIR}_{prefix}{suffix}'
+                            )
                     else:
-                        cr_dir = os.path.join(counts_dir, f'{CELLRANGER_DIR}_{prefix}{suffix}')
-
+                        cr_dir = os.path.join(
+                            counts_dir, f'{CELLRANGER_DIR}_{prefix}{suffix}'
+                        )
+
                     cr_result = matrix_to_cellranger(
-                        count_result[i]['mtx'], count_result[i]['barcodes'],
-                        count_result[i]['genes'], t2g_path,
+                        count_result[i]['mtx'],
+                        count_result[i]['barcodes'],
+                        count_result[i]['genes'],
+                        t2g_path,
                         cr_dir,
                         gzip=gzip
                     )
@@ -2225,7 +2241,10 @@ def update_results_with_suffix(current_results, new_results, suffix):
                     update_results_with_suffix(prefix_results, res, suffix)
                     if cellranger:
                         cr_result = matrix_to_cellranger(
-                            res['mtx'], res['barcodes'], res['genes'], t2g_path,
+                            res['mtx'],
+                            res['barcodes'],
+                            res['genes'],
+                            t2g_path,
                             os.path.join(
                                 counts_dir, f'{CELLRANGER_DIR}_{prefix}{suffix}'
                             ),
@@ -2352,17 +2371,28 @@ def update_results_with_suffix(current_results, new_results, suffix):
                     if cellranger_style:
                         # Create spliced/unspliced subdirectories for CellRanger style
                         if i == 0:  # processed/spliced
-                            cr_dir = os.path.join(filtered_counts_dir, 'spliced')
+                            cr_dir = os.path.join(
+                                filtered_counts_dir, 'spliced'
+                            )
                         elif i == 1:  # unprocessed/unspliced
-                            cr_dir = os.path.join(filtered_counts_dir, 'unspliced')
+                            cr_dir = os.path.join(
+                                filtered_counts_dir, 'unspliced'
+                            )
                         else:  # ambiguous
-                            cr_dir = os.path.join(filtered_counts_dir, f'{CELLRANGER_DIR}_{prefix}')
+                            cr_dir = os.path.join(
+                                filtered_counts_dir,
+                                f'{CELLRANGER_DIR}_{prefix}'
+                            )
                     else:
-                        cr_dir = os.path.join(filtered_counts_dir, f'{CELLRANGER_DIR}_{prefix}')
-
+                        cr_dir = os.path.join(
+                            filtered_counts_dir, f'{CELLRANGER_DIR}_{prefix}'
+                        )
+
                     cr_result = matrix_to_cellranger(
-                        count_result[i]['mtx'], count_result[i]['barcodes'],
-                        count_result[i]['genes'], t2g_path,
+                        count_result[i]['mtx'],
+                        count_result[i]['barcodes'],
+                        count_result[i]['genes'],
+                        t2g_path,
                         cr_dir,
                         gzip=gzip
                     )
@@ -2396,7 +2426,10 @@ def update_results_with_suffix(current_results, new_results, suffix):
                     filtered_results[prefix] = {}
                     if cellranger:
                         cr_result = matrix_to_cellranger(
-                            res['mtx'], res['barcodes'], res['genes'], t2g_path,
+                            res['mtx'],
+                            res['barcodes'],
+                            res['genes'],
+                            t2g_path,
                             os.path.join(
                                 filtered_counts_dir,
                                 f'{CELLRANGER_DIR}_{prefix}'
@@ -2488,19 +2521,21 @@ def update_results_with_suffix(current_results, new_results, suffix):
     if delete_bus:
         logger.info('Deleting intermediate BUS files to save disk space')
         bus_files_to_delete = []
-        
+
         # Collect all .bus files from results
         prefixes = ['processed', 'unprocessed', 'ambiguous']
         for prefix in prefixes:
             if prefix in unfiltered_results:
                 for suffix in ['', INTERNAL_SUFFIX, UMI_SUFFIX]:
                     if f'bus{suffix}' in unfiltered_results[prefix]:
-                        bus_files_to_delete.append(unfiltered_results[prefix][f'bus{suffix}'])
-
+                        bus_files_to_delete.append(
+                            unfiltered_results[prefix][f'bus{suffix}']
+                        )
+
         # Delete filtered bus files if they exist
         if 'filtered' in results and 'bus_scs' in results['filtered']:
             bus_files_to_delete.append(results['filtered']['bus_scs'])
-        
+
         # Delete each BUS file
         for bus_file in bus_files_to_delete:
             if bus_file and os.path.exists(bus_file):

diff --git a/kb_python/main.py b/kb_python/main.py
@@ -408,7 +408,7 @@ def parse_count(
             'Plots for TCC matrices have not yet been implemented. '
             'The HTML report will not contain any plots.'
         )
-    # Note: We are currently not supporting --genomebam
+
     if args.genomebam:
         parser.error('--genomebam is not currently supported')
     if args.genomebam and not args.gtf:
@@ -591,11 +591,11 @@ def parse_count(
             parser.error(
                 f'Option `--aa` cannot be used with workflow {args.workflow}.'
             )
-        
+
         # Auto-enable gzip and cellranger-style when --cellranger is used
         use_gzip = args.cellranger and not args.no_gzip or args.gzip
         use_cellranger_style = args.cellranger
-        
+
         from .count import count_nac
         count_nac(
             args.i,
@@ -1462,7 +1462,10 @@ def setup_count_args(
     )
     parser_count.add_argument(
         '--gzip',
-        help='Gzip compress output matrices (matrix.mtx.gz, barcodes.tsv.gz, genes.tsv.gz). Automatically enabled with --cellranger',
+        help=(
+            'Gzip compress output matrices (matrix.mtx.gz, barcodes.tsv.gz, genes.tsv.gz). '
+            'Automatically enabled with --cellranger. '
+        ),
         action='store_true'
     )
     parser_count.add_argument(
@@ -1472,7 +1475,9 @@ def setup_count_args(
     )
     parser_count.add_argument(
         '--delete-bus',
-        help='Delete intermediate BUS files after successful count to save disk space',
+        help=(
+            'Delete intermediate BUS files after successful count to save disk space'
+        ),
         action='store_true'
     )
     parser_count.add_argument(

diff --git a/kb_python/ref.py b/kb_python/ref.py
@@ -1,7 +1,7 @@
 import glob
-import itertools
 import os
 import tarfile
+from collections import defaultdict
 from typing import Callable, Dict, List, Optional, Tuple, Union
 
 import ngs_tools as ngs
@@ -71,6 +71,10 @@ def generate_mismatches(name, sequence):
     lengths = set()
     features = {}
     variants = {}
+
+    # Store all original sequences to check for collisions with variants
+    original_sequences = set()
+
     # Generate all feature barcode variations before saving to check for collisions.
     for i, row in df_features.iterrows():
         # Check that the first column contains the sequence
@@ -83,6 +87,8 @@ def generate_mismatches(name, sequence):
 
         lengths.add(len(row.sequence))
         features[row['name']] = row.sequence
+        original_sequences.add(row.sequence)
+
         variants[row['name']] = {
             name: seq
             for name, seq in generate_mismatches(row['name'], row.sequence)
@@ -103,45 +109,36 @@ def generate_mismatches(name, sequence):
                 ','.join(str(l) for l in lengths)  # noqa
             )
         )
-    # Find & remove collisions between barcode and variants
-    for feature in variants.keys():
-        _variants = variants[feature]
-        collisions = set(_variants.values()) & set(features.values())
-        if collisions:
-            # Remove collisions
+
+    # Invert variants: sequence -> list of (feature_name, variant_name)
+    seq_to_variants = defaultdict(list)
+    for feature_name, feature_variants in variants.items():
+        for variant_name, seq in feature_variants.items():
+            seq_to_variants[seq].append((feature_name, variant_name))
+
+    # Process collisions
+    for seq, variant_list in seq_to_variants.items():
+        # 1. Check collision with original barcodes
+        if seq in original_sequences:
             logger.warning(
-                f'Colision detected between variants of feature barcode {feature} '
-                'and feature barcode(s). These variants will be removed.'
+                f'Collision detected between variants of feature barcode(s) {",".join(set(v[0] for v in variant_list))}'
+                f' and original feature barcode {seq}. These variants will be removed.'
             )
-            variants[feature] = {
-                name: seq
-                for name, seq in _variants.items()
-                if seq not in collisions
-            }
-
-    # Find & remove collisions between variants
-    for f1, f2 in itertools.combinations(variants.keys(), 2):
-        v1 = variants[f1]
-        v2 = variants[f2]
-
-        collisions = set(v1.values()) & set(v2.values())
-        if collisions:
+            for feature_name, variant_name in variant_list:
+                if variant_name in variants[feature_name]:
+                    del variants[feature_name][variant_name]
+            continue
+
+        # 2. Check collision between variants of DIFFERENT features
+        features_involved = set(v[0] for v in variant_list)
+        if len(features_involved) > 1:
             logger.warning(
-                f'Collision(s) detected between variants of feature barcodes {f1} and {f2}: '
-                f'{",".join(collisions)}. These variants will be removed.'
+                f'Collision(s) detected between variants of feature barcodes {",".join(features_involved)}: '
+                f'{seq}. These variants will be removed.'
             )
-
-            # Remove collisions
-            variants[f1] = {
-                name: seq
-                for name, seq in v1.items()
-                if seq not in collisions
-            }
-            variants[f2] = {
-                name: seq
-                for name, seq in v2.items()
-                if seq not in collisions
-            }
+            for feature_name, variant_name in variant_list:
+                if variant_name in variants[feature_name]:
+                    del variants[feature_name][variant_name]
 
     # Write FASTA
     with ngs.fasta.Fasta(out_path, 'w') as f: