Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ jobs:
- name: Setup python
uses: actions/setup-python@v1
with:
python-version: '3.9.22'
python-version: '3.10.19'
architecture: x64
- name: Install dependencies
run: pip install -r dev-requirements.txt
Expand All @@ -22,7 +22,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
python: [3.9.22, 3.10.17 ]
python: [3.10.19 ]
os: [ubuntu-20.04]
name: Test on Python ${{ matrix.python }}
steps:
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ instance/

# Mac stuff:
.DS_Store
*.swp

# Sphinx documentation
docs/_build/
Expand Down Expand Up @@ -114,4 +115,4 @@ venv.bak/
/.idea/

# Temp files
/scratch/
/scratch/
4 changes: 2 additions & 2 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
bumpversion==0.6.0
coverage==5.2.1
coverage==7.6.1
flake8==3.8.2
pytest==8.2.2
pytest-cov==5.0.0
Expand All @@ -8,5 +8,5 @@ sphinx>=3.3.1
sphinx-autoapi>=1.5.1
sphinx_rtd_theme>=0.5.0
twine>=2.0.0
wheel==0.38.1
wheel==0.46.2
yapf==0.30.0
Binary file modified kb_python/bins/linux/kallisto/kallisto
Binary file not shown.
Binary file modified kb_python/bins/linux/kallisto/kallisto_k64
Binary file not shown.
Binary file modified kb_python/bins/linux/kallisto/kallisto_optoff
Binary file not shown.
Binary file modified kb_python/bins/linux/kallisto/kallisto_optoff_k64
Binary file not shown.
Empty file modified kb_python/bins/linux/kallisto/license.txt
100755 → 100644
Empty file.
97 changes: 66 additions & 31 deletions kb_python/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,8 +658,12 @@ def bustools_whitelist(


def matrix_to_cellranger(
matrix_path: str, barcodes_path: str, genes_path: str, t2g_path: str,
out_dir: str, gzip: bool = False
matrix_path: str,
barcodes_path: str,
genes_path: str,
t2g_path: str,
out_dir: str,
gzip: bool = False
) -> Dict[str, str]:
"""Convert bustools count matrix to cellranger-format matrix.

Expand Down Expand Up @@ -1065,8 +1069,10 @@ def filter_with_bustools(
if cellranger:
if not tcc:
cr_result = matrix_to_cellranger(
count_result['mtx'], count_result['barcodes'],
count_result['genes'], t2g_path,
count_result['mtx'],
count_result['barcodes'],
count_result['genes'],
t2g_path,
os.path.join(counts_dir, CELLRANGER_DIR),
gzip=gzip
)
Expand Down Expand Up @@ -1290,7 +1296,7 @@ def count(
by_name: Aggregate counts by name instead of ID.
cellranger: Whether to convert the final count matrix into a
cellranger-compatible matrix, defaults to `False`
gzip: Whether to gzip compress cellranger output matrices,
gzip: Whether to gzip compress cellranger output matrices,
defaults to `False`
delete_bus: Whether to delete intermediate BUS files after successful count,
defaults to `False`
Expand Down Expand Up @@ -1649,8 +1655,10 @@ def update_results_with_suffix(current_results, new_results, suffix):
final_result = quant_result if quant else count_result
if cellranger:
cr_result = matrix_to_cellranger(
count_result['mtx'], count_result['barcodes'],
count_result['genes'], t2g_path,
count_result['mtx'],
count_result['barcodes'],
count_result['genes'],
t2g_path,
os.path.join(counts_dir, f'{CELLRANGER_DIR}{suffix}'),
gzip=gzip
)
Expand Down Expand Up @@ -1760,24 +1768,26 @@ def update_results_with_suffix(current_results, new_results, suffix):
if delete_bus:
logger.info('Deleting intermediate BUS files to save disk space')
bus_files_to_delete = []

# Collect all .bus files from results
if 'bus' in unfiltered_results:
bus_files_to_delete.append(unfiltered_results['bus'])
if 'bus_scs' in unfiltered_results:
bus_files_to_delete.append(unfiltered_results['bus_scs'])

# For smartseq3, delete suffix versions too
for suffix in ['', INTERNAL_SUFFIX, UMI_SUFFIX]:
if f'bus{suffix}' in unfiltered_results:
bus_files_to_delete.append(unfiltered_results[f'bus{suffix}'])
if f'bus_scs{suffix}' in unfiltered_results:
bus_files_to_delete.append(unfiltered_results[f'bus_scs{suffix}'])

bus_files_to_delete.append(
unfiltered_results[f'bus_scs{suffix}']
)

# Delete filtered bus if exists
if 'filtered' in results and 'bus_scs' in results['filtered']:
bus_files_to_delete.append(results['filtered']['bus_scs'])

# Delete each BUS file
for bus_file in bus_files_to_delete:
if bus_file and os.path.exists(bus_file):
Expand Down Expand Up @@ -1875,7 +1885,7 @@ def count_nac(
by_name: Aggregate counts by name instead of ID.
cellranger: Whether to convert the final count matrix into a
cellranger-compatible matrix, defaults to `False`
gzip: Whether to gzip compress cellranger output matrices,
gzip: Whether to gzip compress cellranger output matrices,
defaults to `False`
cellranger_style: Whether to organize output in CellRanger-style directories
(spliced/ and unspliced/ subdirectories), defaults to `False`
Expand Down Expand Up @@ -2181,13 +2191,19 @@ def update_results_with_suffix(current_results, new_results, suffix):
elif i == 1: # unprocessed/unspliced
cr_dir = os.path.join(counts_dir, 'unspliced')
else: # ambiguous
cr_dir = os.path.join(counts_dir, f'{CELLRANGER_DIR}_{prefix}{suffix}')
cr_dir = os.path.join(
counts_dir, f'{CELLRANGER_DIR}_{prefix}{suffix}'
)
else:
cr_dir = os.path.join(counts_dir, f'{CELLRANGER_DIR}_{prefix}{suffix}')

cr_dir = os.path.join(
counts_dir, f'{CELLRANGER_DIR}_{prefix}{suffix}'
)

cr_result = matrix_to_cellranger(
count_result[i]['mtx'], count_result[i]['barcodes'],
count_result[i]['genes'], t2g_path,
count_result[i]['mtx'],
count_result[i]['barcodes'],
count_result[i]['genes'],
t2g_path,
cr_dir,
gzip=gzip
)
Expand Down Expand Up @@ -2225,7 +2241,10 @@ def update_results_with_suffix(current_results, new_results, suffix):
update_results_with_suffix(prefix_results, res, suffix)
if cellranger:
cr_result = matrix_to_cellranger(
res['mtx'], res['barcodes'], res['genes'], t2g_path,
res['mtx'],
res['barcodes'],
res['genes'],
t2g_path,
os.path.join(
counts_dir, f'{CELLRANGER_DIR}_{prefix}{suffix}'
),
Expand Down Expand Up @@ -2352,17 +2371,28 @@ def update_results_with_suffix(current_results, new_results, suffix):
if cellranger_style:
# Create spliced/unspliced subdirectories for CellRanger style
if i == 0: # processed/spliced
cr_dir = os.path.join(filtered_counts_dir, 'spliced')
cr_dir = os.path.join(
filtered_counts_dir, 'spliced'
)
elif i == 1: # unprocessed/unspliced
cr_dir = os.path.join(filtered_counts_dir, 'unspliced')
cr_dir = os.path.join(
filtered_counts_dir, 'unspliced'
)
else: # ambiguous
cr_dir = os.path.join(filtered_counts_dir, f'{CELLRANGER_DIR}_{prefix}')
cr_dir = os.path.join(
filtered_counts_dir,
f'{CELLRANGER_DIR}_{prefix}'
)
else:
cr_dir = os.path.join(filtered_counts_dir, f'{CELLRANGER_DIR}_{prefix}')

cr_dir = os.path.join(
filtered_counts_dir, f'{CELLRANGER_DIR}_{prefix}'
)

cr_result = matrix_to_cellranger(
count_result[i]['mtx'], count_result[i]['barcodes'],
count_result[i]['genes'], t2g_path,
count_result[i]['mtx'],
count_result[i]['barcodes'],
count_result[i]['genes'],
t2g_path,
cr_dir,
gzip=gzip
)
Expand Down Expand Up @@ -2396,7 +2426,10 @@ def update_results_with_suffix(current_results, new_results, suffix):
filtered_results[prefix] = {}
if cellranger:
cr_result = matrix_to_cellranger(
res['mtx'], res['barcodes'], res['genes'], t2g_path,
res['mtx'],
res['barcodes'],
res['genes'],
t2g_path,
os.path.join(
filtered_counts_dir,
f'{CELLRANGER_DIR}_{prefix}'
Expand Down Expand Up @@ -2488,19 +2521,21 @@ def update_results_with_suffix(current_results, new_results, suffix):
if delete_bus:
logger.info('Deleting intermediate BUS files to save disk space')
bus_files_to_delete = []

# Collect all .bus files from results
prefixes = ['processed', 'unprocessed', 'ambiguous']
for prefix in prefixes:
if prefix in unfiltered_results:
for suffix in ['', INTERNAL_SUFFIX, UMI_SUFFIX]:
if f'bus{suffix}' in unfiltered_results[prefix]:
bus_files_to_delete.append(unfiltered_results[prefix][f'bus{suffix}'])

bus_files_to_delete.append(
unfiltered_results[prefix][f'bus{suffix}']
)

# Delete filtered bus files if they exist
if 'filtered' in results and 'bus_scs' in results['filtered']:
bus_files_to_delete.append(results['filtered']['bus_scs'])

# Delete each BUS file
for bus_file in bus_files_to_delete:
if bus_file and os.path.exists(bus_file):
Expand Down
15 changes: 10 additions & 5 deletions kb_python/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ def parse_count(
'Plots for TCC matrices have not yet been implemented. '
'The HTML report will not contain any plots.'
)
# Note: We are currently not supporting --genomebam

if args.genomebam:
parser.error('--genomebam is not currently supported')
if args.genomebam and not args.gtf:
Expand Down Expand Up @@ -591,11 +591,11 @@ def parse_count(
parser.error(
f'Option `--aa` cannot be used with workflow {args.workflow}.'
)

# Auto-enable gzip and cellranger-style when --cellranger is used
use_gzip = args.cellranger and not args.no_gzip or args.gzip
use_cellranger_style = args.cellranger

from .count import count_nac
count_nac(
args.i,
Expand Down Expand Up @@ -1462,7 +1462,10 @@ def setup_count_args(
)
parser_count.add_argument(
'--gzip',
help='Gzip compress output matrices (matrix.mtx.gz, barcodes.tsv.gz, genes.tsv.gz). Automatically enabled with --cellranger',
help=(
'Gzip compress output matrices (matrix.mtx.gz, barcodes.tsv.gz, genes.tsv.gz). '
'Automatically enabled with --cellranger. '
),
action='store_true'
)
parser_count.add_argument(
Expand All @@ -1472,7 +1475,9 @@ def setup_count_args(
)
parser_count.add_argument(
'--delete-bus',
help='Delete intermediate BUS files after successful count to save disk space',
help=(
'Delete intermediate BUS files after successful count to save disk space'
),
action='store_true'
)
parser_count.add_argument(
Expand Down
69 changes: 33 additions & 36 deletions kb_python/ref.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import glob
import itertools
import os
import tarfile
from collections import defaultdict
from typing import Callable, Dict, List, Optional, Tuple, Union

import ngs_tools as ngs
Expand Down Expand Up @@ -71,6 +71,10 @@ def generate_mismatches(name, sequence):
lengths = set()
features = {}
variants = {}

# Store all original sequences to check for collisions with variants
original_sequences = set()

# Generate all feature barcode variations before saving to check for collisions.
for i, row in df_features.iterrows():
# Check that the first column contains the sequence
Expand All @@ -83,6 +87,8 @@ def generate_mismatches(name, sequence):

lengths.add(len(row.sequence))
features[row['name']] = row.sequence
original_sequences.add(row.sequence)

variants[row['name']] = {
name: seq
for name, seq in generate_mismatches(row['name'], row.sequence)
Expand All @@ -103,45 +109,36 @@ def generate_mismatches(name, sequence):
','.join(str(l) for l in lengths) # noqa
)
)
# Find & remove collisions between barcode and variants
for feature in variants.keys():
_variants = variants[feature]
collisions = set(_variants.values()) & set(features.values())
if collisions:
# Remove collisions

# Invert variants: sequence -> list of (feature_name, variant_name)
seq_to_variants = defaultdict(list)
for feature_name, feature_variants in variants.items():
for variant_name, seq in feature_variants.items():
seq_to_variants[seq].append((feature_name, variant_name))

# Process collisions
for seq, variant_list in seq_to_variants.items():
# 1. Check collision with original barcodes
if seq in original_sequences:
logger.warning(
f'Colision detected between variants of feature barcode {feature} '
'and feature barcode(s). These variants will be removed.'
f'Collision detected between variants of feature barcode(s) {",".join(set(v[0] for v in variant_list))}'
f' and original feature barcode {seq}. These variants will be removed.'
)
variants[feature] = {
name: seq
for name, seq in _variants.items()
if seq not in collisions
}

# Find & remove collisions between variants
for f1, f2 in itertools.combinations(variants.keys(), 2):
v1 = variants[f1]
v2 = variants[f2]

collisions = set(v1.values()) & set(v2.values())
if collisions:
for feature_name, variant_name in variant_list:
if variant_name in variants[feature_name]:
del variants[feature_name][variant_name]
continue

# 2. Check collision between variants of DIFFERENT features
features_involved = set(v[0] for v in variant_list)
if len(features_involved) > 1:
logger.warning(
f'Collision(s) detected between variants of feature barcodes {f1} and {f2}: '
f'{",".join(collisions)}. These variants will be removed.'
f'Collision(s) detected between variants of feature barcodes {",".join(features_involved)}: '
f'{seq}. These variants will be removed.'
)

# Remove collisions
variants[f1] = {
name: seq
for name, seq in v1.items()
if seq not in collisions
}
variants[f2] = {
name: seq
for name, seq in v2.items()
if seq not in collisions
}
for feature_name, variant_name in variant_list:
if variant_name in variants[feature_name]:
del variants[feature_name][variant_name]

# Write FASTA
with ngs.fasta.Fasta(out_path, 'w') as f:
Expand Down
Loading