Hi there! I am running the below Snakemake workflow to bin some short read assemblies. I'm testing my workflow on two samples before I run it on my entire sample set, but I am encountering the error described in the title for one of those samples. Below, I've written out the workflow, in addition to the error. I haven't seen this issue brought up so far, so I'm hoping to get some feedback on where I could be producing the corrupted file in question. I'm happy to provide the preceding steps in the workflow if it would be of any aid. Thank you!
rule semibin2_generate_concatenated_db_spades:
"""
Generate concatenated FASTA file necessary for SembiBin's multi-sample binning pipeline
"""
input:
contigs = expand("results/{genera}/3_dedup_contigs/SPAdes/individual_metagenome_assembly/{sample}/{sample}_DEDUP95.fasta", genera=config["genera"], sample=SAMPLES)
output:
"results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/generate_concatenated_db/concatenated.fa"
params:
outdir = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/generate_concatenated_db",
threads = 4
log:
stdout = "logs/{genera}/6_binning/semibin2/SPAdes_individual_assembly/generate_concatenated_db/concatenate_fa.out",
stderr = "logs/{genera}/6_binning/semibin2/SPAdes_individual_assembly/generate_concatenated_db/concatenate_fa.err"
shell:
"""
module unload miniconda
source activate /vast/palmer/pi/turner/flg9/conda_envs/semibin
SemiBin2 concatenate_fasta \
--input-fasta {input.contigs} \
--output {params.outdir} --compression=none
"""
rule sembin2_align_to_concatenated_db_spades:
"""
Align reads from each sample to our concatenated FASTA db, necessary for SemiBin pipeline
"""
input:
db = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/generate_concatenated_db/concatenated.fa",
r1 = "results/{genera}/1_pre_processing/dedup_reads/{sample}/{sample}_host_removed_dedup_R1.fastq",
r2 = "results/{genera}/1_pre_processing/dedup_reads/{sample}/{sample}_host_removed_dedup_R2.fastq"
output:
"results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/align_to_concatenated_db/{sample}/{sample}_indexed_contig.1.bt2",
"results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/align_to_concatenated_db/{sample}/{sample}_indexed_contig.2.bt2",
"results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/align_to_concatenated_db/{sample}/{sample}_indexed_contig.3.bt2",
"results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/align_to_concatenated_db/{sample}/{sample}_indexed_contig.4.bt2",
"results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/align_to_concatenated_db/{sample}/{sample}_indexed_contig.rev.1.bt2",
"results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/align_to_concatenated_db/{sample}/{sample}_indexed_contig.rev.2.bt2",
"results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/align_to_concatenated_db/{sample}_aligned_sorted.bam"
params:
genera=config["genera"],
outdir = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/align_to_concatenated_db/{sample}"
log:
stdout = "logs/{genera}/6_binning/semibin2/SPAdes_individual_assembly/align_to_concatenated_db/{sample}_aln.out",
stderr = "logs/{genera}/6_binning/semibin2/SPAdes_individual_assembly/align_to_concatenated_db/{sample}_aln.err"
shell:
"""
module unload miniconda
module load Bowtie2/2.5.1-GCC-12.2.0
module load SAMtools/1.21-GCC-12.2.0
# 1. Build db index
bowtie2-build \
-f {input.db} {params.outdir}/{wildcards.sample}_indexed_contig \
1>> {log.stdout} 2>> {log.stderr}
# 2. Align reads back to SemiBin db index
bowtie2 \
-x {params.outdir}/{wildcards.sample}_indexed_contig -1 {input.r1} -2 {input.r2} | samtools view -b -F 4 -F 2048 | samtools sort -o {output[6]} \
1>> {log.stdout} 2>> {log.stderr}
"""
rule semibin2_features_and_model_spades:
"""
Generate sequence features and train model for SemiBin2
"""
input:
cat_fa = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/generate_concatenated_db/concatenated.fa",
bams = expand("results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/align_to_concatenated_db/{sample}_aligned_sorted.bam", genera=config["genera"], sample=SAMPLES)
output:
split = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/features_and_model/{sample}/data_split.csv",
csv = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/features_and_model/{sample}/data.csv"
params:
outdir = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/features_and_model",
outdir2 = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/features_and_model/{sample}",
threads = 4
log:
stdout = "logs/{genera}/6_binning/semibin2/SPAdes_individual_assembly/features_and_model/{sample}/sequence_features.out",
stderr = "logs/{genera}/6_binning/semibin2/SPAdes_individual_assembly/features_and_model/{sample}/sequence_features.err"
shell:
"""
module unload miniconda
source activate /vast/palmer/pi/turner/flg9/conda_envs/semibin
# Generate sequence features data.csv & data_split.csv files
SemiBin2 generate_sequence_features_multi \
-i {input.cat_fa} \
-b {input.bams} \
-o {params.outdir} \
-t {params.threads} \
1>> {log.stdout} 2>> {log.stderr}
mv {params.outdir}/samples/{wildcards.sample}_DEDUP95/data.csv {params.outdir2}
mv {params.outdir}/samples/{wildcards.sample}_DEDUP95/data_split.csv {params.outdir2}
"""
rule semibin2_train_model_spades:
"""
Train ML model on previously curated SemiBin2 feature data
"""
input:
split = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/features_and_model/{sample}/data_split.csv",
csv = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/features_and_model/{sample}/data.csv"
output:
model = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/train_model/{sample}/model.pt"
params:
outdir = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/train_model/{sample}",
threads = 4
log:
stdout = "logs/{genera}/6_binning/semibin2/SPAdes_individual_assembly/train_model/{sample}/ML_train.out",
stderr = "logs/{genera}/6_binning/semibin2/SPAdes_individual_assembly/train_model/{sample}/ML_train.err"
shell:
"""
module unload miniconda
source activate /vast/palmer/pi/turner/flg9/conda_envs/semibin
# Train model
SemiBin2 train_self \
--data {input.csv} \
--data-split {input.split} \
-o {params.outdir} \
-t {params.threads} \
1>> {log.stdout} 2>> {log.stderr}
"""
checkpoint semibin2_bin_spades:
"""
Bin contigs using SemiBin's multi-sample binning model for individual binning of samples.
This method often returns the most complete bins and is most optimized for complex samples.
"""
input:
contigs = "results/{genera}/3_dedup_contigs/SPAdes/individual_metagenome_assembly/{sample}/{sample}_DEDUP95.fasta",
csv = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/features_and_model/{sample}/data.csv",
model = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/train_model/{sample}/model.pt"
output:
outdir = directory("results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/binning/{sample}/output_bins"),
check = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/binning/{sample}/contig_bins.tsv"
params:
base = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/binning/{sample}",
outdir = "results/{genera}/6_binning/semibin2/SPAdes_individual_assembly/binning/{sample}/output_bins",
minlen = "1500",
threads = 4
log:
stdout = "logs/{genera}/6_binning/semibin2/SPAdes_individual_assembly/binning/{sample}/bin.out",
stderr = "logs/{genera}/6_binning/semibin2/SPAdes_individual_assembly/binning/{sample}/bin.err"
shell:
"""
module unload miniconda
source activate /vast/palmer/pi/turner/flg9/conda_envs/semibin
# Bin
SemiBin2 bin_short \
-i {input.contigs} \
--model {input.model} \
--data {input.csv} \
-o {params.base} \
-t {params.threads} \
--min-len={params.minlen} \
1>> {log.stdout} 2>> {log.stderr}
"""
def get_semibin2_bin_spades(wc):
ckpt = checkpoints.semibin2_bin_spades.get(genera=wc.genera, sample=wc.sample)
import os, glob
bins_dir = ckpt.output.outdir
return sorted(glob.glob(os.path.join(bins_dir, "*.fa.gz")))
2025-10-09 13:20:22 login2.mccleary.ycrc.yale.edu SemiBin2[1617592] INFO Running SemiBin2 version 2.2.0
2025-10-09 13:20:24 login2.mccleary.ycrc.yale.edu SemiBin2[1617592] WARNING Did not detect GPU or CUDA was not installed/supported, using CPU.
2025-10-09 13:20:24 login2.mccleary.ycrc.yale.edu SemiBin2[1617592] INFO Start binning.
multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/vast/palmer/pi/turner/flg9/conda_envs/semibin/lib/python3.13/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
~~~~^^^^^^^^^^^^^^^
File "/vast/palmer/pi/turner/flg9/conda_envs/semibin/lib/python3.13/site-packages/SemiBin/cluster.py", line 14, in run_infomap1
return g.community_infomap(edge_weights=edge_weights, vertex_weights=vertex_weights, trials=trials)
~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/vast/palmer/pi/turner/flg9/conda_envs/semibin/lib/python3.13/site-packages/igraph/community.py", line 56, in _community_infomap
membership, codelength = GraphBase.community_infomap(
~~~~~~~~~~~~~~~~~~~~~~~~~~~^
graph, edge_weights, vertex_weights, trials
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
)
^
igraph._igraph.InternalError: Error at src/community/infomap/infomap.cc:263: Edge weights must not be NaN values. -- Invalid value
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/vast/palmer/pi/turner/flg9/conda_envs/semibin/bin/SemiBin2", line 10, in <module>
sys.exit(main2())
~~~~~^^
File "/vast/palmer/pi/turner/flg9/conda_envs/semibin/lib/python3.13/site-packages/SemiBin/main.py", line 1605, in main2
binning_short(logger, args.data, args.minfasta_kb * 1000, binned_length,
~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
environment=args.environment, contig_dict=contig_dict,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
model_path=args.model_path, output=args.output,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
device=device, args=args)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/vast/palmer/pi/turner/flg9/conda_envs/semibin/lib/python3.13/site-packages/SemiBin/main.py", line 1222, in binning_short
cluster(
~~~~~~~^
logger,
^^^^^^^
...<8 lines>...
binned_length=binned_length,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
minfasta=minfasta)
^^^^^^^^^^^^^^^^^^
File "/vast/palmer/pi/turner/flg9/conda_envs/semibin/lib/python3.13/site-packages/SemiBin/cluster.py", line 289, in cluster
embedding, contig_labels = run_embed_infomap(logger, model, data,
~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^
device=device, max_edges=args.max_edges, max_node=args.max_node,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
is_combined=is_combined, n_sample=n_sample,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
contig_dict=contig_dict, num_process=args.num_process,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
random_seed=args.random_seed)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/vast/palmer/pi/turner/flg9/conda_envs/semibin/lib/python3.13/site-packages/SemiBin/cluster.py", line 182, in run_embed_infomap
result = run_infomap(g,
edge_weights=edge_weights,
vertex_weights=length_weight,
num_process=num_process)
File "/vast/palmer/pi/turner/flg9/conda_envs/semibin/lib/python3.13/site-packages/SemiBin/cluster.py", line 26, in run_infomap
rs = [r.get() for r in rs]
~~~~~^^
File "/vast/palmer/pi/turner/flg9/conda_envs/semibin/lib/python3.13/multiprocessing/pool.py", line 774, in get
raise self._value
igraph._igraph.InternalError: Error at src/community/infomap/infomap.cc:263: Edge weights must not be NaN values. -- Invalid value
Hi there! I am running the below Snakemake workflow to bin some short read assemblies. I'm testing my workflow on two samples before I run it on my entire sample set, but I am encountering the error described in the title for one of those samples. Below, I've written out the workflow, in addition to the error. I haven't seen this issue brought up so far, so I'm hoping to get some feedback on where I could be producing the corrupted file in question. I'm happy to provide the preceding steps in the workflow if it would be of any aid. Thank you!
Workflow -
Error -