From 4bc3f20385100d7aed98831941adb69362de5140 Mon Sep 17 00:00:00 2001 From: Josh Chorlton Date: Sun, 22 Sep 2024 21:26:54 +0000 Subject: [PATCH 1/5] switch to subprocess.run in run.py --- run.py | 270 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 187 insertions(+), 83 deletions(-) diff --git a/run.py b/run.py index 688d8cf..b6673fb 100644 --- a/run.py +++ b/run.py @@ -1,15 +1,13 @@ -import os +from pathlib import Path import argparse +import csv +import os +import shutil +import subprocess import sys import time -import subprocess -from subprocess import Popen, PIPE -def cmd_exists(cmd): - return subprocess.call("type " + cmd, shell=True, - stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0 - def main(): cwd=os.path.dirname(os.path.abspath(__file__)) @@ -36,11 +34,11 @@ def main(): # if first != 1: # print(time.strftime("%c")+': Networkx should be 1.11 or earlier.. Terminating...\n', file=sys.stderr) # sys.exit(1) - if not cmd_exists('samtools'): + if not shutil.which('samtools'): print(time.strftime("%c")+': Samtools does not exist in PATH. Terminating....\n', file=sys.stderr) sys.exit(1) - if not cmd_exists('bamToBed'): + if not shutil.which('bamToBed'): print(time.strftime("%c")+': Bedtools does not exist in PATH. Terminating....\n', file=sys.stderr) sys.exit(1) @@ -49,23 +47,36 @@ def main(): print(time.strftime("%c")+':Starting scaffolding..', file=sys.stderr) - if os.path.exists(args.dir+'/alignment.bed') == False: + alignment_bed = Path(args.dir) / "alignment.bed" + if not alignment_bed.exists(): print("converting bam file to bed file", file=sys.stderr) - #os.system('bamToBed -i ' + args.mapping + " > " + args.dir+'/alignment.bed') try: - p = subprocess.check_output('bamToBed -i ' + args.mapping + " > " + args.dir+'/alignment.bed', shell=True) + with open(alignment_bed, "w") as alignment_bed_f: + subprocess.run([ + "bamToBed", + "-i", + args.mapping, + ], stdout=alignment_bed_f, check=True) print('finished conversion', file=sys.stderr) except subprocess.CalledProcessError as err: - os.system("rm " + args.dir+'/alignment.bed') + alignment_bed.unlink() print(time.strftime("%c")+': Failed in coverting bam file to bed format, terminating scaffolding....\n' + str(err.output), file=sys.stderr) sys.exit(1) try: - #os.system('samtools faidx '+args.assembly) - p = subprocess.check_output('samtools faidx '+args.assembly,shell=True) + subprocess.run([ + "samtools", + "faidx", + args.assembly, + ], check=True) except subprocess.CalledProcessError as err: print(str(err.output), file=sys.stderr) - sys.exit() - os.system('cut -f 1,2 '+ args.assembly+'.fai > '+args.dir+'/contig_length') + sys.exit(1) + + with open(args.assembly+'.fai', "r", newline='') as f_in, open(args.dir+'/contig_length', "w", newline='') as f_out: + reader = csv.DictReader(f_in, fieldnames=["name", "length", "offset", "linebases", "linewidth"], delimiter="\t") + writer = csv.DictWriter(f_out, fieldnames=["name", "length"], delimiter="\t") + for row in reader: + writer.writerow({"name": row["name"], "length": row["length"]}) print(time.strftime("%c")+':Finished conversion', file=sys.stderr) @@ -73,115 +84,208 @@ def main(): final_mapping = args.mapping print(time.strftime("%c") + ':Started generating links between contigs', file=sys.stderr) + contig_links = Path(args.dir) / "contig_links" + contig_length = Path(args.dir) / "contig_length" + contig_coverage = Path(args.dir) / "contig_coverage" if os.path.exists(args.dir+'/contig_links') == False: #print './libcorrect -l' + args.lib + ' -a' + args.dir+'/alignment.bed -d ' +args.dir+'/contig_length -o '+ args.dir+'/contig_links' try: - #os.system('./libcorrect -l ' + args.lib + ' -a ' + args.dir+'/alignment.bed -d ' +args.dir+'/contig_length -o '+ args.dir+'/contig_links -x '+args.dir+'/contig_coverage') - p = subprocess.check_output(cwd+'/libcorrect -a ' + args.dir+'/alignment.bed -d ' +args.dir+'/contig_length -o '+ args.dir+'/contig_links -x '+args.dir+'/contig_coverage -c '+str(args.length),shell=True) - print(time.strftime("%c") +':Finished generating links between contigs', file=sys.stderr) + subprocess.run([ + cwd+"/libcorrect", + "-a", + args.dir+"/alignment.bed", + "-d", + str(contig_length), + "-o", + str(contig_links), + "-x", + str(contig_coverage), + "-c", + str(args.length), + ], check=True) + print(time.strftime("%c") +':Finished generating links between contigs', file=sys.stderr) except subprocess.CalledProcessError as err: - os.system('rm '+args.dir+'/contig_links') - print(time.strftime("%c")+': Failed in generate links from bed file, terminating scaffolding....\n' + str(err.output), file=sys.stderr) - sys.exit(1) + contig_links.unlink() + print(time.strftime("%c")+': Failed in generate links from bed file, terminating scaffolding....\n' + str(err.output), file=sys.stderr) + sys.exit(1) print(time.strftime("%c")+':Started bulding of links between contigs', file=sys.stderr) + bundled_links = Path(args.dir) / "bundled_links" + bundled_graph = Path(args.dir) / "bundled_graph.gml" if os.path.exists(args.dir+'/bundled_links') == False: try: - #os.system('./bundler -l '+ args.dir+'/contig_links -o ' + args.dir+'/bundled_links + -b '+args.dir+'/bundled_graph.gml') - p = subprocess.check_output(cwd+'/bundler -l '+ args.dir+'/contig_links -o ' + args.dir+'/bundled_links + -b '+args.dir+'/bundled_graph.gml -c '+str(args.bsize), shell=True) + subprocess.run([ + cwd+"/bundler", + "-l", + str(contig_links), + "-o", + str(bundled_links), + "-b", + str(bundled_graph), + "-c", + str(args.bsize), + ], check=True) print(time.strftime("%c")+':Finished bundling of links between contigs', file=sys.stderr) except subprocess.CalledProcessError as err: - os.system('rm '+args.dir+'/bundled_links') - os.system('rm '+args.dir+'/bundled_graph.gml') + bundled_links.unlink() + bundled_graph.unlink() print(time.strftime("%c")+': Failed to bundle links, terminating scaffolding....\n' + str(err.output), file=sys.stderr) sys.exit(1) + invalidated_counts = Path(args.dir) / "invalidated_counts" + high_centrality = Path(args.dir) / "high_centrality.txt" + bundled_links_filtered = Path(args.dir) / "bundled_links_filtered" + oriented_gml = Path(args.dir) / "oriented.gml" + oriented_links = Path(args.dir) / "oriented_links" + repeats = Path(args.dir) / "repeats" if args.repeats == "true": print(time.strftime("%c")+':Started finding and removing repeats', file=sys.stderr) try: - p = subprocess.check_output(cwd+'/orientcontigs -l '+args.dir+'/bundled_links -c '+ args.dir+'/contig_length --bsize -o ' +args.dir+'/oriented.gml -p ' + args.dir+'/oriented_links -i '+args.dir+'/invalidated_counts',shell=True) - + subprocess.run([ + cwd+"/orientcontigs", + "-l", + str(bundled_links), + "-c", + str(contig_length), + "--bsize", + "-o", + str(oriented_gml), + "-p", + str(oriented_links), + "-i", + str(invalidated_counts), + ], check=True) except subprocess.CalledProcessError as err: print(time.strftime("%c") + ': Failed to find repeats, terminating scaffolding...\n' + str(err.output), file=sys.stderr) try: - p = subprocess.check_output('python '+cwd+'/centrality.py -g '+args.dir+'/bundled_links -l ' + args.dir+ '/contig_length -o '+args.dir+'/high_centrality.txt' ,shell=True) + subprocess.run([ + "python", + cwd+"/centrality.py", + "-g", + str(bundled_links), + "-l", + str(contig_length), + "-o", + str(high_centrality), + ], check=True) except subprocess.CalledProcessError as err: - print(time.strftime("%c")+': Failed to find repeats, terminating scaffolding....\n' + str(err.output), file=sys.stderr) - sys.exit(1) + print(time.strftime("%c")+': Failed to find repeats, terminating scaffolding....\n' + str(err.output), file=sys.stderr) + sys.exit(1) try: - p = subprocess.check_output('python '+cwd+'/repeat_filter.py '+args.dir+'/contig_coverage ' + args.dir+ '/bundled_links ' + args.dir+'//invalidated_counts ' + args.dir+'/high_centrality.txt ' + args.dir+ '/contig_length '+ args.dir+'/repeats > ' + args.dir+'//bundled_links_filtered',shell=True) + with open(bundled_links_filtered, "w") as bundled_links_filtered_f: + subprocess.run([ + "python", + cwd+"/repeat_filter.py", + str(contig_coverage), + str(bundled_links), + str(invalidated_counts), + str(high_centrality), + str(contig_length), + str(repeats), + ], stdout=bundled_links_filtered_f, check=True) except subprocess.CalledProcessError as err: print(time.strftime("%c")+': Failed to find repeats, terminating scaffolding....\n' + str(err.output), file=sys.stderr) sys.exit(1) print(time.strftime("%c")+':Finished repeat finding and removal', file=sys.stderr) else: - os.system('mv '+args.dir+'/bundled_links ' + args.dir+'/bundled_links_filtered') + bundled_links.rename(bundled_links_filtered) print(time.strftime("%c")+':Started orienting the contigs', file=sys.stderr) - # if os.path.exists(args.dir+'/oriented_links') == False: - #os.system('./orientcontigs -l '+args.dir+'/bundled_links_filtered -c '+ args.dir+'/contig_length --bsize -o ' +args.dir+'/oriented.gml -p ' + args.dir+'/oriented_links' ) try: - p = subprocess.check_output(cwd+'/orientcontigs -l '+args.dir+'/bundled_links_filtered -c '+ args.dir+'/contig_length --bsize -o ' +args.dir+'/oriented.gml -p ' + args.dir+'/oriented_links -i '+args.dir+'/invalidated_counts',shell=True) - print(time.strftime("%c")+':Finished orienting the contigs', file=sys.stderr) + subprocess.run([ + cwd+"/orientcontigs", + "-l", + str(bundled_links_filtered), + "-c", + str(contig_length), + "--bsize", + "-o", + str(oriented_gml), + "-p", + str(oriented_links), + "-i", + str(invalidated_counts), + ], check=True) + print(time.strftime("%c")+':Finished orienting the contigs', file=sys.stderr) except subprocess.CalledProcessError: - print(time.strftime("%c")+': Failed to Orient contigs, terminating scaffolding....', file=sys.stderr) + print(time.strftime("%c")+': Failed to Orient contigs, terminating scaffolding....', file=sys.stderr) print(time.strftime("%c")+':Started finding separation pairs', file=sys.stderr) - #if os.path.exists(args.dir+'/seppairs') == False: - #os.system('./spqr -l ' + args.dir+'/oriented_links -o ' + args.dir+'/seppairs') + seppairs = Path(args.dir) / "seppairs" try: - p = subprocess.check_output(cwd+'/spqr -l ' + args.dir+'/oriented_links -o ' + args.dir+'/seppairs',shell=True) - print(time.strftime("%c")+':Finished finding spearation pairs', file=sys.stderr) + subprocess.run([ + cwd+"/spqr", + "-l", + str(oriented_links), + "-o", + str(seppairs), + ], check=True) + print(time.strftime("%c")+':Finished finding spearation pairs', file=sys.stderr) except subprocess.CalledProcessError as err: - print(time.strftime("%c")+': Failed to decompose graph, terminating scaffolding....\n' + str(err.output), file=sys.stderr) - sys.exit(1) + print(time.strftime("%c")+': Failed to decompose graph, terminating scaffolding....\n' + str(err.output), file=sys.stderr) + sys.exit(1) print(time.strftime("%c")+':Finding the layout of contigs', file=sys.stderr) + bubbles = Path(args.dir) / "bubbles.txt" if os.path.exists(args.dir+'/scaffolds.fasta') == False: - try: - p = subprocess.check_output('python '+cwd+'/layout.py -a '+ args.assembly +' -b '+args.dir+'/bubbles.txt' +' -g ' + args.dir+'/oriented.gml -s '+args.dir+'/seppairs -o '+args.dir+'/scaffolds.fa -f '+args.dir+'/scaffolds.agp -e '+args.dir+'/scaffold_graph.gfa',shell=True) - print(time.strftime("%c")+':Final scaffolds written, Done!', file=sys.stderr) - except subprocess.CalledProcessError as err: - print(time.strftime("%c")+': Failed to generate scaffold sequences, terminating scaffolding....\n' + str(err.output), file=sys.stderr) + try: + subprocess.run([ + "python", + cwd+"/layout.py", + "-a", + str(args.assembly), + "-b", + str(bubbles), + "-g", + str(oriented_gml), + "-s", + str(seppairs), + "-o", + args.dir+"/scaffolds.fa", + "-f", + args.dir+"/scaffolds.agp", + "-e", + args.dir+"/scaffold_graph.gfa", + ], check=True) + print(time.strftime("%c")+':Final scaffolds written, Done!', file=sys.stderr) + except subprocess.CalledProcessError as err: + print(time.strftime("%c")+': Failed to generate scaffold sequences, terminating scaffolding....\n' + str(err.output), file=sys.stderr) if args.visualization == "true": - #try: - graphpath = os.path.abspath(args.dir+'/oriented.gml') - bubblepath = os.path.abspath(args.dir+'/bubbles.txt') # Output the MetagenomeScope .db file directly to args.dir. The only file # created by collate.py here is the mgsc.db file. - os.system('python '+cwd+'/MetagenomeScope/graph_collator/collate.py -i ' - + graphpath + ' -w -ub ' + bubblepath + ' -ubl -d ' + args.dir - + ' -o mgsc') - #p = subprocess.check_output('python '+cwd+'/MetagenomeScope/graph_collator/collate.py -i ' + graphpath + ' -w -ub ' + bubblepath + ' -ubl -d ' + args.dir + ' -o mgsc') - #except subprocess.CalledProcessError as err: - #print >> sys.stderr, time.strftime("%c")+": Failed to run MetagenomeScope \n" + str(err.output) + subprocess.run([ + "python", + cwd+"/MetagenomeScope/graph_collator/collate.py", + "-i", + str(oriented_gml), + "-w", + "-ub", + str(bubbles), + "-ubl", + "-d", + args.dir, + "-o", + "mgsc", + ], check=True) if not args.keep == "true": - if os.path.exists(args.dir+'/contig_length'): - os.system("rm "+args.dir+'/contig_length') - if os.path.exists(args.dir+'/contig_links'): - os.system("rm "+args.dir+'/contig_links') - if os.path.exists(args.dir+'/contig_coverage'): - os.system("rm "+args.dir+'/contig_coverage') - if os.path.exists(args.dir+'/bundled_links'): - os.system("rm "+args.dir+'/bundled_links') - if os.path.exists(args.dir+'/bundled_links_filtered'): - os.system("rm "+args.dir+'/bundled_links_filtered') - if os.path.exists(args.dir+'/bundled_graph.gml'): - os.system("rm "+args.dir+'/bundled_graph.gml') - if os.path.exists(args.dir+'/invalidated_counts'): - os.system("rm "+args.dir+'/invalidated_counts') - if os.path.exists(args.dir+'/repeats'): - os.system("rm "+args.dir+'/repeats') - if os.path.exists(args.dir+'/oriented_links'): - os.system("rm "+args.dir+'/oriented_links') - if os.path.exists(args.dir+'/oriented.gml'): - os.system("rm "+args.dir+'/oriented.gml') - if os.path.exists(args.dir+'/seppairs'): - os.system("rm "+args.dir+'/seppairs') - if os.path.exists(args.dir+'/alignment.bed'): - os.system("rm "+args.dir+'/alignment.bed') + for gen_f in [ + contig_length, + contig_links, + contig_coverage, + bundled_links, + bundled_links_filtered, + bundled_graph, + invalidated_counts, + repeats, + oriented_links, + oriented_gml, + seppairs, + alignment_bed, + ]: + gen_f.unlink(missing_ok=True) + if __name__ == '__main__': main() From 9094b9d70352e8c0ee1de418154ccb2eeb4014ca Mon Sep 17 00:00:00 2001 From: Josh Chorlton Date: Sun, 22 Sep 2024 21:41:45 +0000 Subject: [PATCH 2/5] move cwd to pathlib --- run.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/run.py b/run.py index b6673fb..2727203 100644 --- a/run.py +++ b/run.py @@ -9,7 +9,7 @@ def main(): - cwd=os.path.dirname(os.path.abspath(__file__)) + cwd=Path(__file__).resolve().parent.absolute() parser = argparse.ArgumentParser(description="MetaCarvel: A scaffolding tool for metagenomic assemblies") parser.add_argument("-a","--assembly",help="assembled contigs",required=True) @@ -91,7 +91,7 @@ def main(): #print './libcorrect -l' + args.lib + ' -a' + args.dir+'/alignment.bed -d ' +args.dir+'/contig_length -o '+ args.dir+'/contig_links' try: subprocess.run([ - cwd+"/libcorrect", + str(cwd / "libcorrect"), "-a", args.dir+"/alignment.bed", "-d", @@ -115,7 +115,7 @@ def main(): if os.path.exists(args.dir+'/bundled_links') == False: try: subprocess.run([ - cwd+"/bundler", + str(cwd / "bundler"), "-l", str(contig_links), "-o", @@ -142,7 +142,7 @@ def main(): print(time.strftime("%c")+':Started finding and removing repeats', file=sys.stderr) try: subprocess.run([ - cwd+"/orientcontigs", + str(cwd / "orientcontigs"), "-l", str(bundled_links), "-c", @@ -161,7 +161,7 @@ def main(): try: subprocess.run([ "python", - cwd+"/centrality.py", + str(cwd / "centrality.py"), "-g", str(bundled_links), "-l", @@ -177,7 +177,7 @@ def main(): with open(bundled_links_filtered, "w") as bundled_links_filtered_f: subprocess.run([ "python", - cwd+"/repeat_filter.py", + str(cwd / "repeat_filter.py"), str(contig_coverage), str(bundled_links), str(invalidated_counts), @@ -194,7 +194,7 @@ def main(): print(time.strftime("%c")+':Started orienting the contigs', file=sys.stderr) try: subprocess.run([ - cwd+"/orientcontigs", + str(cwd / "orientcontigs"), "-l", str(bundled_links_filtered), "-c", @@ -215,7 +215,7 @@ def main(): seppairs = Path(args.dir) / "seppairs" try: subprocess.run([ - cwd+"/spqr", + str(cwd / "spqr"), "-l", str(oriented_links), "-o", @@ -232,7 +232,7 @@ def main(): try: subprocess.run([ "python", - cwd+"/layout.py", + str(cwd / "layout.py"), "-a", str(args.assembly), "-b", @@ -257,7 +257,7 @@ def main(): # created by collate.py here is the mgsc.db file. subprocess.run([ "python", - cwd+"/MetagenomeScope/graph_collator/collate.py", + str(cwd / "MetagenomeScope/graph_collator/collate.py"), "-i", str(oriented_gml), "-w", From da28a86432ac0e08bc8b902ea49c98d4032b6939 Mon Sep 17 00:00:00 2001 From: Josh Chorlton Date: Sun, 22 Sep 2024 21:58:15 +0000 Subject: [PATCH 3/5] use path and bool types for args --- run.py | 81 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 42 insertions(+), 39 deletions(-) diff --git a/run.py b/run.py index 2727203..6a70243 100644 --- a/run.py +++ b/run.py @@ -12,14 +12,14 @@ def main(): cwd=Path(__file__).resolve().parent.absolute() parser = argparse.ArgumentParser(description="MetaCarvel: A scaffolding tool for metagenomic assemblies") - parser.add_argument("-a","--assembly",help="assembled contigs",required=True) - parser.add_argument("-m","--mapping", help="mapping of read to contigs in bam format",required=True) - parser.add_argument("-d","--dir",help="output directory for results",default='out',required=True) - parser.add_argument("-r",'--repeats',help="To turn repeat detection on",default="true") - parser.add_argument("-k","--keep", help="Set this to keep temporary files in output directory",default=False) - parser.add_argument("-l","--length",help="Minimum length of contigs to consider for scaffolding in base pairs (bp)",default=500) - parser.add_argument("-b","--bsize",help="Minimum mate pair support between contigs to consider for scaffolding",default=3) - parser.add_argument("-v",'--visualization',help="Generate a .db file for the MetagenomeScope visualization tool",default=False) + parser.add_argument("-a","--assembly",type=Path,help="assembled contigs",required=True) + parser.add_argument("-m","--mapping", type=Path, help="mapping of read to contigs in bam format",required=True) + parser.add_argument("-d","--dir",type=Path,help="output directory for results",default='out',required=True) + parser.add_argument("-r",'--repeats',action=argparse.BooleanOptionalAction,help="To turn repeat detection on", default=True) + parser.add_argument("-k","--keep", action=argparse.BooleanOptionalAction, help="Set this to keep temporary files in output directory",default=False) + parser.add_argument("-l","--length", type=int, help="Minimum length of contigs to consider for scaffolding in base pairs (bp)",default=500) + parser.add_argument("-b","--bsize", type=int, help="Minimum mate pair support between contigs to consider for scaffolding",default=3) + parser.add_argument("-v",'--visualization', action=argparse.BooleanOptionalAction, help="Generate a .db file for the MetagenomeScope visualization tool", default=False) args = parser.parse_args() try: @@ -42,12 +42,11 @@ def main(): print(time.strftime("%c")+': Bedtools does not exist in PATH. Terminating....\n', file=sys.stderr) sys.exit(1) - if not os.path.exists(args.dir): - os.makedirs(args.dir) + args.dir.mkdir(parents=True, exist_ok=True) print(time.strftime("%c")+':Starting scaffolding..', file=sys.stderr) - alignment_bed = Path(args.dir) / "alignment.bed" + alignment_bed = args.dir / "alignment.bed" if not alignment_bed.exists(): print("converting bam file to bed file", file=sys.stderr) try: @@ -55,24 +54,29 @@ def main(): subprocess.run([ "bamToBed", "-i", - args.mapping, + str(args.mapping), ], stdout=alignment_bed_f, check=True) print('finished conversion', file=sys.stderr) except subprocess.CalledProcessError as err: alignment_bed.unlink() print(time.strftime("%c")+': Failed in coverting bam file to bed format, terminating scaffolding....\n' + str(err.output), file=sys.stderr) sys.exit(1) + + assembly_idx = args.assembly.parent / (args.assembly.name + '.fai') try: subprocess.run([ "samtools", "faidx", - args.assembly, + str(args.assembly), + "--fai-idx", + str(assembly_idx), ], check=True) except subprocess.CalledProcessError as err: print(str(err.output), file=sys.stderr) sys.exit(1) - with open(args.assembly+'.fai', "r", newline='') as f_in, open(args.dir+'/contig_length', "w", newline='') as f_out: + contig_length = args.dir / "contig_length" + with open(assembly_idx, "r", newline='') as f_in, open(contig_length, "w", newline='') as f_out: reader = csv.DictReader(f_in, fieldnames=["name", "length", "offset", "linebases", "linewidth"], delimiter="\t") writer = csv.DictWriter(f_out, fieldnames=["name", "length"], delimiter="\t") for row in reader: @@ -84,16 +88,14 @@ def main(): final_mapping = args.mapping print(time.strftime("%c") + ':Started generating links between contigs', file=sys.stderr) - contig_links = Path(args.dir) / "contig_links" - contig_length = Path(args.dir) / "contig_length" - contig_coverage = Path(args.dir) / "contig_coverage" - if os.path.exists(args.dir+'/contig_links') == False: - #print './libcorrect -l' + args.lib + ' -a' + args.dir+'/alignment.bed -d ' +args.dir+'/contig_length -o '+ args.dir+'/contig_links' + contig_links = args.dir / "contig_links" + contig_coverage = args.dir / "contig_coverage" + if not contig_links.exists(): try: subprocess.run([ str(cwd / "libcorrect"), "-a", - args.dir+"/alignment.bed", + str(alignment_bed), "-d", str(contig_length), "-o", @@ -110,9 +112,9 @@ def main(): sys.exit(1) print(time.strftime("%c")+':Started bulding of links between contigs', file=sys.stderr) - bundled_links = Path(args.dir) / "bundled_links" - bundled_graph = Path(args.dir) / "bundled_graph.gml" - if os.path.exists(args.dir+'/bundled_links') == False: + bundled_links = args.dir / "bundled_links" + bundled_graph = args.dir / "bundled_graph.gml" + if not bundled_links.exists(): try: subprocess.run([ str(cwd / "bundler"), @@ -132,13 +134,13 @@ def main(): print(time.strftime("%c")+': Failed to bundle links, terminating scaffolding....\n' + str(err.output), file=sys.stderr) sys.exit(1) - invalidated_counts = Path(args.dir) / "invalidated_counts" - high_centrality = Path(args.dir) / "high_centrality.txt" - bundled_links_filtered = Path(args.dir) / "bundled_links_filtered" - oriented_gml = Path(args.dir) / "oriented.gml" - oriented_links = Path(args.dir) / "oriented_links" - repeats = Path(args.dir) / "repeats" - if args.repeats == "true": + invalidated_counts = args.dir / "invalidated_counts" + high_centrality = args.dir / "high_centrality.txt" + bundled_links_filtered = args.dir / "bundled_links_filtered" + oriented_gml = args.dir / "oriented.gml" + oriented_links = args.dir / "oriented_links" + repeats = args.dir / "repeats" + if args.repeats: print(time.strftime("%c")+':Started finding and removing repeats', file=sys.stderr) try: subprocess.run([ @@ -212,7 +214,7 @@ def main(): print(time.strftime("%c")+': Failed to Orient contigs, terminating scaffolding....', file=sys.stderr) print(time.strftime("%c")+':Started finding separation pairs', file=sys.stderr) - seppairs = Path(args.dir) / "seppairs" + seppairs = args.dir / "seppairs" try: subprocess.run([ str(cwd / "spqr"), @@ -227,8 +229,9 @@ def main(): sys.exit(1) print(time.strftime("%c")+':Finding the layout of contigs', file=sys.stderr) - bubbles = Path(args.dir) / "bubbles.txt" - if os.path.exists(args.dir+'/scaffolds.fasta') == False: + bubbles = args.dir / "bubbles.txt" + scaffolds_fasta = args.dir / "scaffolds.fa" + if not scaffolds_fasta.exists(): try: subprocess.run([ "python", @@ -242,17 +245,17 @@ def main(): "-s", str(seppairs), "-o", - args.dir+"/scaffolds.fa", + str(scaffolds_fasta), "-f", - args.dir+"/scaffolds.agp", + str(args.dir / "scaffolds.agp"), "-e", - args.dir+"/scaffold_graph.gfa", + str(args.dir / "scaffold_graph.gfa"), ], check=True) print(time.strftime("%c")+':Final scaffolds written, Done!', file=sys.stderr) except subprocess.CalledProcessError as err: print(time.strftime("%c")+': Failed to generate scaffold sequences, terminating scaffolding....\n' + str(err.output), file=sys.stderr) - if args.visualization == "true": + if args.visualization: # Output the MetagenomeScope .db file directly to args.dir. The only file # created by collate.py here is the mgsc.db file. subprocess.run([ @@ -265,12 +268,12 @@ def main(): str(bubbles), "-ubl", "-d", - args.dir, + str(args.dir), "-o", "mgsc", ], check=True) - if not args.keep == "true": + if not args.keep: for gen_f in [ contig_length, contig_links, From 2d649334271d1b6d6b928e24218afde0903e6f02 Mon Sep 17 00:00:00 2001 From: Josh Chorlton Date: Sun, 22 Sep 2024 21:59:29 +0000 Subject: [PATCH 4/5] remove unnecessary comment --- run.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/run.py b/run.py index 6a70243..f54f179 100644 --- a/run.py +++ b/run.py @@ -31,9 +31,6 @@ def main(): version_id = int(version.split('.')[1]) first = int(version.split('.')[0]) print('Networkx ' + version + ' found', file=sys.stderr) - # if first != 1: - # print(time.strftime("%c")+': Networkx should be 1.11 or earlier.. Terminating...\n', file=sys.stderr) - # sys.exit(1) if not shutil.which('samtools'): print(time.strftime("%c")+': Samtools does not exist in PATH. Terminating....\n', file=sys.stderr) sys.exit(1) From 7b9a7b129aa9b6ba2a25118b09cb7552a57ee9fb Mon Sep 17 00:00:00 2001 From: Josh Chorlton Date: Sun, 22 Sep 2024 22:11:18 +0000 Subject: [PATCH 5/5] run ruff and get passing --- run.py | 586 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 372 insertions(+), 214 deletions(-) diff --git a/run.py b/run.py index f54f179..56cfe33 100644 --- a/run.py +++ b/run.py @@ -1,7 +1,6 @@ from pathlib import Path import argparse import csv -import os import shutil import subprocess import sys @@ -9,127 +8,224 @@ def main(): - cwd=Path(__file__).resolve().parent.absolute() + cwd = Path(__file__).resolve().parent.absolute() - parser = argparse.ArgumentParser(description="MetaCarvel: A scaffolding tool for metagenomic assemblies") - parser.add_argument("-a","--assembly",type=Path,help="assembled contigs",required=True) - parser.add_argument("-m","--mapping", type=Path, help="mapping of read to contigs in bam format",required=True) - parser.add_argument("-d","--dir",type=Path,help="output directory for results",default='out',required=True) - parser.add_argument("-r",'--repeats',action=argparse.BooleanOptionalAction,help="To turn repeat detection on", default=True) - parser.add_argument("-k","--keep", action=argparse.BooleanOptionalAction, help="Set this to keep temporary files in output directory",default=False) - parser.add_argument("-l","--length", type=int, help="Minimum length of contigs to consider for scaffolding in base pairs (bp)",default=500) - parser.add_argument("-b","--bsize", type=int, help="Minimum mate pair support between contigs to consider for scaffolding",default=3) - parser.add_argument("-v",'--visualization', action=argparse.BooleanOptionalAction, help="Generate a .db file for the MetagenomeScope visualization tool", default=False) + parser = argparse.ArgumentParser( + description="MetaCarvel: A scaffolding tool for metagenomic assemblies" + ) + parser.add_argument( + "-a", "--assembly", type=Path, help="assembled contigs", required=True + ) + parser.add_argument( + "-m", + "--mapping", + type=Path, + help="mapping of read to contigs in bam format", + required=True, + ) + parser.add_argument( + "-d", + "--dir", + type=Path, + help="output directory for results", + default="out", + required=True, + ) + parser.add_argument( + "-r", + "--repeats", + action=argparse.BooleanOptionalAction, + help="To turn repeat detection on", + default=True, + ) + parser.add_argument( + "-k", + "--keep", + action=argparse.BooleanOptionalAction, + help="Set this to keep temporary files in output directory", + default=False, + ) + parser.add_argument( + "-l", + "--length", + type=int, + help="Minimum length of contigs to consider for scaffolding in base pairs (bp)", + default=500, + ) + parser.add_argument( + "-b", + "--bsize", + type=int, + help="Minimum mate pair support between contigs to consider for scaffolding", + default=3, + ) + parser.add_argument( + "-v", + "--visualization", + action=argparse.BooleanOptionalAction, + help="Generate a .db file for the MetagenomeScope visualization tool", + default=False, + ) args = parser.parse_args() try: - import networkx + import networkx except ImportError: - raise ImportError('Looks like you do not have networkx. Please rerun with networkx module installed.') - sys.exit(1) + raise ImportError( + "Looks like you do not have networkx. Please rerun with networkx module installed." + ) + sys.exit(1) version = networkx.__version__ - version_id = int(version.split('.')[1]) - first = int(version.split('.')[0]) - print('Networkx ' + version + ' found', file=sys.stderr) - if not shutil.which('samtools'): - print(time.strftime("%c")+': Samtools does not exist in PATH. Terminating....\n', file=sys.stderr) - sys.exit(1) + print("Networkx " + version + " found", file=sys.stderr) + if not shutil.which("samtools"): + print( + time.strftime("%c") + + ": Samtools does not exist in PATH. Terminating....\n", + file=sys.stderr, + ) + sys.exit(1) - if not shutil.which('bamToBed'): - print(time.strftime("%c")+': Bedtools does not exist in PATH. Terminating....\n', file=sys.stderr) - sys.exit(1) + if not shutil.which("bamToBed"): + print( + time.strftime("%c") + + ": Bedtools does not exist in PATH. Terminating....\n", + file=sys.stderr, + ) + sys.exit(1) args.dir.mkdir(parents=True, exist_ok=True) - print(time.strftime("%c")+':Starting scaffolding..', file=sys.stderr) - + print(time.strftime("%c") + ":Starting scaffolding..", file=sys.stderr) alignment_bed = args.dir / "alignment.bed" if not alignment_bed.exists(): print("converting bam file to bed file", file=sys.stderr) try: - with open(alignment_bed, "w") as alignment_bed_f: - subprocess.run([ - "bamToBed", - "-i", - str(args.mapping), - ], stdout=alignment_bed_f, check=True) - print('finished conversion', file=sys.stderr) + with open(alignment_bed, "w") as alignment_bed_f: + subprocess.run( + [ + "bamToBed", + "-i", + str(args.mapping), + ], + stdout=alignment_bed_f, + check=True, + ) + print("finished conversion", file=sys.stderr) except subprocess.CalledProcessError as err: - alignment_bed.unlink() - print(time.strftime("%c")+': Failed in coverting bam file to bed format, terminating scaffolding....\n' + str(err.output), file=sys.stderr) - sys.exit(1) + alignment_bed.unlink() + print( + time.strftime("%c") + + ": Failed in coverting bam file to bed format, terminating scaffolding....\n" + + str(err.output), + file=sys.stderr, + ) + sys.exit(1) - assembly_idx = args.assembly.parent / (args.assembly.name + '.fai') + assembly_idx = args.assembly.parent / (args.assembly.name + ".fai") try: - subprocess.run([ - "samtools", - "faidx", - str(args.assembly), - "--fai-idx", - str(assembly_idx), - ], check=True) + subprocess.run( + [ + "samtools", + "faidx", + str(args.assembly), + "--fai-idx", + str(assembly_idx), + ], + check=True, + ) except subprocess.CalledProcessError as err: - print(str(err.output), file=sys.stderr) - sys.exit(1) + print(str(err.output), file=sys.stderr) + sys.exit(1) contig_length = args.dir / "contig_length" - with open(assembly_idx, "r", newline='') as f_in, open(contig_length, "w", newline='') as f_out: - reader = csv.DictReader(f_in, fieldnames=["name", "length", "offset", "linebases", "linewidth"], delimiter="\t") - writer = csv.DictWriter(f_out, fieldnames=["name", "length"], delimiter="\t") - for row in reader: - writer.writerow({"name": row["name"], "length": row["length"]}) - - print(time.strftime("%c")+':Finished conversion', file=sys.stderr) + with open(assembly_idx, "r", newline="") as f_in, open( + contig_length, "w", newline="" + ) as f_out: + reader = csv.DictReader( + f_in, + fieldnames=["name", "length", "offset", "linebases", "linewidth"], + delimiter="\t", + ) + writer = csv.DictWriter(f_out, fieldnames=["name", "length"], delimiter="\t") + for row in reader: + writer.writerow({"name": row["name"], "length": row["length"]}) - final_assembly = args.assembly - final_mapping = args.mapping + print(time.strftime("%c") + ":Finished conversion", file=sys.stderr) - print(time.strftime("%c") + ':Started generating links between contigs', file=sys.stderr) + print( + time.strftime("%c") + ":Started generating links between contigs", + file=sys.stderr, + ) contig_links = args.dir / "contig_links" contig_coverage = args.dir / "contig_coverage" if not contig_links.exists(): try: - subprocess.run([ - str(cwd / "libcorrect"), - "-a", - str(alignment_bed), - "-d", - str(contig_length), - "-o", - str(contig_links), - "-x", - str(contig_coverage), - "-c", - str(args.length), - ], check=True) - print(time.strftime("%c") +':Finished generating links between contigs', file=sys.stderr) + subprocess.run( + [ + str(cwd / "libcorrect"), + "-a", + str(alignment_bed), + "-d", + str(contig_length), + "-o", + str(contig_links), + "-x", + str(contig_coverage), + "-c", + str(args.length), + ], + check=True, + ) + print( + time.strftime("%c") + ":Finished generating links between contigs", + file=sys.stderr, + ) except subprocess.CalledProcessError as err: - contig_links.unlink() - print(time.strftime("%c")+': Failed in generate links from bed file, terminating scaffolding....\n' + str(err.output), file=sys.stderr) - sys.exit(1) + contig_links.unlink() + print( + time.strftime("%c") + + ": Failed in generate links from bed file, terminating scaffolding....\n" + + str(err.output), + file=sys.stderr, + ) + sys.exit(1) - print(time.strftime("%c")+':Started bulding of links between contigs', file=sys.stderr) + print( + time.strftime("%c") + ":Started bulding of links between contigs", + file=sys.stderr, + ) bundled_links = args.dir / "bundled_links" bundled_graph = args.dir / "bundled_graph.gml" if not bundled_links.exists(): try: - subprocess.run([ - str(cwd / "bundler"), - "-l", - str(contig_links), - "-o", - str(bundled_links), - "-b", - str(bundled_graph), - "-c", - str(args.bsize), - ], check=True) - print(time.strftime("%c")+':Finished bundling of links between contigs', file=sys.stderr) + subprocess.run( + [ + str(cwd / "bundler"), + "-l", + str(contig_links), + "-o", + str(bundled_links), + "-b", + str(bundled_graph), + "-c", + str(args.bsize), + ], + check=True, + ) + print( + time.strftime("%c") + ":Finished bundling of links between contigs", + file=sys.stderr, + ) except subprocess.CalledProcessError as err: - bundled_links.unlink() - bundled_graph.unlink() - print(time.strftime("%c")+': Failed to bundle links, terminating scaffolding....\n' + str(err.output), file=sys.stderr) - sys.exit(1) + bundled_links.unlink() + bundled_graph.unlink() + print( + time.strftime("%c") + + ": Failed to bundle links, terminating scaffolding....\n" + + str(err.output), + file=sys.stderr, + ) + sys.exit(1) invalidated_counts = args.dir / "invalidated_counts" high_centrality = args.dir / "high_centrality.txt" @@ -138,154 +234,216 @@ def main(): oriented_links = args.dir / "oriented_links" repeats = args.dir / "repeats" if args.repeats: - print(time.strftime("%c")+':Started finding and removing repeats', file=sys.stderr) + print( + time.strftime("%c") + ":Started finding and removing repeats", + file=sys.stderr, + ) try: - subprocess.run([ - str(cwd / "orientcontigs"), - "-l", - str(bundled_links), - "-c", - str(contig_length), - "--bsize", - "-o", - str(oriented_gml), - "-p", - str(oriented_links), - "-i", - str(invalidated_counts), - ], check=True) + subprocess.run( + [ + str(cwd / "orientcontigs"), + "-l", + str(bundled_links), + "-c", + str(contig_length), + "--bsize", + "-o", + str(oriented_gml), + "-p", + str(oriented_links), + "-i", + str(invalidated_counts), + ], + check=True, + ) except subprocess.CalledProcessError as err: - print(time.strftime("%c") + ': Failed to find repeats, terminating scaffolding...\n' + str(err.output), file=sys.stderr) + print( + time.strftime("%c") + + ": Failed to find repeats, terminating scaffolding...\n" + + str(err.output), + file=sys.stderr, + ) try: - subprocess.run([ - "python", - str(cwd / "centrality.py"), - "-g", - str(bundled_links), - "-l", - str(contig_length), - "-o", - str(high_centrality), - ], check=True) + subprocess.run( + [ + "python", + str(cwd / "centrality.py"), + "-g", + str(bundled_links), + "-l", + str(contig_length), + "-o", + str(high_centrality), + ], + check=True, + ) except subprocess.CalledProcessError as err: - print(time.strftime("%c")+': Failed to find repeats, terminating scaffolding....\n' + str(err.output), file=sys.stderr) - sys.exit(1) + print( + time.strftime("%c") + + ": Failed to find repeats, terminating scaffolding....\n" + + str(err.output), + file=sys.stderr, + ) + sys.exit(1) try: - with open(bundled_links_filtered, "w") as bundled_links_filtered_f: - subprocess.run([ - "python", - str(cwd / "repeat_filter.py"), - str(contig_coverage), - str(bundled_links), - str(invalidated_counts), - str(high_centrality), - str(contig_length), - str(repeats), - ], stdout=bundled_links_filtered_f, check=True) + with open(bundled_links_filtered, "w") as bundled_links_filtered_f: + subprocess.run( + [ + "python", + str(cwd / "repeat_filter.py"), + str(contig_coverage), + str(bundled_links), + str(invalidated_counts), + str(high_centrality), + str(contig_length), + str(repeats), + ], + stdout=bundled_links_filtered_f, + check=True, + ) except subprocess.CalledProcessError as err: - print(time.strftime("%c")+': Failed to find repeats, terminating scaffolding....\n' + str(err.output), file=sys.stderr) + print( + time.strftime("%c") + + ": Failed to find repeats, terminating scaffolding....\n" + + str(err.output), + file=sys.stderr, + ) sys.exit(1) - print(time.strftime("%c")+':Finished repeat finding and removal', file=sys.stderr) + print( + time.strftime("%c") + ":Finished repeat finding and removal", + file=sys.stderr, + ) else: - bundled_links.rename(bundled_links_filtered) - print(time.strftime("%c")+':Started orienting the contigs', file=sys.stderr) + bundled_links.rename(bundled_links_filtered) + print(time.strftime("%c") + ":Started orienting the contigs", file=sys.stderr) try: - subprocess.run([ - str(cwd / "orientcontigs"), - "-l", - str(bundled_links_filtered), - "-c", - str(contig_length), - "--bsize", - "-o", - str(oriented_gml), - "-p", - str(oriented_links), - "-i", - str(invalidated_counts), - ], check=True) - print(time.strftime("%c")+':Finished orienting the contigs', file=sys.stderr) + subprocess.run( + [ + str(cwd / "orientcontigs"), + "-l", + str(bundled_links_filtered), + "-c", + str(contig_length), + "--bsize", + "-o", + str(oriented_gml), + "-p", + str(oriented_links), + "-i", + str(invalidated_counts), + ], + check=True, + ) + print(time.strftime("%c") + ":Finished orienting the contigs", file=sys.stderr) except subprocess.CalledProcessError: - print(time.strftime("%c")+': Failed to Orient contigs, terminating scaffolding....', file=sys.stderr) + print( + time.strftime("%c") + + ": Failed to Orient contigs, terminating scaffolding....", + file=sys.stderr, + ) - print(time.strftime("%c")+':Started finding separation pairs', file=sys.stderr) + print(time.strftime("%c") + ":Started finding separation pairs", file=sys.stderr) seppairs = args.dir / "seppairs" try: - subprocess.run([ - str(cwd / "spqr"), - "-l", - str(oriented_links), - "-o", - str(seppairs), - ], check=True) - print(time.strftime("%c")+':Finished finding spearation pairs', file=sys.stderr) + subprocess.run( + [ + str(cwd / "spqr"), + "-l", + str(oriented_links), + "-o", + str(seppairs), + ], + check=True, + ) + print( + time.strftime("%c") + ":Finished finding spearation pairs", file=sys.stderr + ) except subprocess.CalledProcessError as err: - print(time.strftime("%c")+': Failed to decompose graph, terminating scaffolding....\n' + str(err.output), file=sys.stderr) - sys.exit(1) + print( + time.strftime("%c") + + ": Failed to decompose graph, terminating scaffolding....\n" + + str(err.output), + file=sys.stderr, + ) + sys.exit(1) - print(time.strftime("%c")+':Finding the layout of contigs', file=sys.stderr) + print(time.strftime("%c") + ":Finding the layout of contigs", file=sys.stderr) bubbles = args.dir / "bubbles.txt" scaffolds_fasta = args.dir / "scaffolds.fa" if not scaffolds_fasta.exists(): - try: - subprocess.run([ - "python", - str(cwd / "layout.py"), - "-a", - str(args.assembly), - "-b", - str(bubbles), - "-g", - str(oriented_gml), - "-s", - str(seppairs), - "-o", - str(scaffolds_fasta), - "-f", - str(args.dir / "scaffolds.agp"), - "-e", - str(args.dir / "scaffold_graph.gfa"), - ], check=True) - print(time.strftime("%c")+':Final scaffolds written, Done!', file=sys.stderr) - except subprocess.CalledProcessError as err: - print(time.strftime("%c")+': Failed to generate scaffold sequences, terminating scaffolding....\n' + str(err.output), file=sys.stderr) + try: + subprocess.run( + [ + "python", + str(cwd / "layout.py"), + "-a", + str(args.assembly), + "-b", + str(bubbles), + "-g", + str(oriented_gml), + "-s", + str(seppairs), + "-o", + str(scaffolds_fasta), + "-f", + str(args.dir / "scaffolds.agp"), + "-e", + str(args.dir / "scaffold_graph.gfa"), + ], + check=True, + ) + print( + time.strftime("%c") + ":Final scaffolds written, Done!", file=sys.stderr + ) + except subprocess.CalledProcessError as err: + print( + time.strftime("%c") + + ": Failed to generate scaffold sequences, terminating scaffolding....\n" + + str(err.output), + file=sys.stderr, + ) if args.visualization: - # Output the MetagenomeScope .db file directly to args.dir. The only file - # created by collate.py here is the mgsc.db file. - subprocess.run([ - "python", - str(cwd / "MetagenomeScope/graph_collator/collate.py"), - "-i", - str(oriented_gml), - "-w", - "-ub", - str(bubbles), - "-ubl", - "-d", - str(args.dir), - "-o", - "mgsc", - ], check=True) + # Output the MetagenomeScope .db file directly to args.dir. The only file + # created by collate.py here is the mgsc.db file. + subprocess.run( + [ + "python", + str(cwd / "MetagenomeScope/graph_collator/collate.py"), + "-i", + str(oriented_gml), + "-w", + "-ub", + str(bubbles), + "-ubl", + "-d", + str(args.dir), + "-o", + "mgsc", + ], + check=True, + ) if not args.keep: - for gen_f in [ - contig_length, - contig_links, - contig_coverage, - bundled_links, - bundled_links_filtered, - bundled_graph, - invalidated_counts, - repeats, - oriented_links, - oriented_gml, - seppairs, - alignment_bed, - ]: - gen_f.unlink(missing_ok=True) + for gen_f in [ + contig_length, + contig_links, + contig_coverage, + bundled_links, + bundled_links_filtered, + bundled_graph, + invalidated_counts, + repeats, + oriented_links, + oriented_gml, + seppairs, + alignment_bed, + ]: + gen_f.unlink(missing_ok=True) + -if __name__ == '__main__': +if __name__ == "__main__": main()