-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathmetacompass2.nf
More file actions
executable file
·221 lines (175 loc) · 7.1 KB
/
metacompass2.nf
File metadata and controls
executable file
·221 lines (175 loc) · 7.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/usr/bin/env nextflow
// setting variables
//nextflow.preview.output = 1
params.forward = "" // input forward reads
params.reverse = "" // input reverse reads
forward_gz = "" // uncompressed forward reads
reverse_gz = "" // uncompressed reverse reads
params.unpaired = "" // input unpaired reads
unpaired_gz = "" // uncompressed unpaired reads
reads = "" // string containing command line parameters for passing reads along
params.reference_db = "" // location of reference database
//params.output = "" // location for output files
params.skip_rs = false // skip reference selection
params.skip_rc = false // skip reference culling
params.clean_uf = false // clean up as we go
params.de_novo = 1 // execute de novo assembly
gzip_flag = false // input files are gziped
paired = false // input files are paired
unpaired = false // input files are unpaired
// some modularization
include {filter_reads} from './pipeline/ref_selection.nf'
include {map_to_gene} from './pipeline/ref_selection.nf'
include {select_genomes} from './pipeline/ref_selection.nf'
include {collect_refs} from './pipeline/ref_culling.nf'
include {SkaniTriangle} from './pipeline/ref_culling.nf'
include {Cluster} from './pipeline/ref_culling.nf'
include {ConcatFasta} from './pipeline/ref_culling.nf'
include {IndexReads} from './pipeline/ref_culling.nf'
include {ClusterIndex} from './pipeline/ref_culling.nf'
include {interleaveReads} from './pipeline/ref_assembly.nf'
include {reduceClusters} from './pipeline/ref_assembly.nf'
include {refAssembly} from './pipeline/ref_assembly.nf'
include {deNovoAssembly} from './pipeline/denovo_assembly.nf'
include {createOutputs} from './pipeline/finalize.nf'
//include {createDeNovoOutputs} from './pipeline/finalize.nf'
// Usage information
def usage(status) {
log.info "Usage: \nnextflow run metacompass.nf \n\
[[--forward /path/to/forwardReads --reverse /path/to/reverseReads --unpaired /path/to/unpairedReads] | \n\
[--forward /path/to/forwardReads --reverse /path/to/reverseReads] | \n\
[--unpaired /path/to/unpairedReads]] \n\
--output /path/to/outputDir"
log.info " --reference_db Path to reference marker gene database."
log.info ""
log.info "Required:"
log.info ""
log.info " --forward Path to forward paired-end read."
log.info " --reverse Path to reverse paried-end read."
log.info " --unpaired Path to unpaired read fasta file(s)."
log.info " --output Path to output folder."
log.info ""
log.info "Optional:"
log.info ""
log.info " --ref_sel Reference selection method. Default set to 'tax'."
log.info " --ref_pick Pick reference selection method. Default set to 'breadth'."
log.info " --readlen Read length for filtering. Default set to 200."
log.info " --mincov Minimum coverage. Default set to 1."
log.info " --minctglen Minimum contig length. Default set to 1."
log.info " --run_valet Whether or not run VALET during reference-guided assembly."
log.info " --de_novo Set to 0 to skip de_novo assembly. Default set to 1."
log.info " --skip_rs Set to true to skip ref selection process. Default set to false."
log.info " --skip_rc Set to true to skip ref culling process. Default set to false."
log.info " --tracks Tracks. Default set to false."
log.info " --threads Number of threads to use. Default set to 12."
log.info " --memory Amount of memory to use."
log.info " --clean_uf Remove unnecessary files while running the job. Default is false."
log.info " --executor Executor to use."
log.info " --help Print help message."
exit status
}
if (params.help){
usage(0)
}
// if the path is invalid, throw an error
//if (ams.output == "" || ! file(params.output).mkdirs() ){
// println "ERROR: Need proper output directory path!"
// usage(1)
//}
// check if reference file exists
if (params.reference_db != "" && !file(params.reference_db).isDirectory() ){
println "ERROR: Reference genome files not found!"
usage(1)
}
// check input reads
if (params.forward != "" && params.reverse != ""){
if(!file(params.forward).isFile() ||
!file(params.reverse).isFile()){
println "ERROR: Incorrect filepath to paired reads!"
usage(1)
}
paired = true
}
// Dealing with unpaired reads
if (params.unpaired != ""){
if(!file(params.unpaired).isFile()){
println "ERROR: Incorrect filepath to unpaired reads!"
usage(1)
}
unpaired = true
}
if (paired == false && unpaired == false){
println "ERROR: Incorrect combination of reads given!"
usage(1)
}
// Here we define the actual workflow
workflow {
main:
println "Output dir is $workflow.outputDir"
// filter the reads aligned to marker genes
if (paired == true) {
reads = file(params.forward) + " " + file(params.reverse)
} else {
reads = file(params.unpaired)
}
// keep only reads aligned to marker genes
mapped_reads = filter_reads(reads)
def toProcess = []
file("${params.reference_db}/marker_index").eachFile {item ->
if (item.isDirectory()) {
// println "DEBUG: got ${item.getName()}"
toProcess = toProcess + item.getName()
} //else {
//println "not dir ${item.getName()}"
//}
}
// mapped_reads.view() // for debugging
// map reads to each gene and collect info
marker_covs = map_to_gene(mapped_reads, Channel.fromList(toProcess))
// generate list of genomes selected
select_genomes(marker_covs.collect())
// download all the genomes needed
collect_refs(select_genomes.out.candidates)
// cluster the genomes based on sequence similarity
SkaniTriangle(collect_refs.out, select_genomes.out.candidates.countLines())
Cluster(SkaniTriangle.out)
// create per-cluster FASTA files
ConcatFasta(Cluster.out.clusters)
// build k-mer index for reads
if (paired == true) {
toindex = file(params.forward) + "\n" + file(params.reverse)
} else {
toindex = file(params.unpaired)
}
IndexReads(toindex)
// build k-mer index from clusters
ClusterIndex(ConcatFasta.out.cluster_files.flatten())
// run alignment/assembly
// first interleave
if (paired == true) {
toassemble = "-in1=" + file(params.forward) + " -in2=" + file(params.reverse)
} else {
toassemble = "-in=" + file(params.unpaired)
}
interleaveReads(toassemble)
// combine all clusters in a single fasta file
// and identify just the reads that map to some of the clusters
reduceClusters(Cluster.out.clusters, interleaveReads.out)
// run the cluster-by-cluster assembly
refAssembly(interleaveReads.out, Cluster.out.clusters, reduceClusters.out, IndexReads.out.collect(), ClusterIndex.out.collect())
// de novo assembly (if needed) and stage results
if (params.de_novo > 0) {
deNovoAssembly(refAssembly.out.unmapped_reads)
createOutputs(deNovoAssembly.out.flag, refAssembly.out.genomes)
} else {
createOutputs(0, refAssembly.out.genomes)
}
// publish:
// contigs = createOutputs.out
}
//output {
// contigs {
// mode 'move'
// path "."
// }
//}