-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcalculate_conversions.py
More file actions
executable file
·94 lines (75 loc) · 3.01 KB
/
calculate_conversions.py
File metadata and controls
executable file
·94 lines (75 loc) · 3.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python
from argparse import ArgumentParser
from pathlib import Path
import subprocess
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import io
import json
import shutil
def parse_args():
parser = ArgumentParser()
parser.add_argument('--new_ref', type=Path, help="path to single-sequence fasta to convert to.")
parser.add_argument('--old_ref', type=Path, help="path to fasta containing sequence(s) to convert from. Each sequence must be named with the RNAME value of the reads from the BAM file(s) that will be converted later.")
parser.add_argument('--offsets', type=Path, help="path to the output json file containing conversion offsets.")
return parser.parse_args()
def parse_mafft(output):
"""parse output from mafft
Args:
output (str): output from mafft
"""
file = io.StringIO(output)
file.seek(0)
records = list(SeqIO.parse(file, 'fasta'))
return records[0], records[1]
def mafft(fasta):
"""align old_ref fasta sequences to new_ref sequence
Args:
fasta (str): path to fasta with 2 sequences: wuhan and other
"""
output = subprocess.check_output(["mafft", "--quiet", fasta], shell=False, text=True)#, errors="ignore")
return parse_mafft(output)
def align_seqs(ref_fasta, other_fasta):
"""Aligns `other_fasta` sequence to `ref_fasta` sequence
Args:
ref_fasta (str): path to wuhan reference fasta
other_fasta (str): path to multi-reference fasta
"""
temp_fasta = "tmp.fasta"
with open(temp_fasta, "w") as temp:
SeqIO.write([ref_fasta, other_fasta], temp, "fasta")
wuhan_aln, other_aln = mafft(temp_fasta)
Path(temp_fasta).unlink()
return wuhan_aln, other_aln
def map_offsets(ref_aln:SeqRecord, other_aln:SeqRecord):
"""Map multi-reference positions to wuhan reference positions
Args:
ref_aln (SeqRecord): wuhan reference alignment
other_aln (SeqRecord): multi-reference alignment
"""
ref_gaps, other_gaps, last_offset = 0, 0, None
offsets = [] # (position, offset)
for i in range(max(len(ref_aln), len(other_aln))):
# determine offset for current position (0-based)
ref_gaps += ref_aln.seq[i] == "-"
other_gaps += other_aln.seq[i] == "-"
offset = other_gaps - ref_gaps
# add in offset if it has changed
if offset != last_offset:
offsets.append((i, offset))
last_offset = offset
return offsets
def main():
args = parse_args()
offset_dict = {}
reference = SeqIO.read(args.new_ref, "fasta")
for record in SeqIO.parse(args.old_ref, "fasta"):
# align multi-reference fasta sequences to wuhan reference
wuhan_aln, other_aln = align_seqs(reference, record)
# convert positions in each sequence from multi-reference to positions in wuhan reference
offset_dict[record.id] = map_offsets(wuhan_aln, other_aln)
# write output
with open(args.offsets, "w") as out:
json.dump(offset_dict, out)
if __name__ == "__main__":
main()