Skip to content
This repository was archived by the owner on Sep 12, 2022. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added analysis/plots/mapq_mod.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added analysis/plots/mapq_nomod.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added analysis/plots/signal_length_mod.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added analysis/plots/signal_length_nomod.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added analysis/plots/start_raw_index_mod.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added analysis/plots/start_raw_index_nomod.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1,001 changes: 1,001 additions & 0 deletions analysis/results/CG_mod.csv

Large diffs are not rendered by default.

1,001 changes: 1,001 additions & 0 deletions analysis/results/CG_nomod.csv

Large diffs are not rendered by default.

1,001 changes: 1,001 additions & 0 deletions analysis/results/mappy_mod.csv

Large diffs are not rendered by default.

1,001 changes: 1,001 additions & 0 deletions analysis/results/mappy_nomod.csv

Large diffs are not rendered by default.

60 changes: 60 additions & 0 deletions analysis/scripts/align_mappy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
from pathlib import Path
import mappy as mp
from tqdm import tqdm
import h5py
from Bio import SeqIO
import re
import pandas as pd

from util import get_files, read_fastq


def align_mappy(dir_in, file_out, file_fasta):
a = mp.Aligner(file_fasta, preset='map-ont') # Load or build index
if not a:
raise Exception("ERROR: failed to load/build index")

reads = get_files(dir_in)
files_fastq = {}
data = []

for read in tqdm(reads):
with h5py.File(read, 'r', libver='latest') as fd:
no_alignment = True
fastq = read_fastq(fd)
files_fastq[fastq.id] = len(fastq.seq)

for hit in a.map(fastq.seq): # Traverse alignments
if hit.is_primary: # Check if the alignment is primary
# Reference
for seq_record in SeqIO.parse(file_fasta, 'fasta'):
ref = seq_record.seq[hit.r_st: hit.r_en]
r_CG_num = len(re.findall(r'(CG)', str(ref)))

# Query
query = fastq.seq[hit.q_st: hit.q_en]
if hit.strand == -1:
query = mp.revcomp(query)
q_CG_num = len(re.findall(r'(CG)', str(query)))

no_alignment = False
data.append([fastq.id, hit.r_st, hit.r_en, hit.q_st, hit.q_en, r_CG_num, q_CG_num, hit.cigar_str])
break

if no_alignment:
data.append([fastq.id, '', '', '', '', 0, 0, ''])

data = pd.DataFrame(data, columns=['read_id', 'r_st', 'r_en', 'q_st', 'q_en', 'r_CG_num', 'q_CG_num', 'cigar_str'])
data.sort_values('read_id', inplace=True)
data.to_csv(file_out, index=False)

print("Average length of fastq files:", sum(files_fastq.values()) / len(files_fastq.values()))


if __name__ == '__main__':
modification = 'nomod' # 'mod' or 'nomod'
dir_in = Path(f'/home/sdeur/tmp-guppy/{modification}/workspace')
file_out = f'/home/sdeur/tmp-mappy/mappy_{modification}.csv'
file_fasta = '/home/sdeur/data/ecoli_k12_mg1655.fasta'

align_mappy(dir_in, file_out, file_fasta)
147 changes: 147 additions & 0 deletions analysis/scripts/analyse_CG.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
from pathlib import Path
import mappy as mp
from tqdm import tqdm
import h5py
from Bio import SeqIO
import pandas as pd

from util import get_files, read_fastq


def count_matches(cs):
matches = {'M': 0, 'X': 0, 'D': 0, 'I': 0}
mode, value = 'M', 0

for c in cs:
if c == ':':
matches[mode] += int(value) if mode == 'M' or mode == 'X' else len(value)
mode, value = 'M', ''
continue
elif c == '*':
matches[mode] += int(value) if mode == 'M' or mode == 'X' else len(value)
mode, value = 'X', ''
continue
elif c == '-':
matches[mode] += int(value) if mode == 'M' or mode == 'X' else len(value)
mode, value = 'D', ''
continue
elif c == '+':
matches[mode] += int(value) if mode == 'M' or mode == 'X' else len(value)
mode, value = 'I', ''
continue

if mode == 'X':
value = 1
else:
value += c

matches[mode] += int(value) if mode == 'M' or mode == 'X' else len(value)
return matches


def normalize(ref, query, cigar):
r = list(ref)
q = list(query)
i = 0
num = ''

for c in cigar:
if c.isnumeric():
num += c
continue

elif c == 'M':
i += int(num)

elif c == 'D':
for _ in range(int(num)):
q.insert(i, '-')
i += 1

elif c == 'I':
for _ in range(int(num)):
r.insert(i, '-')
i += 1

num = ''

return ''.join(r), ''.join(q)


def count_CG(ref, query):
CG_cnt = {'M': 0, 'X': 0, 'D': 0, 'I': 0}

for i in range(len(ref) - 1):
if ref[i] == 'C' and ref[i + 1] == 'G':
if query[i] == 'C' and query[i + 1] == 'G':
CG_cnt['M'] += 1

elif query[i] == '-' and query[i + 1] == 'G':
CG_cnt['D'] += 1

elif query[i] in {'A', 'G', 'T'} and query[i + 1] == 'G':
CG_cnt['X'] += 1

elif ref[i] == '-' and ref[i + 1] == 'G':
if query[i] == 'C' and query[i + 1] == 'G':
CG_cnt['I'] += 1

return CG_cnt


def analyse_CG(dir_in, file_out, file_fasta):
a = mp.Aligner(file_fasta, preset='map-ont') # Load or build index
if not a:
raise Exception("ERROR: failed to load/build index")

reads = get_files(dir_in)
data = []

for read in tqdm(reads):
with h5py.File(read, 'r', libver='latest') as fd:
matches = {'M': 0, 'X': 0, 'D': 0, 'I': 0}
CG_cnt = {'M': 0, 'X': 0, 'D': 0, 'I': 0}

fastq = read_fastq(fd)
ref = ''
mapq = 0

for hit in a.map(fastq.seq, cs=True): # Traverse alignments
if hit.is_primary: # Check if the alignment is primary
# Alignment
matches = count_matches(hit.cs)

# Reference
for seq_record in SeqIO.parse(file_fasta, 'fasta'):
ref = seq_record.seq[hit.r_st: hit.r_en]

# Query
query = fastq.seq[hit.q_st: hit.q_en]
if hit.strand == -1:
query = mp.revcomp(query)

# Normalize
ref, query = normalize(ref, query, hit.cigar_str)

# Analyse CG motif
CG_cnt = count_CG(ref, query)

mapq = hit.mapq
break

data.append([fastq.id, len(ref), matches['M'], matches['X'], matches['D'], matches['I'],
CG_cnt['M'], CG_cnt['X'], CG_cnt['D'], CG_cnt['I'], mapq])

data = pd.DataFrame(data, columns=['read_id', 'alignment_len', 'M', 'X', 'D', 'I',
'M_CG', 'X_CG', 'D_CG', 'I_CG', 'mapq'])
data.sort_values('read_id', inplace=True)
data.to_csv(file_out, index=False)


if __name__ == '__main__':
modification = 'nomod' # 'mod' or 'nomod'
dir_in = Path(f'/home/sdeur/tmp-guppy/{modification}/workspace')
file_out = f'/home/sdeur/tmp-mappy/CG_{modification}.csv'
file_fasta = '/home/sdeur/data/ecoli_k12_mg1655.fasta'

analyse_CG(dir_in, file_out, file_fasta)
50 changes: 50 additions & 0 deletions analysis/scripts/plot_mapq.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from pathlib import Path
import mappy as mp
from tqdm import tqdm
import h5py
import matplotlib.pyplot as plt
import numpy as np

from util import get_files, read_fastq


def plot_distribution(data, modification, bins=15):
""" Distribution of mapping quality """
plt.hist(data, bins=bins, color='green' if modification == 'nomod' else 'red')
plt.title(f'Distribution of mapping quality ({modification})')
plt.xlabel('mapping quality')
plt.ylabel('count')
plt.yticks(np.arange(0, 1001, 100))
plt.savefig(f'../plots/mapq_{modification}.png')


def get_mapqs(dir_in, file_fasta):
a = mp.Aligner(file_fasta, preset='map-ont') # Load or build index
if not a:
raise Exception("ERROR: failed to load/build index")

reads = get_files(dir_in)
mapqs = []

for read in tqdm(reads):
with h5py.File(read, 'r', libver='latest') as fd:
fastq = read_fastq(fd)
mapq = 0

for hit in a.map(fastq.seq): # Traverse alignments
if hit.is_primary: # Check if the alignment is primary
mapq = hit.mapq
break

mapqs.append(mapq)

return mapqs


if __name__ == '__main__':
modification = 'nomod' # 'mod' or 'nomod'
dir_in = Path(f'/home/sdeur/tmp-guppy/{modification}/workspace')
file_fasta = '/home/sdeur/data/ecoli_k12_mg1655.fasta'

mapqs = get_mapqs(dir_in, file_fasta)
plot_distribution(mapqs, modification)
39 changes: 39 additions & 0 deletions analysis/scripts/plot_signal_length.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import os
from ont_fast5_api.fast5_interface import get_fast5_file
import numpy as np
import matplotlib.pyplot as plt


def get_signal_length(fast5_filepath):
"""
:param fast5_filepath: can be a single- or multi-read file
:return: raw signal length
"""
with get_fast5_file(fast5_filepath, mode="r") as f5:
for read in f5.get_reads():
raw_data = read.get_raw_data()
# print(read.read_id, raw_data, len(raw_data))

return len(raw_data)


def plot_signal_length(dir_in, bins=50):
""" Distribution of raw signal length """
data = []
for file in os.listdir(dir_in):
data.append(get_signal_length(os.path.join(dir_in, file)))

plt.hist(data, bins=bins, color='green' if modification == 'nomod' else 'red')
plt.title(f'Distribution of raw signal length ({modification})')
plt.xlabel('raw signal length')
plt.ylabel('count')
plt.xticks(np.arange(0, 600_001, 100_000)) # Change if needed
plt.yticks(np.arange(0, 301, 50)) # Change if needed
plt.savefig(f'../plots/signal_length_{modification}.png')


if __name__ == '__main__':
modification = 'nomod' # 'mod' or 'nomod'
dir_in = f'/home/sdeur/tmp-guppy/{modification}/workspace'

plot_signal_length(dir_in)
Loading