lbcb-sci · sanjadeur · May 2, 2021 · May 9, 2021 · May 11, 2021 · May 11, 2021
diff --git a/analysis/plots/mapq_mod.png b/analysis/plots/mapq_mod.png
diff --git a/analysis/plots/mapq_nomod.png b/analysis/plots/mapq_nomod.png
diff --git a/analysis/plots/signal_length_mod.png b/analysis/plots/signal_length_mod.png
diff --git a/analysis/plots/signal_length_nomod.png b/analysis/plots/signal_length_nomod.png
diff --git a/analysis/plots/start_raw_index_mod.png b/analysis/plots/start_raw_index_mod.png
diff --git a/analysis/plots/start_raw_index_nomod.png b/analysis/plots/start_raw_index_nomod.png
diff --git a/analysis/results/CG_mod.csv b/analysis/results/CG_mod.csv
diff --git a/analysis/results/CG_nomod.csv b/analysis/results/CG_nomod.csv
diff --git a/analysis/results/mappy_mod.csv b/analysis/results/mappy_mod.csv
diff --git a/analysis/results/mappy_nomod.csv b/analysis/results/mappy_nomod.csv
diff --git a/analysis/scripts/align_mappy.py b/analysis/scripts/align_mappy.py
@@ -0,0 +1,60 @@
+from pathlib import Path
+import mappy as mp
+from tqdm import tqdm
+import h5py
+from Bio import SeqIO
+import re
+import pandas as pd
+
+from util import get_files, read_fastq
+
+
+def align_mappy(dir_in, file_out, file_fasta):
+    a = mp.Aligner(file_fasta, preset='map-ont')  # Load or build index
+    if not a:
+        raise Exception("ERROR: failed to load/build index")
+
+    reads = get_files(dir_in)
+    files_fastq = {}
+    data = []
+
+    for read in tqdm(reads):
+        with h5py.File(read, 'r', libver='latest') as fd:
+            no_alignment = True
+            fastq = read_fastq(fd)
+            files_fastq[fastq.id] = len(fastq.seq)
+
+            for hit in a.map(fastq.seq):  # Traverse alignments
+                if hit.is_primary:  # Check if the alignment is primary
+                    # Reference
+                    for seq_record in SeqIO.parse(file_fasta, 'fasta'):
+                        ref = seq_record.seq[hit.r_st: hit.r_en]
+                    r_CG_num = len(re.findall(r'(CG)', str(ref)))
+
+                    # Query
+                    query = fastq.seq[hit.q_st: hit.q_en]
+                    if hit.strand == -1:
+                        query = mp.revcomp(query)
+                    q_CG_num = len(re.findall(r'(CG)', str(query)))
+
+                    no_alignment = False
+                    data.append([fastq.id, hit.r_st, hit.r_en, hit.q_st, hit.q_en, r_CG_num, q_CG_num, hit.cigar_str])
+                    break
+
+        if no_alignment:
+            data.append([fastq.id, '', '', '', '', 0, 0, ''])
+
+    data = pd.DataFrame(data, columns=['read_id', 'r_st', 'r_en', 'q_st', 'q_en', 'r_CG_num', 'q_CG_num', 'cigar_str'])
+    data.sort_values('read_id', inplace=True)
+    data.to_csv(file_out, index=False)
+
+    print("Average length of fastq files:", sum(files_fastq.values()) / len(files_fastq.values()))
+
+
+if __name__ == '__main__':
+    modification = 'nomod'  # 'mod' or 'nomod'
+    dir_in = Path(f'/home/sdeur/tmp-guppy/{modification}/workspace')
+    file_out = f'/home/sdeur/tmp-mappy/mappy_{modification}.csv'
+    file_fasta = '/home/sdeur/data/ecoli_k12_mg1655.fasta'
+
+    align_mappy(dir_in, file_out, file_fasta)
diff --git a/analysis/scripts/analyse_CG.py b/analysis/scripts/analyse_CG.py
@@ -0,0 +1,147 @@
+from pathlib import Path
+import mappy as mp
+from tqdm import tqdm
+import h5py
+from Bio import SeqIO
+import pandas as pd
+
+from util import get_files, read_fastq
+
+
+def count_matches(cs):
+    matches = {'M': 0, 'X': 0, 'D': 0, 'I': 0}
+    mode, value = 'M', 0
+
+    for c in cs:
+        if c == ':':
+            matches[mode] += int(value) if mode == 'M' or mode == 'X' else len(value)
+            mode, value = 'M', ''
+            continue
+        elif c == '*':
+            matches[mode] += int(value) if mode == 'M' or mode == 'X' else len(value)
+            mode, value = 'X', ''
+            continue
+        elif c == '-':
+            matches[mode] += int(value) if mode == 'M' or mode == 'X' else len(value)
+            mode, value = 'D', ''
+            continue
+        elif c == '+':
+            matches[mode] += int(value) if mode == 'M' or mode == 'X' else len(value)
+            mode, value = 'I', ''
+            continue
+
+        if mode == 'X':
+            value = 1
+        else:
+            value += c
+
+    matches[mode] += int(value) if mode == 'M' or mode == 'X' else len(value)
+    return matches
+
+
+def normalize(ref, query, cigar):
+    r = list(ref)
+    q = list(query)
+    i = 0
+    num = ''
+
+    for c in cigar:
+        if c.isnumeric():
+            num += c
+            continue
+
+        elif c == 'M':
+            i += int(num)
+
+        elif c == 'D':
+            for _ in range(int(num)):
+                q.insert(i, '-')
+                i += 1
+
+        elif c == 'I':
+            for _ in range(int(num)):
+                r.insert(i, '-')
+                i += 1
+
+        num = ''
+
+    return ''.join(r), ''.join(q)
+
+
+def count_CG(ref, query):
+    CG_cnt = {'M': 0, 'X': 0, 'D': 0, 'I': 0}
+
+    for i in range(len(ref) - 1):
+        if ref[i] == 'C' and ref[i + 1] == 'G':
+            if query[i] == 'C' and query[i + 1] == 'G':
+                CG_cnt['M'] += 1
+
+            elif query[i] == '-' and query[i + 1] == 'G':
+                CG_cnt['D'] += 1
+
+            elif query[i] in {'A', 'G', 'T'} and query[i + 1] == 'G':
+                CG_cnt['X'] += 1
+
+        elif ref[i] == '-' and ref[i + 1] == 'G':
+            if query[i] == 'C' and query[i + 1] == 'G':
+                CG_cnt['I'] += 1
+
+    return CG_cnt
+
+
+def analyse_CG(dir_in, file_out, file_fasta):
+    a = mp.Aligner(file_fasta, preset='map-ont')  # Load or build index
+    if not a:
+        raise Exception("ERROR: failed to load/build index")
+
+    reads = get_files(dir_in)
+    data = []
+
+    for read in tqdm(reads):
+        with h5py.File(read, 'r', libver='latest') as fd:
+            matches = {'M': 0, 'X': 0, 'D': 0, 'I': 0}
+            CG_cnt = {'M': 0, 'X': 0, 'D': 0, 'I': 0}
+
+            fastq = read_fastq(fd)
+            ref = ''
+            mapq = 0
+
+            for hit in a.map(fastq.seq, cs=True):  # Traverse alignments
+                if hit.is_primary:  # Check if the alignment is primary
+                    # Alignment
+                    matches = count_matches(hit.cs)
+
+                    # Reference
+                    for seq_record in SeqIO.parse(file_fasta, 'fasta'):
+                        ref = seq_record.seq[hit.r_st: hit.r_en]
+
+                    # Query
+                    query = fastq.seq[hit.q_st: hit.q_en]
+                    if hit.strand == -1:
+                        query = mp.revcomp(query)
+
+                    # Normalize
+                    ref, query = normalize(ref, query, hit.cigar_str)
+
+                    # Analyse CG motif
+                    CG_cnt = count_CG(ref, query)
+
+                    mapq = hit.mapq
+                    break
+
+            data.append([fastq.id, len(ref), matches['M'], matches['X'], matches['D'], matches['I'],
+                         CG_cnt['M'], CG_cnt['X'], CG_cnt['D'], CG_cnt['I'], mapq])
+
+    data = pd.DataFrame(data, columns=['read_id', 'alignment_len', 'M', 'X', 'D', 'I',
+                                       'M_CG', 'X_CG', 'D_CG', 'I_CG', 'mapq'])
+    data.sort_values('read_id', inplace=True)
+    data.to_csv(file_out, index=False)
+
+
+if __name__ == '__main__':
+    modification = 'nomod'  # 'mod' or 'nomod'
+    dir_in = Path(f'/home/sdeur/tmp-guppy/{modification}/workspace')
+    file_out = f'/home/sdeur/tmp-mappy/CG_{modification}.csv'
+    file_fasta = '/home/sdeur/data/ecoli_k12_mg1655.fasta'
+
+    analyse_CG(dir_in, file_out, file_fasta)
diff --git a/analysis/scripts/plot_mapq.py b/analysis/scripts/plot_mapq.py
@@ -0,0 +1,50 @@
+from pathlib import Path
+import mappy as mp
+from tqdm import tqdm
+import h5py
+import matplotlib.pyplot as plt
+import numpy as np
+
+from util import get_files, read_fastq
+
+
+def plot_distribution(data, modification, bins=15):
+    """ Distribution of mapping quality """
+    plt.hist(data, bins=bins, color='green' if modification == 'nomod' else 'red')
+    plt.title(f'Distribution of mapping quality ({modification})')
+    plt.xlabel('mapping quality')
+    plt.ylabel('count')
+    plt.yticks(np.arange(0, 1001, 100))
+    plt.savefig(f'../plots/mapq_{modification}.png')
+
+
+def get_mapqs(dir_in, file_fasta):
+    a = mp.Aligner(file_fasta, preset='map-ont')  # Load or build index
+    if not a:
+        raise Exception("ERROR: failed to load/build index")
+
+    reads = get_files(dir_in)
+    mapqs = []
+
+    for read in tqdm(reads):
+        with h5py.File(read, 'r', libver='latest') as fd:
+            fastq = read_fastq(fd)
+            mapq = 0
+
+            for hit in a.map(fastq.seq):  # Traverse alignments
+                if hit.is_primary:  # Check if the alignment is primary
+                    mapq = hit.mapq
+                    break
+
+            mapqs.append(mapq)
+
+    return mapqs
+
+
+if __name__ == '__main__':
+    modification = 'nomod'  # 'mod' or 'nomod'
+    dir_in = Path(f'/home/sdeur/tmp-guppy/{modification}/workspace')
+    file_fasta = '/home/sdeur/data/ecoli_k12_mg1655.fasta'
+
+    mapqs = get_mapqs(dir_in, file_fasta)
+    plot_distribution(mapqs, modification)
diff --git a/analysis/scripts/plot_signal_length.py b/analysis/scripts/plot_signal_length.py
@@ -0,0 +1,39 @@
+import os
+from ont_fast5_api.fast5_interface import get_fast5_file
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+def get_signal_length(fast5_filepath):
+    """
+    :param fast5_filepath: can be a single- or multi-read file
+    :return: raw signal length
+    """
+    with get_fast5_file(fast5_filepath, mode="r") as f5:
+        for read in f5.get_reads():
+            raw_data = read.get_raw_data()
+            # print(read.read_id, raw_data, len(raw_data))
+
+    return len(raw_data)
+
+
+def plot_signal_length(dir_in, bins=50):
+    """ Distribution of raw signal length """
+    data = []
+    for file in os.listdir(dir_in):
+        data.append(get_signal_length(os.path.join(dir_in, file)))
+
+    plt.hist(data, bins=bins, color='green' if modification == 'nomod' else 'red')
+    plt.title(f'Distribution of raw signal length ({modification})')
+    plt.xlabel('raw signal length')
+    plt.ylabel('count')
+    plt.xticks(np.arange(0, 600_001, 100_000))  # Change if needed
+    plt.yticks(np.arange(0, 301, 50))  # Change if needed
+    plt.savefig(f'../plots/signal_length_{modification}.png')
+
+
+if __name__ == '__main__':
+    modification = 'nomod'  # 'mod' or 'nomod'
+    dir_in = f'/home/sdeur/tmp-guppy/{modification}/workspace'
+
+    plot_signal_length(dir_in)