covid-analysis/createSpreadsheet.py at main · enviro-lab/covid-analysis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/env python

## Creates spreadsheets combining input metadata and noteworthy results
import sys
import pandas as pd
from pathlib import Path
import json

reference_length = 29903

def getMetaDF(meta_file):
    """Returns metadata as df"""

    df = pd.read_csv(meta_file)
    if not "Seq ID" in df.columns:
        raise Exception(f"Missing required field 'Seq ID' from metadata file '{meta_file}'")
    # finalize some column names
    newCols = {"Seq ID":"Sample #","Ct N1":"Ct N gene","Qubit reading":"post-ARTIC Qubit (ng/ul)","Zip Code":"Zip code","Sample date":"Test date"}
    df = df.rename(columns=newCols)
    cols = df.columns
    if "Zip code" in cols:
        df["Zip code"] = df["Zip code"].astype('Int64')
    # make dates dates (to ensure they're consistent)
    for col in cols:
        if "date" in col.lower():
            df[col] = pd.to_datetime(df[col])
    print(f"Metadata\t{len(df)}")
    # print(df)
    return df

def get_ambig(x):
    if x == 'failed to map': return 1.0
    try:
        return float(x.split(":")[-1])
    except(ValueError):
        print(f"'Ambiguity_content' column contains unaccounted for string '{x}'")
        exit(1)

def getPangoDF(pango_file):
    """Returns useful pangolin metadata as df"""

    df = pd.read_csv(pango_file)
    df["status"] = df["qc_status"].apply(lambda x: x.replace("pass","passed_qc"))
    pangolin_cols = ["taxon","lineage","scorpio_call","ambiguity_score","is_designated","status","qc_notes","note","pangolin_version"]
    df = df[pangolin_cols]
    df["Sample #"] = df.taxon.apply(lambda x: x.split("/")[0])
    df = df.rename(columns={
        "scorpio_call":"Scorpio lineage",
        "lineage":"Pango lineage",
        "status":"Pangolin QC",
        "ambiguity_score":"Scorp Score",
        "note":"pango_note",
        "qc_notes":"Ambiguous_content"})
    df["Ambiguous_content"] = df["Ambiguous_content"].apply(lambda x: get_ambig(x)) # convert from "Ambiguous_content:0.02" --> 0.02
    print(f"Pangolin\t{len(df)}")
    # print(df)
    return df

def getNextcladeDF(nextclade_file,nextclade_version):
    """Returns nextclade file as df with useful columns"""

    df = pd.read_csv(nextclade_file,sep=";")
    df = df[["seqName","clade","qc.overallScore","qc.overallStatus","totalMissing","aaSubstitutions","substitutions"]]
    df["Sample #"] = df["seqName"].apply(lambda x: x.split("/")[0])
    df = df.rename(columns={
        "clade":"Nextstrain Clade",
        "qc.overallScore":"Nextstrain QC",
        "qc.overallStatus":"Nextstrain Status",
        "totalMissing":"Total Missing",
        "aaSubstitutions":"AA Substitutions",
        "substitutions":"Nucleotide Substitutions"})
    df = df.drop_duplicates()

    df["nextclade_version"] = nextclade_version
    print(f"Nextclade\t{len(df)}")
    # print(df)
    return df

def getReadsCountsDF(reads_file,counts,title):
    """Returns read counts by barcode as df"""

    df = pd.read_csv(reads_file,header=None,names=["Sample #",counts])
    print(f"{title}\t{len(df)}")
    # print(df)
    return df

def checkDfs(dfs):
    """Verifies dfs aren't empty; warns if they're awfully small"""

    for name,df in dfs.items():
        if df.empty:
            raise Exception(f"No data found in {name} df.")
        if "Sample #" not in df.columns:
            raise Exception(f"'Sample #' field not present in {name} df")


def writeSpreadsheet(df,outdir,plate,which_report="All"):
    """Writes a spreadsheet for individual source labs"""

    # write out df if not empty
    if len(df) > 0:
        print("outdir:",outdir)
        outfile = outdir / f"Sequencing-report-{plate}-{which_report}.csv"
        print(f"Writing out {which_report} report\n\tOutfile: {outfile}")
        df.to_csv(outfile,index=False)
    else: print(f"{which_report}\n\tSkipping. No samples found in this batch for Source Lab: '{which_report}'")

def writeSequencingReport(combo:pd.DataFrame,outdir,plate,reportMap):
    """Writes df to csv"""

    # output file to csvs (split by source)
    print(f"\nWriting out spreadsheet CSVs:")
    # this stuff vvv is mirrored in Prepare-Corvaseq.py -- ensure it stays updated together
    # reportMap = {"Campus":["Campus",None],"Mecklenburg":["Starmed|StarMed","MCPH"],"StarMed":["Starmed|StarMed",None],"Mission":["Mission",None]}            # EDIT if more source labs added

    # if there are multiple source labs, write a report file for each
    if "Source Lab" in combo.columns and len(combo["Source Lab"].unique()) > 1:
        print("reportMap:",reportMap)
        if reportMap == '{}':
            # skip individual reports
            pass
        elif reportMap == "":
            # create reports for each source lab
            for source_lab in combo["Source Lab"].dropna().unique():
                print("source_lab:",source_lab)
                df = combo[combo["Source Lab"].str.contains(source_lab, na=False)]
                writeSpreadsheet(df,outdir,plate,source_lab)
        else:
            # create reports for each group specified in reportMap
            reportMap = json.loads(reportMap.replace("'",'"'))
            for which_report, data in reportMap.items():
                source_lab = data.get("source_lab",which_report)
                limit_to = data.get("limit_to",None)
                df = combo[combo["Source Lab"].str.contains(source_lab, na=False)]
                if limit_to:
                    df = combo[combo["Sample #"].str.contains(limit_to, na=False)]
                writeSpreadsheet(df,outdir,plate,which_report)

    if "From Wastewater" in combo.columns:
        ww = combo[combo["From Wastewater"]]
        if not ww.empty:
            writeSpreadsheet(ww,outdir,plate,"Wastewater")

    # write out cumulative report (All)
    # outfile = outdir / f"Sequencing-report-{plate}-All.csv"
    # print(f"Writing out combined report\n\tOutfile: {outfile}")
    # combo.to_csv(outfile, index=False)
    writeSpreadsheet(combo,outdir,plate,"All")

def setDefaults(combo):
    """Fills na for important fields"""

    if "Total Missing" in combo.columns:
        print("Filling NA for Total Missing")
        combo["Total Missing"] = combo["Total Missing"].fillna(reference_length).astype('Int64')
    if "Nextstrain Status" in combo.columns:
        print("Filling NA for Nextstrain Status")
        combo["Nextstrain Status"] = combo["Nextstrain Status"].fillna("bad")
    if "Pangolin QC" in combo.columns:
        print("Filling NA for Pangolin QC")
        combo["Pangolin QC"] = combo["Pangolin QC"].fillna("fail")
    return combo

def main():
    meta_file = Path(sys.argv[1])
    pangolin_file = Path(sys.argv[2])
    nextclade_file = Path(sys.argv[3])
    nextclade_version = Path(sys.argv[4])
    read_counts_file = Path(sys.argv[5])
    raw_read_counts_file = Path(sys.argv[6])
    plate = sys.argv[7]
    outdir = Path(sys.argv[8])
    reportMap = sys.argv[9]

    print("Inputs:")
    print("meta_file:",meta_file)
    print("pangolin_file:",pangolin_file)
    print("nextclade_file:",nextclade_file)
    print("nextclade_version:",nextclade_version)
    print("read_counts_file:",read_counts_file)
    print("raw_read_counts_file:",raw_read_counts_file)
    print("plate:",plate)
    print("outdir:",outdir)
    print("reportMap:",reportMap)
    print()

    # read in all csvs to dataframes
    print("Reading in CSVs\ndf\t\tlength")
    meta_df = getMetaDF(meta_file)
    pangolin_df = getPangoDF(pangolin_file)
    nextclade_df = getNextcladeDF(nextclade_file,nextclade_version)
    reads_df = getReadsCountsDF(read_counts_file,"Reads on Barcode","Reads filt")
    raw_reads_df = getReadsCountsDF(raw_read_counts_file,"read_counts_raw","Reads raw")

    # ensure all dfs have data
    dfs = {"Metadata":meta_df,"Nextclade":nextclade_df,"Pangolin":pangolin_df,"read counts":reads_df,"raw read counts":raw_reads_df}
    checkDfs(dfs)
    for name,df in dfs.items():
        print(name)
        print(df.columns)
        print(df)

    # merge dfs
    combo:pd.DataFrame = meta_df.merge(pangolin_df,on="Sample #",how="outer"
                ).merge(nextclade_df,on="Sample #",how="outer"
                ).merge(reads_df,on="Sample #",how="outer"
                ).merge(raw_reads_df,on="Sample #",how="outer")

    print(len(combo))

    # set defaults for samples that lacked fastas to go to nextclade/pangolin
    combo = setDefaults(combo)

    print(len(combo))
    print(combo[["Sample #","Pango lineage","Total Missing"]])

    # write out results
    writeSequencingReport(combo,outdir,plate,reportMap)
    # combo.to_csv(outfile,index=False)

    print(f"\ndone\n")

if __name__ == "__main__":
    main()