-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreateSpreadsheet.py
More file actions
executable file
·224 lines (189 loc) · 8.46 KB
/
createSpreadsheet.py
File metadata and controls
executable file
·224 lines (189 loc) · 8.46 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
#!/usr/bin/env python
## Creates spreadsheets combining input metadata and noteworthy results
import sys
import pandas as pd
from pathlib import Path
import json
reference_length = 29903
def getMetaDF(meta_file):
"""Returns metadata as df"""
df = pd.read_csv(meta_file)
if not "Seq ID" in df.columns:
raise Exception(f"Missing required field 'Seq ID' from metadata file '{meta_file}'")
# finalize some column names
newCols = {"Seq ID":"Sample #","Ct N1":"Ct N gene","Qubit reading":"post-ARTIC Qubit (ng/ul)","Zip Code":"Zip code","Sample date":"Test date"}
df = df.rename(columns=newCols)
cols = df.columns
if "Zip code" in cols:
df["Zip code"] = df["Zip code"].astype('Int64')
# make dates dates (to ensure they're consistent)
for col in cols:
if "date" in col.lower():
df[col] = pd.to_datetime(df[col])
print(f"Metadata\t{len(df)}")
# print(df)
return df
def get_ambig(x):
if x == 'failed to map': return 1.0
try:
return float(x.split(":")[-1])
except(ValueError):
print(f"'Ambiguity_content' column contains unaccounted for string '{x}'")
exit(1)
def getPangoDF(pango_file):
"""Returns useful pangolin metadata as df"""
df = pd.read_csv(pango_file)
df["status"] = df["qc_status"].apply(lambda x: x.replace("pass","passed_qc"))
pangolin_cols = ["taxon","lineage","scorpio_call","ambiguity_score","is_designated","status","qc_notes","note","pangolin_version"]
df = df[pangolin_cols]
df["Sample #"] = df.taxon.apply(lambda x: x.split("/")[0])
df = df.rename(columns={
"scorpio_call":"Scorpio lineage",
"lineage":"Pango lineage",
"status":"Pangolin QC",
"ambiguity_score":"Scorp Score",
"note":"pango_note",
"qc_notes":"Ambiguous_content"})
df["Ambiguous_content"] = df["Ambiguous_content"].apply(lambda x: get_ambig(x)) # convert from "Ambiguous_content:0.02" --> 0.02
print(f"Pangolin\t{len(df)}")
# print(df)
return df
def getNextcladeDF(nextclade_file,nextclade_version):
"""Returns nextclade file as df with useful columns"""
df = pd.read_csv(nextclade_file,sep=";")
df = df[["seqName","clade","qc.overallScore","qc.overallStatus","totalMissing","aaSubstitutions","substitutions"]]
df["Sample #"] = df["seqName"].apply(lambda x: x.split("/")[0])
df = df.rename(columns={
"clade":"Nextstrain Clade",
"qc.overallScore":"Nextstrain QC",
"qc.overallStatus":"Nextstrain Status",
"totalMissing":"Total Missing",
"aaSubstitutions":"AA Substitutions",
"substitutions":"Nucleotide Substitutions"})
df = df.drop_duplicates()
df["nextclade_version"] = nextclade_version
print(f"Nextclade\t{len(df)}")
# print(df)
return df
def getReadsCountsDF(reads_file,counts,title):
"""Returns read counts by barcode as df"""
df = pd.read_csv(reads_file,header=None,names=["Sample #",counts])
print(f"{title}\t{len(df)}")
# print(df)
return df
def checkDfs(dfs):
"""Verifies dfs aren't empty; warns if they're awfully small"""
for name,df in dfs.items():
if df.empty:
raise Exception(f"No data found in {name} df.")
if "Sample #" not in df.columns:
raise Exception(f"'Sample #' field not present in {name} df")
def writeSpreadsheet(df,outdir,plate,which_report="All"):
"""Writes a spreadsheet for individual source labs"""
# write out df if not empty
if len(df) > 0:
print("outdir:",outdir)
outfile = outdir / f"Sequencing-report-{plate}-{which_report}.csv"
print(f"Writing out {which_report} report\n\tOutfile: {outfile}")
df.to_csv(outfile,index=False)
else: print(f"{which_report}\n\tSkipping. No samples found in this batch for Source Lab: '{which_report}'")
def writeSequencingReport(combo:pd.DataFrame,outdir,plate,reportMap):
"""Writes df to csv"""
# output file to csvs (split by source)
print(f"\nWriting out spreadsheet CSVs:")
# this stuff vvv is mirrored in Prepare-Corvaseq.py -- ensure it stays updated together
# reportMap = {"Campus":["Campus",None],"Mecklenburg":["Starmed|StarMed","MCPH"],"StarMed":["Starmed|StarMed",None],"Mission":["Mission",None]} # EDIT if more source labs added
# if there are multiple source labs, write a report file for each
if "Source Lab" in combo.columns and len(combo["Source Lab"].unique()) > 1:
print("reportMap:",reportMap)
if reportMap == '{}':
# skip individual reports
pass
elif reportMap == "":
# create reports for each source lab
for source_lab in combo["Source Lab"].dropna().unique():
print("source_lab:",source_lab)
df = combo[combo["Source Lab"].str.contains(source_lab, na=False)]
writeSpreadsheet(df,outdir,plate,source_lab)
else:
# create reports for each group specified in reportMap
reportMap = json.loads(reportMap.replace("'",'"'))
for which_report, data in reportMap.items():
source_lab = data.get("source_lab",which_report)
limit_to = data.get("limit_to",None)
df = combo[combo["Source Lab"].str.contains(source_lab, na=False)]
if limit_to:
df = combo[combo["Sample #"].str.contains(limit_to, na=False)]
writeSpreadsheet(df,outdir,plate,which_report)
if "From Wastewater" in combo.columns:
ww = combo[combo["From Wastewater"]]
if not ww.empty:
writeSpreadsheet(ww,outdir,plate,"Wastewater")
# write out cumulative report (All)
# outfile = outdir / f"Sequencing-report-{plate}-All.csv"
# print(f"Writing out combined report\n\tOutfile: {outfile}")
# combo.to_csv(outfile, index=False)
writeSpreadsheet(combo,outdir,plate,"All")
def setDefaults(combo):
"""Fills na for important fields"""
if "Total Missing" in combo.columns:
print("Filling NA for Total Missing")
combo["Total Missing"] = combo["Total Missing"].fillna(reference_length).astype('Int64')
if "Nextstrain Status" in combo.columns:
print("Filling NA for Nextstrain Status")
combo["Nextstrain Status"] = combo["Nextstrain Status"].fillna("bad")
if "Pangolin QC" in combo.columns:
print("Filling NA for Pangolin QC")
combo["Pangolin QC"] = combo["Pangolin QC"].fillna("fail")
return combo
def main():
meta_file = Path(sys.argv[1])
pangolin_file = Path(sys.argv[2])
nextclade_file = Path(sys.argv[3])
nextclade_version = Path(sys.argv[4])
read_counts_file = Path(sys.argv[5])
raw_read_counts_file = Path(sys.argv[6])
plate = sys.argv[7]
outdir = Path(sys.argv[8])
reportMap = sys.argv[9]
print("Inputs:")
print("meta_file:",meta_file)
print("pangolin_file:",pangolin_file)
print("nextclade_file:",nextclade_file)
print("nextclade_version:",nextclade_version)
print("read_counts_file:",read_counts_file)
print("raw_read_counts_file:",raw_read_counts_file)
print("plate:",plate)
print("outdir:",outdir)
print("reportMap:",reportMap)
print()
# read in all csvs to dataframes
print("Reading in CSVs\ndf\t\tlength")
meta_df = getMetaDF(meta_file)
pangolin_df = getPangoDF(pangolin_file)
nextclade_df = getNextcladeDF(nextclade_file,nextclade_version)
reads_df = getReadsCountsDF(read_counts_file,"Reads on Barcode","Reads filt")
raw_reads_df = getReadsCountsDF(raw_read_counts_file,"read_counts_raw","Reads raw")
# ensure all dfs have data
dfs = {"Metadata":meta_df,"Nextclade":nextclade_df,"Pangolin":pangolin_df,"read counts":reads_df,"raw read counts":raw_reads_df}
checkDfs(dfs)
for name,df in dfs.items():
print(name)
print(df.columns)
print(df)
# merge dfs
combo:pd.DataFrame = meta_df.merge(pangolin_df,on="Sample #",how="outer"
).merge(nextclade_df,on="Sample #",how="outer"
).merge(reads_df,on="Sample #",how="outer"
).merge(raw_reads_df,on="Sample #",how="outer")
print(len(combo))
# set defaults for samples that lacked fastas to go to nextclade/pangolin
combo = setDefaults(combo)
print(len(combo))
print(combo[["Sample #","Pango lineage","Total Missing"]])
# write out results
writeSequencingReport(combo,outdir,plate,reportMap)
# combo.to_csv(outfile,index=False)
print(f"\ndone\n")
if __name__ == "__main__":
main()