-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathExtractSraMetadata.py
More file actions
89 lines (61 loc) · 3 KB
/
ExtractSraMetadata.py
File metadata and controls
89 lines (61 loc) · 3 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
import logging
import pandas as pd
from xml.dom import minidom
taxon_id = "1211417" # crAsshage
if os.name == "nt":
sample_dir = r"D:\17 Dutihl Lab\_tools\diversitools"
else:
sample_dir = "/hosts/linuxhome/mgx/DB/MGXDB/MGXDB"
class ExtractSraMetadata:
sample_table_name = ""
dir_sep = ""
taxon_id = ""
def __init__(self, sample_dir, taxon_id):
if os.name == 'nt':
self.dir_sep = "\\"
else:
self.dir_sep = "/"
logging.basicConfig(filename=sample_dir + self.dir_sep + ".." + self.dir_sep + "ExtractSraMetadata.log", filemode='w', format='%(asctime)s - %(message)s',
level=logging.DEBUG)
self.sample_dir = sample_dir
self.taxon_id = taxon_id
self.sample_table_name = sample_dir + self.dir_sep + ".." + self.dir_sep + "taxon_" + taxon_id + "_counts.txt"
def read_metadata(self, taxon_id):
subfolders = [f.path for f in os.scandir(self.sample_dir) if f.is_dir()]
data = []
for subfolder in subfolders:
sample = os.path.basename(subfolder)
metafile_name = subfolder + self.dir_sep + sample + "_metadata"
if os.path.isfile(metafile_name) and os.path.getsize(metafile_name) > 0:
logging.debug("processing {}".format(metafile_name))
parsed = False
try:
meta = minidom.parse(metafile_name)
parsed = True
except:
logging.error("Error in parsing meta file for sample {}".format(sample))
if parsed:
items = meta.getElementsByTagName('taxon')
taxon = [
[i.attributes["total_count"].value
, i.attributes["name"].value ]
for i in items
if i.attributes["tax_id"].value == taxon_id ]
if len(taxon) == 1:
primary_ids = meta.getElementsByTagName('PRIMARY_ID')
primary_id = "UNKNOWN"
if len(primary_ids) > 0:
primary_id = primary_ids[0].firstChild.nodeValue
study_title = "UNKNOWN"
study_titles = meta.getElementsByTagName('STUDY_TITLE')
if len(study_titles) == 1:
study_title = study_titles[0].firstChild.nodeValue
data.append([sample, primary_id, taxon[0][0], taxon[0][1], study_title])
columns = ['sample', 'primary_id', 'total_count', 'taxon_name', 'study_title']
df = pd.DataFrame(columns=columns, data=data)
#somehow this sorting does work but is not used in to_
# df.sort_values(by='total_count', ascending=False, inplace=True)
df.to_csv(path_or_buf=self.sample_table_name, sep='\t', index=False, header=True)
extract = ExtractSraMetadata(sample_dir, taxon_id)
extract.read_metadata(taxon_id)