-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathPreprocessing.py
More file actions
85 lines (65 loc) · 2.86 KB
/
Copy pathPreprocessing.py
File metadata and controls
85 lines (65 loc) · 2.86 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
###### Core Cluster Finding
###### Such clusters which have sequences of all the strains of a specific pathogen.
###### Got cores_with_orthologs.fasta and cores_with_orthologs.tsv files for each cluster
###### Here we have deleted the duplicates and retain the orthologs of each cluster.
###### Each cluster has equal number of sequences which came from all the strains of a specific pathogen.
###### cores_with_orthologs.tsv file includes Cluster_ids, Sequence_ids, Representatives, Identity, Strain_ids etc.
###### cores_with_orthologs.fasta file includes all the sequences which are present in the core clusters of that pathogen.
import os
import sys
import pyfaidx
import pandas as pd
from tqdm import tqdm
from operator import itemgetter
def get_cores(clstr, count):
# Extracting strain count for given species
# count = pd.read_csv(count, sep='\t')
# count = count.shape[0]
# Change this to shape[0] for species
count = int(count)
clstr = pd.read_csv(clstr, sep='\t')
all_tbls = []
# Removing cluster_IDs where there are less than `count` members
clstr = clstr.groupby('Cluster_ID').filter(lambda x: len(x) >= count)
# Check which of the following clusters contain at least `count` members
for clust in tqdm(clstr.Cluster_ID.unique()):
tbl_clust = clstr[clstr.Cluster_ID==clust]
if len(tbl_clust.Strain_ID.unique()) >= count:
idx = tbl_clust.groupby('Strain_ID')['Identity'].idxmax()
result = tbl_clust.loc[idx]
all_tbls.append(result)
df = pd.concat(all_tbls)
# Extract core fasta
return df
def get_core_fasta(cores, fasta, out):
core_sequences=list(cores.Sequence_ID.unique())
core_sequences=[c[1:] for c in core_sequences]
print(f"Core sequences: {len(core_sequences)}")
fasta_count=0
data={}
records = pyfaidx.Fasta(fasta)
for record in tqdm(records, ncols=100):
data[record.name] = record
# Use itemgetter to extract values based on keys
value_extractor = itemgetter(*core_sequences)
subset_values = value_extractor(data)
# Create a new dictionary using zip
subset_dict = dict(zip(core_sequences, subset_values))
with open(out, 'w') as cf:
for k, v in tqdm(subset_dict.items(), ncols=100):
fasta_count+=1
cf.write(">"+v.long_name+"\n")
cf.write(str(v)+"\n")
print(f"Fasta count: {fasta_count}")
if __name__=="__main__":
# TSV of strain_ID tagged clusters
clstr_df= 'D:VRITIKA_New/DATA/Acinetobacter/clstr_df_strain.tsv'
# Fasta file extracted from CD-HIT
clstr_fasta = 'D:VRITIKA_New/DATA/Acinetobacter/Acinetobacter_all_strains.fasta.fai'
# Metadata of strain count
strain_count= 228
cores = get_cores(clstr_df, strain_count)
cores.to_csv(os.path.join(os.path.dirname(clstr_df), "cores_with_orthologs.tsv"), sep='\t', index=False)
print(f'Saved to {os.path.join(os.path.dirname(clstr_df), "cores_with_orthologs.tsv")}')
out = os.path.join(os.path.dirname(clstr_df), "cores_with_orthologs.fasta")
get_core_fasta(cores, clstr_fasta, out)