-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathBCH
More file actions
195 lines (183 loc) · 6.53 KB
/
BCH
File metadata and controls
195 lines (183 loc) · 6.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!usr/bin/bash
# @nicolemariepaterson
# HA pipeline
#set -e
#set -u
set -o pipefail
timestamp=$(date +%Y-%m-%d)
echo $timestamp
#define args
#"./BCH relax" runs the relax pipeline, must be run first
#"./BCH tables" generates the data tables
#"./BCH upload" sends table data to cdp and refreshes the relaxed_isolate_name table
#paths not expected to change frequently for input/output
out_p=
if [ "$out_p" == "" ]; then
out_p=$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)
fi
echo $out_p
date_output="$out_p/$timestamp"
relaxed_dir="$out_p/relaxed"
unrelaxed_dir="$out_p/unrelaxed"
home="$out_p"
if [ ! -d "$date_output" ]; then
mkdir $date_output
fi
if [ ! -d "$relaxed_dir" ]; then
mkdir $relaxed_dir
fi
if [ ! -d "$unrelaxed_dir" ]; then
mkdir $unrelaxed_dir
fi
timestamp=$(date +%Y-%m-%d)
function move_files {
for file in $home/relaxed*.pdb
do
cp $file $relaxed_dir
done
for file in $home/*.pdb
do
cp $file $unrelaxed_dir
done
for file in $unrelaxed_dir/relaxed_*.pdb
do
rm $file
done
}
function run_pdb_parse {
for file in $unrelaxed_dir/*.pdb
do
filename=$(basename "$file")
name=$(basename "$file" .pdb)
source ~/miniconda3/etc/profile.d/conda.sh
source ~/miniconda3/bin/activate --stack ~/.conda/envs/appsquared
python pdb_parse_raw.py -i $file -o $date_output/$name"_pdb.tsv" -v $name -t $timestamp
done
}
function run_relax {
for file in $unrelaxed_dir/*.pdb
do
filename=$(basename "$file")
#module load rosetta/3.13-cluster
#mpirun -np 2 -nooversubscribe relax.default.linuxgccrelease -quick -ex1 -ex2 -gpu true -relax:minimize_bondlength_subset=3 -in:file:s $file --out:pdb -out:prefix relaxed_ -out:path:pdb $pdb_dir -relax:fast
mpirun -np 3 -nooversubscribe relax.cxx11threadmpiserialization.linuxgccrelease -overwrite -quick -ex1 -ex2 -gpu true -device 0 -multithreading true -total_threads 8 -relax:minimize_bondlength_subset=3 -in:file:s $file --out:pdb -out:prefix relaxed_quick_ -out:path:pdb $relaxed_dir -relax:fast
done
}
#run Rosetta energy score for ddG, prep file for cdp upload
function run_rosetta {
for file in $relaxed_dir/relaxed_*.pdb
#for file in $home/relaxed_*.pdb
do
filename=$(basename "$file" .pdb)
#module load rosetta/3.13-cluster
timestamp=$(date +%Y-%m-%d)
score_jd2.default.linuxgccrelease -simple_metrics true -renumber_pdb -in:ignore_unrecognized_res -in:file:s $file -in:file:native $ref_path -out:file:scorefile $date_output/$filename.features.txt
#mpirun score_jd2.mpi.linuxgccrelease -simple_metrics true -multithreading true -total_threads 4 -renumber_pdb -in:ignore_unrecognized_res -in:file:s $file -in:file:native $ref_path -out:file:scorefile $gwa_output/$filename.features.txt
score_aln2.default.linuxgccrelease -multithreading true -total_threads 4 -in:file:s $file out:file:scorefile $date_output/$filename.aln2_scorefile.txt
residue_energy_breakdown.linuxgccrelease -mp:quickrelax:repack_again -in:file:s $file -out:file:silent $date_output/$filename.res_breakdown.txt
done
}
function pdb_parquet_gen {
for file in $date_output/*pdb.tsv
do
source ~/miniconda3/etc/profile.d/conda.sh
source ~/miniconda3/bin/activate --stack ~/.conda/envs/appsquared
conda activate --stack appsquared
python3 tsv_to_parquet.py $file
done
}
#computes the glycosylation distances for each site from pdb file
function run_glyc_dist {
for file in $relaxed_dir/relaxed_*.pdb
#for file in $home/relaxed_*.pdb
do
filename=$(basename $file)
name=$(basename $file .pdb)
source ~/miniconda3/etc/profile.d/conda.sh
source ~/miniconda3/bin/activate --stack ~/.conda/envs/glyc
conda activate --stack glyc
python glyc.py -i $file -v $name -t $timestamp -o $date_output/$name.ASA_glyc.csv
sed -i "s/relaxed_quick_//g" $date_output/$name.ASA_glyc.csv
sed -i "s/relaxed_//g" $date_output/$name.ASA_glyc.csv
sed -i "s/_0001//g" $date_output/$name.ASA_glyc.csv
python3 tsv_to_parquet.py $date_output/$name.ASA_glyc.csv
done
}
function run_get_contacts {
for file in $relaxed_dir/relaxed_*.pdb
#for file in $home/relaxed_*.pdb
do
source ~/miniconda3/etc/profile.d/conda.sh
source ~/miniconda3/bin/activate --stack ~/.conda/envs/getcontacts
filename=$(basename $file)
name=$(basename $file .pdb)
python get_static_contacts.py --structure $filename --output $date_output/$name.static_contacts.csv --itypes all
done
}
#cleans up the getcontacts file for upload to CDP. Needs to be updated to parquet and moved to clean_up_for_tables
function run_get_contacts_clean {
for file in $date_output/*.static_contacts.csv
do
source ~/miniconda3/etc/profile.d/conda.sh
source ~/miniconda3/bin/activate --stack ~/.conda/envs/appsquared
filename=$(basename "$file" .pdb)
name=$(basename $file .static_contacts.csv)
timestamp=$(date +%Y-%m-%d)
sed -i '1d' $file
sed -i '1d' $file
sed -i 's/:/\t/g' $file
python getcontacts_df.py -i $file -v $name -t $timestamp
done
for file in $date_output/*.getcontacts.csv
do
source ~/miniconda3/etc/profile.d/conda.sh
source ~/miniconda3/bin/activate --stack ~/.conda/envs/appsquared
sed -i "s/relaxed_quick_//g" $file
sed -i "s/relaxed_//g" $file
sed -i "s/_0001//g" $file
python3 tsv_to_parquet.py $file
done
}
function clean_up_for_tables {
for file in $date_output/*.res_breakdown.txt
do
sed -i "s/SCORE: //g" $file
sed -i "s/relaxed_quick_//g" $file
sed -i "s/relaxed_//g" $file
sed -i "s/_0001//g" $file
source ~/miniconda3/etc/profile.d/conda.sh
source ~/miniconda3/bin/activate --stack ~/.conda/envs/appsquared
conda activate --stack appsquared
name=$(basename $file .res_breakdown.txt)
python dataframe_per_res.py -i $file -v $name -t $timestamp
done
for file in $date_output/*.per_res_scorefile.tsv
do
sed -i "s/relaxed_quick_//g" $file
sed -i "s/relaxed_//g" $file
sed -i "s/_0001//g" $file
python3 tsv_to_parquet.py $file
done
}
if [ "$1" == "relax" ]; then
#relax required for most pipeline applications
move_files
echo $timestamp "OPTIMIZING STRUCTURES"
run_relax
fi
if [ "$1" == "tables" ]; then
move_files
echo $timestamp "GENERATING DATA FOR CDP TABLES"
#Runs the pipeline in order
echo $timestamp "Parsing raw pdb"
#run_pdb_parse
#pdb_parquet_gen
echo $timestamp "Running Rosetta"
run_rosetta
echo $timestamp "Running Glycosylation Distance Calculator"
run_glyc_dist
echo $timestamp "Running GetContacts: Intermolecular Interactions"run_get_contacts
run_get_contacts_clean
echo $timestamp "Cleaning up the tables for database entry"
clean_up_for_tables
fi