app-squared/BCH at main · CDCgov/app-squared · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
#!usr/bin/bash
# @nicolemariepaterson
# HA pipeline

#set -e
#set -u
set -o pipefail

timestamp=$(date +%Y-%m-%d)
echo $timestamp

#define args
#"./BCH relax" runs the relax pipeline, must be run first
#"./BCH tables" generates the data tables
#"./BCH upload" sends table data to cdp and refreshes the relaxed_isolate_name table

#paths not expected to change frequently for input/output
out_p=
if [ "$out_p" == "" ]; then
    out_p=$(cd -P "$(dirname "${BASH_SOURCE[0]}")" && pwd)
fi
echo $out_p

date_output="$out_p/$timestamp"
relaxed_dir="$out_p/relaxed"
unrelaxed_dir="$out_p/unrelaxed"
home="$out_p"

if [ ! -d "$date_output" ]; then
    mkdir $date_output
fi
if [ ! -d "$relaxed_dir" ]; then
    mkdir $relaxed_dir
fi
if [ ! -d "$unrelaxed_dir" ]; then
    mkdir $unrelaxed_dir
fi
timestamp=$(date +%Y-%m-%d)

function move_files {
for file in $home/relaxed*.pdb
do
    cp $file $relaxed_dir
done

for file in $home/*.pdb
do
    cp $file $unrelaxed_dir
done

for file in $unrelaxed_dir/relaxed_*.pdb
do
    rm $file
done
}

function run_pdb_parse {
for file in $unrelaxed_dir/*.pdb
do
    filename=$(basename "$file")
    name=$(basename "$file" .pdb)
    source ~/miniconda3/etc/profile.d/conda.sh
    source ~/miniconda3/bin/activate --stack ~/.conda/envs/appsquared
    python pdb_parse_raw.py -i $file -o $date_output/$name"_pdb.tsv" -v $name -t $timestamp
done
}
function run_relax {
for file in $unrelaxed_dir/*.pdb
do
    filename=$(basename "$file")
    #module load rosetta/3.13-cluster
    #mpirun -np 2 -nooversubscribe relax.default.linuxgccrelease -quick -ex1 -ex2 -gpu true -relax:minimize_bondlength_subset=3 -in:file:s $file --out:pdb -out:prefix relaxed_ -out:path:pdb $pdb_dir -relax:fast
    mpirun -np 3 -nooversubscribe relax.cxx11threadmpiserialization.linuxgccrelease -overwrite -quick -ex1 -ex2 -gpu true -device 0 -multithreading true -total_threads 8 -relax:minimize_bondlength_subset=3 -in:file:s $file --out:pdb -out:prefix relaxed_quick_ -out:path:pdb $relaxed_dir -relax:fast
done
}
#run Rosetta energy score for ddG, prep file for cdp upload
function run_rosetta {
for file in $relaxed_dir/relaxed_*.pdb
#for file in $home/relaxed_*.pdb
do
    filename=$(basename "$file" .pdb)
    #module load rosetta/3.13-cluster
    timestamp=$(date +%Y-%m-%d)
    score_jd2.default.linuxgccrelease -simple_metrics true -renumber_pdb -in:ignore_unrecognized_res -in:file:s $file -in:file:native $ref_path -out:file:scorefile $date_output/$filename.features.txt
    #mpirun score_jd2.mpi.linuxgccrelease -simple_metrics true -multithreading true -total_threads 4 -renumber_pdb -in:ignore_unrecognized_res -in:file:s $file -in:file:native $ref_path -out:file:scorefile $gwa_output/$filename.features.txt
    score_aln2.default.linuxgccrelease -multithreading true -total_threads 4 -in:file:s $file out:file:scorefile $date_output/$filename.aln2_scorefile.txt
    residue_energy_breakdown.linuxgccrelease -mp:quickrelax:repack_again -in:file:s $file -out:file:silent $date_output/$filename.res_breakdown.txt
done
}
function pdb_parquet_gen {
for file in $date_output/*pdb.tsv
do
    source ~/miniconda3/etc/profile.d/conda.sh
    source ~/miniconda3/bin/activate --stack ~/.conda/envs/appsquared
    conda activate --stack appsquared
    python3 tsv_to_parquet.py $file
done
}
#computes the glycosylation distances for each site from pdb file
function run_glyc_dist {
for file in $relaxed_dir/relaxed_*.pdb
#for file in $home/relaxed_*.pdb
do
    filename=$(basename $file)
    name=$(basename $file .pdb)
    source ~/miniconda3/etc/profile.d/conda.sh
    source ~/miniconda3/bin/activate --stack ~/.conda/envs/glyc
    conda activate --stack glyc
    python glyc.py -i $file -v $name -t $timestamp -o $date_output/$name.ASA_glyc.csv
    sed -i "s/relaxed_quick_//g" $date_output/$name.ASA_glyc.csv
    sed -i "s/relaxed_//g" $date_output/$name.ASA_glyc.csv
    sed -i "s/_0001//g" $date_output/$name.ASA_glyc.csv
    python3 tsv_to_parquet.py $date_output/$name.ASA_glyc.csv
done
}
function run_get_contacts {
for file in $relaxed_dir/relaxed_*.pdb
#for file in $home/relaxed_*.pdb
do
    source ~/miniconda3/etc/profile.d/conda.sh
    source ~/miniconda3/bin/activate --stack ~/.conda/envs/getcontacts
    filename=$(basename $file)
    name=$(basename $file .pdb)
    python get_static_contacts.py --structure $filename --output $date_output/$name.static_contacts.csv --itypes all
done
}
#cleans up the getcontacts file for upload to CDP. Needs to be updated to parquet and moved to clean_up_for_tables
function run_get_contacts_clean {
for file in $date_output/*.static_contacts.csv
do
    source ~/miniconda3/etc/profile.d/conda.sh
    source ~/miniconda3/bin/activate --stack ~/.conda/envs/appsquared
    filename=$(basename "$file" .pdb)
    name=$(basename $file .static_contacts.csv)
    timestamp=$(date +%Y-%m-%d)
    sed -i '1d' $file
    sed -i '1d' $file
    sed -i 's/:/\t/g' $file
    python getcontacts_df.py -i $file -v $name -t $timestamp
done
for file in $date_output/*.getcontacts.csv
do
    source ~/miniconda3/etc/profile.d/conda.sh
    source ~/miniconda3/bin/activate --stack ~/.conda/envs/appsquared
    sed -i "s/relaxed_quick_//g" $file
    sed -i "s/relaxed_//g" $file
    sed -i "s/_0001//g" $file
    python3 tsv_to_parquet.py $file
done
}
function clean_up_for_tables {
for file in $date_output/*.res_breakdown.txt
do
    sed -i "s/SCORE:  //g" $file
    sed -i "s/relaxed_quick_//g" $file
    sed -i "s/relaxed_//g" $file
    sed -i "s/_0001//g" $file
    source ~/miniconda3/etc/profile.d/conda.sh
    source ~/miniconda3/bin/activate --stack ~/.conda/envs/appsquared
    conda activate --stack appsquared
    name=$(basename $file .res_breakdown.txt)
    python dataframe_per_res.py -i $file -v $name -t $timestamp
done
for file in $date_output/*.per_res_scorefile.tsv
do
    sed -i "s/relaxed_quick_//g" $file
    sed -i "s/relaxed_//g" $file
    sed -i "s/_0001//g" $file
    python3 tsv_to_parquet.py $file
done
}

if [ "$1" == "relax" ]; then
    #relax required for most pipeline applications
    move_files
    echo $timestamp "OPTIMIZING STRUCTURES"
    run_relax
fi

if [ "$1" == "tables" ]; then
    move_files
    echo $timestamp "GENERATING DATA FOR CDP TABLES"
    #Runs the pipeline in order
    echo $timestamp "Parsing raw pdb"
    #run_pdb_parse
    #pdb_parquet_gen
    echo $timestamp "Running Rosetta"
    run_rosetta
    echo $timestamp "Running Glycosylation Distance Calculator"
    run_glyc_dist
    echo $timestamp "Running GetContacts: Intermolecular Interactions"run_get_contacts
    run_get_contacts_clean
    echo $timestamp "Cleaning up the tables for database entry"
    clean_up_for_tables
fi