Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
53 commits
Select commit Hold shift + click to select a range
e376b7e
update options and saving for encoders
soulios Jan 1, 2024
6f7611e
only argparse
soulios Mar 6, 2024
16a24f4
flaked and fixed predictgnn arg
soulios Mar 6, 2024
774b0a1
add json
soulios Mar 6, 2024
11fb829
remove comma
soulios Mar 6, 2024
add3993
final fix
soulios Mar 6, 2024
fa33f2f
final fix
soulios Mar 6, 2024
1f59fe9
final fix
soulios Mar 6, 2024
00fa012
convert fix
soulios Mar 7, 2024
ace62d3
Update dfpl/options.py
soulios Mar 8, 2024
8a1b334
Apply suggestions from code review
soulios Mar 8, 2024
3c92b98
edited help in args
soulios Mar 8, 2024
40e6b0b
flaked and blacked
soulios Mar 8, 2024
c03a32e
removed metavar from args with choices
soulios Mar 11, 2024
ebaaaca
make literals optionals for None
soulios Mar 11, 2024
d6090a9
applied black
soulios Mar 11, 2024
cb3fa01
rename some variables
bernt-matthias Apr 9, 2024
460c482
Merge branch 'argparse' of https://github.com/soulios/deepFPlearn int…
bernt-matthias Apr 9, 2024
e87be1b
fixup
bernt-matthias Apr 9, 2024
8b0af64
removed paths from default args and fixed creating args from json and…
soulios Jul 11, 2024
31f48a4
Merge branch 'master' of https://github.com/yigbt/deepFPlearn
soulios Jul 11, 2024
efef88a
only argparse
soulios Mar 6, 2024
ac0db5d
rename some variables
bernt-matthias Apr 9, 2024
13a1626
flaked and fixed predictgnn arg
soulios Mar 6, 2024
c3a5da2
add json
soulios Mar 6, 2024
2577f10
remove comma
soulios Mar 6, 2024
96f59b4
final fix
soulios Mar 6, 2024
01942ba
final fix
soulios Mar 6, 2024
056110e
final fix
soulios Mar 6, 2024
630f6d1
convert fix
soulios Mar 7, 2024
4f418cc
Update dfpl/options.py
soulios Mar 8, 2024
dd34bca
Apply suggestions from code review
soulios Mar 8, 2024
83361ee
edited help in args
soulios Mar 8, 2024
d165644
removed metavar from args with choices
soulios Mar 11, 2024
3d5ae80
make literals optionals for None
soulios Mar 11, 2024
e2ceb28
applied black
soulios Mar 11, 2024
f748c2f
removed paths from default args and fixed creating args from json and…
soulios Jul 11, 2024
a03642a
Merge branch 'master' into argparse
soulios Jul 11, 2024
51453b5
rebased argparse
soulios Jul 11, 2024
4cdd5a8
merged argparse
soulios Jul 11, 2024
7348fd0
blacked and flaked
soulios Jul 11, 2024
40be7bb
trying fix for cmd and json args
soulios Jul 11, 2024
35a63ee
changed path for input file
soulios Jul 11, 2024
1ba017c
changed path for input file
soulios Jul 11, 2024
11e808c
changed path for input file
soulios Jul 11, 2024
4989421
changed path for input file
soulios Jul 11, 2024
8cc13b4
changed path for test file in pr.yml
soulios Jul 11, 2024
c666c67
Merge branch 'refs/heads/master' into soulious-argparse
tom-mohr Jul 24, 2024
8244c26
fix --compressFeatures arg for "dfpl train"
tom-mohr Aug 6, 2024
9de595a
- added parsing for --fnnType
tom-mohr Aug 6, 2024
af7f2cc
include missing json file path in error message
tom-mohr Aug 6, 2024
225907a
add "dfpl.egg-info" and "build" directory to .gitignore
tom-mohr Aug 6, 2024
46f7da3
correctly load defaults with argparse
tom-mohr Aug 6, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions .github/workflows/pr.yml
Original file line number Diff line number Diff line change
Expand Up @@ -95,9 +95,14 @@ jobs:
exit 1
fi
echo "result lines "$(wc -l preds_dmpnn/DMPNN_preds.csv)
if [ "$(cat preds_dmpnn/DMPNN_preds.csv | wc -l)" -lt "6" ]; then
echo "predict result should have at least 5 lines. But had only $(cat preds_dmpnn/DMPNN_preds.csv | wc -l)" >&2
if [ "$(cat preds_dmpnn/preds.csv | wc -l)" -lt "6" ]; then
echo "predict result should have at least 5 lines. But had only $(cat preds_dmpnn/preds.csv | wc -l)" >&2
exit 1
fi

dfpl convert -f tests/data
if [ "$(find tests/data \( -name '*.csv' -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then
echo "not all csv files are converted to pickle ones" >&2
exit 1
fi
echo "All tests passed!"
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,5 @@ example/data/convert.log
results
release_asset
LICENSE.docx
dfpl.egg-info
build
328 changes: 26 additions & 302 deletions dfpl/__main__.py
Original file line number Diff line number Diff line change
@@ -1,315 +1,39 @@
import dataclasses
import logging
import os.path
import pathlib
import sys
from argparse import Namespace
from os import path

import chemprop as cp
import pandas as pd
from keras.models import load_model
from dfpl.parse import parse_dfpl
from dfpl.convert import convert
from dfpl.interpretgnn import interpretdmpnn
from dfpl.predictgnn import predictdmpnn
from dfpl.train import train
from dfpl.predict import predict
from dfpl.traingnn import traindmpnn

from dfpl import autoencoder as ac
from dfpl import feedforwardNN as fNN
from dfpl import fingerprint as fp
from dfpl import options, predictions
from dfpl import single_label_model as sl
from dfpl import vae as vae
from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute

project_directory = pathlib.Path(".").parent.parent.absolute()
test_train_opts = options.Options(
inputFile=f"{project_directory}/input_datasets/S_dataset.pkl",
outputDir=f"{project_directory}/output_data/console_test",
ecWeightsFile=f"{project_directory}/output_data/case_00/AE_S/ae_S.encoder.hdf5",
ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model",
type="smiles",
fpType="topological",
epochs=100,
batchSize=1024,
fpSize=2048,
encFPSize=256,
enableMultiLabel=False,
testSize=0.2,
kFolds=2,
verbose=2,
trainAC=False,
trainFNN=True,
compressFeatures=True,
activationFunction="selu",
lossFunction="bce",
optimizer="Adam",
fnnType="FNN",
)
def run_dfpl(args: Namespace):
subprogram_name = args.method

test_pred_opts = options.Options(
inputFile=f"{project_directory}/input_datasets/S_dataset.pkl",
outputDir=f"{project_directory}/output_data/console_test",
outputFile=f"{project_directory}/output_data/console_test/S_dataset.predictions_ER.csv",
ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model",
fnnModelDir=f"{project_directory}/output_data/console_test/ER_saved_model",
type="smiles",
fpType="topological",
)
# The ".method" attribute is added by the parser,
# specifying which subprogram was chosen by the user.
# However, the subprograms don't expect the ".method" attribute,
# so we remove it here before calling the subprogram.
del args.method


def traindmpnn(opts: options.GnnOptions):
"""
Train a D-MPNN model using the given options.
Args:
- opts: options.GnnOptions instance containing the details of the training
Returns:
- None
"""
os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}"
ignore_elements = ["py/object"]
# Load options from a JSON file and replace the relevant attributes in `opts`
arguments = createArgsFromJson(
opts.configFile, ignore_elements, return_json_object=False
)
opts = cp.args.TrainArgs().parse_args(arguments)
logging.info("Training DMPNN...")
# Train the model and get the mean and standard deviation of AUC score from cross-validation
mean_score, std_score = cp.train.cross_validate(
args=opts, train_func=cp.train.run_training
)
logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}")


def predictdmpnn(opts: options.GnnOptions, json_arg_path: str) -> None:
"""
Predict the values using a trained D-MPNN model with the given options.
Args:
- opts: options.GnnOptions instance containing the details of the prediction
- JSON_ARG_PATH: path to a JSON file containing additional arguments for prediction
Returns:
- None
"""
ignore_elements = [
"py/object",
"checkpoint_paths",
"save_dir",
"saving_name",
]
# Load options and additional arguments from a JSON file
arguments, data = createArgsFromJson(
json_arg_path, ignore_elements, return_json_object=True
)
arguments.append("--preds_path")
arguments.append("")
save_dir = data.get("save_dir")
name = data.get("saving_name")
# Replace relevant attributes in `opts` with loaded options
opts = cp.args.PredictArgs().parse_args(arguments)
opts.preds_path = save_dir + "/" + name
df = pd.read_csv(opts.test_path)
smiles = []
for index, rows in df.iterrows():
my_list = [rows.smiles]
smiles.append(my_list)
# Make predictions and return the result
cp.train.make_predictions(args=opts, smiles=smiles)


def train(opts: options.Options):
"""
Run the main training procedure
:param opts: Options defining the details of the training
"""

os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}"

# import data from file and create DataFrame
if "tsv" in opts.inputFile:
df = fp.importDataFile(
opts.inputFile, import_function=fp.importDstoxTSV, fp_size=opts.fpSize
)
else:
df = fp.importDataFile(
opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize
)
# initialize encoders to None
encoder = None
autoencoder = None
if opts.trainAC:
if opts.aeType == "deterministic":
encoder, train_indices, test_indices = ac.train_full_ac(df, opts)
elif opts.aeType == "variational":
encoder, train_indices, test_indices = vae.train_full_vae(df, opts)
else:
raise ValueError(f"Unknown autoencoder type: {opts.aeType}")

# if feature compression is enabled
if opts.compressFeatures:
if not opts.trainAC:
if opts.aeType == "deterministic":
(autoencoder, encoder) = ac.define_ac_model(opts=options.Options())
elif opts.aeType == "variational":
(autoencoder, encoder) = vae.define_vae_model(opts=options.Options())
elif opts.ecWeightsFile == "":
encoder = load_model(opts.ecModelDir)
else:
autoencoder.load_weights(
os.path.join(opts.ecModelDir, opts.ecWeightsFile)
)
# compress the fingerprints using the autoencoder
df = ac.compress_fingerprints(df, encoder)
# ac.visualize_fingerprints(
# df,
# before_col="fp",
# after_col="fpcompressed",
# train_indices=train_indices,
# test_indices=test_indices,
# save_as=f"UMAP_{opts.aeSplitType}.png",
# )
# train single label models if requested
if opts.trainFNN and not opts.enableMultiLabel:
sl.train_single_label_models(df=df, opts=opts)

# train multi-label models if requested
if opts.trainFNN and opts.enableMultiLabel:
fNN.train_nn_models_multi(df=df, opts=opts)


def predict(opts: options.Options) -> None:
"""
Run prediction given specific options
:param opts: Options defining the details of the prediction
"""
# import data from file and create DataFrame
if "tsv" in opts.inputFile:
df = fp.importDataFile(
opts.inputFile, import_function=fp.importDstoxTSV, fp_size=opts.fpSize
)
else:
df = fp.importDataFile(
opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize
)

if opts.compressFeatures:
# load trained model for autoencoder
if opts.aeType == "deterministic":
(autoencoder, encoder) = ac.define_ac_model(opts=options.Options())
if opts.aeType == "variational":
(autoencoder, encoder) = vae.define_vae_model(opts=options.Options())
# Load trained model for autoencoder
if opts.ecWeightsFile == "":
encoder = load_model(opts.ecModelDir)
else:
encoder.load_weights(os.path.join(opts.ecModelDir, opts.ecWeightsFile))
df = ac.compress_fingerprints(df, encoder)

# Run predictions on the compressed fingerprints and store the results in a dataframe
df2 = predictions.predict_values(df=df, opts=opts)

# Extract the column names from the dataframe, excluding the 'fp' and 'fpcompressed' columns
names_columns = [c for c in df2.columns if c not in ["fp", "fpcompressed"]]

# Save the predicted values to a CSV file in the output directory
df2[names_columns].to_csv(path_or_buf=path.join(opts.outputDir, opts.outputFile))

# Log successful completion of prediction and the file path where the results were saved
logging.info(
f"Prediction successful. Results written to '{path.join(opts.outputDir, opts.outputFile)}'"
)


def createLogger(filename: str) -> None:
"""
Set up a logger for the main function that also saves to a log file
"""
# get root logger and set its level
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# create file handler which logs info messages
fh = logging.FileHandler(filename, mode="w")
fh.setLevel(logging.INFO)
# create console handler
ch = logging.StreamHandler()
ch.setLevel(logging.INFO)
# create formatter and add it to the handlers
formatterFile = logging.Formatter(
"{asctime} - {name} - {levelname} - {message}", style="{"
)
formatterConsole = logging.Formatter("{levelname} {message}", style="{")
fh.setFormatter(formatterFile)
ch.setFormatter(formatterConsole)
# add the handlers to the logger
logger.addHandler(fh)
logger.addHandler(ch)
{
"traingnn": traindmpnn,
"predictgnn": predictdmpnn,
"interpretgnn": interpretdmpnn,
"train": train,
"predict": predict,
"convert": convert
}[subprogram_name](args)


def main():
"""
Main function that runs training/prediction defined by command line arguments
"""

parser = options.createCommandlineParser()
prog_args: Namespace = parser.parse_args()
try:
if prog_args.method == "convert":
directory = makePathAbsolute(prog_args.f)
if path.isdir(directory):
createLogger(path.join(directory, "convert.log"))
logging.info(f"Convert all data files in {directory}")
fp.convert_all(directory)
else:
raise ValueError("Input directory is not a directory")
elif prog_args.method == "traingnn":
traingnn_opts = options.GnnOptions.fromCmdArgs(prog_args)

traindmpnn(traingnn_opts)

elif prog_args.method == "predictgnn":
predictgnn_opts = options.GnnOptions.fromCmdArgs(prog_args)
fixed_opts = dataclasses.replace(
predictgnn_opts,
test_path=makePathAbsolute(predictgnn_opts.test_path),
preds_path=makePathAbsolute(predictgnn_opts.preds_path),
)

logging.info(
f"The following arguments are received or filled with default values:\n{prog_args}"
)

predictdmpnn(fixed_opts, prog_args.configFile)
args = parse_dfpl(*sys.argv[1:])

elif prog_args.method == "train":
train_opts = options.Options.fromCmdArgs(prog_args)
fixed_opts = dataclasses.replace(
train_opts,
inputFile=makePathAbsolute(train_opts.inputFile),
outputDir=makePathAbsolute(train_opts.outputDir),
)
createDirectory(fixed_opts.outputDir)
createLogger(path.join(fixed_opts.outputDir, "train.log"))
logging.info(
f"The following arguments are received or filled with default values:\n{fixed_opts}"
)
train(fixed_opts)
elif prog_args.method == "predict":
predict_opts = options.Options.fromCmdArgs(prog_args)
fixed_opts = dataclasses.replace(
predict_opts,
inputFile=makePathAbsolute(predict_opts.inputFile),
outputDir=makePathAbsolute(predict_opts.outputDir),
outputFile=makePathAbsolute(
path.join(predict_opts.outputDir, predict_opts.outputFile)
),
ecModelDir=makePathAbsolute(predict_opts.ecModelDir),
fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir),
trainAC=False,
trainFNN=False,
)
createDirectory(fixed_opts.outputDir)
createLogger(path.join(fixed_opts.outputDir, "predict.log"))
logging.info(
f"The following arguments are received or filled with default values:\n{prog_args}"
)
predict(fixed_opts)
except AttributeError as e:
print(e)
parser.print_usage()
# dynamic import after parsing was successful (to allow for faster CLI feedback)
run_dfpl(args)


if __name__ == "__main__":
Expand Down
9 changes: 5 additions & 4 deletions dfpl/autoencoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,12 @@

from dfpl import callbacks
from dfpl import history as ht
from dfpl import options, settings
from dfpl import settings
from dfpl.train import TrainOptions
from dfpl.utils import ae_scaffold_split, weight_split


def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Model]:
def define_ac_model(opts: TrainOptions, output_bias=None) -> Tuple[Model, Model]:
"""
This function provides an autoencoder model to reduce a certain input to a compressed version.

Expand Down Expand Up @@ -131,12 +132,12 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod
return autoencoder, encoder


def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model:
def train_full_ac(df: pd.DataFrame, opts: TrainOptions) -> Model:
"""
Trains an autoencoder on the given feature matrix X. The response matrix is only used to
split the data into meaningful test and train sets.

:param opts: Command line arguments as defined in options.py
:param opts: Command line arguments
:param df: Pandas dataframe that contains the SMILES/InChI data for training the autoencoder
:return: The encoder model of the trained autoencoder
"""
Expand Down
Loading