From e376b7eac6912de3773cf4589e5c7c1fa45c42d5 Mon Sep 17 00:00:00 2001 From: soulios Date: Mon, 1 Jan 2024 20:51:55 +0100 Subject: [PATCH 01/48] update options and saving for encoders --- dfpl/__main__.py | 115 ++-------- dfpl/autoencoder.py | 63 ++---- dfpl/callbacks.py | 33 ++- dfpl/feedforwardNN.py | 16 +- dfpl/options.py | 442 +++++++++++++++++-------------------- dfpl/single_label_model.py | 17 +- dfpl/utils.py | 73 +++++- dfpl/vae.py | 244 ++++++++------------ example/predict.json | 10 +- example/predictgnn.json | 5 +- example/train.json | 18 +- example/traingnn.json | 6 +- 12 files changed, 463 insertions(+), 579 deletions(-) diff --git a/dfpl/__main__.py b/dfpl/__main__.py index 7896d451..aada91a3 100755 --- a/dfpl/__main__.py +++ b/dfpl/__main__.py @@ -17,43 +17,8 @@ from dfpl import vae as vae from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute -project_directory = pathlib.Path(".").parent.parent.absolute() -test_train_opts = options.Options( - inputFile=f"{project_directory}/input_datasets/S_dataset.pkl", - outputDir=f"{project_directory}/output_data/console_test", - ecWeightsFile=f"{project_directory}/output_data/case_00/AE_S/ae_S.encoder.hdf5", - ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model", - type="smiles", - fpType="topological", - epochs=100, - batchSize=1024, - fpSize=2048, - encFPSize=256, - enableMultiLabel=False, - testSize=0.2, - kFolds=2, - verbose=2, - trainAC=False, - trainFNN=True, - compressFeatures=True, - activationFunction="selu", - lossFunction="bce", - optimizer="Adam", - fnnType="FNN", -) -test_pred_opts = options.Options( - inputFile=f"{project_directory}/input_datasets/S_dataset.pkl", - outputDir=f"{project_directory}/output_data/console_test", - outputFile=f"{project_directory}/output_data/console_test/S_dataset.predictions_ER.csv", - ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model", - fnnModelDir=f"{project_directory}/output_data/console_test/ER_saved_model", - type="smiles", - fpType="topological", -) - - -def traindmpnn(opts: options.GnnOptions): +def traindmpnn(opts: options.GnnOptions) -> None: """ Train a D-MPNN model using the given options. Args: @@ -61,54 +26,29 @@ def traindmpnn(opts: options.GnnOptions): Returns: - None """ - os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}" - ignore_elements = ["py/object"] # Load options from a JSON file and replace the relevant attributes in `opts` - arguments = createArgsFromJson( - opts.configFile, ignore_elements, return_json_object=False - ) + arguments = createArgsFromJson(jsonFile = opts.configFile) opts = cp.args.TrainArgs().parse_args(arguments) logging.info("Training DMPNN...") - # Train the model and get the mean and standard deviation of AUC score from cross-validation mean_score, std_score = cp.train.cross_validate( args=opts, train_func=cp.train.run_training ) logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}") -def predictdmpnn(opts: options.GnnOptions, json_arg_path: str) -> None: +def predictdmpnn(opts: options.GnnOptions) -> None: """ Predict the values using a trained D-MPNN model with the given options. Args: - opts: options.GnnOptions instance containing the details of the prediction - - JSON_ARG_PATH: path to a JSON file containing additional arguments for prediction Returns: - None """ - ignore_elements = [ - "py/object", - "checkpoint_paths", - "save_dir", - "saving_name", - ] # Load options and additional arguments from a JSON file - arguments, data = createArgsFromJson( - json_arg_path, ignore_elements, return_json_object=True - ) - arguments.append("--preds_path") - arguments.append("") - save_dir = data.get("save_dir") - name = data.get("saving_name") - # Replace relevant attributes in `opts` with loaded options + arguments = createArgsFromJson(jsonFile = opts.configFile) opts = cp.args.PredictArgs().parse_args(arguments) - opts.preds_path = save_dir + "/" + name - df = pd.read_csv(opts.test_path) - smiles = [] - for index, rows in df.iterrows(): - my_list = [rows.smiles] - smiles.append(my_list) - # Make predictions and return the result - cp.train.make_predictions(args=opts, smiles=smiles) + + cp.train.make_predictions(args=opts) def train(opts: options.Options): @@ -116,9 +56,6 @@ def train(opts: options.Options): Run the main training procedure :param opts: Options defining the details of the training """ - - os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}" - # import data from file and create DataFrame if "tsv" in opts.inputFile: df = fp.importDataFile( @@ -128,7 +65,7 @@ def train(opts: options.Options): df = fp.importDataFile( opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize ) - # initialize encoders to None + # initialize (auto)encoders to None encoder = None autoencoder = None if opts.trainAC: @@ -142,11 +79,12 @@ def train(opts: options.Options): # if feature compression is enabled if opts.compressFeatures: if not opts.trainAC: - if opts.aeType == "deterministic": - (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) - elif opts.aeType == "variational": + if opts.aeType == "variational": (autoencoder, encoder) = vae.define_vae_model(opts=options.Options()) - elif opts.ecWeightsFile == "": + else: + (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) + + if opts.ecWeightsFile == "": encoder = load_model(opts.ecModelDir) else: autoencoder.load_weights( @@ -154,14 +92,15 @@ def train(opts: options.Options): ) # compress the fingerprints using the autoencoder df = ac.compress_fingerprints(df, encoder) - # ac.visualize_fingerprints( - # df, - # before_col="fp", - # after_col="fpcompressed", - # train_indices=train_indices, - # test_indices=test_indices, - # save_as=f"UMAP_{opts.aeSplitType}.png", - # ) + if opts.visualizeLatent: + ac.visualize_fingerprints( + df, + before_col="fp", + after_col="fpcompressed", + train_indices=train_indices, + test_indices=test_indices, + save_as=f"UMAP_{opts.aeSplitType}.png", + ) # train single label models if requested if opts.trainFNN and not opts.enableMultiLabel: sl.train_single_label_models(df=df, opts=opts) @@ -257,7 +196,7 @@ def main(): raise ValueError("Input directory is not a directory") elif prog_args.method == "traingnn": traingnn_opts = options.GnnOptions.fromCmdArgs(prog_args) - + createLogger("traingnn.log") traindmpnn(traingnn_opts) elif prog_args.method == "predictgnn": @@ -267,12 +206,8 @@ def main(): test_path=makePathAbsolute(predictgnn_opts.test_path), preds_path=makePathAbsolute(predictgnn_opts.preds_path), ) - - logging.info( - f"The following arguments are received or filled with default values:\n{prog_args}" - ) - - predictdmpnn(fixed_opts, prog_args.configFile) + createLogger("predictgnn.log") + predictdmpnn(fixed_opts) elif prog_args.method == "train": train_opts = options.Options.fromCmdArgs(prog_args) @@ -298,8 +233,6 @@ def main(): ), ecModelDir=makePathAbsolute(predict_opts.ecModelDir), fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir), - trainAC=False, - trainFNN=False, ) createDirectory(fixed_opts.outputDir) createLogger(path.join(fixed_opts.outputDir, "predict.log")) diff --git a/dfpl/autoencoder.py b/dfpl/autoencoder.py index 99bf4578..6909b156 100644 --- a/dfpl/autoencoder.py +++ b/dfpl/autoencoder.py @@ -8,12 +8,12 @@ import numpy as np import pandas as pd import seaborn as sns -import umap +import umap.umap_ as umap import wandb from sklearn.model_selection import train_test_split from tensorflow.keras import initializers, losses, optimizers from tensorflow.keras.layers import Dense, Input -from tensorflow.keras.models import Model +from tensorflow.keras.models import Model, load_model from dfpl import callbacks from dfpl import history as ht @@ -32,9 +32,13 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod """ input_size = opts.fpSize encoding_dim = opts.encFPSize - ac_optimizer = optimizers.Adam( - learning_rate=opts.aeLearningRate, decay=opts.aeLearningRateDecay + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, ) + ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) if output_bias is not None: output_bias = initializers.Constant(output_bias) @@ -104,7 +108,6 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod )(decoded) # output layer - # to either 0 or 1 and hence we use sigmoid activation function. decoded = Dense( units=input_size, activation="sigmoid", bias_initializer=output_bias )(decoded) @@ -145,37 +148,8 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: if opts.aeWabTracking and not opts.wabTracking: wandb.init(project=f"AE_{opts.aeSplitType}") - # Define output files for autoencoder and encoder weights - if opts.ecWeightsFile == "": - # If no encoder weights file is specified, use the input file name to generate a default file name - logging.info("No AE encoder weights file specified") - base_file_name = ( - os.path.splitext(basename(opts.inputFile))[0] + opts.aeSplitType - ) - logging.info( - f"(auto)encoder weights will be saved in {base_file_name}.autoencoder.hdf5" - ) - ac_weights_file = os.path.join( - opts.outputDir, base_file_name + ".autoencoder.weights.hdf5" - ) - # ec_weights_file = os.path.join( - # opts.outputDir, base_file_name + ".encoder.weights.hdf5" - # ) - else: - # If an encoder weights file is specified, use it as the encoder weights file name - logging.info(f"AE encoder will be saved in {opts.ecWeightsFile}") - base_file_name = ( - os.path.splitext(basename(opts.ecWeightsFile))[0] + opts.aeSplitType - ) - ac_weights_file = os.path.join( - opts.outputDir, base_file_name + ".autoencoder.weights.hdf5" - ) - # ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile) - + save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_split_autoencoder") # Collect the callbacks for training - callback_list = callbacks.autoencoder_callback( - checkpoint_path=ac_weights_file, opts=opts - ) # Select all fingerprints that are valid and turn them into a numpy array fp_matrix = np.array( @@ -286,30 +260,35 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: # Set up the model of the AC w.r.t. the input size and the dimension of the bottle neck (z!) (autoencoder, encoder) = define_ac_model(opts, output_bias=initial_bias) - + callback_list = callbacks.autoencoder_callback(checkpoint_path=save_path, opts=opts) # Train the autoencoder on the training data auto_hist = autoencoder.fit( x_train, x_train, - callbacks=callback_list, + callbacks=[callback_list], epochs=opts.aeEpochs, batch_size=opts.aeBatchSize, verbose=opts.verbose, validation_data=(x_test, x_test) if opts.testSize > 0.0 else None, ) - logging.info(f"Autoencoder weights stored in file: {ac_weights_file}") # Store the autoencoder training history and plot the metrics ht.store_and_plot_history( - base_file_name=os.path.join(opts.outputDir, base_file_name + ".AC"), + base_file_name=save_path, hist=auto_hist, ) # Save the autoencoder callback model to disk - save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_autoencoder") if opts.testSize > 0.0: - (callback_autoencoder, callback_encoder) = define_ac_model(opts) - callback_encoder.save(filepath=save_path) + # Re-define autoencoder and encoder using your function + callback_autoencoder = load_model(filepath=save_path) + _, callback_encoder = define_ac_model(opts) + for i, layer in enumerate(callback_encoder.layers): + layer.set_weights(callback_autoencoder.layers[i].get_weights()) + + # Save the encoder model + encoder_save_path = os.path.join(save_path, "encoder_model") + callback_encoder.save(filepath=encoder_save_path) else: encoder.save(filepath=save_path) # Return the encoder model of the trained autoencoder diff --git a/dfpl/callbacks.py b/dfpl/callbacks.py index 6eae7965..fc1f817c 100644 --- a/dfpl/callbacks.py +++ b/dfpl/callbacks.py @@ -22,28 +22,37 @@ def autoencoder_callback(checkpoint_path: str, opts: options.Options) -> list: else: target = "loss" # enable this checkpoint to restore the weights of the best performing model - checkpoint = ModelCheckpoint( + if opts.aeType == "deterministic": + checkpoint = ModelCheckpoint( checkpoint_path, monitor=target, mode="min", verbose=1, - period=settings.ac_train_check_period, + save_freq="epoch", save_best_only=True, - save_weights_only=True, - ) + ) + else: + checkpoint = ModelCheckpoint( + checkpoint_path, + monitor=target, + mode="min", + verbose=1, + save_freq="epoch", + save_best_only=True, + save_weights_only=True + ) callbacks.append(checkpoint) # enable early stopping if val_loss is not improving anymore early_stop = EarlyStopping( - monitor=target, - mode="min", - patience=settings.ac_train_patience, - min_delta=settings.ac_train_min_delta, - verbose=1, - restore_best_weights=True, + monitor=target, + mode="min", + patience=settings.ac_train_patience, + min_delta=settings.ac_train_min_delta, + verbose=1, + restore_best_weights=True, ) callbacks.append(early_stop) - if opts.aeWabTracking and not opts.wabTracking: callbacks.append(WandbCallback(save_model=False)) return callbacks @@ -65,7 +74,7 @@ def nn_callback(checkpoint_path: str, opts: options.Options) -> list: checkpoint = ModelCheckpoint( checkpoint_path, verbose=1, - period=settings.nn_train_check_period, + save_freq="epoch", save_best_only=True, monitor="val_loss", mode="min", diff --git a/dfpl/feedforwardNN.py b/dfpl/feedforwardNN.py index e9c88776..bf4241aa 100644 --- a/dfpl/feedforwardNN.py +++ b/dfpl/feedforwardNN.py @@ -69,10 +69,16 @@ def define_out_file_names(path_prefix: str, target: str, fold: int = -1) -> tupl def define_nn_multi_label_model( input_size: int, output_size: int, opts: options.Options ) -> Model: + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, + ) if opts.optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=opts.learningRate) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) elif opts.optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=opts.learningRate, momentum=0.9) + my_optimizer = optimizers.legacy.SGD(lr=lr_schedule, momentum=0.9) else: logging.error(f"Your selected optimizer is not supported:{opts.optimizer}.") sys.exit("Unsupported optimizer.") @@ -132,9 +138,9 @@ def define_nn_model_multi( decay: float = 0.01, ) -> Model: if optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=lr, decay=decay) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr, decay=decay) elif optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=lr, momentum=0.9, decay=decay) + my_optimizer = optimizers.legacy.SGD(lr=lr, momentum=0.9, decay=decay) else: my_optimizer = optimizer @@ -294,6 +300,8 @@ def train_nn_models_multi(df: pd.DataFrame, opts: options.Options) -> None: model_file_path_weights, model_file_path_json, model_hist_path, + model_hist_csv_path, + model_predict_valset_csv_path, model_validation, model_auc_file, model_auc_file_data, diff --git a/dfpl/options.py b/dfpl/options.py index 6d84dbc4..d1d657aa 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -3,12 +3,12 @@ import argparse from dataclasses import dataclass from pathlib import Path - +from typing import Optional import jsonpickle import torch from chemprop.args import TrainArgs -from dfpl.utils import makePathAbsolute +from dfpl.utils import makePathAbsolute, parseCmdArgs @dataclass @@ -17,51 +17,51 @@ class Options: Dataclass for all options necessary for training the neural nets """ - configFile: str = "./example/train.json" - inputFile: str = "/deepFPlearn/CMPNN/data/tox21.csv" - outputDir: str = "." - outputFile: str = "" - ecWeightsFile: str = "AE.encoder.weights.hdf5" - ecModelDir: str = "AE_encoder" - fnnModelDir: str = "modeltraining" + configFile: str = None + inputFile: str = "tests/data/smiles.csv" + outputDir: str = "example/results_train/" # changes according to mode + outputFile: str = "results.csv" + ecWeightsFile: str = "" + ecModelDir: str = "example/results_train/AE_encoder/" + fnnModelDir: str = "example/results_train/AR_saved_model/" type: str = "smiles" fpType: str = "topological" # also "MACCS", "atompairs" - epochs: int = 512 + epochs: int = 100 fpSize: int = 2048 encFPSize: int = 256 - kFolds: int = 0 + kFolds: int = 1 testSize: float = 0.2 enableMultiLabel: bool = False - verbose: int = 0 - trainAC: bool = True # if set to False, an AC weight file must be provided! + verbose: int = 2 + trainAC: bool = False trainFNN: bool = True - compressFeatures: bool = True - sampleFractionOnes: float = 0.5 # Only used when value is in [0,1] + compressFeatures: bool = False + sampleFractionOnes: float = 0.5 sampleDown: bool = False split_type: str = "random" aeSplitType: str = "random" aeType: str = "deterministic" - aeEpochs: int = 3000 + aeEpochs: int = 100 aeBatchSize: int = 512 aeLearningRate: float = 0.001 - aeLearningRateDecay: float = 0.01 - aeActivationFunction: str = "relu" + aeLearningRateDecay: float = 0.96 + aeActivationFunction: str = "selu" aeOptimizer: str = "Adam" fnnType: str = "FNN" batchSize: int = 128 optimizer: str = "Adam" learningRate: float = 0.001 + learningRateDecay: float = 0.96 lossFunction: str = "bce" activationFunction: str = "relu" l2reg: float = 0.001 dropout: float = 0.2 threshold: float = 0.5 - gpu: str = "" - snnDepth = 8 - snnWidth = 50 - aeWabTracking: str = "" # Wand & Biases autoencoder tracking - wabTracking: str = "" # Wand & Biases FNN tracking - wabTarget: str = "ER" # Wand & Biases target used for showing training progress + visualizeLatent: bool = False #only if autoencoder is trained or loaded + gpu: int = None + aeWabTracking: bool = False # Wand & Biases autoencoder tracking + wabTracking: bool = False # Wand & Biases FNN tracking + wabTarget: str = "AR" # Wand & Biases target used for showing training progress def saveToFile(self, file: str) -> None: """ @@ -72,42 +72,8 @@ def saveToFile(self, file: str) -> None: f.write(jsonpickle.encode(self)) @classmethod - def fromJson(cls, file: str) -> Options: - """ - Create an instance from a JSON file - """ - jsonFile = Path(file) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - return jsonpickle.decode(content) - raise ValueError("JSON file does not exist or is not readable") - - @classmethod - def fromCmdArgs(cls, args: argparse.Namespace) -> Options: - """ - Creates Options instance from cmdline arguments. - - If a training file (JSON) is provided, the values from that file are used. - However, additional commandline arguments will be preferred. If, e.g., "fpSize" is specified both in the - JSON file and on the commandline, then the value of the commandline argument will be used. - """ - result = Options() - if "configFile" in vars(args).keys(): - jsonFile = Path(makePathAbsolute(args.configFile)) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - result = jsonpickle.decode(content) - else: - raise ValueError("Could not find JSON input file") - - for key, value in vars(args).items(): - # The args dict will contain a "method" key from the subparser. - # We don't use this. - if key != "method": - result.__setattr__(key, value) - return result + def fromCmdArgs(cls, args: argparse.Namespace) -> "Options": + return parseCmdArgs(cls, args) @dataclass @@ -134,37 +100,19 @@ class GnnOptions(TrainArgs): save_preds: bool = True @classmethod - def fromCmdArgs(cls, args: argparse.Namespace) -> GnnOptions: - """ - Creates Options instance from cmdline arguments. + def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): + # Initialize with JSON config if provided + if json_config: + opts = cls(**json_config) + else: + opts = cls() - If a training file (JSON) is provided, the values from that file are used. - However, additional commandline arguments will be preferred. If, e.g., "fpSize" is specified both in the - JSON file and on the commandline, then the value of the commandline argument will be used. - """ - result = GnnOptions() - if "configFile" in vars(args).keys(): - jsonFile = Path(makePathAbsolute(args.configFile)) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - result = jsonpickle.decode(content) - else: - raise ValueError("Could not find JSON input file") - - return result + # Update with command-line arguments + for key, value in vars(args).items(): + if value is not None: + setattr(opts, key, value) - @classmethod - def fromJson(cls, file: str) -> GnnOptions: - """ - Create an instance from a JSON file - """ - jsonFile = Path(file) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - return jsonpickle.decode(content) - raise ValueError("JSON file does not exist or is not readable") + return opts def createCommandlineParser() -> argparse.ArgumentParser: @@ -225,7 +173,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="FILE", type=str, help="Input JSON file that contains all information for training/predicting.", - default=argparse.SUPPRESS, + default="example/train.json", ) general_args.add_argument( "-i", @@ -234,7 +182,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="The file containing the data for training in " "comma separated CSV format.The first column should be smiles.", - default=argparse.SUPPRESS, + default="tests/data/smiles.csv" ) general_args.add_argument( "-o", @@ -243,8 +191,10 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="Prefix of output file name. Trained model and " "respective stats will be returned in this directory.", - default=argparse.SUPPRESS, + default="example/results_train/" ) + + # TODO CHECK WHAT IS TYPE DOING? general_args.add_argument( "-t", "--type", @@ -252,7 +202,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["fp", "smiles"], help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default=argparse.SUPPRESS, + default="fp" ) general_args.add_argument( "-thr", @@ -260,47 +210,41 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=float, metavar="FLOAT", help="Threshold for binary classification.", - default=argparse.SUPPRESS, + default=0.5 ) general_args.add_argument( "-gpu", "--gpu", metavar="INT", type=int, - help="Select which gpu to use. If not available, leave empty.", - default=argparse.SUPPRESS, + help="Select which gpu to use by index. If not available, leave empty", + default=None ) general_args.add_argument( - "-k", "--fpType", metavar="STR", type=str, - choices=["topological", "MACCS"], # , 'atompairs', 'torsions'], - help="The type of fingerprint to be generated/used in input file.", - default=argparse.SUPPRESS, + choices=["topological", "MACCS"], + help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.", + default="topological" ) general_args.add_argument( - "-s", "--fpSize", type=int, - help="Size of fingerprint that should be generated.", - default=argparse.SUPPRESS, + help="Length of the fingerprint that should be generated.", + default=2048 ) general_args.add_argument( - "-c", "--compressFeatures", - metavar="BOOL", - type=bool, - help="Should the fingerprints be compressed or not. Activates the autoencoder. ", - default=argparse.SUPPRESS, + action="store_true", + help="Should the fingerprints be compressed or not. Needs a path of a trained autoencoder or needs the trainAC also set to True.", + default=False ) general_args.add_argument( - "-m", "--enableMultiLabel", - metavar="BOOL", - type=bool, + action="store_true", help="Train multi-label classification model in addition to the individual models.", - default=argparse.SUPPRESS, + default=False ) # Autoencoder Configuration autoencoder_args.add_argument( @@ -309,14 +253,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, metavar="FILE", help="The .hdf5 file of a trained encoder", - default=argparse.SUPPRESS, + default="" ) autoencoder_args.add_argument( "--ecModelDir", type=str, metavar="DIR", help="The directory where the full model of the encoder will be saved", - default=argparse.SUPPRESS, + default="example/results_train/AE_encoder/" ) autoencoder_args.add_argument( "--aeType", @@ -324,21 +268,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["variational", "deterministic"], help="Autoencoder type, variational or deterministic.", - default=argparse.SUPPRESS, + default="deterministic" ) autoencoder_args.add_argument( "--aeEpochs", metavar="INT", type=int, help="Number of epochs for autoencoder training.", - default=argparse.SUPPRESS, + default=100 ) autoencoder_args.add_argument( "--aeBatchSize", metavar="INT", type=int, help="Batch size in autoencoder training.", - default=argparse.SUPPRESS, + default=512 ) autoencoder_args.add_argument( "--aeActivationFunction", @@ -346,21 +290,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["relu", "selu"], help="The activation function for the hidden layers in the autoencoder.", - default=argparse.SUPPRESS, + default="relu" ) autoencoder_args.add_argument( "--aeLearningRate", metavar="FLOAT", type=float, help="Learning rate for autoencoder training.", - default=argparse.SUPPRESS, + default=0.001 ) autoencoder_args.add_argument( "--aeLearningRateDecay", metavar="FLOAT", type=float, help="Learning rate decay for autoencoder training.", - default=argparse.SUPPRESS, + default=0.96 ) autoencoder_args.add_argument( "--aeSplitType", @@ -368,7 +312,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the autoencoder", - default=argparse.SUPPRESS, + default="random" ) autoencoder_args.add_argument( "-d", @@ -376,7 +320,13 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Size of encoded fingerprint (z-layer of autoencoder).", - default=argparse.SUPPRESS, + default=256 + ) + autoencoder_args.add_argument( + "--visualizeLatent", + action="store_true", + help="UMAP the latent space for exploration", + default=False ) # Training Configuration training_args.add_argument( @@ -385,15 +335,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the feedforward neural network", - default=argparse.SUPPRESS, + default="random" ) training_args.add_argument( - "-l", "--testSize", metavar="FLOAT", type=float, help="Fraction of the dataset that should be used for testing. Value in [0,1].", - default=argparse.SUPPRESS, + default=0.2 ) training_args.add_argument( "-K", @@ -401,7 +350,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="K that is used for K-fold cross-validation in the training procedure.", - default=argparse.SUPPRESS, + default=1 ) training_args.add_argument( "-v", @@ -411,21 +360,19 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: choices=[0, 1, 2], help="Verbosity level. O: No additional output, " + "1: Some additional output, 2: full additional output", - default=argparse.SUPPRESS, + default=2, ) training_args.add_argument( "--trainAC", - metavar="BOOL", - type=bool, + action="store_true", help="Choose to train or not, the autoencoder based on the input file", - default=argparse.SUPPRESS, + default=False, ) training_args.add_argument( "--trainFNN", - metavar="BOOL", - type=bool, - help="Train the feedforward network either with provided weights.", - default=argparse.SUPPRESS, + action="store_false", + help="When called it deactivates the training.", + default=True, ) training_args.add_argument( "--sampleFractionOnes", @@ -433,14 +380,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=float, help="This is the fraction of positive target associations (1s) in comparison to the majority class(0s)." "only works if --sampleDown is enabled", - default=argparse.SUPPRESS, + default=0.5, ) training_args.add_argument( "--sampleDown", metavar="BOOL", type=bool, help="Enable automatic down sampling of the 0 valued samples.", - default=argparse.SUPPRESS, + default=False, ) training_args.add_argument( "-e", @@ -448,52 +395,60 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Number of epochs that should be used for the FNN training", - default=argparse.SUPPRESS, + default=100, ) - + # TODO CHECK IF ALL LOSSES MAKE SENSE HERE training_args.add_argument( "--lossFunction", metavar="STRING", type=str, choices=["mse", "bce", "focal"], help="Loss function to use during training. mse - mean squared error, bce - binary cross entropy.", - default=argparse.SUPPRESS, + default="bce", ) + # TODO DO I NEED ALL ARGUMENTS TO BE USER SPECIFIED? WHAT DOES THE USER KNOW ABOUT OPTIMIZERS? training_args.add_argument( "--optimizer", metavar="STRING", type=str, choices=["Adam", "SGD"], help='Optimizer to use for backpropagation in the FNN. Possible values: "Adam", "SGD"', - default=argparse.SUPPRESS, + default="Adam", ) training_args.add_argument( "--batchSize", metavar="INT", type=int, help="Batch size in FNN training.", - default=argparse.SUPPRESS, + default=128, ) training_args.add_argument( "--l2reg", metavar="FLOAT", type=float, help="Value for l2 kernel regularizer.", - default=argparse.SUPPRESS, + default=0.001, ) training_args.add_argument( "--dropout", metavar="FLOAT", type=float, help="The fraction of data that is dropped out in each dropout layer.", - default=argparse.SUPPRESS, + default=0.2, ) training_args.add_argument( "--learningRate", metavar="FLOAT", type=float, help="Learning rate size in FNN training.", - default=argparse.SUPPRESS, + default=0.000022, + ) + training_args.add_argument( + "--learningRateDecay", + metavar="FLOAT", + type=float, + help="Learning rate size in FNN training.", + default=0.96, ) training_args.add_argument( "--activationFunction", @@ -501,7 +456,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["relu", "selu"], help="The activation function for hidden layers in the FNN.", - default=argparse.SUPPRESS, + default="relu", ) # Tracking Configuration tracking_args.add_argument( @@ -509,14 +464,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="BOOL", type=bool, help="Track autoencoder performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default=False, ) tracking_args.add_argument( "--wabTracking", metavar="BOOL", type=bool, help="Track FNN performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default=False, ) tracking_args.add_argument( "--wabTarget", @@ -524,8 +479,108 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["AR", "ER", "ED", "GR", "TR", "PPARg", "Aromatase"], help="Which target to use for tracking performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default="AR", + ) + + +def parseInputPredict(parser: argparse.ArgumentParser) -> None: + """ + Parse the input arguments. + + :return: A namespace object built up from attributes parsed out of the cmd line. + """ + + general_args = parser.add_argument_group("General Configuration") + files_args = parser.add_argument_group("Files") + files_args.add_argument( + "-f", + "--configFile", + metavar="FILE", + type=str, + help="Input JSON file that contains all information for training/predicting." + ) + files_args.add_argument( + "-i", + "--inputFile", + metavar="FILE", + type=str, + help="The file containing the data for the prediction in (unquoted) " + "comma separated CSV format. The column named 'smiles' or 'fp'" + "contains the field to be predicted. Please adjust the type " + "that should be predicted (fp or smile) with -t option appropriately." + "An optional column 'id' is used to assign the outcomes to the" + "original identifiers. If this column is missing, the results are" + "numbered in the order of their appearance in the input file." + "A header is expected and respective column names are used.", + default="tests/data/smiles.csv", + ) + files_args.add_argument( + "-o", + "--outputDir", + metavar="DIR", + type=str, + help="Prefix of output directory. It will contain a log file and the file specified" + "with --outputFile.", + default="example/results_predict/", + ) + files_args.add_argument( + "--outputFile", + metavar="FILE", + type=str, + help="Output .CSV file name which will contain one prediction per input line. " + "Default: prefix of input file name.", + default="results.csv", + ) + # TODO AGAIN THIS TRASH HERE? CAN WE EVEN PROCESS SMILES? + general_args.add_argument( + "-t", + "--type", + metavar="STR", + type=str, + choices=["fp", "smiles"], + help="Type of the chemical representation. Choices: 'fp', 'smiles'.", + default="fp", + ) + general_args.add_argument( + "-k", + "--fpType", + metavar="STR", + type=str, + choices=["topological", "MACCS"], + help="The type of fingerprint to be generated/used in input file. Should be the same as the type of the fps that the model was trained upon.", + default="topological", + ) + files_args.add_argument( + "--ecModelDir", + type=str, + metavar="DIR", + help="The directory where the full model of the encoder will be saved (if trainAE=True) or " + "loaded from (if trainAE=False). Provide a full path here.", + default="", + ) + files_args.add_argument( + "--ecWeightsFile", + type=str, + metavar="STR", + help="The file where the full model of the encoder will be loaded from, to compress the fingerprints. Provide a full path here.", + default="", ) + files_args.add_argument( + "--fnnModelDir", + type=str, + metavar="DIR", + help="The directory where the full model of the fnn is loaded from. " + "Provide a full path here.", + default="example/results_train/AR_saved_model", + ) + general_args.add_argument( + "-c", "--compressFeatures", action="store_true", default=False + ) + (general_args.add_argument( + "--aeType", metavar="STRING", type=str, + choices=["variational", "deterministic"], + help="Autoencoder type, variational or deterministic.", + default="deterministic")) def parseTrainGnn(parser: argparse.ArgumentParser) -> None: @@ -575,9 +630,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: default=10, help="The number of batches between each logging of the training loss", ) - general_args.add_argument( - "--no_cuda", action="store_true", default=True, help="Turn off cuda" - ) general_args.add_argument( "--no_cache", action="store_true", @@ -1034,91 +1086,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: ) -def parseInputPredict(parser: argparse.ArgumentParser) -> None: - """ - Parse the input arguments. - - :return: A namespace object built up from attributes parsed out of the cmd line. - """ - - general_args = parser.add_argument_group("General Configuration") - files_args = parser.add_argument_group("Files") - files_args.add_argument( - "-f", - "--configFile", - metavar="FILE", - type=str, - help="Input JSON file that contains all information for training/predicting.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "-i", - "--inputFile", - metavar="FILE", - type=str, - help="The file containing the data for the prediction in (unquoted) " - "comma separated CSV format. The column named 'smiles' or 'fp'" - "contains the field to be predicted. Please adjust the type " - "that should be predicted (fp or smile) with -t option appropriately." - "An optional column 'id' is used to assign the outcomes to the" - "original identifiers. If this column is missing, the results are" - "numbered in the order of their appearance in the input file." - "A header is expected and respective column names are used.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "-o", - "--outputDir", - metavar="DIR", - type=str, - help="Prefix of output directory. It will contain a log file and the file specified" - "with --outputFile.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "--outputFile", - metavar="FILE", - type=str, - help="Output .CSV file name which will contain one prediction per input line. " - "Default: prefix of input file name.", - default=argparse.SUPPRESS, - ) - general_args.add_argument( - "-t", - "--type", - metavar="STR", - type=str, - choices=["fp", "smiles"], - help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default=argparse.SUPPRESS, - ) - general_args.add_argument( - "-k", - "--fpType", - metavar="STR", - type=str, - choices=["topological", "MACCS"], # , 'atompairs', 'torsions'], - help="The type of fingerprint to be generated/used in input file.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "--ecModelDir", - type=str, - metavar="DIR", - help="The directory where the full model of the encoder will be saved (if trainAE=True) or " - "loaded from (if trainAE=False). Provide a full path here.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "--fnnModelDir", - type=str, - metavar="DIR", - help="The directory where the full model of the fnn is loaded from. " - "Provide a full path here.", - default=argparse.SUPPRESS, - ) - - def parsePredictGnn(parser: argparse.ArgumentParser) -> None: general_args = parser.add_argument_group("General Configuration") data_args = parser.add_argument_group("Data Configuration") @@ -1139,9 +1106,6 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: choices=list(range(torch.cuda.device_count())), help="Which GPU to use", ) - general_args.add_argument( - "--no_cuda", action="store_true", default=False, help="Turn off cuda" - ) general_args.add_argument( "--num_workers", type=int, diff --git a/dfpl/single_label_model.py b/dfpl/single_label_model.py index 18402f09..191690ba 100644 --- a/dfpl/single_label_model.py +++ b/dfpl/single_label_model.py @@ -333,12 +333,17 @@ def define_single_label_model( else: logging.error(f"Your selected loss is not supported: {opts.lossFunction}.") sys.exit("Unsupported loss function") - + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.learningRate, + decay_steps=1000, + decay_rate=opts.learningRateDecay, + staircase=True, + ) # Set the optimizer according to the option selected if opts.optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=opts.learningRate) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) elif opts.optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=opts.learningRate, momentum=0.9) + my_optimizer = optimizers.legacy.SGD(lr=lr_schedule, momentum=0.9) else: logging.error(f"Your selected optimizer is not supported: {opts.optimizer}.") sys.exit("Unsupported optimizer") @@ -596,11 +601,7 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: """ # find target columns - targets = [ - c - for c in df.columns - if c in ["AR", "ER", "ED", "TR", "GR", "PPARg", "Aromatase"] - ] + targets = [c for c in df.columns if c not in ["smiles", "fp", "fpcompressed"]] if opts.wabTracking and opts.wabTarget != "": # For W&B tracking, we only train one target that's specified as wabTarget "ER". # In case it's not there, we use the first one available diff --git a/dfpl/utils.py b/dfpl/utils.py index db3d6ec1..15fd018b 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -5,8 +5,14 @@ import warnings from collections import defaultdict from random import Random -from typing import Dict, List, Set, Tuple, Union +from typing import Dict, List, Set, Tuple, Union, Type, TypeVar, Any +# Define a type variable + +from pathlib import Path +import argparse +import jsonpickle +import sys import numpy as np import pandas as pd from rdkit import Chem, RDLogger @@ -15,7 +21,44 @@ from tqdm import tqdm RDLogger.DisableLog("rdApp.*") +T = TypeVar("T") + + +def parseCmdArgs(cls: Type[T], args: argparse.Namespace) -> T: + """ + Parses command-line arguments to create an instance of the given class. + + Args: + cls: The class to create an instance of. + args: argparse.Namespace containing the command-line arguments. + + Returns: + An instance of cls populated with values from the command-line arguments. + """ + # Extract argument flags from sys.argv + arg_flags = {arg.lstrip('-') for arg in sys.argv if arg.startswith('-')} + + # Create the result instance, which will be modified and returned + result = cls() + + # Load JSON file if specified + if hasattr(args, "configFile") and args.configFile: + jsonFile = Path(args.configFile) + if jsonFile.exists() and jsonFile.is_file(): + with jsonFile.open() as f: + content = jsonpickle.decode(f.read()) + for key, value in vars(content).items(): + setattr(result, key, value) + else: + raise ValueError("Could not find JSON input file") + + # Override with user-provided command-line arguments + for key in arg_flags: + if hasattr(args, key): + user_value = getattr(args, key, None) + setattr(result, key, user_value) + return result def makePathAbsolute(p: str) -> str: path = pathlib.Path(p) @@ -31,20 +74,34 @@ def createDirectory(directory: str): os.makedirs(path) -def createArgsFromJson(in_json: str, ignore_elements: list, return_json_object: bool): +def createArgsFromJson(jsonFile: str): arguments = [] - with open(in_json, "r") as f: + ignore_elements = ["py/object"] + + with open(jsonFile, "r") as f: data = json.load(f) + + # Check each key in the JSON file against command-line arguments for key, value in data.items(): if key not in ignore_elements: + # Prepare the command-line argument format + cli_arg_key = f"--{key}" + + # Check if this argument is provided in the command line + if cli_arg_key in sys.argv: + # Find the index of the argument in sys.argv and get its value + arg_index = sys.argv.index(cli_arg_key) + 1 + if arg_index < len(sys.argv): + cli_value = sys.argv[arg_index] + value = cli_value # Override JSON value with command-line value + + # Append the argument and its value to the list if key == "extra_metrics" and isinstance(value, list): - arguments.append("--extra_metrics") + arguments.append(cli_arg_key) arguments.extend(value) else: - arguments.append("--" + str(key)) - arguments.append(str(value)) - if return_json_object: - return arguments, data + arguments.extend([cli_arg_key, str(value)]) + return arguments diff --git a/dfpl/vae.py b/dfpl/vae.py index d0a89dbe..cc61b17d 100644 --- a/dfpl/vae.py +++ b/dfpl/vae.py @@ -13,7 +13,7 @@ from sklearn.model_selection import train_test_split from tensorflow.keras import initializers, optimizers from tensorflow.keras.layers import Dense, Input, Lambda -from tensorflow.keras.models import Model +from tensorflow.keras.models import Model,load_model from tensorflow.python.framework.ops import disable_eager_execution from dfpl import callbacks @@ -26,114 +26,71 @@ def define_vae_model(opts: options.Options, output_bias=None) -> Tuple[Model, Model]: input_size = opts.fpSize - encoding_dim = opts.encFPSize - ac_optimizer = optimizers.Adam( - learning_rate=opts.aeLearningRate, decay=opts.aeLearningRateDecay + encoding_dim = opts.encFPSize # This should be the intended size of your latent space, e.g., 256 + + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, ) + ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) if output_bias is not None: output_bias = initializers.Constant(output_bias) - # get the number of meaningful hidden layers (latent space included) hidden_layer_count = round(math.log2(input_size / encoding_dim)) - # the input placeholder input_vec = Input(shape=(input_size,)) - # 1st hidden layer, that receives weights from input layer - # equals bottleneck layer, if hidden_layer_count==1! + # 1st hidden layer if opts.aeActivationFunction != "selu": - encoded = Dense( - units=int(input_size / 2), activation=opts.aeActivationFunction - )(input_vec) + encoded = Dense(units=int(input_size / 2), activation=opts.aeActivationFunction)(input_vec) else: - encoded = Dense( - units=int(input_size / 2), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(input_vec) - - if hidden_layer_count > 1: - # encoding layers, incl. bottle-neck - for i in range(1, hidden_layer_count): - factor_units = 2 ** (i + 1) - # print(f'{factor_units}: {int(input_size / factor_units)}') - if opts.aeActivationFunction != "selu": - encoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - else: - encoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(encoded) - - # latent space layers - factor_units = 2 ** (hidden_layer_count - 1) + encoded = Dense(units=int(input_size / 2), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(input_vec) + + # encoding layers + for i in range(1, hidden_layer_count - 1): # Adjust the range to stop before the latent space layers + factor_units = 2 ** (i + 1) if opts.aeActivationFunction != "selu": - z_mean = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - z_log_var = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) + encoded = Dense(units=int(input_size / factor_units), activation=opts.aeActivationFunction)(encoded) else: - z_mean = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(encoded) - z_log_var = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(encoded) - - # sampling layer - def sampling(args): - z_mean, z_log_var = args - batch = K.shape(z_mean)[0] - dim = K.int_shape(z_mean)[1] - epsilon = K.random_normal(shape=(batch, dim)) - return z_mean + K.exp(0.5 * z_log_var) * epsilon - - # sample from latent space - z = Lambda(sampling, output_shape=(int(input_size / factor_units),))( - [z_mean, z_log_var] - ) - decoded = z - # decoding layers - for i in range(hidden_layer_count - 2, 0, -1): - factor_units = 2**i - # print(f'{factor_units}: {int(input_size/factor_units)}') - if opts.aeActivationFunction != "selu": - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(decoded) - else: - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(decoded) - - # output layer - decoded = Dense( - units=input_size, activation="sigmoid", bias_initializer=output_bias - )(decoded) + encoded = Dense(units=int(input_size / factor_units), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(encoded) + # latent space layers + if opts.aeActivationFunction != "selu": + z_mean = Dense(units=encoding_dim, activation=opts.aeActivationFunction)(encoded) # Adjusted size to encoding_dim + z_log_var = Dense(units=encoding_dim, activation=opts.aeActivationFunction)(encoded) # Adjusted size to encoding_dim else: - # output layer - decoded = Dense( - units=input_size, activation="sigmoid", bias_initializer=output_bias - )(encoded) + z_mean = Dense(units=encoding_dim, activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(encoded) # Adjusted size to encoding_dim + z_log_var = Dense(units=encoding_dim, activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(encoded) # Adjusted size to encoding_dim + + # sampling layer + def sampling(args): + z_mean, z_log_var = args + batch = K.shape(z_mean)[0] + dim = K.int_shape(z_mean)[1] + epsilon = K.random_normal(shape=(batch, dim)) + return z_mean + K.exp(0.5 * z_log_var) * epsilon + + z = Lambda(sampling, output_shape=(encoding_dim,))([z_mean, z_log_var]) + decoded = z + + # decoding layers + for i in range(hidden_layer_count - 2, 0, -1): + factor_units = 2 ** i + if opts.aeActivationFunction != "selu": + decoded = Dense(units=int(input_size / factor_units), activation=opts.aeActivationFunction)(decoded) + else: + decoded = Dense(units=int(input_size / factor_units), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(decoded) + + # output layer + decoded = Dense(units=input_size, activation="sigmoid", bias_initializer=output_bias)(decoded) autoencoder = Model(input_vec, decoded) + encoder = Model(input_vec, z) + autoencoder.summary(print_fn=logging.info) + # KL divergence loss def kl_loss(z_mean, z_log_var): @@ -155,9 +112,6 @@ def vae_loss(y_true, y_pred): optimizer=ac_optimizer, loss=vae_loss, metrics=[bce_loss, kl_loss] ) - # build encoder model - encoder = Model(input_vec, z_mean) - return autoencoder, encoder @@ -175,39 +129,9 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: if opts.aeWabTracking and not opts.wabTracking: wandb.init(project=f"VAE_{opts.aeSplitType}") - # Define output files for VAE and encoder weights - if opts.ecWeightsFile == "": - # If no encoder weights file is specified, use the input file name to generate a default file name - logging.info("No VAE encoder weights file specified") - base_file_name = ( - os.path.splitext(basename(opts.inputFile))[0] - + opts.aeType - + opts.aeSplitType - ) - logging.info( - f"(variational) encoder weights will be saved in {base_file_name}.autoencoder.hdf5" - ) - vae_weights_file = os.path.join( - opts.outputDir, base_file_name + ".vae.weights.hdf5" - ) - # ec_weights_file = os.path.join( - # opts.outputDir, base_file_name + ".encoder.weights.hdf5" - # ) - else: - # If an encoder weights file is specified, use it as the encoder weights file name - logging.info(f"VAE encoder will be saved in {opts.ecWeightsFile}") - base_file_name = ( - os.path.splitext(basename(opts.ecWeightsFile))[0] + opts.aeSplitType - ) - vae_weights_file = os.path.join( - opts.outputDir, base_file_name + ".vae.weights.hdf5" - ) - # ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile) - + save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_split_autoencoder") # Collect the callbacks for training - callback_list = callbacks.autoencoder_callback( - checkpoint_path=vae_weights_file, opts=opts - ) + # Select all fingerprints that are valid and turn them into a numpy array fp_matrix = np.array( df[df["fp"].notnull()]["fp"].to_list(), @@ -219,17 +143,17 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: ) assert 0.0 <= opts.testSize <= 0.5 if opts.aeSplitType == "random": - logging.info("Training VAE using random split") - train_indices = np.arange(fp_matrix.shape[0]) + logging.info("Training autoencoder using random split") + initial_indices = np.arange(fp_matrix.shape[0]) if opts.testSize > 0.0: # Split data into test and training data if opts.aeWabTracking: - x_train, x_test, _, _ = train_test_split( - fp_matrix, train_indices, test_size=opts.testSize, random_state=42 + x_train, x_test, train_indices, test_indices = train_test_split( + fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 ) else: - x_train, x_test, _, _ = train_test_split( - fp_matrix, train_indices, test_size=opts.testSize, random_state=42 + x_train, x_test, train_indices, test_indices = train_test_split( + fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 ) else: x_train = fp_matrix @@ -255,6 +179,12 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: dtype=settings.ac_fp_numpy_type, copy=settings.numpy_copy_values, ) + train_indices = df[ + df.index.isin(train_data[train_data["fp"].notnull()].index) + ].index.to_numpy() + test_indices = df[ + df.index.isin(test_data[test_data["fp"].notnull()].index) + ].index.to_numpy() else: x_train = fp_matrix x_test = None @@ -262,7 +192,6 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: logging.info("Training autoencoder using molecular weight split") train_indices = np.arange(fp_matrix.shape[0]) if opts.testSize > 0.0: - # if opts.aeWabTracking: train_data, val_data, test_data = weight_split( df, sizes=(1 - opts.testSize, 0.0, opts.testSize), bias="small" ) @@ -276,16 +205,21 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: dtype=settings.ac_fp_numpy_type, copy=settings.numpy_copy_values, ) + df_sorted = df.sort_values(by="mol_weight", ascending=True) + # Get the sorted indices from the sorted DataFrame + sorted_indices = df_sorted.index.to_numpy() + + # Find the corresponding indices for train_data, val_data, and test_data in the sorted DataFrame + train_indices = sorted_indices[df.index.isin(train_data.index)] + # val_indices = sorted_indices[df.index.isin(val_data.index)] + test_indices = sorted_indices[df.index.isin(test_data.index)] else: x_train = fp_matrix x_test = None else: raise ValueError(f"Invalid split type: {opts.split_type}") - if opts.testSize > 0.0: - train_indices = train_indices[train_indices < x_train.shape[0]] - test_indices = np.arange(x_train.shape[0], x_train.shape[0] + x_test.shape[0]) - else: - test_indices = None + + # Calculate the initial bias aka the log ratio between 1's and 0'1 in all fingerprints ids, counts = np.unique(x_train.flatten(), return_counts=True) count_dict = dict(zip(ids, counts)) if count_dict[0] == 0: @@ -304,34 +238,32 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: (vae, encoder) = define_vae_model(opts, output_bias=initial_bias) # Train the VAE on the training data + callback_list = callbacks.autoencoder_callback(checkpoint_path=f"{save_path}.h5", opts=opts) + vae_hist = vae.fit( x_train, x_train, epochs=opts.aeEpochs, batch_size=opts.aeBatchSize, verbose=opts.verbose, - callbacks=callback_list, + callbacks=[callback_list], validation_data=(x_test, x_test) if opts.testSize > 0.0 else None, ) # Save the VAE weights - logging.info(f"VAE weights stored in file: {vae_weights_file}") ht.store_and_plot_history( - base_file_name=os.path.join(opts.outputDir, base_file_name + ".VAE"), + base_file_name=save_path, hist=vae_hist, ) - save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_VAE.h5") - if opts.testSize > 0.0: - (callback_vae, callback_encoder) = define_vae_model(opts) - callback_vae.load_weights(filepath=vae_weights_file) - callback_encoder.save(filepath=save_path) - else: - encoder.save(filepath=save_path) - latent_space = encoder.predict(fp_matrix) - latent_space_file = os.path.join( - opts.outputDir, base_file_name + ".latent_space.csv" - ) - with open(latent_space_file, "w", newline="") as file: - writer = csv.writer(file) - writer.writerows(latent_space) + # Re-define autoencoder and encoder using your function + callback_autoencoder, callback_encoder = define_vae_model(opts) + callback_autoencoder.load_weights(filepath=f"{save_path}.h5") + + for i, layer in enumerate(callback_encoder.layers): + layer.set_weights(callback_autoencoder.layers[i].get_weights()) + + # Save the encoder model + encoder_save_path = f"{save_path}_encoder.h5" + callback_encoder.save_weights(filepath=encoder_save_path) + return encoder, train_indices, test_indices diff --git a/example/predict.json b/example/predict.json index 252965e3..e3305c7c 100755 --- a/example/predict.json +++ b/example/predict.json @@ -1,12 +1,12 @@ { "py/object": "dfpl.options.Options", - "inputFile": "tests/data/smiles.csv", + "inputFile": "tests/data/tox21.csv", "outputDir": "example/results_predict/", "outputFile": "smiles.csv", - "ecModelDir": "example/results_train/random_autoencoder/", - "ecWeightsFile": "", - "fnnModelDir": "example/results_train/AR_saved_model", + "ecModelDir": "example/results_train/random_split_autoencoder", + "ecWeightsFile": "random_split_autoencoder_encoder.h5", + "fnnModelDir": "example/results_train/NR-AR-1_best_saved_model", + "aeType": "variational", "compressFeatures": true, - "trainAC": false, "trainFNN": false } diff --git a/example/predictgnn.json b/example/predictgnn.json index 157b5e05..813cf0c5 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -1,7 +1,6 @@ { "py/object": "dfpl.options.GnnOptions", "test_path": "tests/data/smiles.csv", - "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt", - "save_dir": "preds_dmpnn", - "saving_name": "DMPNN_preds.csv" + "preds_path": "example/results_gnn.csv", + "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt" } \ No newline at end of file diff --git a/example/train.json b/example/train.json index 62f2abb4..53575adc 100755 --- a/example/train.json +++ b/example/train.json @@ -1,22 +1,23 @@ { "py/object": "dfpl.options.Options", - "inputFile": "tests/data/S_dataset.csv", + "inputFile": "tests/data/tox21.csv", "outputDir": "example/results_train/", - "ecModelDir": "example/results_train/", - "ecWeightsFile": "random_autoencoder.hdf5", + "ecModelDir": "example/results_train/random_split_autoencoder/", + "ecWeightsFile": "", "verbose": 2, - "trainAC": true, - "compressFeatures": true, + "trainAC": false, + "compressFeatures": false, + "visualizeLatent": false, "encFPSize": 256, "aeSplitType": "random", - "aeEpochs": 2, + "aeEpochs": 4, "aeBatchSize": 351, "aeOptimizer": "Adam", "aeActivationFunction": "relu", "aeLearningRate": 0.001, - "aeLearningRateDecay": 0.0001, + "aeLearningRateDecay": 0.96, "aeType": "deterministic", "type": "smiles", @@ -29,7 +30,7 @@ "gpu": "", "trainFNN": true, - "kFolds": 1, + "kFolds": 2, "threshold": 0.5, "testSize": 0.2, "fnnType": "FNN", @@ -40,6 +41,7 @@ "activationFunction": "selu", "dropout": 0.0107, "learningRate": 0.0000022, + "learningRateDecay": 0.96, "l2reg": 0.001, "aeWabTracking": false, diff --git a/example/traingnn.json b/example/traingnn.json index 7a5a0712..1ca58a30 100644 --- a/example/traingnn.json +++ b/example/traingnn.json @@ -2,13 +2,13 @@ "py/object": "dfpl.options.GnnOptions", "data_path": "tests/data/S_dataset.csv", "save_dir": "dmpnn-random/", - "epochs": 2, - "num_folds": 2, + "epochs": 4, + "num_folds": 1, "metric": "accuracy", "loss_function": "binary_cross_entropy", "split_type": "random", "dataset_type": "classification", "smiles_columns": "smiles", - "extra_metrics": ["balanced_accuracy","auc","f1","mcc","recall","specificity","precision"], + "extra_metrics": ["balanced_accuracy","auc","f1","mcc","recall","precision"], "hidden_size": 256 } \ No newline at end of file From 6f7611ee3ed1aa20db9d42a43db6bf7503158605 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Mar 2024 13:59:16 +0100 Subject: [PATCH 02/48] only argparse --- dfpl/__main__.py | 165 ++++------ dfpl/options.py | 814 ++++++++++++++++++++++++++++------------------- dfpl/utils.py | 180 +++++++---- 3 files changed, 669 insertions(+), 490 deletions(-) diff --git a/dfpl/__main__.py b/dfpl/__main__.py index 7896d451..8d035579 100755 --- a/dfpl/__main__.py +++ b/dfpl/__main__.py @@ -1,12 +1,10 @@ import dataclasses import logging -import os.path -import pathlib +import os from argparse import Namespace from os import path -import chemprop as cp -import pandas as pd +import chemprop from keras.models import load_model from dfpl import autoencoder as ac @@ -17,43 +15,8 @@ from dfpl import vae as vae from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute -project_directory = pathlib.Path(".").parent.parent.absolute() -test_train_opts = options.Options( - inputFile=f"{project_directory}/input_datasets/S_dataset.pkl", - outputDir=f"{project_directory}/output_data/console_test", - ecWeightsFile=f"{project_directory}/output_data/case_00/AE_S/ae_S.encoder.hdf5", - ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model", - type="smiles", - fpType="topological", - epochs=100, - batchSize=1024, - fpSize=2048, - encFPSize=256, - enableMultiLabel=False, - testSize=0.2, - kFolds=2, - verbose=2, - trainAC=False, - trainFNN=True, - compressFeatures=True, - activationFunction="selu", - lossFunction="bce", - optimizer="Adam", - fnnType="FNN", -) - -test_pred_opts = options.Options( - inputFile=f"{project_directory}/input_datasets/S_dataset.pkl", - outputDir=f"{project_directory}/output_data/console_test", - outputFile=f"{project_directory}/output_data/console_test/S_dataset.predictions_ER.csv", - ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model", - fnnModelDir=f"{project_directory}/output_data/console_test/ER_saved_model", - type="smiles", - fpType="topological", -) - - -def traindmpnn(opts: options.GnnOptions): + +def traindmpnn(opts: options.GnnOptions) -> None: """ Train a D-MPNN model using the given options. Args: @@ -61,54 +24,46 @@ def traindmpnn(opts: options.GnnOptions): Returns: - None """ - os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}" - ignore_elements = ["py/object"] # Load options from a JSON file and replace the relevant attributes in `opts` - arguments = createArgsFromJson( - opts.configFile, ignore_elements, return_json_object=False - ) - opts = cp.args.TrainArgs().parse_args(arguments) + arguments = createArgsFromJson(jsonFile=opts.configFile) + opts = chemprop.args.TrainArgs().parse_args(arguments) logging.info("Training DMPNN...") - # Train the model and get the mean and standard deviation of AUC score from cross-validation - mean_score, std_score = cp.train.cross_validate( - args=opts, train_func=cp.train.run_training + mean_score, std_score = chemprop.train.cross_validate( + args=opts, train_func=chemprop.train.run_training ) logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}") -def predictdmpnn(opts: options.GnnOptions, json_arg_path: str) -> None: +def predictdmpnn(opts: options.GnnOptions) -> None: """ Predict the values using a trained D-MPNN model with the given options. Args: - opts: options.GnnOptions instance containing the details of the prediction - - JSON_ARG_PATH: path to a JSON file containing additional arguments for prediction Returns: - None """ - ignore_elements = [ - "py/object", - "checkpoint_paths", - "save_dir", - "saving_name", - ] # Load options and additional arguments from a JSON file - arguments, data = createArgsFromJson( - json_arg_path, ignore_elements, return_json_object=True + arguments = createArgsFromJson(jsonFile=opts.configFile) + opts = chemprop.args.PredictArgs().parse_args(arguments) + + chemprop.train.make_predictions(args=opts) + + +def interpretdmpnn(opts: options.GnnOptions) -> None: + """ + Interpret the predictions of a trained D-MPNN model with the given options. + Args: + - opts: options.GnnOptions instance containing the details of the prediction + Returns: + - None + """ + # Load options and additional arguments from a JSON file + arguments = createArgsFromJson(jsonFile=opts.configFile) + opts = chemprop.args.InterpretArgs().parse_args(arguments) + + chemprop.interpret.interpret( + args=opts, save_to_csv=True ) - arguments.append("--preds_path") - arguments.append("") - save_dir = data.get("save_dir") - name = data.get("saving_name") - # Replace relevant attributes in `opts` with loaded options - opts = cp.args.PredictArgs().parse_args(arguments) - opts.preds_path = save_dir + "/" + name - df = pd.read_csv(opts.test_path) - smiles = [] - for index, rows in df.iterrows(): - my_list = [rows.smiles] - smiles.append(my_list) - # Make predictions and return the result - cp.train.make_predictions(args=opts, smiles=smiles) def train(opts: options.Options): @@ -116,9 +71,6 @@ def train(opts: options.Options): Run the main training procedure :param opts: Options defining the details of the training """ - - os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}" - # import data from file and create DataFrame if "tsv" in opts.inputFile: df = fp.importDataFile( @@ -128,7 +80,7 @@ def train(opts: options.Options): df = fp.importDataFile( opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize ) - # initialize encoders to None + # initialize (auto)encoders to None encoder = None autoencoder = None if opts.trainAC: @@ -142,11 +94,12 @@ def train(opts: options.Options): # if feature compression is enabled if opts.compressFeatures: if not opts.trainAC: - if opts.aeType == "deterministic": - (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) - elif opts.aeType == "variational": + if opts.aeType == "variational": (autoencoder, encoder) = vae.define_vae_model(opts=options.Options()) - elif opts.ecWeightsFile == "": + else: + (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) + + if opts.ecWeightsFile == "": encoder = load_model(opts.ecModelDir) else: autoencoder.load_weights( @@ -154,14 +107,18 @@ def train(opts: options.Options): ) # compress the fingerprints using the autoencoder df = ac.compress_fingerprints(df, encoder) - # ac.visualize_fingerprints( - # df, - # before_col="fp", - # after_col="fpcompressed", - # train_indices=train_indices, - # test_indices=test_indices, - # save_as=f"UMAP_{opts.aeSplitType}.png", - # ) + if opts.visualizeLatent and opts.trainAC: + ac.visualize_fingerprints( + df, + train_indices=train_indices, + test_indices=test_indices, + save_as=f"{opts.ecModelDir}/UMAP_{opts.aeSplitType}.png", + ) + elif opts.visualizeLatent: + logging.info( + "Visualizing latent space is only available if you train the autoencoder. Skipping visualization." + ) + # train single label models if requested if opts.trainFNN and not opts.enableMultiLabel: sl.train_single_label_models(df=df, opts=opts) @@ -257,24 +214,22 @@ def main(): raise ValueError("Input directory is not a directory") elif prog_args.method == "traingnn": traingnn_opts = options.GnnOptions.fromCmdArgs(prog_args) - + createLogger("traingnn.log") traindmpnn(traingnn_opts) elif prog_args.method == "predictgnn": - predictgnn_opts = options.GnnOptions.fromCmdArgs(prog_args) - fixed_opts = dataclasses.replace( - predictgnn_opts, - test_path=makePathAbsolute(predictgnn_opts.test_path), - preds_path=makePathAbsolute(predictgnn_opts.preds_path), - ) - - logging.info( - f"The following arguments are received or filled with default values:\n{prog_args}" - ) - - predictdmpnn(fixed_opts, prog_args.configFile) + predictgnn_opts = options.PredictGnnOptions.fromCmdArgs(prog_args) + createLogger("predictgnn.log") + predictdmpnn(predictgnn_opts) + elif prog_args.method == "interpretgnn": + interpretgnn_opts = options.InterpretGNNoptions.fromCmdArgs(prog_args) + createLogger("interpretgnn.log") + interpretdmpnn(interpretgnn_opts) elif prog_args.method == "train": + if prog_args.configFile is None and prog_args.inputFile is None: + parser.error("Either --configFile or --inputFile must be provided.") + train_opts = options.Options.fromCmdArgs(prog_args) fixed_opts = dataclasses.replace( train_opts, @@ -288,6 +243,8 @@ def main(): ) train(fixed_opts) elif prog_args.method == "predict": + if prog_args.configFile is None and prog_args.inputFile is None: + parser.error("Either --configFile or --inputFile must be provided.") predict_opts = options.Options.fromCmdArgs(prog_args) fixed_opts = dataclasses.replace( predict_opts, @@ -298,8 +255,6 @@ def main(): ), ecModelDir=makePathAbsolute(predict_opts.ecModelDir), fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir), - trainAC=False, - trainFNN=False, ) createDirectory(fixed_opts.outputDir) createLogger(path.join(fixed_opts.outputDir, "predict.log")) diff --git a/dfpl/options.py b/dfpl/options.py index 6d84dbc4..85e245bc 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -3,12 +3,13 @@ import argparse from dataclasses import dataclass from pathlib import Path +from typing import Optional, Literal, List import jsonpickle import torch -from chemprop.args import TrainArgs +from chemprop.args import TrainArgs, PredictArgs, InterpretArgs -from dfpl.utils import makePathAbsolute +from dfpl.utils import parseCmdArgs @dataclass @@ -17,51 +18,51 @@ class Options: Dataclass for all options necessary for training the neural nets """ - configFile: str = "./example/train.json" - inputFile: str = "/deepFPlearn/CMPNN/data/tox21.csv" - outputDir: str = "." - outputFile: str = "" - ecWeightsFile: str = "AE.encoder.weights.hdf5" - ecModelDir: str = "AE_encoder" - fnnModelDir: str = "modeltraining" + configFile: str = None + inputFile: str = "tests/data/smiles.csv" + outputDir: str = "example/results_train/" # changes according to mode + outputFile: str = "results.csv" + ecWeightsFile: str = "" + ecModelDir: str = "example/results_train/AE_encoder/" + fnnModelDir: str = "example/results_train/AR_saved_model/" type: str = "smiles" fpType: str = "topological" # also "MACCS", "atompairs" - epochs: int = 512 + epochs: int = 100 fpSize: int = 2048 encFPSize: int = 256 - kFolds: int = 0 + kFolds: int = 1 testSize: float = 0.2 enableMultiLabel: bool = False - verbose: int = 0 - trainAC: bool = True # if set to False, an AC weight file must be provided! + verbose: int = 2 + trainAC: bool = False trainFNN: bool = True - compressFeatures: bool = True - sampleFractionOnes: float = 0.5 # Only used when value is in [0,1] + compressFeatures: bool = False + sampleFractionOnes: float = 0.5 sampleDown: bool = False split_type: str = "random" aeSplitType: str = "random" aeType: str = "deterministic" - aeEpochs: int = 3000 + aeEpochs: int = 100 aeBatchSize: int = 512 aeLearningRate: float = 0.001 - aeLearningRateDecay: float = 0.01 - aeActivationFunction: str = "relu" + aeLearningRateDecay: float = 0.96 + aeActivationFunction: str = "selu" aeOptimizer: str = "Adam" fnnType: str = "FNN" batchSize: int = 128 optimizer: str = "Adam" learningRate: float = 0.001 + learningRateDecay: float = 0.96 lossFunction: str = "bce" activationFunction: str = "relu" l2reg: float = 0.001 dropout: float = 0.2 threshold: float = 0.5 - gpu: str = "" - snnDepth = 8 - snnWidth = 50 - aeWabTracking: str = "" # Wand & Biases autoencoder tracking - wabTracking: str = "" # Wand & Biases FNN tracking - wabTarget: str = "ER" # Wand & Biases target used for showing training progress + visualizeLatent: bool = False # only if autoencoder is trained or loaded + gpu: int = None + aeWabTracking: bool = False # Wand & Biases autoencoder tracking + wabTracking: bool = False # Wand & Biases FNN tracking + wabTarget: str = "AR" # Wand & Biases target used for showing training progress def saveToFile(self, file: str) -> None: """ @@ -72,42 +73,8 @@ def saveToFile(self, file: str) -> None: f.write(jsonpickle.encode(self)) @classmethod - def fromJson(cls, file: str) -> Options: - """ - Create an instance from a JSON file - """ - jsonFile = Path(file) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - return jsonpickle.decode(content) - raise ValueError("JSON file does not exist or is not readable") - - @classmethod - def fromCmdArgs(cls, args: argparse.Namespace) -> Options: - """ - Creates Options instance from cmdline arguments. - - If a training file (JSON) is provided, the values from that file are used. - However, additional commandline arguments will be preferred. If, e.g., "fpSize" is specified both in the - JSON file and on the commandline, then the value of the commandline argument will be used. - """ - result = Options() - if "configFile" in vars(args).keys(): - jsonFile = Path(makePathAbsolute(args.configFile)) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - result = jsonpickle.decode(content) - else: - raise ValueError("Could not find JSON input file") - - for key, value in vars(args).items(): - # The args dict will contain a "method" key from the subparser. - # We don't use this. - if key != "method": - result.__setattr__(key, value) - return result + def fromCmdArgs(cls, args: argparse.Namespace) -> "Options": + return parseCmdArgs(cls, args) @dataclass @@ -132,39 +99,114 @@ class GnnOptions(TrainArgs): preds_path: str = "./tox21dmpnn.csv" test_path: str = "" save_preds: bool = True + calibration_method: str = "none" + uncertainty_method: str = "none" + calibration_path: str = "" + evaluation_methods: str = "none" + evaluation_scores_path: str = "" + wabTracking: bool = False + split_sizes: List[float] = None + # save_smiles_splits: bool = False + @classmethod + def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): + # Initialize with JSON config if provided + if json_config: + opts = cls(**json_config) + else: + opts = cls() + + # Update with command-line arguments + for key, value in vars(args).items(): + if value is not None: + setattr(opts, key, value) + + return opts + + +class PredictGnnOptions(PredictArgs): + """ + Dataclass to hold all options used for training the graph models + """ + + configFile: str = "./example/predictgnn.json" + calibration_atom_descriptors_path: str = None + calibration_features_path: str = None + calibration_interval_percentile: float = 95 + calibration_method: Literal[ + "zscaling", + "tscaling", + "zelikman_interval", + "mve_weighting", + "platt", + "isotonic", + ] = None + calibration_path: str = None + calibration_phase_features_path: str = None + drop_extra_columns: bool = False + dropout_sampling_size: int = 10 + evaluation_methods: List[str] = None + evaluation_scores_path: str = None + # no_features_scaling: bool = True + individual_ensemble_predictions: bool = False + preds_path: str = None + regression_calibrator_metric: Literal["stdev", "interval"] = None + test_path: str = None + uncertainty_dropout_p: float = 0.1 + uncertainty_method: Literal[ + "mve", + "ensemble", + "evidential_epistemic", + "evidential_aleatoric", + "evidential_total", + "classification", + "dropout", + ] = None @classmethod - def fromCmdArgs(cls, args: argparse.Namespace) -> GnnOptions: - """ - Creates Options instance from cmdline arguments. + def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): + # Initialize with JSON config if provided + if json_config: + opts = cls(**json_config) + else: + opts = cls() - If a training file (JSON) is provided, the values from that file are used. - However, additional commandline arguments will be preferred. If, e.g., "fpSize" is specified both in the - JSON file and on the commandline, then the value of the commandline argument will be used. - """ - result = GnnOptions() - if "configFile" in vars(args).keys(): - jsonFile = Path(makePathAbsolute(args.configFile)) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - result = jsonpickle.decode(content) - else: - raise ValueError("Could not find JSON input file") - - return result + # Update with command-line arguments + for key, value in vars(args).items(): + if value is not None: + setattr(opts, key, value) + + return opts + + +class InterpretGNNoptions(InterpretArgs): + """ + Dataclass to hold all options used for training the graph models + """ + + configFile: str = "./example/interpret.json" + data_path: str = "./example/data/smiles.csv" + batch_size: int = 500 + c_puct: float = 10.0 + max_atoms: int = 20 + min_atoms: int = 8 + prop_delta: float = 0.5 + property_id: List[int] = None + rollout: int = 20 @classmethod - def fromJson(cls, file: str) -> GnnOptions: - """ - Create an instance from a JSON file - """ - jsonFile = Path(file) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - return jsonpickle.decode(content) - raise ValueError("JSON file does not exist or is not readable") + def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): + # Initialize with JSON config if provided + if json_config: + opts = cls(**json_config) + else: + opts = cls() + + # Update with command-line arguments + for key, value in vars(args).items(): + if value is not None: + setattr(opts, key, value) + + return opts def createCommandlineParser() -> argparse.ArgumentParser: @@ -186,6 +228,12 @@ def createCommandlineParser() -> argparse.ArgumentParser: parser_predict_gnn.set_defaults(method="predictgnn") parsePredictGnn(parser_predict_gnn) + parser_interpret_gnn = subparsers.add_parser( + "interpretgnn", help="Interpret your GNN models" + ) + parser_interpret_gnn.set_defaults(method="interpretgnn") + parseInterpretGnn(parser_interpret_gnn) + parser_train = subparsers.add_parser( "train", help="Train new models with your data" ) @@ -225,7 +273,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="FILE", type=str, help="Input JSON file that contains all information for training/predicting.", - default=argparse.SUPPRESS, + default="example/train.json", ) general_args.add_argument( "-i", @@ -234,7 +282,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="The file containing the data for training in " "comma separated CSV format.The first column should be smiles.", - default=argparse.SUPPRESS, + default="tests/data/smiles.csv", ) general_args.add_argument( "-o", @@ -243,8 +291,10 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="Prefix of output file name. Trained model and " "respective stats will be returned in this directory.", - default=argparse.SUPPRESS, + default="example/results_train/", ) + + # TODO CHECK WHAT IS TYPE DOING? general_args.add_argument( "-t", "--type", @@ -252,7 +302,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["fp", "smiles"], help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default=argparse.SUPPRESS, + default="fp", ) general_args.add_argument( "-thr", @@ -260,47 +310,41 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=float, metavar="FLOAT", help="Threshold for binary classification.", - default=argparse.SUPPRESS, + default=0.5, ) general_args.add_argument( "-gpu", "--gpu", metavar="INT", type=int, - help="Select which gpu to use. If not available, leave empty.", - default=argparse.SUPPRESS, + help="Select which gpu to use by index. If not available, leave empty", + default=None, ) general_args.add_argument( - "-k", "--fpType", metavar="STR", type=str, - choices=["topological", "MACCS"], # , 'atompairs', 'torsions'], - help="The type of fingerprint to be generated/used in input file.", - default=argparse.SUPPRESS, + choices=["topological", "MACCS"], + help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.", + default="topological", ) general_args.add_argument( - "-s", "--fpSize", type=int, - help="Size of fingerprint that should be generated.", - default=argparse.SUPPRESS, + help="Length of the fingerprint that should be generated.", + default=2048, ) general_args.add_argument( - "-c", "--compressFeatures", - metavar="BOOL", - type=bool, - help="Should the fingerprints be compressed or not. Activates the autoencoder. ", - default=argparse.SUPPRESS, + action="store_true", + help="Should the fingerprints be compressed or not. Needs a path of a trained autoencoder or needs the trainAC also set to True.", + default=False, ) general_args.add_argument( - "-m", "--enableMultiLabel", - metavar="BOOL", - type=bool, + action="store_true", help="Train multi-label classification model in addition to the individual models.", - default=argparse.SUPPRESS, + default=False, ) # Autoencoder Configuration autoencoder_args.add_argument( @@ -309,14 +353,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, metavar="FILE", help="The .hdf5 file of a trained encoder", - default=argparse.SUPPRESS, + default="", ) autoencoder_args.add_argument( "--ecModelDir", type=str, metavar="DIR", help="The directory where the full model of the encoder will be saved", - default=argparse.SUPPRESS, + default="example/results_train/AE_encoder/", ) autoencoder_args.add_argument( "--aeType", @@ -324,21 +368,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["variational", "deterministic"], help="Autoencoder type, variational or deterministic.", - default=argparse.SUPPRESS, + default="deterministic", ) autoencoder_args.add_argument( "--aeEpochs", metavar="INT", type=int, help="Number of epochs for autoencoder training.", - default=argparse.SUPPRESS, + default=100, ) autoencoder_args.add_argument( "--aeBatchSize", metavar="INT", type=int, help="Batch size in autoencoder training.", - default=argparse.SUPPRESS, + default=512, ) autoencoder_args.add_argument( "--aeActivationFunction", @@ -346,21 +390,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["relu", "selu"], help="The activation function for the hidden layers in the autoencoder.", - default=argparse.SUPPRESS, + default="relu", ) autoencoder_args.add_argument( "--aeLearningRate", metavar="FLOAT", type=float, help="Learning rate for autoencoder training.", - default=argparse.SUPPRESS, + default=0.001, ) autoencoder_args.add_argument( "--aeLearningRateDecay", metavar="FLOAT", type=float, help="Learning rate decay for autoencoder training.", - default=argparse.SUPPRESS, + default=0.96, ) autoencoder_args.add_argument( "--aeSplitType", @@ -368,7 +412,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the autoencoder", - default=argparse.SUPPRESS, + default="random", ) autoencoder_args.add_argument( "-d", @@ -376,7 +420,13 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Size of encoded fingerprint (z-layer of autoencoder).", - default=argparse.SUPPRESS, + default=256, + ) + autoencoder_args.add_argument( + "--visualizeLatent", + action="store_true", + help="UMAP the latent space for exploration", + default=False, ) # Training Configuration training_args.add_argument( @@ -385,15 +435,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the feedforward neural network", - default=argparse.SUPPRESS, + default="random", ) training_args.add_argument( - "-l", "--testSize", metavar="FLOAT", type=float, help="Fraction of the dataset that should be used for testing. Value in [0,1].", - default=argparse.SUPPRESS, + default=0.2, ) training_args.add_argument( "-K", @@ -401,7 +450,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="K that is used for K-fold cross-validation in the training procedure.", - default=argparse.SUPPRESS, + default=1, ) training_args.add_argument( "-v", @@ -411,21 +460,19 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: choices=[0, 1, 2], help="Verbosity level. O: No additional output, " + "1: Some additional output, 2: full additional output", - default=argparse.SUPPRESS, + default=2, ) training_args.add_argument( "--trainAC", - metavar="BOOL", - type=bool, + action="store_true", help="Choose to train or not, the autoencoder based on the input file", - default=argparse.SUPPRESS, + default=False, ) training_args.add_argument( "--trainFNN", - metavar="BOOL", - type=bool, - help="Train the feedforward network either with provided weights.", - default=argparse.SUPPRESS, + action="store_false", + help="When called it deactivates the training.", + default=True, ) training_args.add_argument( "--sampleFractionOnes", @@ -433,14 +480,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=float, help="This is the fraction of positive target associations (1s) in comparison to the majority class(0s)." "only works if --sampleDown is enabled", - default=argparse.SUPPRESS, + default=0.5, ) training_args.add_argument( "--sampleDown", metavar="BOOL", type=bool, help="Enable automatic down sampling of the 0 valued samples.", - default=argparse.SUPPRESS, + default=False, ) training_args.add_argument( "-e", @@ -448,52 +495,60 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Number of epochs that should be used for the FNN training", - default=argparse.SUPPRESS, + default=100, ) - + # TODO CHECK IF ALL LOSSES MAKE SENSE HERE training_args.add_argument( "--lossFunction", metavar="STRING", type=str, choices=["mse", "bce", "focal"], help="Loss function to use during training. mse - mean squared error, bce - binary cross entropy.", - default=argparse.SUPPRESS, + default="bce", ) + # TODO DO I NEED ALL ARGUMENTS TO BE USER SPECIFIED? WHAT DOES THE USER KNOW ABOUT OPTIMIZERS? training_args.add_argument( "--optimizer", metavar="STRING", type=str, choices=["Adam", "SGD"], help='Optimizer to use for backpropagation in the FNN. Possible values: "Adam", "SGD"', - default=argparse.SUPPRESS, + default="Adam", ) training_args.add_argument( "--batchSize", metavar="INT", type=int, help="Batch size in FNN training.", - default=argparse.SUPPRESS, + default=128, ) training_args.add_argument( "--l2reg", metavar="FLOAT", type=float, help="Value for l2 kernel regularizer.", - default=argparse.SUPPRESS, + default=0.001, ) training_args.add_argument( "--dropout", metavar="FLOAT", type=float, help="The fraction of data that is dropped out in each dropout layer.", - default=argparse.SUPPRESS, + default=0.2, ) training_args.add_argument( "--learningRate", metavar="FLOAT", type=float, help="Learning rate size in FNN training.", - default=argparse.SUPPRESS, + default=0.000022, + ) + training_args.add_argument( + "--learningRateDecay", + metavar="FLOAT", + type=float, + help="Learning rate size in FNN training.", + default=0.96, ) training_args.add_argument( "--activationFunction", @@ -501,7 +556,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["relu", "selu"], help="The activation function for hidden layers in the FNN.", - default=argparse.SUPPRESS, + default="relu", ) # Tracking Configuration tracking_args.add_argument( @@ -509,14 +564,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="BOOL", type=bool, help="Track autoencoder performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default=False, ) tracking_args.add_argument( "--wabTracking", metavar="BOOL", type=bool, help="Track FNN performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default=False, ) tracking_args.add_argument( "--wabTarget", @@ -524,7 +579,112 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["AR", "ER", "ED", "GR", "TR", "PPARg", "Aromatase"], help="Which target to use for tracking performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default="AR", + ) + + +def parseInputPredict(parser: argparse.ArgumentParser) -> None: + """ + Parse the input arguments. + + :return: A namespace object built up from attributes parsed out of the cmd line. + """ + + general_args = parser.add_argument_group("General Configuration") + files_args = parser.add_argument_group("Files") + files_args.add_argument( + "-f", + "--configFile", + metavar="FILE", + type=str, + help="Input JSON file that contains all information for training/predicting.", + ) + files_args.add_argument( + "-i", + "--inputFile", + metavar="FILE", + type=str, + help="The file containing the data for the prediction in (unquoted) " + "comma separated CSV format. The column named 'smiles' or 'fp'" + "contains the field to be predicted. Please adjust the type " + "that should be predicted (fp or smile) with -t option appropriately." + "An optional column 'id' is used to assign the outcomes to the" + "original identifiers. If this column is missing, the results are" + "numbered in the order of their appearance in the input file." + "A header is expected and respective column names are used.", + default="tests/data/smiles.csv", + ) + files_args.add_argument( + "-o", + "--outputDir", + metavar="DIR", + type=str, + help="Prefix of output directory. It will contain a log file and the file specified" + "with --outputFile.", + default="example/results_predict/", + ) + files_args.add_argument( + "--outputFile", + metavar="FILE", + type=str, + help="Output .CSV file name which will contain one prediction per input line. " + "Default: prefix of input file name.", + default="results.csv", + ) + # TODO AGAIN THIS TRASH HERE? CAN WE EVEN PROCESS SMILES? + general_args.add_argument( + "-t", + "--type", + metavar="STR", + type=str, + choices=["fp", "smiles"], + help="Type of the chemical representation. Choices: 'fp', 'smiles'.", + default="fp", + ) + general_args.add_argument( + "-k", + "--fpType", + metavar="STR", + type=str, + choices=["topological", "MACCS"], + help="The type of fingerprint to be generated/used in input file. Should be the same as the type of the fps that the model was trained upon.", + default="topological", + ) + files_args.add_argument( + "--ecModelDir", + type=str, + metavar="DIR", + help="The directory where the full model of the encoder will be saved (if trainAE=True) or " + "loaded from (if trainAE=False). Provide a full path here.", + default="", + ) + files_args.add_argument( + "--ecWeightsFile", + type=str, + metavar="STR", + help="The file where the full model of the encoder will be loaded from, to compress the fingerprints. Provide a full path here.", + default="", + ) + files_args.add_argument( + "--fnnModelDir", + type=str, + metavar="DIR", + help="The directory where the full model of the fnn is loaded from. " + "Provide a full path here.", + default="example/results_train/AR_saved_model", + ) + general_args.add_argument( + "-c", "--compressFeatures", action="store_true", default=False + ) + ( + general_args.add_argument( + "--aeType", + metavar="STRING", + type=str, + choices=["variational", "deterministic"], + help="Autoencoder type, variational or deterministic.", + default="deterministic", + ) ) @@ -534,21 +694,62 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: files_args = parser.add_argument_group("Files") model_args = parser.add_argument_group("Model arguments") training_args = parser.add_argument_group("Training Configuration") + uncertainty_args = parser.add_argument_group("Uncertainty Configuration") + uncertainty_args.add_argument( + "--uncertainty_method", + type=str, + metavar="STRING", + choices=[ + "mve", + "ensemble", + "evidential_epistemic", + "evidential_aleatoric", + "evidential_total", + "classification", + "dropout", + "dirichlet", + ], + help="Method to use for uncertainty estimation", + default="none", + ) + # Uncertainty arguments + uncertainty_args.add_argument( + "--calibration_method", + type=str, + metavar="STRING", + choices=[ + "zscaling", + "tscaling", + "zelikman_interval", + "mve_weighting", + "platt", + "isotonic", + ], + help="Method to use for calibration", + default="none", + ) + uncertainty_args.add_argument( + "--calibration_path", + type=str, + metavar="FILE", + help="Path to file with calibration data", + ) # General arguments general_args.add_argument("--split_key_molecule", type=int) general_args.add_argument("--pytorch_seed", type=int) general_args.add_argument("--cache_cutoff", type=float) general_args.add_argument("--save_preds", type=bool) + general_args.add_argument("--wabTracking", action="store_true", default=False) general_args.add_argument( "--cuda", action="store_true", default=False, help="Turn on cuda" ) - general_args.add_argument( - "--save_smiles_splits", - action="store_true", - default=False, - help="Save smiles for each train/val/test splits for prediction convenience later", - ) + # general_args.add_argument( + # "--save_smiles_splits", + # action="store_true", + # default=False, + # help="Save smiles for each train/val/test splits for prediction convenience later", + # ) general_args.add_argument( "--test", action="store_true", @@ -575,9 +776,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: default=10, help="The number of batches between each logging of the training loss", ) - general_args.add_argument( - "--no_cuda", action="store_true", default=True, help="Turn off cuda" - ) general_args.add_argument( "--no_cache", action="store_true", @@ -593,13 +791,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: type=str, help="Input JSON file that contains all information for training/predicting.", ) - files_args.add_argument( - "--config_path", - type=str, - metavar="FILE", - help="Path to a .json file containing arguments. Any arguments present in the config" - "file will override arguments specified via the command line or by the defaults.", - ) files_args.add_argument( "--save_dir", type=str, @@ -1034,141 +1225,149 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: ) -def parseInputPredict(parser: argparse.ArgumentParser) -> None: - """ - Parse the input arguments. - - :return: A namespace object built up from attributes parsed out of the cmd line. - """ - +def parsePredictGnn(parser: argparse.ArgumentParser) -> None: general_args = parser.add_argument_group("General Configuration") files_args = parser.add_argument_group("Files") + uncertainty_args = parser.add_argument_group("Uncertainty Configuration") + + general_args.add_argument( + "--checkpoint_path", + type=str, + metavar="FILE", + help="Path to model checkpoint (.pt file)" + ) + # general_args.add_argument( + # "--no_features_scaling", + # action="store_true", + # help="Turn on scaling of features", + # ) files_args.add_argument( "-f", "--configFile", - metavar="FILE", type=str, - help="Input JSON file that contains all information for training/predicting.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "-i", - "--inputFile", metavar="FILE", - type=str, - help="The file containing the data for the prediction in (unquoted) " - "comma separated CSV format. The column named 'smiles' or 'fp'" - "contains the field to be predicted. Please adjust the type " - "that should be predicted (fp or smile) with -t option appropriately." - "An optional column 'id' is used to assign the outcomes to the" - "original identifiers. If this column is missing, the results are" - "numbered in the order of their appearance in the input file." - "A header is expected and respective column names are used.", - default=argparse.SUPPRESS, + help="Path to a .json file containing arguments. Any arguments present in the config" + "file will override arguments specified via the command line or by the defaults.", ) files_args.add_argument( - "-o", - "--outputDir", - metavar="DIR", + "--test_path", type=str, - help="Prefix of output directory. It will contain a log file and the file specified" - "with --outputFile.", - default=argparse.SUPPRESS, + help="Path to CSV file containing testing data for which predictions will be made.", ) files_args.add_argument( - "--outputFile", - metavar="FILE", + "--preds_path", type=str, - help="Output .CSV file name which will contain one prediction per input line. " - "Default: prefix of input file name.", - default=argparse.SUPPRESS, + help="Path to CSV or PICKLE file where predictions will be saved.", ) - general_args.add_argument( - "-t", - "--type", - metavar="STR", + files_args.add_argument( + "--calibration_path", type=str, - choices=["fp", "smiles"], - help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default=argparse.SUPPRESS, + help="Path to data file to be used for uncertainty calibration.", ) - general_args.add_argument( - "-k", - "--fpType", - metavar="STR", + files_args.add_argument( + "--calibration_features_path", type=str, - choices=["topological", "MACCS"], # , 'atompairs', 'torsions'], - help="The type of fingerprint to be generated/used in input file.", - default=argparse.SUPPRESS, + nargs="+", + help="Path to features data to be used with the uncertainty calibration dataset.", ) + files_args.add_argument("--calibration_phase_features_path", type=str, help="") files_args.add_argument( - "--ecModelDir", + "--calibration_atom_descriptors_path", type=str, - metavar="DIR", - help="The directory where the full model of the encoder will be saved (if trainAE=True) or " - "loaded from (if trainAE=False). Provide a full path here.", - default=argparse.SUPPRESS, + help="Path to the extra atom descriptors.", ) files_args.add_argument( - "--fnnModelDir", + "--calibration_bond_descriptors_path", type=str, - metavar="DIR", - help="The directory where the full model of the fnn is loaded from. " - "Provide a full path here.", - default=argparse.SUPPRESS, + help="Path to the extra bond descriptors that will be used as bond features to featurize a given molecule.", ) + general_args.add_argument( + "--drop_extra_columns", + action="store_true", + help="Whether to drop all columns from the test data file besides the SMILES columns and the new prediction columns.", + ) -def parsePredictGnn(parser: argparse.ArgumentParser) -> None: - general_args = parser.add_argument_group("General Configuration") - data_args = parser.add_argument_group("Data Configuration") - files_args = parser.add_argument_group("Files") - training_args = parser.add_argument_group("Training Configuration") - files_args.add_argument( - "-f", - "--configFile", - metavar="FILE", + uncertainty_args.add_argument( + "--uncertainty_method", type=str, - help="Input JSON file that contains all information for training/predicting.", - default=argparse.SUPPRESS, + choices=[ + "mve", + "ensemble", + "evidential_epistemic", + "evidential_aleatoric", + "evidential_total", + "classification", + "dropout", + "spectra_roundrobin", + "dirichlet", + ], + help="The method of calculating uncertainty.", ) - general_args.add_argument( - "--gpu", - type=int, - metavar="INT", - choices=list(range(torch.cuda.device_count())), - help="Which GPU to use", + uncertainty_args.add_argument( + "--calibration_method", + type=str, + nargs="+", + choices=[ + "zscaling", + "tscaling", + "zelikman_interval", + "mve_weighting", + "platt", + "isotonic", + ], + help="Methods used for calibrating the uncertainty calculated with uncertainty method.", ) - general_args.add_argument( - "--no_cuda", action="store_true", default=False, help="Turn off cuda" + uncertainty_args.add_argument("--individual_ensemble_predictions", + action="store_true", + default=False, + help="Whether to save individual ensemble predictions.") + uncertainty_args.add_argument( + "--evaluation_methods", + type=str, + nargs="+", + help="The methods used for evaluating the uncertainty performance if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.", ) - general_args.add_argument( - "--num_workers", + uncertainty_args.add_argument( + "--evaluation_scores_path", + type=str, + help="Location to save the results of uncertainty evaluations.", + ) + uncertainty_args.add_argument( + "--uncertainty_dropout_p", + type=float, + default=0.1, + help="The probability to use for Monte Carlo dropout uncertainty estimation.", + ) + uncertainty_args.add_argument( + "--dropout_sampling_size", type=int, - metavar="INT", - help="Number of workers for the parallel data loading 0 means sequential", + default=10, + help="The number of samples to use for Monte Carlo dropout uncertainty estimation. Distinct from the dropout used during training.", ) - general_args.add_argument( - "--no_cache", - type=bool, - metavar="BOOL", - default=False, - help="Turn off caching mol2graph computation", + uncertainty_args.add_argument( + "--calibration_interval_percentile", + type=float, + default=95, + help="Sets the percentile used in the calibration methods. Must be in the range (1,100).", ) - general_args.add_argument( - "--no_cache_mol", - type=bool, - metavar="BOOL", - default=False, - help="Whether to not cache the RDKit molecule for each SMILES string to reduce memory\ - usage cached by default", + uncertainty_args.add_argument( + "--regression_calibrator_metric", + type=str, + choices=["stdev", "interval"], + help="Regression calibrators can output either a stdev or an inverval.", ) - general_args.add_argument( - "--empty_cache", - type=bool, - metavar="BOOL", - help="Whether to empty all caches before training or predicting. This is necessary if\ - multiple jobs are run within a single script and the atom or bond features change", + + +def parseInterpretGnn(parser: argparse.ArgumentParser) -> None: + files_args = parser.add_argument_group("Files") + interpret_args = parser.add_argument_group("Interpretation Configuration") + files_args.add_argument( + "-f", + "--configFile", + metavar="FILE", + type=str, + help="Input JSON file that contains all information for interpretation.", ) files_args.add_argument( "--preds_path", @@ -1191,89 +1390,44 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: metavar="DIR", help="Path to model checkpoint (.pt file)", ) - files_args.add_argument( - "--checkpoint_paths", - type=str, - metavar="FILE", - nargs="*", - help="Path to model checkpoint (.pt file)", - ) files_args.add_argument( "--data_path", type=str, metavar="FILE", help="Path to CSV file containing testing data for which predictions will be made", - default="", ) - files_args.add_argument( - "--test_path", - type=str, - metavar="FILE", - help="Path to CSV file containing testing data for which predictions will be made", - default="", - ) - files_args.add_argument( - "--features_path", - type=str, - metavar="FILE", - nargs="*", - help="Path to features to use in FNN (instead of features_generator)", - ) - files_args.add_argument( - "--atom_descriptors_path", - type=str, - metavar="FILE", - help="Path to the extra atom descriptors.", - ) - data_args.add_argument( - "--use_compound_names", - action="store_true", - default=False, - help="Use when test data file contains compound names in addition to SMILES strings", - ) - data_args.add_argument( - "--no_features_scaling", - action="store_true", - default=False, - help="Turn off scaling of features", - ) - data_args.add_argument( - "--max_data_size", + interpret_args.add_argument( + "--max_atoms", type=int, metavar="INT", - help="Maximum number of data points to load", + help="Maximum number of atoms to use for interpretation", ) - data_args.add_argument( - "--smiles_columns", - type=str, - metavar="STRING", - help="List of names of the columns containing SMILES strings.By default, uses the first\ - number_of_molecules columns.", - ) - data_args.add_argument( - "--number_of_molecules", + + interpret_args.add_argument( + "--min_atoms", type=int, metavar="INT", - help="Number of molecules in each input to the model.This must equal the length of\ - smiles_columns if not None", + help="Minimum number of atoms to use for interpretation", ) - data_args.add_argument( - "--atom_descriptors", - type=bool, - metavar="Bool", - help="Use or not atom descriptors", + interpret_args.add_argument( + "--prop_delta", + type=float, + metavar="FLOAT", + help="The minimum change in the property of interest that is considered significant", ) - - data_args.add_argument( - "--bond_features_size", + interpret_args.add_argument( + "--property_id", type=int, metavar="INT", - help="Size of the extra bond descriptors that will be used as bond features to featurize a\ - given molecule", + help="The index of the property of interest", ) - training_args.add_argument( - "--batch_size", type=int, metavar="INT", default=50, help="Batch size" + # write the argument for rollouts + interpret_args.add_argument( + "--rollout", + type=int, + metavar="INT", + help="The number of rollouts to use for interpretation", ) diff --git a/dfpl/utils.py b/dfpl/utils.py index db3d6ec1..ccf931df 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -1,12 +1,16 @@ +import argparse import json import logging import os import pathlib +import sys import warnings from collections import defaultdict +from pathlib import Path from random import Random -from typing import Dict, List, Set, Tuple, Union +from typing import Dict, List, Set, Tuple, Type, TypeVar, Union +import jsonpickle import numpy as np import pandas as pd from rdkit import Chem, RDLogger @@ -14,7 +18,48 @@ from rdkit.Chem.Scaffolds import MurckoScaffold from tqdm import tqdm +# Define a type variable + + RDLogger.DisableLog("rdApp.*") +T = TypeVar("T") + + +def parseCmdArgs(cls: Type[T], args: argparse.Namespace) -> T: + """ + Parses command-line arguments to create an instance of the given class. + + Args: + cls: The class to create an instance of. + args: argparse.Namespace containing the command-line arguments. + + Returns: + An instance of cls populated with values from the command-line arguments. + """ + # Extract argument flags from sys.argv + arg_flags = {arg.lstrip("-") for arg in sys.argv if arg.startswith("-")} + + # Create the result instance, which will be modified and returned + result = cls() + + # Load JSON file if specified + if hasattr(args, "configFile") and args.configFile: + jsonFile = Path(args.configFile) + if jsonFile.exists() and jsonFile.is_file(): + with jsonFile.open() as f: + content = jsonpickle.decode(f.read()) + for key, value in vars(content).items(): + setattr(result, key, value) + else: + raise ValueError("Could not find JSON input file") + + # Override with user-provided command-line arguments + for key in arg_flags: + if hasattr(args, key): + user_value = getattr(args, key, None) + setattr(result, key, user_value) + + return result def makePathAbsolute(p: str) -> str: @@ -30,23 +75,49 @@ def createDirectory(directory: str): if not os.path.exists(path): os.makedirs(path) +def parse_cli_list(value: str): + # Simple parser for lists passed as comma-separated values + return value.split(',') + +def parse_cli_boolean(cli_args, cli_arg_key): + # Determines boolean value based on command line presence + if cli_arg_key in cli_args: + return True # Presence of flag implies True + return False -def createArgsFromJson(in_json: str, ignore_elements: list, return_json_object: bool): +def createArgsFromJson(jsonFile: str): arguments = [] - with open(in_json, "r") as f: + ignore_elements = ["py/object"] + cli_args = sys.argv[1:] # Skipping the script name itself + + with open(jsonFile, "r") as f: data = json.load(f) + + processed_cli_keys = [] # To track which CLI keys have been processed + for key, value in data.items(): if key not in ignore_elements: - if key == "extra_metrics" and isinstance(value, list): - arguments.append("--extra_metrics") - arguments.extend(value) + cli_arg_key = f"--{key}" + if cli_arg_key in cli_args: + processed_cli_keys.append(cli_arg_key) + arg_index = cli_args.index(cli_arg_key) + 1 + if isinstance(value, bool): + value = parse_cli_boolean(cli_args, cli_arg_key) + elif arg_index < len(cli_args): + cli_value = cli_args[arg_index] + if isinstance(value, list): + value = parse_cli_list(cli_value) + else: + value = cli_value # Override JSON value with command-line value + if isinstance(value, bool) and value: + arguments.append(cli_arg_key) + elif isinstance(value, list): + arguments.append(cli_arg_key) + arguments.extend(map(str, value)) # Ensure all elements are strings else: - arguments.append("--" + str(key)) - arguments.append(str(value)) - if return_json_object: - return arguments, data - return arguments + arguments.extend([cli_arg_key, str(value)]) + return arguments def make_mol(s: str, keep_h: bool, add_h: bool, keep_atom_map: bool): """ @@ -76,49 +147,6 @@ def make_mol(s: str, keep_h: bool, add_h: bool, keep_atom_map: bool): return mol -def generate_scaffold( - mol: Union[str, Chem.Mol, Tuple[Chem.Mol, Chem.Mol]], include_chirality: bool = True -) -> str: - """ - Computes the Bemis-Murcko scaffold for a SMILES string, an RDKit molecule, or an InChI string or InChIKey. - - :param mol: A SMILES, RDKit molecule, InChI string, or InChIKey string. - :param include_chirality: Whether to include chirality in the computed scaffold. - :return: The Bemis-Murcko scaffold for the molecule. - """ - if isinstance(mol, str): - if mol.startswith("InChI="): - mol = inchi_to_mol(mol) - else: - mol = make_mol(mol, keep_h=False, add_h=False, keep_atom_map=False) - elif isinstance(mol, tuple): - mol = mol[0] - scaffold = MurckoScaffold.MurckoScaffoldSmiles( - mol=mol, includeChirality=include_chirality - ) - - return scaffold - - -def scaffold_to_smiles( - mols: List[str], use_indices: bool = False -) -> Dict[str, Union[Set[str], Set[int]]]: - """ - Computes the scaffold for each SMILES and returns a mapping from scaffolds to sets of smiles (or indices). - :param mols: A list of SMILES. - :param use_indices: Whether to map to the SMILES's index in :code:`mols` rather than - mapping to the smiles string itself. This is necessary if there are duplicate smiles. - :return: A dictionary mapping each unique scaffold to all SMILES (or indices) which have that scaffold. - """ - scaffolds = defaultdict(set) - for i, mol in tqdm(enumerate(mols), total=len(mols)): - scaffold = generate_scaffold(mol) - if use_indices: - scaffolds[scaffold].add(i) - else: - scaffolds[scaffold].add(mol) - - return scaffolds # def inchi_to_mol(inchi: str) -> Chem.Mol: @@ -184,7 +212,49 @@ def weight_split( test_df = sorted_data.iloc[test_indices].reset_index(drop=True) return train_df, val_df, test_df +def generate_scaffold( + mol: Union[str, Chem.Mol, Tuple[Chem.Mol, Chem.Mol]], include_chirality: bool = True +) -> str: + """ + Computes the Bemis-Murcko scaffold for a SMILES string, an RDKit molecule, or an InChI string or InChIKey. + :param mol: A SMILES, RDKit molecule, InChI string, or InChIKey string. + :param include_chirality: Whether to include chirality in the computed scaffold. + :return: The Bemis-Murcko scaffold for the molecule. + """ + if isinstance(mol, str): + if mol.startswith("InChI="): + mol = inchi_to_mol(mol) + else: + mol = make_mol(mol, keep_h=False, add_h=False, keep_atom_map=False) + elif isinstance(mol, tuple): + mol = mol[0] + scaffold = MurckoScaffold.MurckoScaffoldSmiles( + mol=mol, includeChirality=include_chirality + ) + + return scaffold + + +def scaffold_to_smiles( + mols: List[str], use_indices: bool = False +) -> Dict[str, Union[Set[str], Set[int]]]: + """ + Computes the scaffold for each SMILES and returns a mapping from scaffolds to sets of smiles (or indices). + :param mols: A list of SMILES. + :param use_indices: Whether to map to the SMILES's index in :code:`mols` rather than + mapping to the smiles string itself. This is necessary if there are duplicate smiles. + :return: A dictionary mapping each unique scaffold to all SMILES (or indices) which have that scaffold. + """ + scaffolds = defaultdict(set) + for i, mol in tqdm(enumerate(mols), total=len(mols)): + scaffold = generate_scaffold(mol) + if use_indices: + scaffolds[scaffold].add(i) + else: + scaffolds[scaffold].add(mol) + + return scaffolds def ae_scaffold_split( data: pd.DataFrame, @@ -309,7 +379,7 @@ def log_scaffold_stats( targets = [ c for c in data.columns - if c in ["AR", "ER", "ED", "TR", "GR", "PPARg", "Aromatase"] + if c not in ["fp", "morganfp", "fpcompressed", "id", "smiles",] ] # targets = data_set.iloc[:, 2:].values targets = data_set.loc[:, targets].values From 16a24f4c77a0a2510217e8c4dc96065740ede221 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Mar 2024 14:39:51 +0100 Subject: [PATCH 03/48] flaked and fixed predictgnn arg --- dfpl/__main__.py | 4 +--- dfpl/options.py | 13 ++++++++----- dfpl/utils.py | 22 ++++++++++++++++------ example/predictgnn.json | 1 + 4 files changed, 26 insertions(+), 14 deletions(-) diff --git a/dfpl/__main__.py b/dfpl/__main__.py index 8d035579..fe66eec8 100755 --- a/dfpl/__main__.py +++ b/dfpl/__main__.py @@ -61,9 +61,7 @@ def interpretdmpnn(opts: options.GnnOptions) -> None: arguments = createArgsFromJson(jsonFile=opts.configFile) opts = chemprop.args.InterpretArgs().parse_args(arguments) - chemprop.interpret.interpret( - args=opts, save_to_csv=True - ) + chemprop.interpret.interpret(args=opts, save_to_csv=True) def train(opts: options.Options): diff --git a/dfpl/options.py b/dfpl/options.py index 85e245bc..5def434d 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -3,11 +3,11 @@ import argparse from dataclasses import dataclass from pathlib import Path -from typing import Optional, Literal, List +from typing import List, Literal, Optional import jsonpickle import torch -from chemprop.args import TrainArgs, PredictArgs, InterpretArgs +from chemprop.args import InterpretArgs, PredictArgs, TrainArgs from dfpl.utils import parseCmdArgs @@ -107,6 +107,7 @@ class GnnOptions(TrainArgs): wabTracking: bool = False split_sizes: List[float] = None # save_smiles_splits: bool = False + @classmethod def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): # Initialize with JSON config if provided @@ -1234,7 +1235,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: "--checkpoint_path", type=str, metavar="FILE", - help="Path to model checkpoint (.pt file)" + help="Path to model checkpoint (.pt file)", ) # general_args.add_argument( # "--no_features_scaling", @@ -1318,10 +1319,12 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: ], help="Methods used for calibrating the uncertainty calculated with uncertainty method.", ) - uncertainty_args.add_argument("--individual_ensemble_predictions", + uncertainty_args.add_argument( + "--individual_ensemble_predictions", action="store_true", default=False, - help="Whether to save individual ensemble predictions.") + help="Whether to save individual ensemble predictions.", + ) uncertainty_args.add_argument( "--evaluation_methods", type=str, diff --git a/dfpl/utils.py b/dfpl/utils.py index ccf931df..338981c9 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -75,9 +75,11 @@ def createDirectory(directory: str): if not os.path.exists(path): os.makedirs(path) + def parse_cli_list(value: str): # Simple parser for lists passed as comma-separated values - return value.split(',') + return value.split(",") + def parse_cli_boolean(cli_args, cli_arg_key): # Determines boolean value based on command line presence @@ -85,6 +87,7 @@ def parse_cli_boolean(cli_args, cli_arg_key): return True # Presence of flag implies True return False + def createArgsFromJson(jsonFile: str): arguments = [] ignore_elements = ["py/object"] @@ -119,6 +122,7 @@ def createArgsFromJson(jsonFile: str): return arguments + def make_mol(s: str, keep_h: bool, add_h: bool, keep_atom_map: bool): """ Builds an RDKit molecule from a SMILES string. @@ -147,10 +151,6 @@ def make_mol(s: str, keep_h: bool, add_h: bool, keep_atom_map: bool): return mol - - -# def inchi_to_mol(inchi: str) -> Chem.Mol: -# return Chem.inchi.MolFromInchi(inchi) def smiles_to_mol(smiles: str) -> Chem.Mol: mol = Chem.MolFromSmiles(smiles) if mol is None: @@ -212,6 +212,8 @@ def weight_split( test_df = sorted_data.iloc[test_indices].reset_index(drop=True) return train_df, val_df, test_df + + def generate_scaffold( mol: Union[str, Chem.Mol, Tuple[Chem.Mol, Chem.Mol]], include_chirality: bool = True ) -> str: @@ -256,6 +258,7 @@ def scaffold_to_smiles( return scaffolds + def ae_scaffold_split( data: pd.DataFrame, sizes: Tuple[float, float, float] = (0.8, 0, 0.2), @@ -379,7 +382,14 @@ def log_scaffold_stats( targets = [ c for c in data.columns - if c not in ["fp", "morganfp", "fpcompressed", "id", "smiles",] + if c + not in [ + "fp", + "morganfp", + "fpcompressed", + "id", + "smiles", + ] ] # targets = data_set.iloc[:, 2:].values targets = data_set.loc[:, targets].values diff --git a/example/predictgnn.json b/example/predictgnn.json index 157b5e05..221622de 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -1,6 +1,7 @@ { "py/object": "dfpl.options.GnnOptions", "test_path": "tests/data/smiles.csv", + "preds_path": "preds.csv", "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt", "save_dir": "preds_dmpnn", "saving_name": "DMPNN_preds.csv" From 774b0a1afa30efa4b3afa14221d2cd4ad04df6b4 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Mar 2024 15:29:45 +0100 Subject: [PATCH 04/48] add json --- example/predictgnn.json | 2 -- 1 file changed, 2 deletions(-) diff --git a/example/predictgnn.json b/example/predictgnn.json index 221622de..c76aa96c 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -3,6 +3,4 @@ "test_path": "tests/data/smiles.csv", "preds_path": "preds.csv", "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt", - "save_dir": "preds_dmpnn", - "saving_name": "DMPNN_preds.csv" } \ No newline at end of file From 11fb829840f7c1bb5574cab4855e4de7fec78b38 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Mar 2024 15:40:21 +0100 Subject: [PATCH 05/48] remove comma --- example/predictgnn.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/predictgnn.json b/example/predictgnn.json index c76aa96c..1055230f 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -2,5 +2,5 @@ "py/object": "dfpl.options.GnnOptions", "test_path": "tests/data/smiles.csv", "preds_path": "preds.csv", - "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt", + "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt" } \ No newline at end of file From add3993e610e5f7e14b308a0e098d29669ee23a0 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Mar 2024 16:28:19 +0100 Subject: [PATCH 06/48] final fix --- .github/workflows/pr.yml | 12 +++++------- example/predictgnn.json | 2 +- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 27f43c34..47c709e6 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -94,12 +94,10 @@ jobs: echo "predict result directory missing" >&2 exit 1 fi - - echo "result lines "$(wc -l preds_dmpnn/DMPNN_preds.csv) - if [ "$(cat preds_dmpnn/DMPNN_preds.csv | wc -l)" -lt "6" ]; then - echo "predict result should have at least 5 lines. But had only $(cat preds_dmpnn/DMPNN_preds.csv | wc -l)" >&2 + + dfpl convert -f tests/data + if [ "$(find tests/data -name '*.csv' | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then + echo "not all csv files are converted to pickle ones" >&2 exit 1 fi - - - dfpl convert -f tests/data \ No newline at end of file + echo "All tests passed!" \ No newline at end of file diff --git a/example/predictgnn.json b/example/predictgnn.json index 1055230f..dfdd6a8d 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -1,6 +1,6 @@ { "py/object": "dfpl.options.GnnOptions", "test_path": "tests/data/smiles.csv", - "preds_path": "preds.csv", + "preds_path": "preds_dmpnn/preds.csv", "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt" } \ No newline at end of file From fa33f2f847761a76ba1602e340e07797144d8338 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Mar 2024 18:06:01 +0100 Subject: [PATCH 07/48] final fix --- .github/workflows/pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 47c709e6..87173151 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -96,7 +96,7 @@ jobs: fi dfpl convert -f tests/data - if [ "$(find tests/data -name '*.csv' | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then + if [ "$(find tests/data \(-name '*.csv'\ -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then echo "not all csv files are converted to pickle ones" >&2 exit 1 fi From 1f59fe911ed726d3b7f71073e6634c15b1e4e98a Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Mar 2024 18:18:11 +0100 Subject: [PATCH 08/48] final fix --- .github/workflows/pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 87173151..dd97e1aa 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -96,7 +96,7 @@ jobs: fi dfpl convert -f tests/data - if [ "$(find tests/data \(-name '*.csv'\ -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then + if [ "$(find tests/data \(-name '*.csv' -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then echo "not all csv files are converted to pickle ones" >&2 exit 1 fi From 00fa01280f2c531726d09cc50dea652b3b6df201 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Thu, 7 Mar 2024 10:10:02 +0100 Subject: [PATCH 09/48] convert fix --- .github/workflows/pr.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index dd97e1aa..c854fb43 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -96,8 +96,8 @@ jobs: fi dfpl convert -f tests/data - if [ "$(find tests/data \(-name '*.csv' -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then - echo "not all csv files are converted to pickle ones" >&2 - exit 1 + if [ "$(find tests/data \( -name '*.csv' -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then + echo "not all csv files are converted to pickle ones" >&2 + exit 1 fi echo "All tests passed!" \ No newline at end of file From ace62d3d467f346b42e91891901bd6dbf600bd51 Mon Sep 17 00:00:00 2001 From: soulios <90351285+soulios@users.noreply.github.com> Date: Fri, 8 Mar 2024 10:18:18 +0100 Subject: [PATCH 10/48] Update dfpl/options.py Co-authored-by: M Bernt --- dfpl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dfpl/options.py b/dfpl/options.py index 5def434d..60112423 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -1358,7 +1358,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: "--regression_calibrator_metric", type=str, choices=["stdev", "interval"], - help="Regression calibrators can output either a stdev or an inverval.", + help="Regression calibrator output metric. Regression calibrators can output either a stdev or an inverval.", ) From 8a1b334b2f227e02cd625358c2911277f846ed76 Mon Sep 17 00:00:00 2001 From: soulios <90351285+soulios@users.noreply.github.com> Date: Fri, 8 Mar 2024 10:30:24 +0100 Subject: [PATCH 11/48] Apply suggestions from code review Co-authored-by: M Bernt --- dfpl/options.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dfpl/options.py b/dfpl/options.py index 60112423..9f304c1a 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -1258,35 +1258,35 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: files_args.add_argument( "--preds_path", type=str, - help="Path to CSV or PICKLE file where predictions will be saved.", + help="Predictions output file. CSV or PICKLE file where predictions will be saved.", ) files_args.add_argument( "--calibration_path", type=str, - help="Path to data file to be used for uncertainty calibration.", + help="Data file to be used for uncertainty calibration.", ) files_args.add_argument( "--calibration_features_path", type=str, nargs="+", - help="Path to features data to be used with the uncertainty calibration dataset.", + help="Feature data file to be used with the uncertainty calibration dataset.", ) files_args.add_argument("--calibration_phase_features_path", type=str, help="") files_args.add_argument( "--calibration_atom_descriptors_path", type=str, - help="Path to the extra atom descriptors.", + help="Extra atom descriptors file.", ) files_args.add_argument( "--calibration_bond_descriptors_path", type=str, - help="Path to the extra bond descriptors that will be used as bond features to featurize a given molecule.", + help="Extra bond descriptors file. Path to the extra bond descriptors that will be used as bond features to featurize a given molecule.", ) general_args.add_argument( "--drop_extra_columns", action="store_true", - help="Whether to drop all columns from the test data file besides the SMILES columns and the new prediction columns.", + help="Keep only SMILES and new prediction columns in the test data files.", ) uncertainty_args.add_argument( @@ -1323,13 +1323,13 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: "--individual_ensemble_predictions", action="store_true", default=False, - help="Whether to save individual ensemble predictions.", + help="Save individual ensemble predictions.", ) uncertainty_args.add_argument( "--evaluation_methods", type=str, nargs="+", - help="The methods used for evaluating the uncertainty performance if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.", + help="Methods used for evaluating the uncertainty performance. Only used if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.", ) uncertainty_args.add_argument( "--evaluation_scores_path", @@ -1352,7 +1352,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: "--calibration_interval_percentile", type=float, default=95, - help="Sets the percentile used in the calibration methods. Must be in the range (1,100).", + help="Percentile used in calibration methods. Must be in the range (1,100).", ) uncertainty_args.add_argument( "--regression_calibrator_metric", From 3c92b98600fe67e87f4fe30d6ed016e70965ab49 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Fri, 8 Mar 2024 11:51:20 +0100 Subject: [PATCH 12/48] edited help in args --- dfpl/options.py | 204 +++++++++++++++++++++++++++++------------------- 1 file changed, 124 insertions(+), 80 deletions(-) diff --git a/dfpl/options.py b/dfpl/options.py index 9f304c1a..782e55d8 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -338,13 +338,13 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: general_args.add_argument( "--compressFeatures", action="store_true", - help="Should the fingerprints be compressed or not. Needs a path of a trained autoencoder or needs the trainAC also set to True.", + help="Compresses the fingerprints. Needs a path of a trained autoencoder or needs the trainAC also set to True.", default=False, ) general_args.add_argument( "--enableMultiLabel", action="store_true", - help="Train multi-label classification model in addition to the individual models.", + help="Train multi-label classification model. individual models.", default=False, ) # Autoencoder Configuration @@ -360,7 +360,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: "--ecModelDir", type=str, metavar="DIR", - help="The directory where the full model of the encoder will be saved", + help="The directory where the full encoder will be saved", default="example/results_train/AE_encoder/", ) autoencoder_args.add_argument( @@ -390,7 +390,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="STRING", type=str, choices=["relu", "selu"], - help="The activation function for the hidden layers in the autoencoder.", + help="The activation function of the autoencoder.", default="relu", ) autoencoder_args.add_argument( @@ -412,7 +412,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="STRING", type=str, choices=["scaffold_balanced", "random", "molecular_weight"], - help="Set how the data is going to be split for the autoencoder", + help="Set how the data is split for the autoencoder", default="random", ) autoencoder_args.add_argument( @@ -435,14 +435,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="STRING", type=str, choices=["scaffold_balanced", "random", "molecular_weight"], - help="Set how the data is going to be split for the feedforward neural network", + help="Set how the data is split for the feedforward neural network", default="random", ) training_args.add_argument( "--testSize", metavar="FLOAT", type=float, - help="Fraction of the dataset that should be used for testing. Value in [0,1].", + help="Fraction[0,1] of the dataset that should be used for testing", default=0.2, ) training_args.add_argument( @@ -450,7 +450,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: "--kFolds", metavar="INT", type=int, - help="K that is used for K-fold cross-validation in the training procedure.", + help="Number of folds for cross-validation.", default=1, ) training_args.add_argument( @@ -466,28 +466,27 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: training_args.add_argument( "--trainAC", action="store_true", - help="Choose to train or not, the autoencoder based on the input file", + help="Trains the autoencoder.", default=False, ) training_args.add_argument( "--trainFNN", action="store_false", - help="When called it deactivates the training.", + help="Deactivates the FNN training.", default=True, ) training_args.add_argument( "--sampleFractionOnes", metavar="FLOAT", type=float, - help="This is the fraction of positive target associations (1s) in comparison to the majority class(0s)." - "only works if --sampleDown is enabled", + help="This is the desired fraction 1s/0s.only works if --sampleDown is enabled", default=0.5, ) training_args.add_argument( "--sampleDown", metavar="BOOL", type=bool, - help="Enable automatic down sampling of the 0 valued samples.", + help="Down sampling of the 0 valued samples.", default=False, ) training_args.add_argument( @@ -495,7 +494,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: "--epochs", metavar="INT", type=int, - help="Number of epochs that should be used for the FNN training", + help="Number of epochs for the FNN training", default=100, ) # TODO CHECK IF ALL LOSSES MAKE SENSE HERE @@ -504,7 +503,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="STRING", type=str, choices=["mse", "bce", "focal"], - help="Loss function to use during training. mse - mean squared error, bce - binary cross entropy.", + help="Loss function for FNN training. mse - mean squared error, bce - binary cross entropy.", default="bce", ) # TODO DO I NEED ALL ARGUMENTS TO BE USER SPECIFIED? WHAT DOES THE USER KNOW ABOUT OPTIMIZERS? @@ -513,7 +512,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="STRING", type=str, choices=["Adam", "SGD"], - help='Optimizer to use for backpropagation in the FNN. Possible values: "Adam", "SGD"', + help='Optimizer of the FNN.', default="Adam", ) training_args.add_argument( @@ -556,7 +555,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="STRING", type=str, choices=["relu", "selu"], - help="The activation function for hidden layers in the FNN.", + help="The activation function of the FNN.", default="relu", ) # Tracking Configuration @@ -564,23 +563,22 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: "--aeWabTracking", metavar="BOOL", type=bool, - help="Track autoencoder performance via Weights & Biases, see https://wandb.ai.", + help="Track autoencoder performance via Weights & Biases.", default=False, ) tracking_args.add_argument( "--wabTracking", metavar="BOOL", type=bool, - help="Track FNN performance via Weights & Biases, see https://wandb.ai.", + help="Track FNN performance via Weights & Biases", default=False, ) tracking_args.add_argument( "--wabTarget", metavar="STRING", type=str, - choices=["AR", "ER", "ED", "GR", "TR", "PPARg", "Aromatase"], - help="Which target to use for tracking performance via Weights & Biases, see https://wandb.ai.", - default="AR", + help="Which endpoint to use for tracking performance via Weights & Biases. Should match the column name.", + default=None, ) @@ -598,7 +596,7 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: "--configFile", metavar="FILE", type=str, - help="Input JSON file that contains all information for training/predicting.", + help="JSON file that contains all information for training/predicting.", ) files_args.add_argument( "-i", @@ -620,19 +618,17 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: "--outputDir", metavar="DIR", type=str, - help="Prefix of output directory. It will contain a log file and the file specified" - "with --outputFile.", + help="Prefix of output directory. It will contain a log file and the file specified with --outputFile.", default="example/results_predict/", ) files_args.add_argument( "--outputFile", metavar="FILE", type=str, - help="Output .CSV file name which will contain one prediction per input line. " + help="Output csv file name which will contain one prediction per input line. " "Default: prefix of input file name.", default="results.csv", ) - # TODO AGAIN THIS TRASH HERE? CAN WE EVEN PROCESS SMILES? general_args.add_argument( "-t", "--type", @@ -648,34 +644,37 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: metavar="STR", type=str, choices=["topological", "MACCS"], - help="The type of fingerprint to be generated/used in input file. Should be the same as the type of the fps that the model was trained upon.", + help="The type of fingerprint to be generated/used in input file.", default="topological", ) files_args.add_argument( "--ecModelDir", type=str, metavar="DIR", - help="The directory where the full model of the encoder will be saved (if trainAE=True) or " - "loaded from (if trainAE=False). Provide a full path here.", + help="The encoder dir where it is saved (if trainAE=True) or " + "it is loaded from (if trainAE=False). Provide a full path here.", default="", ) files_args.add_argument( "--ecWeightsFile", type=str, metavar="STR", - help="The file where the full model of the encoder will be loaded from, to compress the fingerprints. Provide a full path here.", + help="The encoder file where it is loaded from, to compress the fingerprints.", default="", ) files_args.add_argument( "--fnnModelDir", type=str, metavar="DIR", - help="The directory where the full model of the fnn is loaded from. " - "Provide a full path here.", + help="The directory where the full model of the fnn is loaded from.", default="example/results_train/AR_saved_model", ) general_args.add_argument( - "-c", "--compressFeatures", action="store_true", default=False + "-c", + "--compressFeatures", + action="store_true", + help="Compresses the fingerprints if encoder dir/file is provided", + default=False ) ( general_args.add_argument( @@ -737,20 +736,20 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: ) # General arguments - general_args.add_argument("--split_key_molecule", type=int) - general_args.add_argument("--pytorch_seed", type=int) - general_args.add_argument("--cache_cutoff", type=float) - general_args.add_argument("--save_preds", type=bool) + general_args.add_argument("--split_key_molecule",help="The index of the key molecule used for splitting", type=int) + general_args.add_argument("--pytorch_seed",help="Seed for pytorch", type=int) + general_args.add_argument("--cache_cutoff",help="Maximum number of molecules in dataset to allow caching.", type=float) + general_args.add_argument("--save_preds",help="Saves test split predictions during training", type=bool) general_args.add_argument("--wabTracking", action="store_true", default=False) general_args.add_argument( "--cuda", action="store_true", default=False, help="Turn on cuda" ) - # general_args.add_argument( - # "--save_smiles_splits", - # action="store_true", - # default=False, - # help="Save smiles for each train/val/test splits for prediction convenience later", - # ) + general_args.add_argument( + "--save_smiles_splits", + action="store_true", + default=False, + help="Save smiles for each train/val/test splits", + ) general_args.add_argument( "--test", action="store_true", @@ -775,13 +774,13 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: type=int, metavar="INT", default=10, - help="The number of batches between each logging of the training loss", + help="The number of batches between each log", ) general_args.add_argument( - "--no_cache", + "--no_cache_mol", action="store_true", default=False, - help="Turn off caching mol2graph computation", + help="If raised, Turn off caching rdkit mols", ) # FILES ARGUMENTS @@ -790,7 +789,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: "--configFile", metavar="FILE", type=str, - help="Input JSON file that contains all information for training/predicting.", + help="JSON file that contains all configuration for training/predicting.", ) files_args.add_argument( "--save_dir", @@ -950,7 +949,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: type=int, metavar="INT", default=3, - help="Number of classes when running multiclass classification", + help="Number of classes in multiclass classification", ) data_args.add_argument( "--split_type", @@ -993,6 +992,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: data_args.add_argument( "--target_columns", type=str, + nargs="*", metavar="STRING", help="Name of the target columns", ) @@ -1000,11 +1000,12 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: data_args.add_argument( "--ignore_columns", type=str, + nargs="*", metavar="STRING", help="Names of the columns to ignore", ) data_args.add_argument( - "--num_tasks", type=int, metavar="INT", help="NUmber of tasks" + "--num_tasks", type=int, metavar="INT", help="Number of tasks" ) data_args.add_argument( "--no_features_scaling", @@ -1102,35 +1103,71 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: default=2, help="Number of layers in FFN after MPN encoding", ) - model_args.add_argument("--checkpoint_frzn", type=str, metavar="STRING") + model_args.add_argument("--checkpoint_frzn", type=str, metavar="STRING",help="Freeze the loaded model") # Model arguments - model_args.add_argument("--mpn_shared", type=bool, metavar="BOOL") + # model_args.add_argument("--mpn_shared", type=bool, metavar="BOOL") model_args.add_argument( "--show_individual_scores", action="store_true", default=True, help="Show all scores for individual targets, not just average, at the end", ) - model_args.add_argument("--aggregation", choices=["mean", "sum", "norm"]) - model_args.add_argument("--aggregation_norm", type=int) - model_args.add_argument("--explicit_h", type=bool, metavar="BOOL") - model_args.add_argument("--adding_h", type=bool, metavar="BOOL") + model_args.add_argument( + "--aggregation", + choices=["mean", "sum", "norm"], + help="Aggregation scheme for atomic vectors into molecular vectors") + model_args.add_argument( + "--aggregation_norm", + type=int, + help="For norm aggregation, number by which to divide summed up atomic features") + # model_args.add_argument("--explicit_h", type=bool, metavar="BOOL",help="A explicit hydrogen") + model_args.add_argument( + "--adding_h", + type=bool, + metavar="BOOL", + help="Adding hydrogen") # Training arguments - model_args.add_argument("--class_balance", type=bool, metavar="BOOL") - model_args.add_argument("--evidential_regularization", type=float, metavar="FLOAT") model_args.add_argument( - "--overwrite_default_atom_features", type=bool, metavar="BOOL" + "--class_balance", + type=bool, + metavar="BOOL", + help="Balances the classes across batches") + model_args.add_argument( + "--evidential_regularization", + type=float, + metavar="FLOAT", + help="Regularization parameter for evidential loss") + model_args.add_argument( + "--overwrite_default_atom_features", + type=bool, + metavar="BOOL", + help="Overwrites default atom features instead of concatenating" + ) + model_args.add_argument( + "--no_atom_descriptor_scaling", + type=bool, + metavar="BOOL") + model_args.add_argument( + "--overwrite_default_bond_features", + type=bool, + metavar="BOOL", + help="Overwrites default bond features instead of concatenating" ) - model_args.add_argument("--no_atom_descriptor_scaling", type=bool, metavar="BOOL") model_args.add_argument( - "--overwrite_default_bond_features", type=bool, metavar="BOOL" + "--frzn_ffn_layers", + type=int, + metavar="INT", + help="Number of layers in FFN to freeze" ) - model_args.add_argument("--frzn_ffn_layers", type=int, metavar="INT") - model_args.add_argument("--freeze_first_only", type=bool, metavar="BOOL") + # model_args.add_argument("--freeze_first_only", type=bool, metavar="BOOL") # Training arguments training_args.add_argument( - "--epochs", type=int, metavar="INT", default=30, help="Number of epochs to run" + "--epochs", + type=int, + metavar="INT", + default=30, + help="Number of epochs to run" ) training_args.add_argument( "--total_epochs", @@ -1140,7 +1177,11 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: help="Number of total epochs to run", ) training_args.add_argument( - "--batch_size", type=int, metavar="INT", default=50, help="Batch size" + "--batch_size", + type=int, + metavar="INT", + default=50, + help="Batch size" ) training_args.add_argument( "--warmup_epochs", @@ -1196,7 +1237,12 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: "dirichlet", ], ) - training_args.add_argument("--grad_clip", type=float) + training_args.add_argument( + "--grad_clip", + type=float, + metavar="FLOAT", + help="Gradient clipping value" + ) training_args.add_argument( "--metric", type=str, @@ -1237,23 +1283,17 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: metavar="FILE", help="Path to model checkpoint (.pt file)", ) - # general_args.add_argument( - # "--no_features_scaling", - # action="store_true", - # help="Turn on scaling of features", - # ) files_args.add_argument( "-f", "--configFile", type=str, metavar="FILE", - help="Path to a .json file containing arguments. Any arguments present in the config" - "file will override arguments specified via the command line or by the defaults.", + help="Path to a .json file containing arguments. CLI arguments will override these.", ) files_args.add_argument( "--test_path", type=str, - help="Path to CSV file containing testing data for which predictions will be made.", + help="Path to CSV file for which predictions will be made.", ) files_args.add_argument( "--preds_path", @@ -1275,12 +1315,13 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: files_args.add_argument( "--calibration_atom_descriptors_path", type=str, - help="Extra atom descriptors file.", + help="Extra atom descriptors file.", ) files_args.add_argument( "--calibration_bond_descriptors_path", type=str, - help="Extra bond descriptors file. Path to the extra bond descriptors that will be used as bond features to featurize a given molecule.", + help="Extra bond descriptors file. Path to the extra bond descriptors that will be used as bond features to " + "featurize a given molecule.", ) general_args.add_argument( @@ -1317,7 +1358,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: "platt", "isotonic", ], - help="Methods used for calibrating the uncertainty calculated with uncertainty method.", + help="Methods used for calibrating the uncertainty.", ) uncertainty_args.add_argument( "--individual_ensemble_predictions", @@ -1329,7 +1370,9 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: "--evaluation_methods", type=str, nargs="+", - help="Methods used for evaluating the uncertainty performance. Only used if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.", + help="Methods used for evaluating the uncertainty performance. Only used if the test data provided includes " + "targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available " + "classification or multiclass metric.", ) uncertainty_args.add_argument( "--evaluation_scores_path", @@ -1346,7 +1389,8 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: "--dropout_sampling_size", type=int, default=10, - help="The number of samples to use for Monte Carlo dropout uncertainty estimation. Distinct from the dropout used during training.", + help="The number of samples to use for Monte Carlo dropout uncertainty estimation. Distinct from the dropout " + "used during training.", ) uncertainty_args.add_argument( "--calibration_interval_percentile", @@ -1397,7 +1441,7 @@ def parseInterpretGnn(parser: argparse.ArgumentParser) -> None: "--data_path", type=str, metavar="FILE", - help="Path to CSV file containing testing data for which predictions will be made", + help="Path to CSV file for which predictions will be made", ) interpret_args.add_argument( "--max_atoms", From 40e6b0ba6c1044ad28d1a96fc039cbf1eefc474e Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Fri, 8 Mar 2024 12:06:32 +0100 Subject: [PATCH 13/48] flaked and blacked --- dfpl/options.py | 82 ++++++++++++++++++++++++------------------------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/dfpl/options.py b/dfpl/options.py index 782e55d8..1d041de6 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -106,6 +106,7 @@ class GnnOptions(TrainArgs): evaluation_scores_path: str = "" wabTracking: bool = False split_sizes: List[float] = None + # save_smiles_splits: bool = False @classmethod @@ -512,7 +513,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="STRING", type=str, choices=["Adam", "SGD"], - help='Optimizer of the FNN.', + help="Optimizer of the FNN.", default="Adam", ) training_args.add_argument( @@ -674,7 +675,7 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: "--compressFeatures", action="store_true", help="Compresses the fingerprints if encoder dir/file is provided", - default=False + default=False, ) ( general_args.add_argument( @@ -736,10 +737,20 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: ) # General arguments - general_args.add_argument("--split_key_molecule",help="The index of the key molecule used for splitting", type=int) - general_args.add_argument("--pytorch_seed",help="Seed for pytorch", type=int) - general_args.add_argument("--cache_cutoff",help="Maximum number of molecules in dataset to allow caching.", type=float) - general_args.add_argument("--save_preds",help="Saves test split predictions during training", type=bool) + general_args.add_argument( + "--split_key_molecule", + type=int, + help="The index of the key molecule used for splitting", + ) + general_args.add_argument("--pytorch_seed", type=int, help="Seed for pytorch") + general_args.add_argument( + "--cache_cutoff", + type=float, + help="Maximum number of molecules in dataset to allow caching.", + ) + general_args.add_argument( + "--save_preds", help="Saves test split predictions during training", type=bool + ) general_args.add_argument("--wabTracking", action="store_true", default=False) general_args.add_argument( "--cuda", action="store_true", default=False, help="Turn on cuda" @@ -1103,8 +1114,9 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: default=2, help="Number of layers in FFN after MPN encoding", ) - model_args.add_argument("--checkpoint_frzn", type=str, metavar="STRING",help="Freeze the loaded model") - + model_args.add_argument( + "--checkpoint_frzn", type=str, metavar="STRING", help="Freeze the loaded model" + ) # Model arguments # model_args.add_argument("--mpn_shared", type=bool, metavar="BOOL") model_args.add_argument( @@ -1116,58 +1128,53 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: model_args.add_argument( "--aggregation", choices=["mean", "sum", "norm"], - help="Aggregation scheme for atomic vectors into molecular vectors") + help="Aggregation scheme for atomic vectors into molecular vectors", + ) model_args.add_argument( "--aggregation_norm", type=int, - help="For norm aggregation, number by which to divide summed up atomic features") + help="For norm aggregation, number by which to divide summed up atomic features", + ) # model_args.add_argument("--explicit_h", type=bool, metavar="BOOL",help="A explicit hydrogen") model_args.add_argument( - "--adding_h", - type=bool, - metavar="BOOL", - help="Adding hydrogen") + "--adding_h", type=bool, metavar="BOOL", help="Adding hydrogen" + ) # Training arguments model_args.add_argument( "--class_balance", type=bool, metavar="BOOL", - help="Balances the classes across batches") + help="Balances the classes across batches", + ) model_args.add_argument( "--evidential_regularization", type=float, metavar="FLOAT", - help="Regularization parameter for evidential loss") + help="Regularization parameter for evidential loss", + ) model_args.add_argument( "--overwrite_default_atom_features", type=bool, metavar="BOOL", - help="Overwrites default atom features instead of concatenating" + help="Overwrites default atom features instead of concatenating", ) - model_args.add_argument( - "--no_atom_descriptor_scaling", - type=bool, - metavar="BOOL") + model_args.add_argument("--no_atom_descriptor_scaling", type=bool, metavar="BOOL") model_args.add_argument( "--overwrite_default_bond_features", type=bool, metavar="BOOL", - help="Overwrites default bond features instead of concatenating" + help="Overwrites default bond features instead of concatenating", ) model_args.add_argument( "--frzn_ffn_layers", type=int, metavar="INT", - help="Number of layers in FFN to freeze" + help="Number of layers in FFN to freeze", ) # model_args.add_argument("--freeze_first_only", type=bool, metavar="BOOL") # Training arguments training_args.add_argument( - "--epochs", - type=int, - metavar="INT", - default=30, - help="Number of epochs to run" + "--epochs", type=int, metavar="INT", default=30, help="Number of epochs to run" ) training_args.add_argument( "--total_epochs", @@ -1177,11 +1184,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: help="Number of total epochs to run", ) training_args.add_argument( - "--batch_size", - type=int, - metavar="INT", - default=50, - help="Batch size" + "--batch_size", type=int, metavar="INT", default=50, help="Batch size" ) training_args.add_argument( "--warmup_epochs", @@ -1238,10 +1241,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: ], ) training_args.add_argument( - "--grad_clip", - type=float, - metavar="FLOAT", - help="Gradient clipping value" + "--grad_clip", type=float, metavar="FLOAT", help="Gradient clipping value" ) training_args.add_argument( "--metric", @@ -1321,7 +1321,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: "--calibration_bond_descriptors_path", type=str, help="Extra bond descriptors file. Path to the extra bond descriptors that will be used as bond features to " - "featurize a given molecule.", + "featurize a given molecule.", ) general_args.add_argument( @@ -1371,8 +1371,8 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: type=str, nargs="+", help="Methods used for evaluating the uncertainty performance. Only used if the test data provided includes " - "targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available " - "classification or multiclass metric.", + "targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available " + "classification or multiclass metric.", ) uncertainty_args.add_argument( "--evaluation_scores_path", @@ -1390,7 +1390,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: type=int, default=10, help="The number of samples to use for Monte Carlo dropout uncertainty estimation. Distinct from the dropout " - "used during training.", + "used during training.", ) uncertainty_args.add_argument( "--calibration_interval_percentile", From c03a32e306694ce7fb845d4047e215b49bf4f184 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Mon, 11 Mar 2024 13:37:18 +0100 Subject: [PATCH 14/48] removed metavar from args with choices --- dfpl/options.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/dfpl/options.py b/dfpl/options.py index 1d041de6..5db0c837 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -300,7 +300,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: general_args.add_argument( "-t", "--type", - metavar="STRING", type=str, choices=["fp", "smiles"], help="Type of the chemical representation. Choices: 'fp', 'smiles'.", @@ -324,7 +323,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: ) general_args.add_argument( "--fpType", - metavar="STR", type=str, choices=["topological", "MACCS"], help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.", @@ -366,7 +364,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: ) autoencoder_args.add_argument( "--aeType", - metavar="STRING", type=str, choices=["variational", "deterministic"], help="Autoencoder type, variational or deterministic.", @@ -388,7 +385,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: ) autoencoder_args.add_argument( "--aeActivationFunction", - metavar="STRING", type=str, choices=["relu", "selu"], help="The activation function of the autoencoder.", @@ -410,7 +406,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: ) autoencoder_args.add_argument( "--aeSplitType", - metavar="STRING", type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is split for the autoencoder", @@ -433,7 +428,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: # Training Configuration training_args.add_argument( "--split_type", - metavar="STRING", type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is split for the feedforward neural network", @@ -457,7 +451,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: training_args.add_argument( "-v", "--verbose", - metavar="INT", type=int, choices=[0, 1, 2], help="Verbosity level. O: No additional output, " @@ -501,7 +494,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: # TODO CHECK IF ALL LOSSES MAKE SENSE HERE training_args.add_argument( "--lossFunction", - metavar="STRING", type=str, choices=["mse", "bce", "focal"], help="Loss function for FNN training. mse - mean squared error, bce - binary cross entropy.", @@ -510,7 +502,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: # TODO DO I NEED ALL ARGUMENTS TO BE USER SPECIFIED? WHAT DOES THE USER KNOW ABOUT OPTIMIZERS? training_args.add_argument( "--optimizer", - metavar="STRING", type=str, choices=["Adam", "SGD"], help="Optimizer of the FNN.", @@ -553,7 +544,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: ) training_args.add_argument( "--activationFunction", - metavar="STRING", type=str, choices=["relu", "selu"], help="The activation function of the FNN.", @@ -633,7 +623,6 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: general_args.add_argument( "-t", "--type", - metavar="STR", type=str, choices=["fp", "smiles"], help="Type of the chemical representation. Choices: 'fp', 'smiles'.", @@ -642,7 +631,6 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: general_args.add_argument( "-k", "--fpType", - metavar="STR", type=str, choices=["topological", "MACCS"], help="The type of fingerprint to be generated/used in input file.", @@ -680,7 +668,6 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: ( general_args.add_argument( "--aeType", - metavar="STRING", type=str, choices=["variational", "deterministic"], help="Autoencoder type, variational or deterministic.", @@ -699,7 +686,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: uncertainty_args.add_argument( "--uncertainty_method", type=str, - metavar="STRING", choices=[ "mve", "ensemble", @@ -717,7 +703,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: uncertainty_args.add_argument( "--calibration_method", type=str, - metavar="STRING", choices=[ "zscaling", "tscaling", @@ -949,7 +934,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: data_args.add_argument( "--dataset_type", type=str, - metavar="STRING", choices=["classification", "regression", "multiclass"], help="Type of dataset, e.g. classification or regression." "This determines the loss function used during training.", @@ -965,7 +949,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: data_args.add_argument( "--split_type", type=str, - metavar="STRING", default="random", choices=[ "random", @@ -1075,7 +1058,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: model_args.add_argument( "--activation", type=str, - metavar="STRING", default="ReLU", choices=["ReLU", "LeakyReLU", "PReLU", "tanh", "SELU", "ELU"], help="Activation function", @@ -1226,7 +1208,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: training_args.add_argument( "--loss_function", type=str, - metavar="STRING", choices=[ "mse", "bounded_mse", @@ -1246,7 +1227,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: training_args.add_argument( "--metric", type=str, - metavar="STRING", default=None, choices=[ "auc", From ebaaacaebbb4e73a72101e95b7f5b11583b4d925 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Mon, 11 Mar 2024 13:42:55 +0100 Subject: [PATCH 15/48] make literals optionals for None --- dfpl/options.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dfpl/options.py b/dfpl/options.py index 5db0c837..d9834092 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -134,14 +134,14 @@ class PredictGnnOptions(PredictArgs): calibration_atom_descriptors_path: str = None calibration_features_path: str = None calibration_interval_percentile: float = 95 - calibration_method: Literal[ + calibration_method: Optional[Literal[ "zscaling", "tscaling", "zelikman_interval", "mve_weighting", "platt", "isotonic", - ] = None + ]] = None calibration_path: str = None calibration_phase_features_path: str = None drop_extra_columns: bool = False @@ -151,10 +151,10 @@ class PredictGnnOptions(PredictArgs): # no_features_scaling: bool = True individual_ensemble_predictions: bool = False preds_path: str = None - regression_calibrator_metric: Literal["stdev", "interval"] = None + regression_calibrator_metric: Optional[Literal["stdev", "interval"]] = None test_path: str = None uncertainty_dropout_p: float = 0.1 - uncertainty_method: Literal[ + uncertainty_method: Optional[Literal[ "mve", "ensemble", "evidential_epistemic", @@ -162,7 +162,7 @@ class PredictGnnOptions(PredictArgs): "evidential_total", "classification", "dropout", - ] = None + ]] = None @classmethod def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): From d6090a9fdf9841ccfe5c7d2ec241687aa0bba136 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Mon, 11 Mar 2024 13:50:35 +0100 Subject: [PATCH 16/48] applied black --- dfpl/options.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/dfpl/options.py b/dfpl/options.py index d9834092..2009fa76 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -134,14 +134,16 @@ class PredictGnnOptions(PredictArgs): calibration_atom_descriptors_path: str = None calibration_features_path: str = None calibration_interval_percentile: float = 95 - calibration_method: Optional[Literal[ - "zscaling", - "tscaling", - "zelikman_interval", - "mve_weighting", - "platt", - "isotonic", - ]] = None + calibration_method: Optional[ + Literal[ + "zscaling", + "tscaling", + "zelikman_interval", + "mve_weighting", + "platt", + "isotonic", + ] + ] = None calibration_path: str = None calibration_phase_features_path: str = None drop_extra_columns: bool = False @@ -154,15 +156,17 @@ class PredictGnnOptions(PredictArgs): regression_calibrator_metric: Optional[Literal["stdev", "interval"]] = None test_path: str = None uncertainty_dropout_p: float = 0.1 - uncertainty_method: Optional[Literal[ - "mve", - "ensemble", - "evidential_epistemic", - "evidential_aleatoric", - "evidential_total", - "classification", - "dropout", - ]] = None + uncertainty_method: Optional[ + Literal[ + "mve", + "ensemble", + "evidential_epistemic", + "evidential_aleatoric", + "evidential_total", + "classification", + "dropout", + ] + ] = None @classmethod def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): From cb3fa01599f9889092df0b3ece618b1c66497061 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 9 Apr 2024 13:11:29 +0200 Subject: [PATCH 17/48] rename some variables needed because of the specifics of the Galaxy tool generator --- dfpl/options.py | 416 ++++++++++++++++++++++++------------------------ 1 file changed, 207 insertions(+), 209 deletions(-) diff --git a/dfpl/options.py b/dfpl/options.py index 85e245bc..d599215d 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -240,11 +240,11 @@ def createCommandlineParser() -> argparse.ArgumentParser: parser_train.set_defaults(method="train") parseInputTrain(parser_train) - parser_predict = subparsers.add_parser( + parser_input_predict = subparsers.add_parser( "predict", help="Predict your data with existing models" ) - parser_predict.set_defaults(method="predict") - parseInputPredict(parser_predict) + parser_input_predict.set_defaults(method="predict") + parseInputPredict(parser_input_predict) parser_convert = subparsers.add_parser( "convert", help="Convert known data files to pickle serialization files" @@ -254,20 +254,20 @@ def createCommandlineParser() -> argparse.ArgumentParser: return parser -def parseInputTrain(parser: argparse.ArgumentParser) -> None: +def parseInputTrain(parser_train: argparse.ArgumentParser) -> None: """ Parse the input arguments. :return: A namespace object built up from attributes parsed out of the cmd line. """ # Create argument groups - general_args = parser.add_argument_group("Model Configuration") - autoencoder_args = parser.add_argument_group("Autoencoder Configuration") - training_args = parser.add_argument_group("Training Configuration") - tracking_args = parser.add_argument_group("Tracking Configuration") + input_tain_general_args = parser_train.add_argument_group("Model Configuration") + input_tain_autoencoder_args = parser_train.add_argument_group("Autoencoder Configuration") + input_tain_training_args = parser_train.add_argument_group("Training Configuration") + input_tain_tracking_args = parser_train.add_argument_group("Tracking Configuration") # Model Configuration - general_args.add_argument( + input_tain_general_args.add_argument( "-f", "--configFile", metavar="FILE", @@ -275,7 +275,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="Input JSON file that contains all information for training/predicting.", default="example/train.json", ) - general_args.add_argument( + input_tain_general_args.add_argument( "-i", "--inputFile", metavar="FILE", @@ -284,7 +284,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: "comma separated CSV format.The first column should be smiles.", default="tests/data/smiles.csv", ) - general_args.add_argument( + input_tain_general_args.add_argument( "-o", "--outputDir", metavar="DIR", @@ -295,7 +295,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: ) # TODO CHECK WHAT IS TYPE DOING? - general_args.add_argument( + input_tain_general_args.add_argument( "-t", "--type", metavar="STRING", @@ -304,7 +304,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="Type of the chemical representation. Choices: 'fp', 'smiles'.", default="fp", ) - general_args.add_argument( + input_tain_general_args.add_argument( "-thr", "--threshold", type=float, @@ -312,7 +312,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="Threshold for binary classification.", default=0.5, ) - general_args.add_argument( + input_tain_general_args.add_argument( "-gpu", "--gpu", metavar="INT", @@ -320,7 +320,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="Select which gpu to use by index. If not available, leave empty", default=None, ) - general_args.add_argument( + input_tain_general_args.add_argument( "--fpType", metavar="STR", type=str, @@ -328,26 +328,26 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.", default="topological", ) - general_args.add_argument( + input_tain_general_args.add_argument( "--fpSize", type=int, help="Length of the fingerprint that should be generated.", default=2048, ) - general_args.add_argument( + input_tain_general_args.add_argument( "--compressFeatures", action="store_true", help="Should the fingerprints be compressed or not. Needs a path of a trained autoencoder or needs the trainAC also set to True.", default=False, ) - general_args.add_argument( + input_tain_general_args.add_argument( "--enableMultiLabel", action="store_true", help="Train multi-label classification model in addition to the individual models.", default=False, ) # Autoencoder Configuration - autoencoder_args.add_argument( + input_tain_autoencoder_args.add_argument( "-a", "--ecWeightsFile", type=str, @@ -355,14 +355,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="The .hdf5 file of a trained encoder", default="", ) - autoencoder_args.add_argument( + input_tain_autoencoder_args.add_argument( "--ecModelDir", type=str, metavar="DIR", help="The directory where the full model of the encoder will be saved", default="example/results_train/AE_encoder/", ) - autoencoder_args.add_argument( + input_tain_autoencoder_args.add_argument( "--aeType", metavar="STRING", type=str, @@ -370,21 +370,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="Autoencoder type, variational or deterministic.", default="deterministic", ) - autoencoder_args.add_argument( + input_tain_autoencoder_args.add_argument( "--aeEpochs", metavar="INT", type=int, help="Number of epochs for autoencoder training.", default=100, ) - autoencoder_args.add_argument( + input_tain_autoencoder_args.add_argument( "--aeBatchSize", metavar="INT", type=int, help="Batch size in autoencoder training.", default=512, ) - autoencoder_args.add_argument( + input_tain_autoencoder_args.add_argument( "--aeActivationFunction", metavar="STRING", type=str, @@ -392,21 +392,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="The activation function for the hidden layers in the autoencoder.", default="relu", ) - autoencoder_args.add_argument( + input_tain_autoencoder_args.add_argument( "--aeLearningRate", metavar="FLOAT", type=float, help="Learning rate for autoencoder training.", default=0.001, ) - autoencoder_args.add_argument( + input_tain_autoencoder_args.add_argument( "--aeLearningRateDecay", metavar="FLOAT", type=float, help="Learning rate decay for autoencoder training.", default=0.96, ) - autoencoder_args.add_argument( + input_tain_autoencoder_args.add_argument( "--aeSplitType", metavar="STRING", type=str, @@ -414,7 +414,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="Set how the data is going to be split for the autoencoder", default="random", ) - autoencoder_args.add_argument( + input_tain_autoencoder_args.add_argument( "-d", "--encFPSize", metavar="INT", @@ -422,14 +422,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="Size of encoded fingerprint (z-layer of autoencoder).", default=256, ) - autoencoder_args.add_argument( + input_tain_autoencoder_args.add_argument( "--visualizeLatent", action="store_true", help="UMAP the latent space for exploration", default=False, ) # Training Configuration - training_args.add_argument( + input_tain_training_args.add_argument( "--split_type", metavar="STRING", type=str, @@ -437,14 +437,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="Set how the data is going to be split for the feedforward neural network", default="random", ) - training_args.add_argument( + input_tain_training_args.add_argument( "--testSize", metavar="FLOAT", type=float, help="Fraction of the dataset that should be used for testing. Value in [0,1].", default=0.2, ) - training_args.add_argument( + input_tain_training_args.add_argument( "-K", "--kFolds", metavar="INT", @@ -452,7 +452,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="K that is used for K-fold cross-validation in the training procedure.", default=1, ) - training_args.add_argument( + input_tain_training_args.add_argument( "-v", "--verbose", metavar="INT", @@ -462,19 +462,19 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: + "1: Some additional output, 2: full additional output", default=2, ) - training_args.add_argument( + input_tain_training_args.add_argument( "--trainAC", action="store_true", help="Choose to train or not, the autoencoder based on the input file", default=False, ) - training_args.add_argument( + input_tain_training_args.add_argument( "--trainFNN", action="store_false", help="When called it deactivates the training.", default=True, ) - training_args.add_argument( + input_tain_training_args.add_argument( "--sampleFractionOnes", metavar="FLOAT", type=float, @@ -482,14 +482,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: "only works if --sampleDown is enabled", default=0.5, ) - training_args.add_argument( + input_tain_training_args.add_argument( "--sampleDown", metavar="BOOL", type=bool, help="Enable automatic down sampling of the 0 valued samples.", default=False, ) - training_args.add_argument( + input_tain_training_args.add_argument( "-e", "--epochs", metavar="INT", @@ -498,7 +498,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: default=100, ) # TODO CHECK IF ALL LOSSES MAKE SENSE HERE - training_args.add_argument( + input_tain_training_args.add_argument( "--lossFunction", metavar="STRING", type=str, @@ -507,7 +507,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: default="bce", ) # TODO DO I NEED ALL ARGUMENTS TO BE USER SPECIFIED? WHAT DOES THE USER KNOW ABOUT OPTIMIZERS? - training_args.add_argument( + input_tain_training_args.add_argument( "--optimizer", metavar="STRING", type=str, @@ -515,42 +515,42 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help='Optimizer to use for backpropagation in the FNN. Possible values: "Adam", "SGD"', default="Adam", ) - training_args.add_argument( + input_tain_training_args.add_argument( "--batchSize", metavar="INT", type=int, help="Batch size in FNN training.", default=128, ) - training_args.add_argument( + input_tain_training_args.add_argument( "--l2reg", metavar="FLOAT", type=float, help="Value for l2 kernel regularizer.", default=0.001, ) - training_args.add_argument( + input_tain_training_args.add_argument( "--dropout", metavar="FLOAT", type=float, help="The fraction of data that is dropped out in each dropout layer.", default=0.2, ) - training_args.add_argument( + input_tain_training_args.add_argument( "--learningRate", metavar="FLOAT", type=float, help="Learning rate size in FNN training.", default=0.000022, ) - training_args.add_argument( + input_tain_training_args.add_argument( "--learningRateDecay", metavar="FLOAT", type=float, help="Learning rate size in FNN training.", default=0.96, ) - training_args.add_argument( + input_tain_training_args.add_argument( "--activationFunction", metavar="STRING", type=str, @@ -559,21 +559,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: default="relu", ) # Tracking Configuration - tracking_args.add_argument( + input_tain_tracking_args.add_argument( "--aeWabTracking", metavar="BOOL", type=bool, help="Track autoencoder performance via Weights & Biases, see https://wandb.ai.", default=False, ) - tracking_args.add_argument( + input_tain_tracking_args.add_argument( "--wabTracking", metavar="BOOL", type=bool, help="Track FNN performance via Weights & Biases, see https://wandb.ai.", default=False, ) - tracking_args.add_argument( + input_tain_tracking_args.add_argument( "--wabTarget", metavar="STRING", type=str, @@ -583,23 +583,23 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: ) -def parseInputPredict(parser: argparse.ArgumentParser) -> None: +def parseInputPredict(parser_input_predict: argparse.ArgumentParser) -> None: """ Parse the input arguments. :return: A namespace object built up from attributes parsed out of the cmd line. """ - general_args = parser.add_argument_group("General Configuration") - files_args = parser.add_argument_group("Files") - files_args.add_argument( + input_predict_general_args = parser_input_predict.add_argument_group("General Configuration") + input_predict_files_args = parser_input_predict.add_argument_group("Files") + input_predict_files_args.add_argument( "-f", "--configFile", metavar="FILE", type=str, help="Input JSON file that contains all information for training/predicting.", ) - files_args.add_argument( + input_predict_files_args.add_argument( "-i", "--inputFile", metavar="FILE", @@ -614,7 +614,7 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: "A header is expected and respective column names are used.", default="tests/data/smiles.csv", ) - files_args.add_argument( + input_predict_files_args.add_argument( "-o", "--outputDir", metavar="DIR", @@ -623,7 +623,7 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: "with --outputFile.", default="example/results_predict/", ) - files_args.add_argument( + input_predict_files_args.add_argument( "--outputFile", metavar="FILE", type=str, @@ -632,7 +632,7 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: default="results.csv", ) # TODO AGAIN THIS TRASH HERE? CAN WE EVEN PROCESS SMILES? - general_args.add_argument( + input_predict_general_args.add_argument( "-t", "--type", metavar="STR", @@ -641,7 +641,7 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: help="Type of the chemical representation. Choices: 'fp', 'smiles'.", default="fp", ) - general_args.add_argument( + input_predict_general_args.add_argument( "-k", "--fpType", metavar="STR", @@ -650,7 +650,7 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: help="The type of fingerprint to be generated/used in input file. Should be the same as the type of the fps that the model was trained upon.", default="topological", ) - files_args.add_argument( + input_predict_files_args.add_argument( "--ecModelDir", type=str, metavar="DIR", @@ -658,14 +658,14 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: "loaded from (if trainAE=False). Provide a full path here.", default="", ) - files_args.add_argument( + input_predict_files_args.add_argument( "--ecWeightsFile", type=str, metavar="STR", help="The file where the full model of the encoder will be loaded from, to compress the fingerprints. Provide a full path here.", default="", ) - files_args.add_argument( + input_predict_files_args.add_argument( "--fnnModelDir", type=str, metavar="DIR", @@ -673,29 +673,27 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: "Provide a full path here.", default="example/results_train/AR_saved_model", ) - general_args.add_argument( + input_predict_general_args.add_argument( "-c", "--compressFeatures", action="store_true", default=False ) - ( - general_args.add_argument( - "--aeType", - metavar="STRING", - type=str, - choices=["variational", "deterministic"], - help="Autoencoder type, variational or deterministic.", - default="deterministic", - ) - ) - - -def parseTrainGnn(parser: argparse.ArgumentParser) -> None: - general_args = parser.add_argument_group("General Configuration") - data_args = parser.add_argument_group("Data Configuration") - files_args = parser.add_argument_group("Files") - model_args = parser.add_argument_group("Model arguments") - training_args = parser.add_argument_group("Training Configuration") - uncertainty_args = parser.add_argument_group("Uncertainty Configuration") - uncertainty_args.add_argument( + input_predict_general_args.add_argument( + "--aeType", + metavar="STRING", + type=str, + choices=["variational", "deterministic"], + help="Autoencoder type, variational or deterministic.", + default="deterministic", + ) + + +def parseTrainGnn(parser_train_gnn: argparse.ArgumentParser) -> None: + train_gnn_general_args = parser_train_gnn.add_argument_group("General Configuration") + train_gnn_data_args = parser_train_gnn.add_argument_group("Data Configuration") + train_gnn_files_args = parser_train_gnn.add_argument_group("Files") + train_gnn_model_args = parser_train_gnn.add_argument_group("Model arguments") + train_gnn_training_args = parser_train_gnn.add_argument_group("Training Configuration") + train_gnn_uncertainty_args = parser_train_gnn.add_argument_group("Uncertainty Configuration") + train_gnn_uncertainty_args.add_argument( "--uncertainty_method", type=str, metavar="STRING", @@ -713,7 +711,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: default="none", ) # Uncertainty arguments - uncertainty_args.add_argument( + train_gnn_uncertainty_args.add_argument( "--calibration_method", type=str, metavar="STRING", @@ -728,7 +726,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: help="Method to use for calibration", default="none", ) - uncertainty_args.add_argument( + train_gnn_uncertainty_args.add_argument( "--calibration_path", type=str, metavar="FILE", @@ -736,47 +734,47 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: ) # General arguments - general_args.add_argument("--split_key_molecule", type=int) - general_args.add_argument("--pytorch_seed", type=int) - general_args.add_argument("--cache_cutoff", type=float) - general_args.add_argument("--save_preds", type=bool) - general_args.add_argument("--wabTracking", action="store_true", default=False) - general_args.add_argument( + train_gnn_general_args.add_argument("--split_key_molecule", type=int) + train_gnn_general_args.add_argument("--pytorch_seed", type=int) + train_gnn_general_args.add_argument("--cache_cutoff", type=float) + train_gnn_general_args.add_argument("--save_preds", type=bool) + train_gnn_general_args.add_argument("--wabTracking", action="store_true", default=False) + train_gnn_general_args.add_argument( "--cuda", action="store_true", default=False, help="Turn on cuda" ) - # general_args.add_argument( + # train_gnn_general_args.add_argument( # "--save_smiles_splits", # action="store_true", # default=False, # help="Save smiles for each train/val/test splits for prediction convenience later", # ) - general_args.add_argument( + train_gnn_general_args.add_argument( "--test", action="store_true", default=False, help="Whether to skip training and only test the model", ) - general_args.add_argument( + train_gnn_general_args.add_argument( "--gpu", type=int, choices=list(range(torch.cuda.device_count())), help="Which GPU to use", ) - general_args.add_argument("--save", type=bool) - general_args.add_argument( + train_gnn_general_args.add_argument("--save", type=bool) + train_gnn_general_args.add_argument( "--quiet", action="store_true", default=False, help="Skip non-essential print statements", ) - general_args.add_argument( + train_gnn_general_args.add_argument( "--log_frequency", type=int, metavar="INT", default=10, help="The number of batches between each logging of the training loss", ) - general_args.add_argument( + train_gnn_general_args.add_argument( "--no_cache", action="store_true", default=False, @@ -784,21 +782,21 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: ) # FILES ARGUMENTS - files_args.add_argument( + train_gnn_files_args.add_argument( "-f", "--configFile", metavar="FILE", type=str, help="Input JSON file that contains all information for training/predicting.", ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--save_dir", type=str, metavar="DIR", default="./ckpt/", help="Directory where model checkpoints will be saved", ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--checkpoint_dir", type=str, metavar="DIR", @@ -806,14 +804,14 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: help="Directory from which to load model checkpoints" "(walks directory and ensembles all models that are found)", ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--checkpoint_path", type=str, metavar="FILE", default=None, help="Path to model checkpoint (.pt file)", ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--checkpoint_paths", type=str, metavar="FILE", @@ -821,73 +819,73 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: default=None, help="Path to model checkpoint (.pt file)", ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--separate_val_path", type=str, metavar="FILE", help="Path to separate val set, optional", ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--separate_val_features_path", type=str, metavar="FILE", nargs="*", help="Path to file with features for separate val set", ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--separate_test_path", type=str, metavar="FILE", help="Path to separate test set, optional", ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--separate_test_features_path", type=str, metavar="FILE", nargs="*", help="Path to file with features for separate test set", ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--folds_file", type=str, metavar="FILE", default=None, help="Optional file of fold labels", ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--val_fold_index", type=int, metavar="INT", default=None, help="Which fold to use as val for cross val", ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--test_fold_index", type=int, metavar="INT", default=None, help="Which fold to use as test for cross val", ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--crossval_index_dir", type=str, metavar="DIR", help="Directory in which to find cross validation index files", ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--crossval_index_file", type=str, metavar="FILE", help="Indices of files to use as train/val/test" "Overrides --num_folds and --seed.", ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--data_weights_path", type=str, metavar="FILE", help="Path where the data weight are saved", ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--features_path", type=str, metavar="FILE", @@ -895,47 +893,47 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: help="Path to features to use in FNN (instead of features_generator)", ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--separate_val_phase_features_path", type=str, metavar="FILE" ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--separate_test_phase_features_path", type=str, metavar="FILE" ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--separate_val_atom_descriptors_path", type=str, metavar="FILE" ) - files_args.add_argument( + train_gnn_files_args.add_argument( "--separate_test_atom_descriptors_path", type=str, metavar="FILE" ) # Data related arguments - data_args.add_argument( + train_gnn_data_args.add_argument( "--data_path", type=str, metavar="FILE", help="Path to data CSV file", default="", ) - data_args.add_argument( + train_gnn_data_args.add_argument( "--use_compound_names", action="store_true", default=False, help="Use when test data file contains compound names in addition to SMILES strings", ) - data_args.add_argument( + train_gnn_data_args.add_argument( "--max_data_size", type=int, metavar="INT", help="Maximum number of data points to load", ) - data_args.add_argument( + train_gnn_data_args.add_argument( "--features_only", action="store_true", default=False, help="Use only the additional features in an FFN, no graph network", ) - data_args.add_argument( + train_gnn_data_args.add_argument( "--dataset_type", type=str, metavar="STRING", @@ -944,14 +942,14 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: "This determines the loss function used during training.", default="regression", ) # classification - data_args.add_argument( + train_gnn_data_args.add_argument( "--multiclass_num_classes", type=int, metavar="INT", default=3, help="Number of classes when running multiclass classification", ) - data_args.add_argument( + train_gnn_data_args.add_argument( "--split_type", type=str, metavar="STRING", @@ -965,7 +963,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: ], help="Method of splitting the data into train/val/test", ) - data_args.add_argument( + train_gnn_data_args.add_argument( "--split_sizes", type=float, metavar="FLOAT", @@ -974,7 +972,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: help="Split proportions for train/validation/test sets", ) - data_args.add_argument( + train_gnn_data_args.add_argument( "--seed", type=int, default=0, @@ -982,42 +980,42 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: "When `num_folds` > 1, the first fold uses this seed and all" "subsequent folds add 1 to the seed.", ) - data_args.add_argument( + train_gnn_data_args.add_argument( "--smiles_columns", type=str, metavar="STRING", help="Name of the smiles columns", ) - data_args.add_argument( + train_gnn_data_args.add_argument( "--target_columns", type=str, metavar="STRING", help="Name of the target columns", ) - data_args.add_argument( + train_gnn_data_args.add_argument( "--ignore_columns", type=str, metavar="STRING", help="Names of the columns to ignore", ) - data_args.add_argument( + train_gnn_data_args.add_argument( "--num_tasks", type=int, metavar="INT", help="NUmber of tasks" ) - data_args.add_argument( + train_gnn_data_args.add_argument( "--no_features_scaling", action="store_true", default=False, help="Turn off scaling of features", ) - data_args.add_argument( + train_gnn_data_args.add_argument( "--features_scaling", action="store_true", default=False, help="Turn on scaling of features", ) - data_args.add_argument( + train_gnn_data_args.add_argument( "--use_input_features", type=str, metavar="STRING", @@ -1025,41 +1023,41 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: ) # Model arguments - model_args.add_argument( + train_gnn_model_args.add_argument( "--ensemble_size", type=int, metavar="INT", default=1, help="Number of models in ensemble", ) - model_args.add_argument( + train_gnn_model_args.add_argument( "--hidden_size", type=int, metavar="INT", default=300, help="Dimensionality of hidden layers in MPN", ) - model_args.add_argument( + train_gnn_model_args.add_argument( "--bias", action="store_true", default=False, help="Whether to add bias to linear layers", ) - model_args.add_argument( + train_gnn_model_args.add_argument( "--depth", type=int, metavar="INT", default=3, help="Number of message passing steps", ) - model_args.add_argument( + train_gnn_model_args.add_argument( "--dropout", type=float, metavar="FLOAT", default=0.0, help="Dropout probability", ) - model_args.add_argument( + train_gnn_model_args.add_argument( "--activation", type=str, metavar="STRING", @@ -1067,81 +1065,81 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: choices=["ReLU", "LeakyReLU", "PReLU", "tanh", "SELU", "ELU"], help="Activation function", ) - model_args.add_argument( + train_gnn_model_args.add_argument( "--undirected", action="store_true", default=False, help="Undirected edges (always sum the two relevant bond vectors)", ) - model_args.add_argument( + train_gnn_model_args.add_argument( "--ffn_hidden_size", type=int, metavar="INT", default=2, help="Hidden dim for higher-capacity FFN (defaults to hidden_size)", ) - model_args.add_argument( + train_gnn_model_args.add_argument( "--ffn_num_layers", type=int, metavar="INT", default=2, help="Number of layers in FFN after MPN encoding", ) - model_args.add_argument( + train_gnn_model_args.add_argument( "--atom_messages", action="store_true", default=False, help="Use messages on atoms instead of messages on bonds", ) - model_args.add_argument( + train_gnn_model_args.add_argument( "--num_lrs", type=int, metavar="INT", default=2, help="Number of layers in FFN after MPN encoding", ) - model_args.add_argument("--checkpoint_frzn", type=str, metavar="STRING") + train_gnn_model_args.add_argument("--checkpoint_frzn", type=str, metavar="STRING") # Model arguments - model_args.add_argument("--mpn_shared", type=bool, metavar="BOOL") - model_args.add_argument( + train_gnn_model_args.add_argument("--mpn_shared", type=bool, metavar="BOOL") + train_gnn_model_args.add_argument( "--show_individual_scores", action="store_true", default=True, help="Show all scores for individual targets, not just average, at the end", ) - model_args.add_argument("--aggregation", choices=["mean", "sum", "norm"]) - model_args.add_argument("--aggregation_norm", type=int) - model_args.add_argument("--explicit_h", type=bool, metavar="BOOL") - model_args.add_argument("--adding_h", type=bool, metavar="BOOL") + train_gnn_model_args.add_argument("--aggregation", choices=["mean", "sum", "norm"]) + train_gnn_model_args.add_argument("--aggregation_norm", type=int) + train_gnn_model_args.add_argument("--explicit_h", type=bool, metavar="BOOL") + train_gnn_model_args.add_argument("--adding_h", type=bool, metavar="BOOL") # Training arguments - model_args.add_argument("--class_balance", type=bool, metavar="BOOL") - model_args.add_argument("--evidential_regularization", type=float, metavar="FLOAT") - model_args.add_argument( + train_gnn_model_args.add_argument("--class_balance", type=bool, metavar="BOOL") + train_gnn_model_args.add_argument("--evidential_regularization", type=float, metavar="FLOAT") + train_gnn_model_args.add_argument( "--overwrite_default_atom_features", type=bool, metavar="BOOL" ) - model_args.add_argument("--no_atom_descriptor_scaling", type=bool, metavar="BOOL") - model_args.add_argument( + train_gnn_model_args.add_argument("--no_atom_descriptor_scaling", type=bool, metavar="BOOL") + train_gnn_model_args.add_argument( "--overwrite_default_bond_features", type=bool, metavar="BOOL" ) - model_args.add_argument("--frzn_ffn_layers", type=int, metavar="INT") - model_args.add_argument("--freeze_first_only", type=bool, metavar="BOOL") + train_gnn_model_args.add_argument("--frzn_ffn_layers", type=int, metavar="INT") + train_gnn_model_args.add_argument("--freeze_first_only", type=bool, metavar="BOOL") # Training arguments - training_args.add_argument( + train_gnn_training_args.add_argument( "--epochs", type=int, metavar="INT", default=30, help="Number of epochs to run" ) - training_args.add_argument( + train_gnn_training_args.add_argument( "--total_epochs", type=int, metavar="INT", default=30, help="Number of total epochs to run", ) - training_args.add_argument( + train_gnn_training_args.add_argument( "--batch_size", type=int, metavar="INT", default=50, help="Batch size" ) - training_args.add_argument( + train_gnn_training_args.add_argument( "--warmup_epochs", type=int, metavar="INT", @@ -1150,35 +1148,35 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: "init_lr to max_lr. Afterwards, learning rate decreases exponentially" "from max_lr to final_lr.", ) - training_args.add_argument( + train_gnn_training_args.add_argument( "--init_lr", type=float, metavar="FLOAT", default=1e-4, help="Initial learning rate", ) - training_args.add_argument( + train_gnn_training_args.add_argument( "--max_lr", type=float, metavar="FLOAT", default=1e-3, help="Maximum learning rate", ) - training_args.add_argument( + train_gnn_training_args.add_argument( "--final_lr", type=float, metavar="FLOAT", default=1e-4, help="Final learning rate", ) - training_args.add_argument( + train_gnn_training_args.add_argument( "--extra_metrics", type=str, metavar="STRING", nargs="*", help="Extra metrics to use", ) - training_args.add_argument( + train_gnn_training_args.add_argument( "--loss_function", type=str, metavar="STRING", @@ -1195,8 +1193,8 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: "dirichlet", ], ) - training_args.add_argument("--grad_clip", type=float) - training_args.add_argument( + train_gnn_training_args.add_argument("--grad_clip", type=float) + train_gnn_training_args.add_argument( "--metric", type=str, metavar="STRING", @@ -1216,7 +1214,7 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: "(loss is determined by the `dataset_type` argument)." 'Note: Defaults to "auc" for classification and "rmse" for regression.', ) - training_args.add_argument( + train_gnn_training_args.add_argument( "--num_folds", type=int, metavar="INT", @@ -1225,23 +1223,23 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: ) -def parsePredictGnn(parser: argparse.ArgumentParser) -> None: - general_args = parser.add_argument_group("General Configuration") - files_args = parser.add_argument_group("Files") - uncertainty_args = parser.add_argument_group("Uncertainty Configuration") +def parsePredictGnn(parser_predict_gnn: argparse.ArgumentParser) -> None: + predict_gnn_general_args = parser_predict_gnn.add_argument_group("General Configuration") + predict_gnn_files_args = parser_predict_gnn.add_argument_group("Files") + predict_gnn_uncertainty_args = parser_predict_gnn.add_argument_group("Uncertainty Configuration") - general_args.add_argument( + predict_gnn_general_args.add_argument( "--checkpoint_path", type=str, metavar="FILE", help="Path to model checkpoint (.pt file)" ) - # general_args.add_argument( + # predict_gnn_general_args.add_argument( # "--no_features_scaling", # action="store_true", # help="Turn on scaling of features", # ) - files_args.add_argument( + predict_gnn_files_args.add_argument( "-f", "--configFile", type=str, @@ -1249,46 +1247,46 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: help="Path to a .json file containing arguments. Any arguments present in the config" "file will override arguments specified via the command line or by the defaults.", ) - files_args.add_argument( + predict_gnn_files_args.add_argument( "--test_path", type=str, help="Path to CSV file containing testing data for which predictions will be made.", ) - files_args.add_argument( + predict_gnn_files_args.add_argument( "--preds_path", type=str, help="Path to CSV or PICKLE file where predictions will be saved.", ) - files_args.add_argument( + predict_gnn_files_args.add_argument( "--calibration_path", type=str, help="Path to data file to be used for uncertainty calibration.", ) - files_args.add_argument( + predict_gnn_files_args.add_argument( "--calibration_features_path", type=str, nargs="+", help="Path to features data to be used with the uncertainty calibration dataset.", ) - files_args.add_argument("--calibration_phase_features_path", type=str, help="") - files_args.add_argument( + predict_gnn_files_args.add_argument("--calibration_phase_features_path", type=str, help="") + predict_gnn_files_args.add_argument( "--calibration_atom_descriptors_path", type=str, help="Path to the extra atom descriptors.", ) - files_args.add_argument( + predict_gnn_files_args.add_argument( "--calibration_bond_descriptors_path", type=str, help="Path to the extra bond descriptors that will be used as bond features to featurize a given molecule.", ) - general_args.add_argument( + predict_gnn_general_args.add_argument( "--drop_extra_columns", action="store_true", help="Whether to drop all columns from the test data file besides the SMILES columns and the new prediction columns.", ) - uncertainty_args.add_argument( + predict_gnn_uncertainty_args.add_argument( "--uncertainty_method", type=str, choices=[ @@ -1304,7 +1302,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: ], help="The method of calculating uncertainty.", ) - uncertainty_args.add_argument( + predict_gnn_uncertainty_args.add_argument( "--calibration_method", type=str, nargs="+", @@ -1318,40 +1316,40 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: ], help="Methods used for calibrating the uncertainty calculated with uncertainty method.", ) - uncertainty_args.add_argument("--individual_ensemble_predictions", + predict_gnn_uncertainty_args.add_argument("--individual_ensemble_predictions", action="store_true", default=False, help="Whether to save individual ensemble predictions.") - uncertainty_args.add_argument( + predict_gnn_uncertainty_args.add_argument( "--evaluation_methods", type=str, nargs="+", help="The methods used for evaluating the uncertainty performance if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.", ) - uncertainty_args.add_argument( + predict_gnn_uncertainty_args.add_argument( "--evaluation_scores_path", type=str, help="Location to save the results of uncertainty evaluations.", ) - uncertainty_args.add_argument( + predict_gnn_uncertainty_args.add_argument( "--uncertainty_dropout_p", type=float, default=0.1, help="The probability to use for Monte Carlo dropout uncertainty estimation.", ) - uncertainty_args.add_argument( + predict_gnn_uncertainty_args.add_argument( "--dropout_sampling_size", type=int, default=10, help="The number of samples to use for Monte Carlo dropout uncertainty estimation. Distinct from the dropout used during training.", ) - uncertainty_args.add_argument( + predict_gnn_uncertainty_args.add_argument( "--calibration_interval_percentile", type=float, default=95, help="Sets the percentile used in the calibration methods. Must be in the range (1,100).", ) - uncertainty_args.add_argument( + predict_gnn_uncertainty_args.add_argument( "--regression_calibrator_metric", type=str, choices=["stdev", "interval"], @@ -1359,24 +1357,24 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: ) -def parseInterpretGnn(parser: argparse.ArgumentParser) -> None: - files_args = parser.add_argument_group("Files") - interpret_args = parser.add_argument_group("Interpretation Configuration") - files_args.add_argument( +def parseInterpretGnn(parser_interpret_gnn: argparse.ArgumentParser) -> None: + interpret_gnn_files_args = parser_interpret_gnn.add_argument_group("Files") + interpret_gnn_interpret_args = parser_interpret_gnn.add_argument_group("Interpretation Configuration") + interpret_gnn_files_args.add_argument( "-f", "--configFile", metavar="FILE", type=str, help="Input JSON file that contains all information for interpretation.", ) - files_args.add_argument( + interpret_gnn_files_args.add_argument( "--preds_path", type=str, metavar="FILE", help="Path to CSV file where predictions will be saved", default="", ) - files_args.add_argument( + interpret_gnn_files_args.add_argument( "--checkpoint_dir", type=str, metavar="DIR", @@ -1384,46 +1382,46 @@ def parseInterpretGnn(parser: argparse.ArgumentParser) -> None: "(walks directory and ensembles all models that are found)", default="./ckpt", ) - files_args.add_argument( + interpret_gnn_files_args.add_argument( "--checkpoint_path", type=str, metavar="DIR", help="Path to model checkpoint (.pt file)", ) - files_args.add_argument( + interpret_gnn_files_args.add_argument( "--data_path", type=str, metavar="FILE", help="Path to CSV file containing testing data for which predictions will be made", ) - interpret_args.add_argument( + interpret_gnn_interpret_args.add_argument( "--max_atoms", type=int, metavar="INT", help="Maximum number of atoms to use for interpretation", ) - interpret_args.add_argument( + interpret_gnn_interpret_args.add_argument( "--min_atoms", type=int, metavar="INT", help="Minimum number of atoms to use for interpretation", ) - interpret_args.add_argument( + interpret_gnn_interpret_args.add_argument( "--prop_delta", type=float, metavar="FLOAT", help="The minimum change in the property of interest that is considered significant", ) - interpret_args.add_argument( + interpret_gnn_interpret_args.add_argument( "--property_id", type=int, metavar="INT", help="The index of the property of interest", ) # write the argument for rollouts - interpret_args.add_argument( + interpret_gnn_interpret_args.add_argument( "--rollout", type=int, metavar="INT", @@ -1431,13 +1429,13 @@ def parseInterpretGnn(parser: argparse.ArgumentParser) -> None: ) -def parseInputConvert(parser: argparse.ArgumentParser) -> None: +def parseInputConvert(parser_convert: argparse.ArgumentParser) -> None: """ Parse the input arguments. :return: A namespace object built up from attributes parsed out of the cmd line. """ - parser.add_argument( + parser_convert.add_argument( "-f", metavar="FILE", type=str, From e87be1bf8161a66d9588873832ff47f126847210 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 9 Apr 2024 15:27:12 +0200 Subject: [PATCH 18/48] fixup --- dfpl/options.py | 1 - 1 file changed, 1 deletion(-) diff --git a/dfpl/options.py b/dfpl/options.py index fe3e1bb9..a9e56102 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -676,7 +676,6 @@ def parseInputPredict(parser_input_predict: argparse.ArgumentParser) -> None: help="Autoencoder type, variational or deterministic.", default="deterministic", ) - ) def parseTrainGnn(parser_train_gnn: argparse.ArgumentParser) -> None: From 8b0af64fa0fd75e1f7c1314cd779c0691df61e4b Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Thu, 11 Jul 2024 12:38:02 +0200 Subject: [PATCH 19/48] removed paths from default args and fixed creating args from json and cmd lin --- dfpl/options.py | 24 ++++++++++++------------ dfpl/utils.py | 26 ++++++++++++++++++++++---- example/train.json | 2 +- 3 files changed, 35 insertions(+), 17 deletions(-) diff --git a/dfpl/options.py b/dfpl/options.py index a9e56102..9be52249 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -19,12 +19,12 @@ class Options: """ configFile: str = None - inputFile: str = "tests/data/smiles.csv" - outputDir: str = "example/results_train/" # changes according to mode - outputFile: str = "results.csv" + inputFile: str = "" + outputDir: str = "" # changes according to mode + outputFile: str = "" ecWeightsFile: str = "" - ecModelDir: str = "example/results_train/AE_encoder/" - fnnModelDir: str = "example/results_train/AR_saved_model/" + ecModelDir: str = "" + fnnModelDir: str = "" type: str = "smiles" fpType: str = "topological" # also "MACCS", "atompairs" epochs: int = 100 @@ -85,8 +85,8 @@ class GnnOptions(TrainArgs): total_epochs: int = 30 save: bool = True - configFile: str = "./example/traingnn.json" - data_path: str = "./example/data/tox21.csv" + configFile: str = "" + data_path: str = "" use_compound_names: bool = False save_dir: str = "" no_cache: bool = False @@ -96,13 +96,13 @@ class GnnOptions(TrainArgs): num_lrs: int = 2 minimize_score: bool = False num_tasks: int = 12 - preds_path: str = "./tox21dmpnn.csv" + preds_path: str = "" test_path: str = "" save_preds: bool = True - calibration_method: str = "none" - uncertainty_method: str = "none" + calibration_method: str = "" + uncertainty_method: str = "" calibration_path: str = "" - evaluation_methods: str = "none" + evaluation_methods: str = "" evaluation_scores_path: str = "" wabTracking: bool = False split_sizes: List[float] = None @@ -130,7 +130,7 @@ class PredictGnnOptions(PredictArgs): Dataclass to hold all options used for training the graph models """ - configFile: str = "./example/predictgnn.json" + configFile: str = "" calibration_atom_descriptors_path: str = None calibration_features_path: str = None calibration_interval_percentile: float = 95 diff --git a/dfpl/utils.py b/dfpl/utils.py index 338981c9..94742741 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -88,7 +88,7 @@ def parse_cli_boolean(cli_args, cli_arg_key): return False -def createArgsFromJson(jsonFile: str): +def createArgsFromJson(jsonFile: str) -> List[str]: arguments = [] ignore_elements = ["py/object"] cli_args = sys.argv[1:] # Skipping the script name itself @@ -106,19 +106,37 @@ def createArgsFromJson(jsonFile: str): arg_index = cli_args.index(cli_arg_key) + 1 if isinstance(value, bool): value = parse_cli_boolean(cli_args, cli_arg_key) - elif arg_index < len(cli_args): + elif arg_index < len(cli_args) and not cli_args[arg_index].startswith('--'): cli_value = cli_args[arg_index] if isinstance(value, list): value = parse_cli_list(cli_value) else: value = cli_value # Override JSON value with command-line value - if isinstance(value, bool) and value: - arguments.append(cli_arg_key) + if isinstance(value, bool): + if value: + arguments.append(cli_arg_key) elif isinstance(value, list): arguments.append(cli_arg_key) arguments.extend(map(str, value)) # Ensure all elements are strings else: arguments.extend([cli_arg_key, str(value)]) + i = 0 + while i < len(cli_args): + arg = cli_args[i] + if arg.startswith("--"): + key = arg.lstrip("--") + if key not in data: + value = True if i + 1 >= len(cli_args) or cli_args[i + 1].startswith("--") else cli_args[i + 1] + if isinstance(value, bool): + if value: + arguments.append(arg) + else: + arguments.extend([arg, str(value)]) + i += 1 if isinstance(value, bool) else 2 + else: + i += 1 + else: + i += 1 return arguments diff --git a/example/train.json b/example/train.json index 62f2abb4..bcf212a7 100755 --- a/example/train.json +++ b/example/train.json @@ -1,6 +1,6 @@ { "py/object": "dfpl.options.Options", - "inputFile": "tests/data/S_dataset.csv", + "inputFile": "tests/data/smiles.csv", "outputDir": "example/results_train/", "ecModelDir": "example/results_train/", "ecWeightsFile": "random_autoencoder.hdf5", From efef88a918ea4661a79ab4c621bebed279bb684c Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Mar 2024 13:59:16 +0100 Subject: [PATCH 20/48] only argparse --- dfpl/__main__.py | 67 +++-- dfpl/options.py | 620 +++++++++++++++++++++++++++-------------------- dfpl/utils.py | 146 ++++++----- 3 files changed, 480 insertions(+), 353 deletions(-) diff --git a/dfpl/__main__.py b/dfpl/__main__.py index aada91a3..2527bb11 100755 --- a/dfpl/__main__.py +++ b/dfpl/__main__.py @@ -1,12 +1,10 @@ import dataclasses import logging -import os.path -import pathlib +import os from argparse import Namespace from os import path -import chemprop as cp -import pandas as pd +import chemprop from keras.models import load_model from dfpl import autoencoder as ac @@ -17,7 +15,6 @@ from dfpl import vae as vae from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute - def traindmpnn(opts: options.GnnOptions) -> None: """ Train a D-MPNN model using the given options. @@ -27,11 +24,11 @@ def traindmpnn(opts: options.GnnOptions) -> None: - None """ # Load options from a JSON file and replace the relevant attributes in `opts` - arguments = createArgsFromJson(jsonFile = opts.configFile) - opts = cp.args.TrainArgs().parse_args(arguments) + arguments = createArgsFromJson(jsonFile=opts.configFile) + opts = chemprop.args.TrainArgs().parse_args(arguments) logging.info("Training DMPNN...") - mean_score, std_score = cp.train.cross_validate( - args=opts, train_func=cp.train.run_training + mean_score, std_score = chemprop.train.cross_validate( + args=opts, train_func=chemprop.train.run_training ) logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}") @@ -45,10 +42,27 @@ def predictdmpnn(opts: options.GnnOptions) -> None: - None """ # Load options and additional arguments from a JSON file - arguments = createArgsFromJson(jsonFile = opts.configFile) - opts = cp.args.PredictArgs().parse_args(arguments) + arguments = createArgsFromJson(jsonFile=opts.configFile) + opts = chemprop.args.PredictArgs().parse_args(arguments) + + chemprop.train.make_predictions(args=opts) + + +def interpretdmpnn(opts: options.GnnOptions) -> None: + """ + Interpret the predictions of a trained D-MPNN model with the given options. + Args: + - opts: options.GnnOptions instance containing the details of the prediction + Returns: + - None + """ + # Load options and additional arguments from a JSON file + arguments = createArgsFromJson(jsonFile=opts.configFile) + opts = chemprop.args.InterpretArgs().parse_args(arguments) - cp.train.make_predictions(args=opts) + chemprop.interpret.interpret( + args=opts, save_to_csv=True + ) def train(opts: options.Options): @@ -92,15 +106,18 @@ def train(opts: options.Options): ) # compress the fingerprints using the autoencoder df = ac.compress_fingerprints(df, encoder) - if opts.visualizeLatent: + if opts.visualizeLatent and opts.trainAC: ac.visualize_fingerprints( df, - before_col="fp", - after_col="fpcompressed", train_indices=train_indices, test_indices=test_indices, - save_as=f"UMAP_{opts.aeSplitType}.png", + save_as=f"{opts.ecModelDir}/UMAP_{opts.aeSplitType}.png", + ) + elif opts.visualizeLatent: + logging.info( + "Visualizing latent space is only available if you train the autoencoder. Skipping visualization." ) + # train single label models if requested if opts.trainFNN and not opts.enableMultiLabel: sl.train_single_label_models(df=df, opts=opts) @@ -200,16 +217,18 @@ def main(): traindmpnn(traingnn_opts) elif prog_args.method == "predictgnn": - predictgnn_opts = options.GnnOptions.fromCmdArgs(prog_args) - fixed_opts = dataclasses.replace( - predictgnn_opts, - test_path=makePathAbsolute(predictgnn_opts.test_path), - preds_path=makePathAbsolute(predictgnn_opts.preds_path), - ) + predictgnn_opts = options.PredictGnnOptions.fromCmdArgs(prog_args) createLogger("predictgnn.log") - predictdmpnn(fixed_opts) + predictdmpnn(predictgnn_opts) + elif prog_args.method == "interpretgnn": + interpretgnn_opts = options.InterpretGNNoptions.fromCmdArgs(prog_args) + createLogger("interpretgnn.log") + interpretdmpnn(interpretgnn_opts) elif prog_args.method == "train": + if prog_args.configFile is None and prog_args.inputFile is None: + parser.error("Either --configFile or --inputFile must be provided.") + train_opts = options.Options.fromCmdArgs(prog_args) fixed_opts = dataclasses.replace( train_opts, @@ -223,6 +242,8 @@ def main(): ) train(fixed_opts) elif prog_args.method == "predict": + if prog_args.configFile is None and prog_args.inputFile is None: + parser.error("Either --configFile or --inputFile must be provided.") predict_opts = options.Options.fromCmdArgs(prog_args) fixed_opts = dataclasses.replace( predict_opts, diff --git a/dfpl/options.py b/dfpl/options.py index d1d657aa..d098bdca 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -3,12 +3,13 @@ import argparse from dataclasses import dataclass from pathlib import Path -from typing import Optional +from typing import Optional, Literal, List + import jsonpickle import torch -from chemprop.args import TrainArgs +from chemprop.args import TrainArgs, PredictArgs, InterpretArgs -from dfpl.utils import makePathAbsolute, parseCmdArgs +from dfpl.utils import parseCmdArgs @dataclass @@ -18,12 +19,12 @@ class Options: """ configFile: str = None - inputFile: str = "tests/data/smiles.csv" - outputDir: str = "example/results_train/" # changes according to mode - outputFile: str = "results.csv" + inputFile: str = "" + outputDir: str = "" # changes according to mode + outputFile: str = "" ecWeightsFile: str = "" - ecModelDir: str = "example/results_train/AE_encoder/" - fnnModelDir: str = "example/results_train/AR_saved_model/" + ecModelDir: str = "" + fnnModelDir: str = "" type: str = "smiles" fpType: str = "topological" # also "MACCS", "atompairs" epochs: int = 100 @@ -57,7 +58,7 @@ class Options: l2reg: float = 0.001 dropout: float = 0.2 threshold: float = 0.5 - visualizeLatent: bool = False #only if autoencoder is trained or loaded + visualizeLatent: bool = False # only if autoencoder is trained or loaded gpu: int = None aeWabTracking: bool = False # Wand & Biases autoencoder tracking wabTracking: bool = False # Wand & Biases FNN tracking @@ -98,6 +99,99 @@ class GnnOptions(TrainArgs): preds_path: str = "./tox21dmpnn.csv" test_path: str = "" save_preds: bool = True + calibration_method: str = "none" + uncertainty_method: str = "none" + calibration_path: str = "" + evaluation_methods: str = "none" + evaluation_scores_path: str = "" + wabTracking: bool = False + split_sizes: List[float] = None + # save_smiles_splits: bool = False + @classmethod + def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): + # Initialize with JSON config if provided + if json_config: + opts = cls(**json_config) + else: + opts = cls() + + # Update with command-line arguments + for key, value in vars(args).items(): + if value is not None: + setattr(opts, key, value) + + return opts + + +class PredictGnnOptions(PredictArgs): + """ + Dataclass to hold all options used for training the graph models + """ + + configFile: str = "./example/predictgnn.json" + calibration_atom_descriptors_path: str = None + calibration_features_path: str = None + calibration_interval_percentile: float = 95 + calibration_method: Literal[ + "zscaling", + "tscaling", + "zelikman_interval", + "mve_weighting", + "platt", + "isotonic", + ] = None + calibration_path: str = None + calibration_phase_features_path: str = None + drop_extra_columns: bool = False + dropout_sampling_size: int = 10 + evaluation_methods: List[str] = None + evaluation_scores_path: str = None + # no_features_scaling: bool = True + individual_ensemble_predictions: bool = False + preds_path: str = None + regression_calibrator_metric: Literal["stdev", "interval"] = None + test_path: str = None + uncertainty_dropout_p: float = 0.1 + uncertainty_method: Literal[ + "mve", + "ensemble", + "evidential_epistemic", + "evidential_aleatoric", + "evidential_total", + "classification", + "dropout", + ] = None + + @classmethod + def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): + # Initialize with JSON config if provided + if json_config: + opts = cls(**json_config) + else: + opts = cls() + + # Update with command-line arguments + for key, value in vars(args).items(): + if value is not None: + setattr(opts, key, value) + + return opts + + +class InterpretGNNoptions(InterpretArgs): + """ + Dataclass to hold all options used for training the graph models + """ + + configFile: str = "./example/interpret.json" + data_path: str = "./example/data/smiles.csv" + batch_size: int = 500 + c_puct: float = 10.0 + max_atoms: int = 20 + min_atoms: int = 8 + prop_delta: float = 0.5 + property_id: List[int] = None + rollout: int = 20 @classmethod def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): @@ -134,6 +228,12 @@ def createCommandlineParser() -> argparse.ArgumentParser: parser_predict_gnn.set_defaults(method="predictgnn") parsePredictGnn(parser_predict_gnn) + parser_interpret_gnn = subparsers.add_parser( + "interpretgnn", help="Interpret your GNN models" + ) + parser_interpret_gnn.set_defaults(method="interpretgnn") + parseInterpretGnn(parser_interpret_gnn) + parser_train = subparsers.add_parser( "train", help="Train new models with your data" ) @@ -173,7 +273,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="FILE", type=str, help="Input JSON file that contains all information for training/predicting.", - default="example/train.json", + default=argparse.SUPPRESS, ) general_args.add_argument( "-i", @@ -182,7 +282,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="The file containing the data for training in " "comma separated CSV format.The first column should be smiles.", - default="tests/data/smiles.csv" + default=argparse.SUPPRESS, ) general_args.add_argument( "-o", @@ -191,10 +291,8 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="Prefix of output file name. Trained model and " "respective stats will be returned in this directory.", - default="example/results_train/" + default=argparse.SUPPRESS, ) - - # TODO CHECK WHAT IS TYPE DOING? general_args.add_argument( "-t", "--type", @@ -202,7 +300,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["fp", "smiles"], help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default="fp" + default=argparse.SUPPRESS, ) general_args.add_argument( "-thr", @@ -210,41 +308,47 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=float, metavar="FLOAT", help="Threshold for binary classification.", - default=0.5 + default=argparse.SUPPRESS, ) general_args.add_argument( "-gpu", "--gpu", metavar="INT", type=int, - help="Select which gpu to use by index. If not available, leave empty", - default=None + help="Select which gpu to use. If not available, leave empty.", + default=argparse.SUPPRESS, ) general_args.add_argument( + "-k", "--fpType", metavar="STR", type=str, - choices=["topological", "MACCS"], - help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.", - default="topological" + choices=["topological", "MACCS"], # , 'atompairs', 'torsions'], + help="The type of fingerprint to be generated/used in input file.", + default=argparse.SUPPRESS, ) general_args.add_argument( + "-s", "--fpSize", type=int, - help="Length of the fingerprint that should be generated.", - default=2048 + help="Size of fingerprint that should be generated.", + default=argparse.SUPPRESS, ) general_args.add_argument( + "-c", "--compressFeatures", - action="store_true", - help="Should the fingerprints be compressed or not. Needs a path of a trained autoencoder or needs the trainAC also set to True.", - default=False + metavar="BOOL", + type=bool, + help="Should the fingerprints be compressed or not. Activates the autoencoder. ", + default=argparse.SUPPRESS, ) general_args.add_argument( + "-m", "--enableMultiLabel", - action="store_true", + metavar="BOOL", + type=bool, help="Train multi-label classification model in addition to the individual models.", - default=False + default=argparse.SUPPRESS, ) # Autoencoder Configuration autoencoder_args.add_argument( @@ -253,14 +357,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, metavar="FILE", help="The .hdf5 file of a trained encoder", - default="" + default=argparse.SUPPRESS, ) autoencoder_args.add_argument( "--ecModelDir", type=str, metavar="DIR", help="The directory where the full model of the encoder will be saved", - default="example/results_train/AE_encoder/" + default=argparse.SUPPRESS, ) autoencoder_args.add_argument( "--aeType", @@ -268,21 +372,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["variational", "deterministic"], help="Autoencoder type, variational or deterministic.", - default="deterministic" + default=argparse.SUPPRESS, ) autoencoder_args.add_argument( "--aeEpochs", metavar="INT", type=int, help="Number of epochs for autoencoder training.", - default=100 + default=argparse.SUPPRESS, ) autoencoder_args.add_argument( "--aeBatchSize", metavar="INT", type=int, help="Batch size in autoencoder training.", - default=512 + default=argparse.SUPPRESS, ) autoencoder_args.add_argument( "--aeActivationFunction", @@ -290,21 +394,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["relu", "selu"], help="The activation function for the hidden layers in the autoencoder.", - default="relu" + default=argparse.SUPPRESS, ) autoencoder_args.add_argument( "--aeLearningRate", metavar="FLOAT", type=float, help="Learning rate for autoencoder training.", - default=0.001 + default=argparse.SUPPRESS, ) autoencoder_args.add_argument( "--aeLearningRateDecay", metavar="FLOAT", type=float, help="Learning rate decay for autoencoder training.", - default=0.96 + default=argparse.SUPPRESS, ) autoencoder_args.add_argument( "--aeSplitType", @@ -312,7 +416,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the autoencoder", - default="random" + default=argparse.SUPPRESS, ) autoencoder_args.add_argument( "-d", @@ -320,13 +424,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Size of encoded fingerprint (z-layer of autoencoder).", - default=256 - ) - autoencoder_args.add_argument( - "--visualizeLatent", - action="store_true", - help="UMAP the latent space for exploration", - default=False + default=argparse.SUPPRESS, ) # Training Configuration training_args.add_argument( @@ -335,14 +433,15 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the feedforward neural network", - default="random" + default=argparse.SUPPRESS, ) training_args.add_argument( + "-l", "--testSize", metavar="FLOAT", type=float, help="Fraction of the dataset that should be used for testing. Value in [0,1].", - default=0.2 + default=argparse.SUPPRESS, ) training_args.add_argument( "-K", @@ -350,7 +449,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="K that is used for K-fold cross-validation in the training procedure.", - default=1 + default=argparse.SUPPRESS, ) training_args.add_argument( "-v", @@ -360,19 +459,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: choices=[0, 1, 2], help="Verbosity level. O: No additional output, " + "1: Some additional output, 2: full additional output", - default=2, + default=argparse.SUPPRESS, ) training_args.add_argument( "--trainAC", - action="store_true", + metavar="BOOL", + type=bool, help="Choose to train or not, the autoencoder based on the input file", - default=False, + default=argparse.SUPPRESS, ) training_args.add_argument( "--trainFNN", - action="store_false", - help="When called it deactivates the training.", - default=True, + metavar="BOOL", + type=bool, + help="Train the feedforward network either with provided weights.", + default=argparse.SUPPRESS, ) training_args.add_argument( "--sampleFractionOnes", @@ -380,14 +481,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=float, help="This is the fraction of positive target associations (1s) in comparison to the majority class(0s)." "only works if --sampleDown is enabled", - default=0.5, + default=argparse.SUPPRESS, ) training_args.add_argument( "--sampleDown", metavar="BOOL", type=bool, help="Enable automatic down sampling of the 0 valued samples.", - default=False, + default=argparse.SUPPRESS, ) training_args.add_argument( "-e", @@ -395,60 +496,52 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Number of epochs that should be used for the FNN training", - default=100, + default=argparse.SUPPRESS, ) - # TODO CHECK IF ALL LOSSES MAKE SENSE HERE + training_args.add_argument( "--lossFunction", metavar="STRING", type=str, choices=["mse", "bce", "focal"], help="Loss function to use during training. mse - mean squared error, bce - binary cross entropy.", - default="bce", + default=argparse.SUPPRESS, ) - # TODO DO I NEED ALL ARGUMENTS TO BE USER SPECIFIED? WHAT DOES THE USER KNOW ABOUT OPTIMIZERS? training_args.add_argument( "--optimizer", metavar="STRING", type=str, choices=["Adam", "SGD"], help='Optimizer to use for backpropagation in the FNN. Possible values: "Adam", "SGD"', - default="Adam", + default=argparse.SUPPRESS, ) training_args.add_argument( "--batchSize", metavar="INT", type=int, help="Batch size in FNN training.", - default=128, + default=argparse.SUPPRESS, ) training_args.add_argument( "--l2reg", metavar="FLOAT", type=float, help="Value for l2 kernel regularizer.", - default=0.001, + default=argparse.SUPPRESS, ) training_args.add_argument( "--dropout", metavar="FLOAT", type=float, help="The fraction of data that is dropped out in each dropout layer.", - default=0.2, + default=argparse.SUPPRESS, ) training_args.add_argument( "--learningRate", metavar="FLOAT", type=float, help="Learning rate size in FNN training.", - default=0.000022, - ) - training_args.add_argument( - "--learningRateDecay", - metavar="FLOAT", - type=float, - help="Learning rate size in FNN training.", - default=0.96, + default=argparse.SUPPRESS, ) training_args.add_argument( "--activationFunction", @@ -456,7 +549,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["relu", "selu"], help="The activation function for hidden layers in the FNN.", - default="relu", + default=argparse.SUPPRESS, ) # Tracking Configuration tracking_args.add_argument( @@ -464,14 +557,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="BOOL", type=bool, help="Track autoencoder performance via Weights & Biases, see https://wandb.ai.", - default=False, + default=argparse.SUPPRESS, ) tracking_args.add_argument( "--wabTracking", metavar="BOOL", type=bool, help="Track FNN performance via Weights & Biases, see https://wandb.ai.", - default=False, + default=argparse.SUPPRESS, ) tracking_args.add_argument( "--wabTarget", @@ -479,116 +572,56 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["AR", "ER", "ED", "GR", "TR", "PPARg", "Aromatase"], help="Which target to use for tracking performance via Weights & Biases, see https://wandb.ai.", - default="AR", + default=argparse.SUPPRESS, ) -def parseInputPredict(parser: argparse.ArgumentParser) -> None: - """ - Parse the input arguments. - - :return: A namespace object built up from attributes parsed out of the cmd line. - """ - +def parseTrainGnn(parser: argparse.ArgumentParser) -> None: general_args = parser.add_argument_group("General Configuration") + data_args = parser.add_argument_group("Data Configuration") files_args = parser.add_argument_group("Files") - files_args.add_argument( - "-f", - "--configFile", - metavar="FILE", + model_args = parser.add_argument_group("Model arguments") + training_args = parser.add_argument_group("Training Configuration") + uncertainty_args = parser.add_argument_group("Uncertainty Configuration") + uncertainty_args.add_argument( + "--uncertainty_method", type=str, - help="Input JSON file that contains all information for training/predicting." + metavar="STRING", + choices=[ + "mve", + "ensemble", + "evidential_epistemic", + "evidential_aleatoric", + "evidential_total", + "classification", + "dropout", + "dirichlet", + ], + help="Method to use for uncertainty estimation", + default="none", ) - files_args.add_argument( - "-i", - "--inputFile", - metavar="FILE", + # Uncertainty arguments + uncertainty_args.add_argument( + "--calibration_method", type=str, - help="The file containing the data for the prediction in (unquoted) " - "comma separated CSV format. The column named 'smiles' or 'fp'" - "contains the field to be predicted. Please adjust the type " - "that should be predicted (fp or smile) with -t option appropriately." - "An optional column 'id' is used to assign the outcomes to the" - "original identifiers. If this column is missing, the results are" - "numbered in the order of their appearance in the input file." - "A header is expected and respective column names are used.", - default="tests/data/smiles.csv", + metavar="STRING", + choices=[ + "zscaling", + "tscaling", + "zelikman_interval", + "mve_weighting", + "platt", + "isotonic", + ], + help="Method to use for calibration", + default="none", ) - files_args.add_argument( - "-o", - "--outputDir", - metavar="DIR", + uncertainty_args.add_argument( + "--calibration_path", type=str, - help="Prefix of output directory. It will contain a log file and the file specified" - "with --outputFile.", - default="example/results_predict/", - ) - files_args.add_argument( - "--outputFile", metavar="FILE", - type=str, - help="Output .CSV file name which will contain one prediction per input line. " - "Default: prefix of input file name.", - default="results.csv", - ) - # TODO AGAIN THIS TRASH HERE? CAN WE EVEN PROCESS SMILES? - general_args.add_argument( - "-t", - "--type", - metavar="STR", - type=str, - choices=["fp", "smiles"], - help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default="fp", + help="Path to file with calibration data", ) - general_args.add_argument( - "-k", - "--fpType", - metavar="STR", - type=str, - choices=["topological", "MACCS"], - help="The type of fingerprint to be generated/used in input file. Should be the same as the type of the fps that the model was trained upon.", - default="topological", - ) - files_args.add_argument( - "--ecModelDir", - type=str, - metavar="DIR", - help="The directory where the full model of the encoder will be saved (if trainAE=True) or " - "loaded from (if trainAE=False). Provide a full path here.", - default="", - ) - files_args.add_argument( - "--ecWeightsFile", - type=str, - metavar="STR", - help="The file where the full model of the encoder will be loaded from, to compress the fingerprints. Provide a full path here.", - default="", - ) - files_args.add_argument( - "--fnnModelDir", - type=str, - metavar="DIR", - help="The directory where the full model of the fnn is loaded from. " - "Provide a full path here.", - default="example/results_train/AR_saved_model", - ) - general_args.add_argument( - "-c", "--compressFeatures", action="store_true", default=False - ) - (general_args.add_argument( - "--aeType", metavar="STRING", type=str, - choices=["variational", "deterministic"], - help="Autoencoder type, variational or deterministic.", - default="deterministic")) - - -def parseTrainGnn(parser: argparse.ArgumentParser) -> None: - general_args = parser.add_argument_group("General Configuration") - data_args = parser.add_argument_group("Data Configuration") - files_args = parser.add_argument_group("Files") - model_args = parser.add_argument_group("Model arguments") - training_args = parser.add_argument_group("Training Configuration") # General arguments general_args.add_argument("--split_key_molecule", type=int) @@ -630,6 +663,9 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: default=10, help="The number of batches between each logging of the training loss", ) + general_args.add_argument( + "--no_cuda", action="store_true", default=True, help="Turn off cuda" + ) general_args.add_argument( "--no_cache", action="store_true", @@ -1088,51 +1124,147 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: def parsePredictGnn(parser: argparse.ArgumentParser) -> None: general_args = parser.add_argument_group("General Configuration") - data_args = parser.add_argument_group("Data Configuration") files_args = parser.add_argument_group("Files") - training_args = parser.add_argument_group("Training Configuration") + uncertainty_args = parser.add_argument_group("Uncertainty Configuration") + + general_args.add_argument( + "--checkpoint_path", + type=str, + metavar="FILE", + help="Path to model checkpoint (.pt file)" + ) + # general_args.add_argument( + # "--no_features_scaling", + # action="store_true", + # help="Turn on scaling of features", + # ) files_args.add_argument( "-f", "--configFile", + type=str, metavar="FILE", + help="Path to a .json file containing arguments. Any arguments present in the config" + "file will override arguments specified via the command line or by the defaults.", + ) + files_args.add_argument( + "--test_path", type=str, - help="Input JSON file that contains all information for training/predicting.", - default=argparse.SUPPRESS, + help="Path to CSV file containing testing data for which predictions will be made.", ) - general_args.add_argument( - "--gpu", - type=int, - metavar="INT", - choices=list(range(torch.cuda.device_count())), - help="Which GPU to use", + files_args.add_argument( + "--preds_path", + type=str, + help="Path to CSV or PICKLE file where predictions will be saved.", ) - general_args.add_argument( - "--num_workers", - type=int, - metavar="INT", - help="Number of workers for the parallel data loading 0 means sequential", + files_args.add_argument( + "--calibration_path", + type=str, + help="Path to data file to be used for uncertainty calibration.", ) - general_args.add_argument( - "--no_cache", - type=bool, - metavar="BOOL", - default=False, - help="Turn off caching mol2graph computation", + files_args.add_argument( + "--calibration_features_path", + type=str, + nargs="+", + help="Path to features data to be used with the uncertainty calibration dataset.", + ) + files_args.add_argument("--calibration_phase_features_path", type=str, help="") + files_args.add_argument( + "--calibration_atom_descriptors_path", + type=str, + help="Path to the extra atom descriptors.", ) + files_args.add_argument( + "--calibration_bond_descriptors_path", + type=str, + help="Path to the extra bond descriptors that will be used as bond features to featurize a given molecule.", + ) + general_args.add_argument( - "--no_cache_mol", - type=bool, - metavar="BOOL", + "--drop_extra_columns", + action="store_true", + help="Whether to drop all columns from the test data file besides the SMILES columns and the new prediction columns.", + ) + + uncertainty_args.add_argument( + "--uncertainty_method", + type=str, + choices=[ + "mve", + "ensemble", + "evidential_epistemic", + "evidential_aleatoric", + "evidential_total", + "classification", + "dropout", + "spectra_roundrobin", + "dirichlet", + ], + help="The method of calculating uncertainty.", + ) + uncertainty_args.add_argument( + "--calibration_method", + type=str, + nargs="+", + choices=[ + "zscaling", + "tscaling", + "zelikman_interval", + "mve_weighting", + "platt", + "isotonic", + ], + help="Methods used for calibrating the uncertainty calculated with uncertainty method.", + ) + uncertainty_args.add_argument("--individual_ensemble_predictions", + action="store_true", default=False, - help="Whether to not cache the RDKit molecule for each SMILES string to reduce memory\ - usage cached by default", + help="Whether to save individual ensemble predictions.") + uncertainty_args.add_argument( + "--evaluation_methods", + type=str, + nargs="+", + help="The methods used for evaluating the uncertainty performance if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.", ) - general_args.add_argument( - "--empty_cache", - type=bool, - metavar="BOOL", - help="Whether to empty all caches before training or predicting. This is necessary if\ - multiple jobs are run within a single script and the atom or bond features change", + uncertainty_args.add_argument( + "--evaluation_scores_path", + type=str, + help="Location to save the results of uncertainty evaluations.", + ) + uncertainty_args.add_argument( + "--uncertainty_dropout_p", + type=float, + default=0.1, + help="The probability to use for Monte Carlo dropout uncertainty estimation.", + ) + uncertainty_args.add_argument( + "--dropout_sampling_size", + type=int, + default=10, + help="The number of samples to use for Monte Carlo dropout uncertainty estimation. Distinct from the dropout used during training.", + ) + uncertainty_args.add_argument( + "--calibration_interval_percentile", + type=float, + default=95, + help="Sets the percentile used in the calibration methods. Must be in the range (1,100).", + ) + uncertainty_args.add_argument( + "--regression_calibrator_metric", + type=str, + choices=["stdev", "interval"], + help="Regression calibrators can output either a stdev or an inverval.", + ) + + +def parseInterpretGnn(parser: argparse.ArgumentParser) -> None: + files_args = parser.add_argument_group("Files") + interpret_args = parser.add_argument_group("Interpretation Configuration") + files_args.add_argument( + "-f", + "--configFile", + metavar="FILE", + type=str, + help="Input JSON file that contains all information for interpretation.", ) files_args.add_argument( "--preds_path", @@ -1167,77 +1299,39 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: type=str, metavar="FILE", help="Path to CSV file containing testing data for which predictions will be made", - default="", - ) - files_args.add_argument( - "--test_path", - type=str, - metavar="FILE", - help="Path to CSV file containing testing data for which predictions will be made", - default="", - ) - files_args.add_argument( - "--features_path", - type=str, - metavar="FILE", - nargs="*", - help="Path to features to use in FNN (instead of features_generator)", - ) - files_args.add_argument( - "--atom_descriptors_path", - type=str, - metavar="FILE", - help="Path to the extra atom descriptors.", - ) - data_args.add_argument( - "--use_compound_names", - action="store_true", - default=False, - help="Use when test data file contains compound names in addition to SMILES strings", - ) - data_args.add_argument( - "--no_features_scaling", - action="store_true", - default=False, - help="Turn off scaling of features", ) - data_args.add_argument( - "--max_data_size", + interpret_args.add_argument( + "--max_atoms", type=int, metavar="INT", - help="Maximum number of data points to load", - ) - data_args.add_argument( - "--smiles_columns", - type=str, - metavar="STRING", - help="List of names of the columns containing SMILES strings.By default, uses the first\ - number_of_molecules columns.", + help="Maximum number of atoms to use for interpretation", ) - data_args.add_argument( - "--number_of_molecules", + + interpret_args.add_argument( + "--min_atoms", type=int, metavar="INT", - help="Number of molecules in each input to the model.This must equal the length of\ - smiles_columns if not None", + help="Minimum number of atoms to use for interpretation", ) - data_args.add_argument( - "--atom_descriptors", - type=bool, - metavar="Bool", - help="Use or not atom descriptors", + interpret_args.add_argument( + "--prop_delta", + type=float, + metavar="FLOAT", + help="The minimum change in the property of interest that is considered significant", ) - - data_args.add_argument( - "--bond_features_size", + interpret_args.add_argument( + "--property_id", type=int, metavar="INT", - help="Size of the extra bond descriptors that will be used as bond features to featurize a\ - given molecule", + help="The index of the property of interest", ) - training_args.add_argument( - "--batch_size", type=int, metavar="INT", default=50, help="Batch size" + # write the argument for rollouts + interpret_args.add_argument( + "--rollout", + type=int, + metavar="INT", + help="The number of rollouts to use for interpretation", ) diff --git a/dfpl/utils.py b/dfpl/utils.py index 15fd018b..1aced706 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -1,18 +1,15 @@ +import argparse import json import logging import os import pathlib +import sys import warnings from collections import defaultdict -from random import Random -from typing import Dict, List, Set, Tuple, Union, Type, TypeVar, Any - -# Define a type variable - from pathlib import Path -import argparse +from random import Random +from typing import Dict, List, Set, Tuple, Type, TypeVar, Union import jsonpickle -import sys import numpy as np import pandas as pd from rdkit import Chem, RDLogger @@ -20,6 +17,9 @@ from rdkit.Chem.Scaffolds import MurckoScaffold from tqdm import tqdm +# Define a type variable + + RDLogger.DisableLog("rdApp.*") T = TypeVar("T") @@ -36,7 +36,7 @@ def parseCmdArgs(cls: Type[T], args: argparse.Namespace) -> T: An instance of cls populated with values from the command-line arguments. """ # Extract argument flags from sys.argv - arg_flags = {arg.lstrip('-') for arg in sys.argv if arg.startswith('-')} + arg_flags = {arg.lstrip("-") for arg in sys.argv if arg.startswith("-")} # Create the result instance, which will be modified and returned result = cls() @@ -60,6 +60,7 @@ def parseCmdArgs(cls: Type[T], args: argparse.Namespace) -> T: return result + def makePathAbsolute(p: str) -> str: path = pathlib.Path(p) if path.is_absolute(): @@ -73,38 +74,50 @@ def createDirectory(directory: str): if not os.path.exists(path): os.makedirs(path) +def parse_cli_list(value: str): + # Simple parser for lists passed as comma-separated values + return value.split(',') + +def parse_cli_boolean(cli_args, cli_arg_key): + # Determines boolean value based on command line presence + if cli_arg_key in cli_args: + return True # Presence of flag implies True + return False def createArgsFromJson(jsonFile: str): arguments = [] ignore_elements = ["py/object"] + cli_args = sys.argv[1:] # Skipping the script name itself with open(jsonFile, "r") as f: data = json.load(f) - # Check each key in the JSON file against command-line arguments + processed_cli_keys = [] # To track which CLI keys have been processed + for key, value in data.items(): if key not in ignore_elements: - # Prepare the command-line argument format cli_arg_key = f"--{key}" - - # Check if this argument is provided in the command line - if cli_arg_key in sys.argv: - # Find the index of the argument in sys.argv and get its value - arg_index = sys.argv.index(cli_arg_key) + 1 - if arg_index < len(sys.argv): - cli_value = sys.argv[arg_index] - value = cli_value # Override JSON value with command-line value - - # Append the argument and its value to the list - if key == "extra_metrics" and isinstance(value, list): + if cli_arg_key in cli_args: + processed_cli_keys.append(cli_arg_key) + arg_index = cli_args.index(cli_arg_key) + 1 + if isinstance(value, bool): + value = parse_cli_boolean(cli_args, cli_arg_key) + elif arg_index < len(cli_args): + cli_value = cli_args[arg_index] + if isinstance(value, list): + value = parse_cli_list(cli_value) + else: + value = cli_value # Override JSON value with command-line value + if isinstance(value, bool) and value: arguments.append(cli_arg_key) - arguments.extend(value) + elif isinstance(value, list): + arguments.append(cli_arg_key) + arguments.extend(map(str, value)) # Ensure all elements are strings else: arguments.extend([cli_arg_key, str(value)]) return arguments - def make_mol(s: str, keep_h: bool, add_h: bool, keep_atom_map: bool): """ Builds an RDKit molecule from a SMILES string. @@ -133,49 +146,6 @@ def make_mol(s: str, keep_h: bool, add_h: bool, keep_atom_map: bool): return mol -def generate_scaffold( - mol: Union[str, Chem.Mol, Tuple[Chem.Mol, Chem.Mol]], include_chirality: bool = True -) -> str: - """ - Computes the Bemis-Murcko scaffold for a SMILES string, an RDKit molecule, or an InChI string or InChIKey. - - :param mol: A SMILES, RDKit molecule, InChI string, or InChIKey string. - :param include_chirality: Whether to include chirality in the computed scaffold. - :return: The Bemis-Murcko scaffold for the molecule. - """ - if isinstance(mol, str): - if mol.startswith("InChI="): - mol = inchi_to_mol(mol) - else: - mol = make_mol(mol, keep_h=False, add_h=False, keep_atom_map=False) - elif isinstance(mol, tuple): - mol = mol[0] - scaffold = MurckoScaffold.MurckoScaffoldSmiles( - mol=mol, includeChirality=include_chirality - ) - - return scaffold - - -def scaffold_to_smiles( - mols: List[str], use_indices: bool = False -) -> Dict[str, Union[Set[str], Set[int]]]: - """ - Computes the scaffold for each SMILES and returns a mapping from scaffolds to sets of smiles (or indices). - :param mols: A list of SMILES. - :param use_indices: Whether to map to the SMILES's index in :code:`mols` rather than - mapping to the smiles string itself. This is necessary if there are duplicate smiles. - :return: A dictionary mapping each unique scaffold to all SMILES (or indices) which have that scaffold. - """ - scaffolds = defaultdict(set) - for i, mol in tqdm(enumerate(mols), total=len(mols)): - scaffold = generate_scaffold(mol) - if use_indices: - scaffolds[scaffold].add(i) - else: - scaffolds[scaffold].add(mol) - - return scaffolds # def inchi_to_mol(inchi: str) -> Chem.Mol: @@ -241,7 +211,49 @@ def weight_split( test_df = sorted_data.iloc[test_indices].reset_index(drop=True) return train_df, val_df, test_df +def generate_scaffold( + mol: Union[str, Chem.Mol, Tuple[Chem.Mol, Chem.Mol]], include_chirality: bool = True +) -> str: + """ + Computes the Bemis-Murcko scaffold for a SMILES string, an RDKit molecule, or an InChI string or InChIKey. + + :param mol: A SMILES, RDKit molecule, InChI string, or InChIKey string. + :param include_chirality: Whether to include chirality in the computed scaffold. + :return: The Bemis-Murcko scaffold for the molecule. + """ + if isinstance(mol, str): + if mol.startswith("InChI="): + mol = inchi_to_mol(mol) + else: + mol = make_mol(mol, keep_h=False, add_h=False, keep_atom_map=False) + elif isinstance(mol, tuple): + mol = mol[0] + scaffold = MurckoScaffold.MurckoScaffoldSmiles( + mol=mol, includeChirality=include_chirality + ) + + return scaffold + + +def scaffold_to_smiles( + mols: List[str], use_indices: bool = False +) -> Dict[str, Union[Set[str], Set[int]]]: + """ + Computes the scaffold for each SMILES and returns a mapping from scaffolds to sets of smiles (or indices). + :param mols: A list of SMILES. + :param use_indices: Whether to map to the SMILES's index in :code:`mols` rather than + mapping to the smiles string itself. This is necessary if there are duplicate smiles. + :return: A dictionary mapping each unique scaffold to all SMILES (or indices) which have that scaffold. + """ + scaffolds = defaultdict(set) + for i, mol in tqdm(enumerate(mols), total=len(mols)): + scaffold = generate_scaffold(mol) + if use_indices: + scaffolds[scaffold].add(i) + else: + scaffolds[scaffold].add(mol) + return scaffolds def ae_scaffold_split( data: pd.DataFrame, @@ -366,7 +378,7 @@ def log_scaffold_stats( targets = [ c for c in data.columns - if c in ["AR", "ER", "ED", "TR", "GR", "PPARg", "Aromatase"] + if c not in ["fp", "morganfp", "fpcompressed", "id", "smiles",] ] # targets = data_set.iloc[:, 2:].values targets = data_set.loc[:, targets].values From ac0db5d9e23c7c3623b31e732094ae9736b9925e Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 9 Apr 2024 13:11:29 +0200 Subject: [PATCH 21/48] rename some variables needed because of the specifics of the Galaxy tool generator --- dfpl/options.py | 173 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 135 insertions(+), 38 deletions(-) diff --git a/dfpl/options.py b/dfpl/options.py index d098bdca..e6ea2f46 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -293,6 +293,8 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: "respective stats will be returned in this directory.", default=argparse.SUPPRESS, ) + + # TODO CHECK WHAT IS TYPE DOING? general_args.add_argument( "-t", "--type", @@ -319,7 +321,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: default=argparse.SUPPRESS, ) general_args.add_argument( - "-k", "--fpType", metavar="STR", type=str, @@ -328,14 +329,12 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: default=argparse.SUPPRESS, ) general_args.add_argument( - "-s", "--fpSize", type=int, help="Size of fingerprint that should be generated.", default=argparse.SUPPRESS, ) general_args.add_argument( - "-c", "--compressFeatures", metavar="BOOL", type=bool, @@ -343,7 +342,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: default=argparse.SUPPRESS, ) general_args.add_argument( - "-m", "--enableMultiLabel", metavar="BOOL", type=bool, @@ -424,7 +422,13 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Size of encoded fingerprint (z-layer of autoencoder).", - default=argparse.SUPPRESS, + default=256, + ) + autoencoder_args.add_argument( + "--visualizeLatent", + action="store_true", + help="UMAP the latent space for exploration", + default=False, ) # Training Configuration training_args.add_argument( @@ -436,7 +440,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: default=argparse.SUPPRESS, ) training_args.add_argument( - "-l", "--testSize", metavar="FLOAT", type=float, @@ -463,17 +466,15 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: ) training_args.add_argument( "--trainAC", - metavar="BOOL", - type=bool, + action="store_true", help="Choose to train or not, the autoencoder based on the input file", default=argparse.SUPPRESS, ) training_args.add_argument( "--trainFNN", - metavar="BOOL", - type=bool, - help="Train the feedforward network either with provided weights.", - default=argparse.SUPPRESS, + action="store_false", + help="When called it deactivates the training.", + default=argparse.SUPPRESS ) training_args.add_argument( "--sampleFractionOnes", @@ -498,7 +499,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="Number of epochs that should be used for the FNN training", default=argparse.SUPPRESS, ) - + # TODO CHECK IF ALL LOSSES MAKE SENSE HERE training_args.add_argument( "--lossFunction", metavar="STRING", @@ -541,7 +542,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="FLOAT", type=float, help="Learning rate size in FNN training.", - default=argparse.SUPPRESS, + default=0.000022, + ) + training_args.add_argument( + "--learningRateDecay", + metavar="FLOAT", + type=float, + help="Learning rate size in FNN training.", + default=0.96, ) training_args.add_argument( "--activationFunction", @@ -572,7 +580,112 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["AR", "ER", "ED", "GR", "TR", "PPARg", "Aromatase"], help="Which target to use for tracking performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default="AR", + ) + + +def parseInputPredict(parser: argparse.ArgumentParser) -> None: + """ + Parse the input arguments. + + :return: A namespace object built up from attributes parsed out of the cmd line. + """ + + general_args = parser.add_argument_group("General Configuration") + files_args = parser.add_argument_group("Files") + files_args.add_argument( + "-f", + "--configFile", + metavar="FILE", + type=str, + help="Input JSON file that contains all information for training/predicting.", + ) + files_args.add_argument( + "-i", + "--inputFile", + metavar="FILE", + type=str, + help="The file containing the data for the prediction in (unquoted) " + "comma separated CSV format. The column named 'smiles' or 'fp'" + "contains the field to be predicted. Please adjust the type " + "that should be predicted (fp or smile) with -t option appropriately." + "An optional column 'id' is used to assign the outcomes to the" + "original identifiers. If this column is missing, the results are" + "numbered in the order of their appearance in the input file." + "A header is expected and respective column names are used.", + default="tests/data/smiles.csv", + ) + files_args.add_argument( + "-o", + "--outputDir", + metavar="DIR", + type=str, + help="Prefix of output directory. It will contain a log file and the file specified" + "with --outputFile.", + default="example/results_predict/", + ) + files_args.add_argument( + "--outputFile", + metavar="FILE", + type=str, + help="Output .CSV file name which will contain one prediction per input line. " + "Default: prefix of input file name.", + default="results.csv", + ) + # TODO AGAIN THIS TRASH HERE? CAN WE EVEN PROCESS SMILES? + general_args.add_argument( + "-t", + "--type", + metavar="STR", + type=str, + choices=["fp", "smiles"], + help="Type of the chemical representation. Choices: 'fp', 'smiles'.", + default="fp", + ) + general_args.add_argument( + "-k", + "--fpType", + metavar="STR", + type=str, + choices=["topological", "MACCS"], + help="The type of fingerprint to be generated/used in input file. Should be the same as the type of the fps that the model was trained upon.", + default="topological", + ) + files_args.add_argument( + "--ecModelDir", + type=str, + metavar="DIR", + help="The directory where the full model of the encoder will be saved (if trainAE=True) or " + "loaded from (if trainAE=False). Provide a full path here.", + default="", + ) + files_args.add_argument( + "--ecWeightsFile", + type=str, + metavar="STR", + help="The file where the full model of the encoder will be loaded from, to compress the fingerprints. Provide a full path here.", + default="", + ) + files_args.add_argument( + "--fnnModelDir", + type=str, + metavar="DIR", + help="The directory where the full model of the fnn is loaded from. " + "Provide a full path here.", + default="example/results_train/AR_saved_model", + ) + general_args.add_argument( + "-c", "--compressFeatures", action="store_true", default=False + ) + ( + general_args.add_argument( + "--aeType", + metavar="STRING", + type=str, + choices=["variational", "deterministic"], + help="Autoencoder type, variational or deterministic.", + default="deterministic", + ) ) @@ -628,15 +741,16 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: general_args.add_argument("--pytorch_seed", type=int) general_args.add_argument("--cache_cutoff", type=float) general_args.add_argument("--save_preds", type=bool) + general_args.add_argument("--wabTracking", action="store_true", default=False) general_args.add_argument( "--cuda", action="store_true", default=False, help="Turn on cuda" ) - general_args.add_argument( - "--save_smiles_splits", - action="store_true", - default=False, - help="Save smiles for each train/val/test splits for prediction convenience later", - ) + # general_args.add_argument( + # "--save_smiles_splits", + # action="store_true", + # default=False, + # help="Save smiles for each train/val/test splits for prediction convenience later", + # ) general_args.add_argument( "--test", action="store_true", @@ -663,9 +777,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: default=10, help="The number of batches between each logging of the training loss", ) - general_args.add_argument( - "--no_cuda", action="store_true", default=True, help="Turn off cuda" - ) general_args.add_argument( "--no_cache", action="store_true", @@ -681,13 +792,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: type=str, help="Input JSON file that contains all information for training/predicting.", ) - files_args.add_argument( - "--config_path", - type=str, - metavar="FILE", - help="Path to a .json file containing arguments. Any arguments present in the config" - "file will override arguments specified via the command line or by the defaults.", - ) files_args.add_argument( "--save_dir", type=str, @@ -1287,13 +1391,6 @@ def parseInterpretGnn(parser: argparse.ArgumentParser) -> None: metavar="DIR", help="Path to model checkpoint (.pt file)", ) - files_args.add_argument( - "--checkpoint_paths", - type=str, - metavar="FILE", - nargs="*", - help="Path to model checkpoint (.pt file)", - ) files_args.add_argument( "--data_path", type=str, From 13a162634f7ecc6041c0b83fece41004d08e20ed Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Mar 2024 14:39:51 +0100 Subject: [PATCH 22/48] flaked and fixed predictgnn arg --- dfpl/__main__.py | 4 +--- dfpl/options.py | 13 ++++++++----- dfpl/utils.py | 22 ++++++++++++++++------ example/predictgnn.json | 6 ++++-- 4 files changed, 29 insertions(+), 16 deletions(-) diff --git a/dfpl/__main__.py b/dfpl/__main__.py index 2527bb11..39157577 100755 --- a/dfpl/__main__.py +++ b/dfpl/__main__.py @@ -60,9 +60,7 @@ def interpretdmpnn(opts: options.GnnOptions) -> None: arguments = createArgsFromJson(jsonFile=opts.configFile) opts = chemprop.args.InterpretArgs().parse_args(arguments) - chemprop.interpret.interpret( - args=opts, save_to_csv=True - ) + chemprop.interpret.interpret(args=opts, save_to_csv=True) def train(opts: options.Options): diff --git a/dfpl/options.py b/dfpl/options.py index e6ea2f46..f958c62e 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -3,11 +3,11 @@ import argparse from dataclasses import dataclass from pathlib import Path -from typing import Optional, Literal, List +from typing import List, Literal, Optional import jsonpickle import torch -from chemprop.args import TrainArgs, PredictArgs, InterpretArgs +from chemprop.args import InterpretArgs, PredictArgs, TrainArgs from dfpl.utils import parseCmdArgs @@ -107,6 +107,7 @@ class GnnOptions(TrainArgs): wabTracking: bool = False split_sizes: List[float] = None # save_smiles_splits: bool = False + @classmethod def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): # Initialize with JSON config if provided @@ -1235,7 +1236,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: "--checkpoint_path", type=str, metavar="FILE", - help="Path to model checkpoint (.pt file)" + help="Path to model checkpoint (.pt file)", ) # general_args.add_argument( # "--no_features_scaling", @@ -1319,10 +1320,12 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: ], help="Methods used for calibrating the uncertainty calculated with uncertainty method.", ) - uncertainty_args.add_argument("--individual_ensemble_predictions", + uncertainty_args.add_argument( + "--individual_ensemble_predictions", action="store_true", default=False, - help="Whether to save individual ensemble predictions.") + help="Whether to save individual ensemble predictions.", + ) uncertainty_args.add_argument( "--evaluation_methods", type=str, diff --git a/dfpl/utils.py b/dfpl/utils.py index 1aced706..506b8f54 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -74,9 +74,11 @@ def createDirectory(directory: str): if not os.path.exists(path): os.makedirs(path) + def parse_cli_list(value: str): # Simple parser for lists passed as comma-separated values - return value.split(',') + return value.split(",") + def parse_cli_boolean(cli_args, cli_arg_key): # Determines boolean value based on command line presence @@ -84,6 +86,7 @@ def parse_cli_boolean(cli_args, cli_arg_key): return True # Presence of flag implies True return False + def createArgsFromJson(jsonFile: str): arguments = [] ignore_elements = ["py/object"] @@ -118,6 +121,7 @@ def createArgsFromJson(jsonFile: str): return arguments + def make_mol(s: str, keep_h: bool, add_h: bool, keep_atom_map: bool): """ Builds an RDKit molecule from a SMILES string. @@ -146,10 +150,6 @@ def make_mol(s: str, keep_h: bool, add_h: bool, keep_atom_map: bool): return mol - - -# def inchi_to_mol(inchi: str) -> Chem.Mol: -# return Chem.inchi.MolFromInchi(inchi) def smiles_to_mol(smiles: str) -> Chem.Mol: mol = Chem.MolFromSmiles(smiles) if mol is None: @@ -211,6 +211,8 @@ def weight_split( test_df = sorted_data.iloc[test_indices].reset_index(drop=True) return train_df, val_df, test_df + + def generate_scaffold( mol: Union[str, Chem.Mol, Tuple[Chem.Mol, Chem.Mol]], include_chirality: bool = True ) -> str: @@ -255,6 +257,7 @@ def scaffold_to_smiles( return scaffolds + def ae_scaffold_split( data: pd.DataFrame, sizes: Tuple[float, float, float] = (0.8, 0, 0.2), @@ -378,7 +381,14 @@ def log_scaffold_stats( targets = [ c for c in data.columns - if c not in ["fp", "morganfp", "fpcompressed", "id", "smiles",] + if c + not in [ + "fp", + "morganfp", + "fpcompressed", + "id", + "smiles", + ] ] # targets = data_set.iloc[:, 2:].values targets = data_set.loc[:, targets].values diff --git a/example/predictgnn.json b/example/predictgnn.json index 813cf0c5..221622de 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -1,6 +1,8 @@ { "py/object": "dfpl.options.GnnOptions", "test_path": "tests/data/smiles.csv", - "preds_path": "example/results_gnn.csv", - "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt" + "preds_path": "preds.csv", + "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt", + "save_dir": "preds_dmpnn", + "saving_name": "DMPNN_preds.csv" } \ No newline at end of file From c3a5da25d086cce18931052614ab0dbc88351a90 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Mar 2024 15:29:45 +0100 Subject: [PATCH 23/48] add json --- example/predictgnn.json | 2 -- 1 file changed, 2 deletions(-) diff --git a/example/predictgnn.json b/example/predictgnn.json index 221622de..c76aa96c 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -3,6 +3,4 @@ "test_path": "tests/data/smiles.csv", "preds_path": "preds.csv", "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt", - "save_dir": "preds_dmpnn", - "saving_name": "DMPNN_preds.csv" } \ No newline at end of file From 2577f101398af8e0b1ebf34eb2d73aa1bdc0c104 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Mar 2024 15:40:21 +0100 Subject: [PATCH 24/48] remove comma --- example/predictgnn.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/predictgnn.json b/example/predictgnn.json index c76aa96c..1055230f 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -2,5 +2,5 @@ "py/object": "dfpl.options.GnnOptions", "test_path": "tests/data/smiles.csv", "preds_path": "preds.csv", - "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt", + "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt" } \ No newline at end of file From 96f59b47d6f06406add5d92a4294978748972ecf Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Mar 2024 16:28:19 +0100 Subject: [PATCH 25/48] final fix --- .github/workflows/pr.yml | 5 +++++ example/predictgnn.json | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 56df68fc..29c2e723 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -101,3 +101,8 @@ jobs: fi dfpl convert -f tests/data + if [ "$(find tests/data -name '*.csv' | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then + echo "not all csv files are converted to pickle ones" >&2 + exit 1 + fi + echo "All tests passed!" \ No newline at end of file diff --git a/example/predictgnn.json b/example/predictgnn.json index 1055230f..dfdd6a8d 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -1,6 +1,6 @@ { "py/object": "dfpl.options.GnnOptions", "test_path": "tests/data/smiles.csv", - "preds_path": "preds.csv", + "preds_path": "preds_dmpnn/preds.csv", "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt" } \ No newline at end of file From 01942ba5335c378bacf30966cafacb8609b1eb5c Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Mar 2024 18:06:01 +0100 Subject: [PATCH 26/48] final fix --- .github/workflows/pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 29c2e723..80e5d90c 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -101,7 +101,7 @@ jobs: fi dfpl convert -f tests/data - if [ "$(find tests/data -name '*.csv' | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then + if [ "$(find tests/data \(-name '*.csv'\ -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then echo "not all csv files are converted to pickle ones" >&2 exit 1 fi From 056110edcee9198b47405a4ded8db64e69c97675 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Mar 2024 18:18:11 +0100 Subject: [PATCH 27/48] final fix --- .github/workflows/pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 80e5d90c..abe214cf 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -101,7 +101,7 @@ jobs: fi dfpl convert -f tests/data - if [ "$(find tests/data \(-name '*.csv'\ -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then + if [ "$(find tests/data \(-name '*.csv' -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then echo "not all csv files are converted to pickle ones" >&2 exit 1 fi From 630f6d129ec7221a954514e3828aa5d1607a7279 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Thu, 7 Mar 2024 10:10:02 +0100 Subject: [PATCH 28/48] convert fix --- .github/workflows/pr.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index abe214cf..43512abf 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -101,8 +101,8 @@ jobs: fi dfpl convert -f tests/data - if [ "$(find tests/data \(-name '*.csv' -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then - echo "not all csv files are converted to pickle ones" >&2 - exit 1 + if [ "$(find tests/data \( -name '*.csv' -o -name '*.tsv' \) | wc -l)" -ne "$(find tests/data -name '*.pkl' | wc -l)" ]; then + echo "not all csv files are converted to pickle ones" >&2 + exit 1 fi echo "All tests passed!" \ No newline at end of file From 4f418cc51353c7ae516328a9b7356050e8a9d0fd Mon Sep 17 00:00:00 2001 From: soulios <90351285+soulios@users.noreply.github.com> Date: Fri, 8 Mar 2024 10:18:18 +0100 Subject: [PATCH 29/48] Update dfpl/options.py Co-authored-by: M Bernt --- dfpl/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dfpl/options.py b/dfpl/options.py index f958c62e..35c315f5 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -1359,7 +1359,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: "--regression_calibrator_metric", type=str, choices=["stdev", "interval"], - help="Regression calibrators can output either a stdev or an inverval.", + help="Regression calibrator output metric. Regression calibrators can output either a stdev or an inverval.", ) From dd34bca5100cc985324d60e239cdb25e56410d4b Mon Sep 17 00:00:00 2001 From: soulios <90351285+soulios@users.noreply.github.com> Date: Fri, 8 Mar 2024 10:30:24 +0100 Subject: [PATCH 30/48] Apply suggestions from code review Co-authored-by: M Bernt --- dfpl/options.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/dfpl/options.py b/dfpl/options.py index 35c315f5..ae9c9b23 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -1259,35 +1259,35 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: files_args.add_argument( "--preds_path", type=str, - help="Path to CSV or PICKLE file where predictions will be saved.", + help="Predictions output file. CSV or PICKLE file where predictions will be saved.", ) files_args.add_argument( "--calibration_path", type=str, - help="Path to data file to be used for uncertainty calibration.", + help="Data file to be used for uncertainty calibration.", ) files_args.add_argument( "--calibration_features_path", type=str, nargs="+", - help="Path to features data to be used with the uncertainty calibration dataset.", + help="Feature data file to be used with the uncertainty calibration dataset.", ) files_args.add_argument("--calibration_phase_features_path", type=str, help="") files_args.add_argument( "--calibration_atom_descriptors_path", type=str, - help="Path to the extra atom descriptors.", + help="Extra atom descriptors file.", ) files_args.add_argument( "--calibration_bond_descriptors_path", type=str, - help="Path to the extra bond descriptors that will be used as bond features to featurize a given molecule.", + help="Extra bond descriptors file. Path to the extra bond descriptors that will be used as bond features to featurize a given molecule.", ) general_args.add_argument( "--drop_extra_columns", action="store_true", - help="Whether to drop all columns from the test data file besides the SMILES columns and the new prediction columns.", + help="Keep only SMILES and new prediction columns in the test data files.", ) uncertainty_args.add_argument( @@ -1324,13 +1324,13 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: "--individual_ensemble_predictions", action="store_true", default=False, - help="Whether to save individual ensemble predictions.", + help="Save individual ensemble predictions.", ) uncertainty_args.add_argument( "--evaluation_methods", type=str, nargs="+", - help="The methods used for evaluating the uncertainty performance if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.", + help="Methods used for evaluating the uncertainty performance. Only used if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.", ) uncertainty_args.add_argument( "--evaluation_scores_path", @@ -1353,7 +1353,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: "--calibration_interval_percentile", type=float, default=95, - help="Sets the percentile used in the calibration methods. Must be in the range (1,100).", + help="Percentile used in calibration methods. Must be in the range (1,100).", ) uncertainty_args.add_argument( "--regression_calibrator_metric", From 83361ee4463af49184745b18a39873e332f7c323 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Fri, 8 Mar 2024 11:51:20 +0100 Subject: [PATCH 31/48] edited help in args --- dfpl/options.py | 54 ++++++++++++++++++++++++------------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/dfpl/options.py b/dfpl/options.py index ae9c9b23..df6a5495 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -19,12 +19,12 @@ class Options: """ configFile: str = None - inputFile: str = "" - outputDir: str = "" # changes according to mode - outputFile: str = "" + inputFile: str = "tests/data/smiles.csv" + outputDir: str = "example/results_train/" # changes according to mode + outputFile: str = "results.csv" ecWeightsFile: str = "" - ecModelDir: str = "" - fnnModelDir: str = "" + ecModelDir: str = "example/results_train/AE_encoder/" + fnnModelDir: str = "example/results_train/AR_saved_model/" type: str = "smiles" fpType: str = "topological" # also "MACCS", "atompairs" epochs: int = 100 @@ -274,7 +274,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="FILE", type=str, help="Input JSON file that contains all information for training/predicting.", - default=argparse.SUPPRESS, + default="example/train.json", ) general_args.add_argument( "-i", @@ -283,7 +283,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="The file containing the data for training in " "comma separated CSV format.The first column should be smiles.", - default=argparse.SUPPRESS, + default="tests/data/smiles.csv", ) general_args.add_argument( "-o", @@ -292,7 +292,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="Prefix of output file name. Trained model and " "respective stats will be returned in this directory.", - default=argparse.SUPPRESS, + default="example/results_train/", ) # TODO CHECK WHAT IS TYPE DOING? @@ -303,7 +303,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["fp", "smiles"], help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default=argparse.SUPPRESS, + default="fp", ) general_args.add_argument( "-thr", @@ -311,29 +311,29 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=float, metavar="FLOAT", help="Threshold for binary classification.", - default=argparse.SUPPRESS, + default=0.5, ) general_args.add_argument( "-gpu", "--gpu", metavar="INT", type=int, - help="Select which gpu to use. If not available, leave empty.", - default=argparse.SUPPRESS, + help="Select which gpu to use by index. If not available, leave empty", + default=None, ) general_args.add_argument( "--fpType", metavar="STR", type=str, - choices=["topological", "MACCS"], # , 'atompairs', 'torsions'], - help="The type of fingerprint to be generated/used in input file.", - default=argparse.SUPPRESS, + choices=["topological", "MACCS"], + help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.", + default="topological", ) general_args.add_argument( "--fpSize", type=int, - help="Size of fingerprint that should be generated.", - default=argparse.SUPPRESS, + help="Length of the fingerprint that should be generated.", + default=2048, ) general_args.add_argument( "--compressFeatures", @@ -356,7 +356,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, metavar="FILE", help="The .hdf5 file of a trained encoder", - default=argparse.SUPPRESS, + default="", ) autoencoder_args.add_argument( "--ecModelDir", @@ -371,21 +371,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["variational", "deterministic"], help="Autoencoder type, variational or deterministic.", - default=argparse.SUPPRESS, + default="deterministic", ) autoencoder_args.add_argument( "--aeEpochs", metavar="INT", type=int, help="Number of epochs for autoencoder training.", - default=argparse.SUPPRESS, + default=100, ) autoencoder_args.add_argument( "--aeBatchSize", metavar="INT", type=int, help="Batch size in autoencoder training.", - default=argparse.SUPPRESS, + default=512, ) autoencoder_args.add_argument( "--aeActivationFunction", @@ -400,14 +400,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="FLOAT", type=float, help="Learning rate for autoencoder training.", - default=argparse.SUPPRESS, + default=0.001, ) autoencoder_args.add_argument( "--aeLearningRateDecay", metavar="FLOAT", type=float, help="Learning rate decay for autoencoder training.", - default=argparse.SUPPRESS, + default=0.96, ) autoencoder_args.add_argument( "--aeSplitType", @@ -463,7 +463,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: choices=[0, 1, 2], help="Verbosity level. O: No additional output, " + "1: Some additional output, 2: full additional output", - default=argparse.SUPPRESS, + default=2, ) training_args.add_argument( "--trainAC", @@ -522,21 +522,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Batch size in FNN training.", - default=argparse.SUPPRESS, + default=128, ) training_args.add_argument( "--l2reg", metavar="FLOAT", type=float, help="Value for l2 kernel regularizer.", - default=argparse.SUPPRESS, + default=0.001, ) training_args.add_argument( "--dropout", metavar="FLOAT", type=float, help="The fraction of data that is dropped out in each dropout layer.", - default=argparse.SUPPRESS, + default=0.2, ) training_args.add_argument( "--learningRate", From d1656441b3538ef16058a4e0905c84c34e03214e Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Mon, 11 Mar 2024 13:37:18 +0100 Subject: [PATCH 32/48] removed metavar from args with choices --- dfpl/options.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/dfpl/options.py b/dfpl/options.py index df6a5495..4415ef6d 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -299,7 +299,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: general_args.add_argument( "-t", "--type", - metavar="STRING", type=str, choices=["fp", "smiles"], help="Type of the chemical representation. Choices: 'fp', 'smiles'.", @@ -323,7 +322,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: ) general_args.add_argument( "--fpType", - metavar="STR", type=str, choices=["topological", "MACCS"], help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.", @@ -367,7 +365,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: ) autoencoder_args.add_argument( "--aeType", - metavar="STRING", type=str, choices=["variational", "deterministic"], help="Autoencoder type, variational or deterministic.", @@ -389,7 +386,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: ) autoencoder_args.add_argument( "--aeActivationFunction", - metavar="STRING", type=str, choices=["relu", "selu"], help="The activation function for the hidden layers in the autoencoder.", @@ -411,7 +407,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: ) autoencoder_args.add_argument( "--aeSplitType", - metavar="STRING", type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the autoencoder", @@ -434,7 +429,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: # Training Configuration training_args.add_argument( "--split_type", - metavar="STRING", type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the feedforward neural network", @@ -458,7 +452,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: training_args.add_argument( "-v", "--verbose", - metavar="INT", type=int, choices=[0, 1, 2], help="Verbosity level. O: No additional output, " @@ -503,7 +496,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: # TODO CHECK IF ALL LOSSES MAKE SENSE HERE training_args.add_argument( "--lossFunction", - metavar="STRING", type=str, choices=["mse", "bce", "focal"], help="Loss function to use during training. mse - mean squared error, bce - binary cross entropy.", @@ -511,7 +503,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: ) training_args.add_argument( "--optimizer", - metavar="STRING", type=str, choices=["Adam", "SGD"], help='Optimizer to use for backpropagation in the FNN. Possible values: "Adam", "SGD"', @@ -554,7 +545,6 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: ) training_args.add_argument( "--activationFunction", - metavar="STRING", type=str, choices=["relu", "selu"], help="The activation function for hidden layers in the FNN.", @@ -637,7 +627,6 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: general_args.add_argument( "-t", "--type", - metavar="STR", type=str, choices=["fp", "smiles"], help="Type of the chemical representation. Choices: 'fp', 'smiles'.", @@ -646,7 +635,6 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: general_args.add_argument( "-k", "--fpType", - metavar="STR", type=str, choices=["topological", "MACCS"], help="The type of fingerprint to be generated/used in input file. Should be the same as the type of the fps that the model was trained upon.", @@ -681,7 +669,6 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: ( general_args.add_argument( "--aeType", - metavar="STRING", type=str, choices=["variational", "deterministic"], help="Autoencoder type, variational or deterministic.", @@ -700,7 +687,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: uncertainty_args.add_argument( "--uncertainty_method", type=str, - metavar="STRING", choices=[ "mve", "ensemble", @@ -718,7 +704,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: uncertainty_args.add_argument( "--calibration_method", type=str, - metavar="STRING", choices=[ "zscaling", "tscaling", @@ -940,7 +925,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: data_args.add_argument( "--dataset_type", type=str, - metavar="STRING", choices=["classification", "regression", "multiclass"], help="Type of dataset, e.g. classification or regression." "This determines the loss function used during training.", @@ -956,7 +940,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: data_args.add_argument( "--split_type", type=str, - metavar="STRING", default="random", choices=[ "random", @@ -1064,7 +1047,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: model_args.add_argument( "--activation", type=str, - metavar="STRING", default="ReLU", choices=["ReLU", "LeakyReLU", "PReLU", "tanh", "SELU", "ELU"], help="Activation function", @@ -1183,7 +1165,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: training_args.add_argument( "--loss_function", type=str, - metavar="STRING", choices=[ "mse", "bounded_mse", @@ -1201,7 +1182,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: training_args.add_argument( "--metric", type=str, - metavar="STRING", default=None, choices=[ "auc", From 3d5ae80affda3f89bb13e1397fa752c2a65d8441 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Mon, 11 Mar 2024 13:42:55 +0100 Subject: [PATCH 33/48] make literals optionals for None --- dfpl/options.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dfpl/options.py b/dfpl/options.py index 4415ef6d..780f2582 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -133,14 +133,14 @@ class PredictGnnOptions(PredictArgs): calibration_atom_descriptors_path: str = None calibration_features_path: str = None calibration_interval_percentile: float = 95 - calibration_method: Literal[ + calibration_method: Optional[Literal[ "zscaling", "tscaling", "zelikman_interval", "mve_weighting", "platt", "isotonic", - ] = None + ]] = None calibration_path: str = None calibration_phase_features_path: str = None drop_extra_columns: bool = False @@ -150,10 +150,10 @@ class PredictGnnOptions(PredictArgs): # no_features_scaling: bool = True individual_ensemble_predictions: bool = False preds_path: str = None - regression_calibrator_metric: Literal["stdev", "interval"] = None + regression_calibrator_metric: Optional[Literal["stdev", "interval"]] = None test_path: str = None uncertainty_dropout_p: float = 0.1 - uncertainty_method: Literal[ + uncertainty_method: Optional[Literal[ "mve", "ensemble", "evidential_epistemic", @@ -161,7 +161,7 @@ class PredictGnnOptions(PredictArgs): "evidential_total", "classification", "dropout", - ] = None + ]] = None @classmethod def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): From e2ceb2880a1ce97e5bcfd142eed6c50a42a16e81 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Mon, 11 Mar 2024 13:50:35 +0100 Subject: [PATCH 34/48] applied black --- dfpl/options.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/dfpl/options.py b/dfpl/options.py index 780f2582..819d45ff 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -133,14 +133,16 @@ class PredictGnnOptions(PredictArgs): calibration_atom_descriptors_path: str = None calibration_features_path: str = None calibration_interval_percentile: float = 95 - calibration_method: Optional[Literal[ - "zscaling", - "tscaling", - "zelikman_interval", - "mve_weighting", - "platt", - "isotonic", - ]] = None + calibration_method: Optional[ + Literal[ + "zscaling", + "tscaling", + "zelikman_interval", + "mve_weighting", + "platt", + "isotonic", + ] + ] = None calibration_path: str = None calibration_phase_features_path: str = None drop_extra_columns: bool = False @@ -153,15 +155,17 @@ class PredictGnnOptions(PredictArgs): regression_calibrator_metric: Optional[Literal["stdev", "interval"]] = None test_path: str = None uncertainty_dropout_p: float = 0.1 - uncertainty_method: Optional[Literal[ - "mve", - "ensemble", - "evidential_epistemic", - "evidential_aleatoric", - "evidential_total", - "classification", - "dropout", - ]] = None + uncertainty_method: Optional[ + Literal[ + "mve", + "ensemble", + "evidential_epistemic", + "evidential_aleatoric", + "evidential_total", + "classification", + "dropout", + ] + ] = None @classmethod def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): From f748c2ffe76ce34b606073e7028f2c743ac9fcc6 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Thu, 11 Jul 2024 12:38:02 +0200 Subject: [PATCH 35/48] removed paths from default args and fixed creating args from json and cmd lin --- dfpl/options.py | 24 ++++++++++++------------ dfpl/utils.py | 26 ++++++++++++++++++++++---- 2 files changed, 34 insertions(+), 16 deletions(-) diff --git a/dfpl/options.py b/dfpl/options.py index 819d45ff..5e3519f6 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -19,12 +19,12 @@ class Options: """ configFile: str = None - inputFile: str = "tests/data/smiles.csv" - outputDir: str = "example/results_train/" # changes according to mode - outputFile: str = "results.csv" + inputFile: str = "" + outputDir: str = "" # changes according to mode + outputFile: str = "" ecWeightsFile: str = "" - ecModelDir: str = "example/results_train/AE_encoder/" - fnnModelDir: str = "example/results_train/AR_saved_model/" + ecModelDir: str = "" + fnnModelDir: str = "" type: str = "smiles" fpType: str = "topological" # also "MACCS", "atompairs" epochs: int = 100 @@ -85,8 +85,8 @@ class GnnOptions(TrainArgs): total_epochs: int = 30 save: bool = True - configFile: str = "./example/traingnn.json" - data_path: str = "./example/data/tox21.csv" + configFile: str = "" + data_path: str = "" use_compound_names: bool = False save_dir: str = "" no_cache: bool = False @@ -96,13 +96,13 @@ class GnnOptions(TrainArgs): num_lrs: int = 2 minimize_score: bool = False num_tasks: int = 12 - preds_path: str = "./tox21dmpnn.csv" + preds_path: str = "" test_path: str = "" save_preds: bool = True - calibration_method: str = "none" - uncertainty_method: str = "none" + calibration_method: str = "" + uncertainty_method: str = "" calibration_path: str = "" - evaluation_methods: str = "none" + evaluation_methods: str = "" evaluation_scores_path: str = "" wabTracking: bool = False split_sizes: List[float] = None @@ -129,7 +129,7 @@ class PredictGnnOptions(PredictArgs): Dataclass to hold all options used for training the graph models """ - configFile: str = "./example/predictgnn.json" + configFile: str = "" calibration_atom_descriptors_path: str = None calibration_features_path: str = None calibration_interval_percentile: float = 95 diff --git a/dfpl/utils.py b/dfpl/utils.py index 506b8f54..f3019084 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -87,7 +87,7 @@ def parse_cli_boolean(cli_args, cli_arg_key): return False -def createArgsFromJson(jsonFile: str): +def createArgsFromJson(jsonFile: str) -> List[str]: arguments = [] ignore_elements = ["py/object"] cli_args = sys.argv[1:] # Skipping the script name itself @@ -105,19 +105,37 @@ def createArgsFromJson(jsonFile: str): arg_index = cli_args.index(cli_arg_key) + 1 if isinstance(value, bool): value = parse_cli_boolean(cli_args, cli_arg_key) - elif arg_index < len(cli_args): + elif arg_index < len(cli_args) and not cli_args[arg_index].startswith('--'): cli_value = cli_args[arg_index] if isinstance(value, list): value = parse_cli_list(cli_value) else: value = cli_value # Override JSON value with command-line value - if isinstance(value, bool) and value: - arguments.append(cli_arg_key) + if isinstance(value, bool): + if value: + arguments.append(cli_arg_key) elif isinstance(value, list): arguments.append(cli_arg_key) arguments.extend(map(str, value)) # Ensure all elements are strings else: arguments.extend([cli_arg_key, str(value)]) + i = 0 + while i < len(cli_args): + arg = cli_args[i] + if arg.startswith("--"): + key = arg.lstrip("--") + if key not in data: + value = True if i + 1 >= len(cli_args) or cli_args[i + 1].startswith("--") else cli_args[i + 1] + if isinstance(value, bool): + if value: + arguments.append(arg) + else: + arguments.extend([arg, str(value)]) + i += 1 if isinstance(value, bool) else 2 + else: + i += 1 + else: + i += 1 return arguments From 51453b5993f6d33b37b54b205583c59645ed36c1 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Thu, 11 Jul 2024 13:44:41 +0200 Subject: [PATCH 36/48] rebased argparse --- dfpl/__main__.py | 1 + dfpl/callbacks.py | 38 +++++++++++------------ dfpl/options.py | 4 +-- dfpl/utils.py | 10 ++++-- dfpl/vae.py | 77 ++++++++++++++++++++++++++++++++++++---------- example/train.json | 4 +-- 6 files changed, 92 insertions(+), 42 deletions(-) diff --git a/dfpl/__main__.py b/dfpl/__main__.py index 39157577..fe66eec8 100755 --- a/dfpl/__main__.py +++ b/dfpl/__main__.py @@ -15,6 +15,7 @@ from dfpl import vae as vae from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute + def traindmpnn(opts: options.GnnOptions) -> None: """ Train a D-MPNN model using the given options. diff --git a/dfpl/callbacks.py b/dfpl/callbacks.py index fc1f817c..8bf157fd 100644 --- a/dfpl/callbacks.py +++ b/dfpl/callbacks.py @@ -24,33 +24,33 @@ def autoencoder_callback(checkpoint_path: str, opts: options.Options) -> list: # enable this checkpoint to restore the weights of the best performing model if opts.aeType == "deterministic": checkpoint = ModelCheckpoint( - checkpoint_path, - monitor=target, - mode="min", - verbose=1, - save_freq="epoch", - save_best_only=True, + checkpoint_path, + monitor=target, + mode="min", + verbose=1, + save_freq="epoch", + save_best_only=True, ) else: checkpoint = ModelCheckpoint( - checkpoint_path, - monitor=target, - mode="min", - verbose=1, - save_freq="epoch", - save_best_only=True, - save_weights_only=True + checkpoint_path, + monitor=target, + mode="min", + verbose=1, + save_freq="epoch", + save_best_only=True, + save_weights_only=True, ) callbacks.append(checkpoint) # enable early stopping if val_loss is not improving anymore early_stop = EarlyStopping( - monitor=target, - mode="min", - patience=settings.ac_train_patience, - min_delta=settings.ac_train_min_delta, - verbose=1, - restore_best_weights=True, + monitor=target, + mode="min", + patience=settings.ac_train_patience, + min_delta=settings.ac_train_min_delta, + verbose=1, + restore_best_weights=True, ) callbacks.append(early_stop) if opts.aeWabTracking and not opts.wabTracking: diff --git a/dfpl/options.py b/dfpl/options.py index 5e3519f6..c266e24b 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -472,7 +472,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: "--trainFNN", action="store_false", help="When called it deactivates the training.", - default=argparse.SUPPRESS + default=argparse.SUPPRESS, ) training_args.add_argument( "--sampleFractionOnes", @@ -1260,7 +1260,7 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: files_args.add_argument( "--calibration_atom_descriptors_path", type=str, - help="Extra atom descriptors file.", + help="Extra atom descriptors file.", ) files_args.add_argument( "--calibration_bond_descriptors_path", diff --git a/dfpl/utils.py b/dfpl/utils.py index f3019084..dd1ab108 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -105,7 +105,9 @@ def createArgsFromJson(jsonFile: str) -> List[str]: arg_index = cli_args.index(cli_arg_key) + 1 if isinstance(value, bool): value = parse_cli_boolean(cli_args, cli_arg_key) - elif arg_index < len(cli_args) and not cli_args[arg_index].startswith('--'): + elif arg_index < len(cli_args) and not cli_args[arg_index].startswith( + "--" + ): cli_value = cli_args[arg_index] if isinstance(value, list): value = parse_cli_list(cli_value) @@ -125,7 +127,11 @@ def createArgsFromJson(jsonFile: str) -> List[str]: if arg.startswith("--"): key = arg.lstrip("--") if key not in data: - value = True if i + 1 >= len(cli_args) or cli_args[i + 1].startswith("--") else cli_args[i + 1] + value = ( + True + if i + 1 >= len(cli_args) or cli_args[i + 1].startswith("--") + else cli_args[i + 1] + ) if isinstance(value, bool): if value: arguments.append(arg) diff --git a/dfpl/vae.py b/dfpl/vae.py index cc61b17d..ebb9957c 100644 --- a/dfpl/vae.py +++ b/dfpl/vae.py @@ -13,7 +13,7 @@ from sklearn.model_selection import train_test_split from tensorflow.keras import initializers, optimizers from tensorflow.keras.layers import Dense, Input, Lambda -from tensorflow.keras.models import Model,load_model +from tensorflow.keras.models import Model, load_model from tensorflow.python.framework.ops import disable_eager_execution from dfpl import callbacks @@ -26,7 +26,9 @@ def define_vae_model(opts: options.Options, output_bias=None) -> Tuple[Model, Model]: input_size = opts.fpSize - encoding_dim = opts.encFPSize # This should be the intended size of your latent space, e.g., 256 + encoding_dim = ( + opts.encFPSize + ) # This should be the intended size of your latent space, e.g., 256 lr_schedule = optimizers.schedules.ExponentialDecay( opts.aeLearningRate, @@ -45,25 +47,56 @@ def define_vae_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mo # 1st hidden layer if opts.aeActivationFunction != "selu": - encoded = Dense(units=int(input_size / 2), activation=opts.aeActivationFunction)(input_vec) + encoded = Dense( + units=int(input_size / 2), activation=opts.aeActivationFunction + )(input_vec) else: - encoded = Dense(units=int(input_size / 2), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(input_vec) + encoded = Dense( + units=int(input_size / 2), + activation=opts.aeActivationFunction, + kernel_initializer="lecun_normal", + )(input_vec) # encoding layers - for i in range(1, hidden_layer_count - 1): # Adjust the range to stop before the latent space layers + for i in range( + 1, hidden_layer_count - 1 + ): # Adjust the range to stop before the latent space layers factor_units = 2 ** (i + 1) if opts.aeActivationFunction != "selu": - encoded = Dense(units=int(input_size / factor_units), activation=opts.aeActivationFunction)(encoded) + encoded = Dense( + units=int(input_size / factor_units), + activation=opts.aeActivationFunction, + )(encoded) else: - encoded = Dense(units=int(input_size / factor_units), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(encoded) + encoded = Dense( + units=int(input_size / factor_units), + activation=opts.aeActivationFunction, + kernel_initializer="lecun_normal", + )(encoded) # latent space layers if opts.aeActivationFunction != "selu": - z_mean = Dense(units=encoding_dim, activation=opts.aeActivationFunction)(encoded) # Adjusted size to encoding_dim - z_log_var = Dense(units=encoding_dim, activation=opts.aeActivationFunction)(encoded) # Adjusted size to encoding_dim + z_mean = Dense(units=encoding_dim, activation=opts.aeActivationFunction)( + encoded + ) # Adjusted size to encoding_dim + z_log_var = Dense(units=encoding_dim, activation=opts.aeActivationFunction)( + encoded + ) # Adjusted size to encoding_dim else: - z_mean = Dense(units=encoding_dim, activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(encoded) # Adjusted size to encoding_dim - z_log_var = Dense(units=encoding_dim, activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(encoded) # Adjusted size to encoding_dim + z_mean = Dense( + units=encoding_dim, + activation=opts.aeActivationFunction, + kernel_initializer="lecun_normal", + )( + encoded + ) # Adjusted size to encoding_dim + z_log_var = Dense( + units=encoding_dim, + activation=opts.aeActivationFunction, + kernel_initializer="lecun_normal", + )( + encoded + ) # Adjusted size to encoding_dim # sampling layer def sampling(args): @@ -78,20 +111,28 @@ def sampling(args): # decoding layers for i in range(hidden_layer_count - 2, 0, -1): - factor_units = 2 ** i + factor_units = 2**i if opts.aeActivationFunction != "selu": - decoded = Dense(units=int(input_size / factor_units), activation=opts.aeActivationFunction)(decoded) + decoded = Dense( + units=int(input_size / factor_units), + activation=opts.aeActivationFunction, + )(decoded) else: - decoded = Dense(units=int(input_size / factor_units), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(decoded) + decoded = Dense( + units=int(input_size / factor_units), + activation=opts.aeActivationFunction, + kernel_initializer="lecun_normal", + )(decoded) # output layer - decoded = Dense(units=input_size, activation="sigmoid", bias_initializer=output_bias)(decoded) + decoded = Dense( + units=input_size, activation="sigmoid", bias_initializer=output_bias + )(decoded) autoencoder = Model(input_vec, decoded) encoder = Model(input_vec, z) autoencoder.summary(print_fn=logging.info) - # KL divergence loss def kl_loss(z_mean, z_log_var): return -0.5 * K.sum( @@ -238,7 +279,9 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: (vae, encoder) = define_vae_model(opts, output_bias=initial_bias) # Train the VAE on the training data - callback_list = callbacks.autoencoder_callback(checkpoint_path=f"{save_path}.h5", opts=opts) + callback_list = callbacks.autoencoder_callback( + checkpoint_path=f"{save_path}.h5", opts=opts + ) vae_hist = vae.fit( x_train, diff --git a/example/train.json b/example/train.json index 53575adc..7b23c4e9 100755 --- a/example/train.json +++ b/example/train.json @@ -6,8 +6,8 @@ "ecWeightsFile": "", "verbose": 2, - "trainAC": false, - "compressFeatures": false, + "trainAC": true, + "compressFeatures": true, "visualizeLatent": false, "encFPSize": 256, From 7348fd02d979f345845aa196a536527e98c5217d Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Thu, 11 Jul 2024 14:02:30 +0200 Subject: [PATCH 37/48] blacked and flaked --- dfpl/autoencoder.py | 1 - dfpl/utils.py | 112 +++++++++++++++++++++++++++++++++++++++++++- dfpl/vae.py | 4 +- 3 files changed, 111 insertions(+), 6 deletions(-) diff --git a/dfpl/autoencoder.py b/dfpl/autoencoder.py index 6909b156..b2b13d76 100644 --- a/dfpl/autoencoder.py +++ b/dfpl/autoencoder.py @@ -1,7 +1,6 @@ import logging import math import os.path -from os.path import basename from typing import Tuple import matplotlib.pyplot as plt diff --git a/dfpl/utils.py b/dfpl/utils.py index 7cfe04af..1b8f1a9a 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -1,11 +1,15 @@ +import argparse import json import logging import os import pathlib +import sys import warnings from collections import defaultdict +from pathlib import Path from random import Random from typing import Dict, List, Set, Tuple, Type, TypeVar, Union + import jsonpickle import numpy as np import pandas as pd @@ -14,7 +18,48 @@ from rdkit.Chem.Scaffolds import MurckoScaffold from tqdm import tqdm +# Define a type variable + + RDLogger.DisableLog("rdApp.*") +T = TypeVar("T") + + +def parseCmdArgs(cls: Type[T], args: argparse.Namespace) -> T: + """ + Parses command-line arguments to create an instance of the given class. + + Args: + cls: The class to create an instance of. + args: argparse.Namespace containing the command-line arguments. + + Returns: + An instance of cls populated with values from the command-line arguments. + """ + # Extract argument flags from sys.argv + arg_flags = {arg.lstrip("-") for arg in sys.argv if arg.startswith("-")} + + # Create the result instance, which will be modified and returned + result = cls() + + # Load JSON file if specified + if hasattr(args, "configFile") and args.configFile: + jsonFile = Path(args.configFile) + if jsonFile.exists() and jsonFile.is_file(): + with jsonFile.open() as f: + content = jsonpickle.decode(f.read()) + for key, value in vars(content).items(): + setattr(result, key, value) + else: + raise ValueError("Could not find JSON input file") + + # Override with user-provided command-line arguments + for key in arg_flags: + if hasattr(args, key): + user_value = getattr(args, key, None) + setattr(result, key, user_value) + + return result def makePathAbsolute(p: str) -> str: @@ -31,10 +76,28 @@ def createDirectory(directory: str): os.makedirs(path) -def createArgsFromJson(in_json: str, ignore_elements: list, return_json_object: bool): +def parse_cli_list(value: str): + # Simple parser for lists passed as comma-separated values + return value.split(",") + + +def parse_cli_boolean(cli_args, cli_arg_key): + # Determines boolean value based on command line presence + if cli_arg_key in cli_args: + return True # Presence of flag implies True + return False + + +def createArgsFromJson(jsonFile: str) -> List[str]: arguments = [] - with open(in_json, "r") as f: + ignore_elements = ["py/object"] + cli_args = sys.argv[1:] # Skipping the script name itself + + with open(jsonFile, "r") as f: data = json.load(f) + + processed_cli_keys = [] # To track which CLI keys have been processed + for key, value in data.items(): if key not in ignore_elements: cli_arg_key = f"--{key}" @@ -175,6 +238,51 @@ def weight_split( return train_df, val_df, test_df +def generate_scaffold( + mol: Union[str, Chem.Mol, Tuple[Chem.Mol, Chem.Mol]], include_chirality: bool = True +) -> str: + """ + Computes the Bemis-Murcko scaffold for a SMILES string, an RDKit molecule, or an InChI string or InChIKey. + + :param mol: A SMILES, RDKit molecule, InChI string, or InChIKey string. + :param include_chirality: Whether to include chirality in the computed scaffold. + :return: The Bemis-Murcko scaffold for the molecule. + """ + if isinstance(mol, str): + if mol.startswith("InChI="): + mol = inchi_to_mol(mol) + else: + mol = make_mol(mol, keep_h=False, add_h=False, keep_atom_map=False) + elif isinstance(mol, tuple): + mol = mol[0] + scaffold = MurckoScaffold.MurckoScaffoldSmiles( + mol=mol, includeChirality=include_chirality + ) + + return scaffold + + +def scaffold_to_smiles( + mols: List[str], use_indices: bool = False +) -> Dict[str, Union[Set[str], Set[int]]]: + """ + Computes the scaffold for each SMILES and returns a mapping from scaffolds to sets of smiles (or indices). + :param mols: A list of SMILES. + :param use_indices: Whether to map to the SMILES's index in :code:`mols` rather than + mapping to the smiles string itself. This is necessary if there are duplicate smiles. + :return: A dictionary mapping each unique scaffold to all SMILES (or indices) which have that scaffold. + """ + scaffolds = defaultdict(set) + for i, mol in tqdm(enumerate(mols), total=len(mols)): + scaffold = generate_scaffold(mol) + if use_indices: + scaffolds[scaffold].add(i) + else: + scaffolds[scaffold].add(mol) + + return scaffolds + + def ae_scaffold_split( data: pd.DataFrame, sizes: Tuple[float, float, float] = (0.8, 0, 0.2), diff --git a/dfpl/vae.py b/dfpl/vae.py index ebb9957c..45cfda7a 100644 --- a/dfpl/vae.py +++ b/dfpl/vae.py @@ -1,8 +1,6 @@ -import csv import logging import math import os.path -from os.path import basename from typing import Tuple import numpy as np @@ -13,7 +11,7 @@ from sklearn.model_selection import train_test_split from tensorflow.keras import initializers, optimizers from tensorflow.keras.layers import Dense, Input, Lambda -from tensorflow.keras.models import Model, load_model +from tensorflow.keras.models import Model from tensorflow.python.framework.ops import disable_eager_execution from dfpl import callbacks From 40be7bb31ecceabe183dc8ef06419bede79e5c9a Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Thu, 11 Jul 2024 14:49:56 +0200 Subject: [PATCH 38/48] trying fix for cmd and json args --- example/train.json | 1 - 1 file changed, 1 deletion(-) diff --git a/example/train.json b/example/train.json index 726cfee2..a616cd89 100755 --- a/example/train.json +++ b/example/train.json @@ -1,7 +1,6 @@ { "py/object": "dfpl.options.Options", "inputFile": "tests/data/tox21.csv", - "inputFile": "tests/data/smiles.csv", "outputDir": "example/results_train/", "ecModelDir": "example/results_train/", "ecWeightsFile": "", From 35a63ee5f025ff75bbaa2049ab6aa0bcb53e0641 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Thu, 11 Jul 2024 15:00:15 +0200 Subject: [PATCH 39/48] changed path for input file --- example/train.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/train.json b/example/train.json index a616cd89..131d800d 100755 --- a/example/train.json +++ b/example/train.json @@ -1,6 +1,6 @@ { "py/object": "dfpl.options.Options", - "inputFile": "tests/data/tox21.csv", + "inputFile": "tests/data/smiles.csv", "outputDir": "example/results_train/", "ecModelDir": "example/results_train/", "ecWeightsFile": "", From 1ba017c9d2766052d08b6c54cc053f77df9ad77e Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Thu, 11 Jul 2024 15:06:55 +0200 Subject: [PATCH 40/48] changed path for input file --- example/train.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/train.json b/example/train.json index 131d800d..bf57e7e2 100755 --- a/example/train.json +++ b/example/train.json @@ -1,6 +1,6 @@ { "py/object": "dfpl.options.Options", - "inputFile": "tests/data/smiles.csv", + "inputFile": "tests/data/S_dataset.csv", "outputDir": "example/results_train/", "ecModelDir": "example/results_train/", "ecWeightsFile": "", From 11e808cccef5d06077f1937421a347bfa04fbbb6 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Thu, 11 Jul 2024 15:18:39 +0200 Subject: [PATCH 41/48] changed path for input file --- example/predict.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/example/predict.json b/example/predict.json index e3305c7c..645bae78 100755 --- a/example/predict.json +++ b/example/predict.json @@ -1,12 +1,12 @@ { "py/object": "dfpl.options.Options", - "inputFile": "tests/data/tox21.csv", + "inputFile": "tests/data/S_dataset.csv", "outputDir": "example/results_predict/", "outputFile": "smiles.csv", "ecModelDir": "example/results_train/random_split_autoencoder", "ecWeightsFile": "random_split_autoencoder_encoder.h5", - "fnnModelDir": "example/results_train/NR-AR-1_best_saved_model", - "aeType": "variational", + "fnnModelDir": "example/results_train/AR-1_best_saved_model", + "aeType": "deterministic", "compressFeatures": true, "trainFNN": false } From 49894218e31db0b31ee9c4ce4797fae505bb71a2 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Thu, 11 Jul 2024 15:37:28 +0200 Subject: [PATCH 42/48] changed path for input file --- example/predict.json | 3 +-- example/traingnn.json | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/example/predict.json b/example/predict.json index 645bae78..d96ad803 100755 --- a/example/predict.json +++ b/example/predict.json @@ -3,8 +3,7 @@ "inputFile": "tests/data/S_dataset.csv", "outputDir": "example/results_predict/", "outputFile": "smiles.csv", - "ecModelDir": "example/results_train/random_split_autoencoder", - "ecWeightsFile": "random_split_autoencoder_encoder.h5", + "ecModelDir": "example/results_train/random_split_autoencoder/encoder_model", "fnnModelDir": "example/results_train/AR-1_best_saved_model", "aeType": "deterministic", "compressFeatures": true, diff --git a/example/traingnn.json b/example/traingnn.json index fa2b714f..5536f700 100644 --- a/example/traingnn.json +++ b/example/traingnn.json @@ -10,5 +10,5 @@ "dataset_type": "classification", "smiles_columns": "smiles", "extra_metrics": ["balanced_accuracy","auc","f1","mcc","recall","precision"], - "hidden_size": 256 + "hidden_size": 300 } From 8cc13b413692306635f9a0562524c0ac531d4862 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Thu, 11 Jul 2024 15:48:57 +0200 Subject: [PATCH 43/48] changed path for test file in pr.yml --- .github/workflows/pr.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pr.yml b/.github/workflows/pr.yml index 43512abf..d7d4ae76 100644 --- a/.github/workflows/pr.yml +++ b/.github/workflows/pr.yml @@ -95,8 +95,8 @@ jobs: exit 1 fi echo "result lines "$(wc -l preds_dmpnn/DMPNN_preds.csv) - if [ "$(cat preds_dmpnn/DMPNN_preds.csv | wc -l)" -lt "6" ]; then - echo "predict result should have at least 5 lines. But had only $(cat preds_dmpnn/DMPNN_preds.csv | wc -l)" >&2 + if [ "$(cat preds_dmpnn/preds.csv | wc -l)" -lt "6" ]; then + echo "predict result should have at least 5 lines. But had only $(cat preds_dmpnn/preds.csv | wc -l)" >&2 exit 1 fi From 8244c26341bb850cb20beb729e5b6a38182f90a3 Mon Sep 17 00:00:00 2001 From: tom-mohr <24864629+tom-mohr@users.noreply.github.com> Date: Tue, 6 Aug 2024 13:02:50 +0200 Subject: [PATCH 44/48] fix --compressFeatures arg for "dfpl train" --- dfpl/options.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dfpl/options.py b/dfpl/options.py index c266e24b..3318ac65 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -338,11 +338,11 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: default=2048, ) general_args.add_argument( + "-c", "--compressFeatures", - metavar="BOOL", - type=bool, + action="store_true", + default=False, help="Should the fingerprints be compressed or not. Activates the autoencoder. ", - default=argparse.SUPPRESS, ) general_args.add_argument( "--enableMultiLabel", From 9de595acce8485d0f6dfab54e68ff1d911565156 Mon Sep 17 00:00:00 2001 From: tom-mohr <24864629+tom-mohr@users.noreply.github.com> Date: Tue, 6 Aug 2024 13:16:26 +0200 Subject: [PATCH 45/48] - added parsing for --fnnType --- dfpl/options.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/dfpl/options.py b/dfpl/options.py index 3318ac65..42ce57e5 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -445,6 +445,13 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="Fraction of the dataset that should be used for testing. Value in [0,1].", default=argparse.SUPPRESS, ) + autoencoder_args.add_argument( + "--fnnType", + type=str, + choices=["FNN", "SNN"], + help="The type of the feedforward neural network.", + default="FNN", + ) training_args.add_argument( "-K", "--kFolds", From af7f2cc629c67fa402b03faa4fbc30e7c0ccc979 Mon Sep 17 00:00:00 2001 From: tom-mohr <24864629+tom-mohr@users.noreply.github.com> Date: Tue, 6 Aug 2024 13:21:01 +0200 Subject: [PATCH 46/48] include missing json file path in error message --- dfpl/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dfpl/utils.py b/dfpl/utils.py index 1b8f1a9a..58ba0965 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -51,7 +51,7 @@ def parseCmdArgs(cls: Type[T], args: argparse.Namespace) -> T: for key, value in vars(content).items(): setattr(result, key, value) else: - raise ValueError("Could not find JSON input file") + raise ValueError(f"Could not find JSON input file {jsonFile}") # Override with user-provided command-line arguments for key in arg_flags: From 225907adb1865fe894a999ba74794c813987890b Mon Sep 17 00:00:00 2001 From: tom-mohr <24864629+tom-mohr@users.noreply.github.com> Date: Tue, 6 Aug 2024 13:22:21 +0200 Subject: [PATCH 47/48] add "dfpl.egg-info" and "build" directory to .gitignore --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 1d102646..5decebbd 100644 --- a/.gitignore +++ b/.gitignore @@ -18,3 +18,5 @@ example/data/convert.log results release_asset LICENSE.docx +dfpl.egg-info +build \ No newline at end of file From 46f7da38f2d284956c3c21d9f7dee0c68bdfae08 Mon Sep 17 00:00:00 2001 From: tom-mohr <24864629+tom-mohr@users.noreply.github.com> Date: Tue, 6 Aug 2024 16:34:06 +0200 Subject: [PATCH 48/48] correctly load defaults with argparse - unified handling of --configFile - specify default values only in the parsing, nowhere else (but only for "train" and "predict", for the other modes, this still needs to be done) - put modes into separate files (train, predict, trainggn, predictgnn, interpretgnn, convert) - parse JSON configs just like CLI args (this will give the user feedback concerning syntax errors etc in the JSON, just like with CLI args) - removed jsonpickle dependency - removed py/object from JSON configs (not needed anymore) - removed aeOptimizer argument (wasn't being used) - added parsing for --fnnType - split train/predict Options into three classes (common, train, predict) - added todo's regarding some confusing parts --- dfpl/__main__.py | 281 +---- dfpl/autoencoder.py | 73 +- dfpl/callbacks.py | 38 +- dfpl/compression.json | 14 + dfpl/convert.py | 35 + dfpl/feedforwardNN.py | 23 +- dfpl/interpretgnn.py | 100 ++ dfpl/options.py | 1442 ------------------------ dfpl/parse.py | 134 +++ dfpl/predict.py | 183 +++ dfpl/predictgnn.py | 190 ++++ dfpl/predictions.py | 12 +- dfpl/single_label_model.py | 35 +- dfpl/train.py | 491 ++++++++ dfpl/traingnn.py | 627 +++++++++++ dfpl/utils.py | 68 +- dfpl/vae.py | 9 +- environment.yml | 1 - example/predict.json | 1 - example/predictgnn.json | 1 - example/train.json | 2 - example/traingnn.json | 1 - setup.py | 1 - singularity_container/environment.yaml | 1 - tests/run_autoencoder.py | 6 +- tests/run_fnntraining.py | 6 +- tests/run_predictgnn.py | 6 +- tests/run_prediction.py | 8 +- tests/run_traingnn.py | 6 +- tests/run_vae.py | 6 +- tests/test_fractional_sampling.py | 2 +- tests/try_fpcomparison.py | 6 +- 32 files changed, 1952 insertions(+), 1857 deletions(-) create mode 100644 dfpl/compression.json create mode 100644 dfpl/convert.py create mode 100644 dfpl/interpretgnn.py delete mode 100644 dfpl/options.py create mode 100644 dfpl/parse.py create mode 100644 dfpl/predict.py create mode 100644 dfpl/predictgnn.py create mode 100644 dfpl/train.py create mode 100644 dfpl/traingnn.py diff --git a/dfpl/__main__.py b/dfpl/__main__.py index fe66eec8..c17950b8 100755 --- a/dfpl/__main__.py +++ b/dfpl/__main__.py @@ -1,268 +1,39 @@ -import dataclasses -import logging -import os +import sys from argparse import Namespace -from os import path -import chemprop -from keras.models import load_model +from dfpl.parse import parse_dfpl +from dfpl.convert import convert +from dfpl.interpretgnn import interpretdmpnn +from dfpl.predictgnn import predictdmpnn +from dfpl.train import train +from dfpl.predict import predict +from dfpl.traingnn import traindmpnn -from dfpl import autoencoder as ac -from dfpl import feedforwardNN as fNN -from dfpl import fingerprint as fp -from dfpl import options, predictions -from dfpl import single_label_model as sl -from dfpl import vae as vae -from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute +def run_dfpl(args: Namespace): + subprogram_name = args.method -def traindmpnn(opts: options.GnnOptions) -> None: - """ - Train a D-MPNN model using the given options. - Args: - - opts: options.GnnOptions instance containing the details of the training - Returns: - - None - """ - # Load options from a JSON file and replace the relevant attributes in `opts` - arguments = createArgsFromJson(jsonFile=opts.configFile) - opts = chemprop.args.TrainArgs().parse_args(arguments) - logging.info("Training DMPNN...") - mean_score, std_score = chemprop.train.cross_validate( - args=opts, train_func=chemprop.train.run_training - ) - logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}") + # The ".method" attribute is added by the parser, + # specifying which subprogram was chosen by the user. + # However, the subprograms don't expect the ".method" attribute, + # so we remove it here before calling the subprogram. + del args.method - -def predictdmpnn(opts: options.GnnOptions) -> None: - """ - Predict the values using a trained D-MPNN model with the given options. - Args: - - opts: options.GnnOptions instance containing the details of the prediction - Returns: - - None - """ - # Load options and additional arguments from a JSON file - arguments = createArgsFromJson(jsonFile=opts.configFile) - opts = chemprop.args.PredictArgs().parse_args(arguments) - - chemprop.train.make_predictions(args=opts) - - -def interpretdmpnn(opts: options.GnnOptions) -> None: - """ - Interpret the predictions of a trained D-MPNN model with the given options. - Args: - - opts: options.GnnOptions instance containing the details of the prediction - Returns: - - None - """ - # Load options and additional arguments from a JSON file - arguments = createArgsFromJson(jsonFile=opts.configFile) - opts = chemprop.args.InterpretArgs().parse_args(arguments) - - chemprop.interpret.interpret(args=opts, save_to_csv=True) - - -def train(opts: options.Options): - """ - Run the main training procedure - :param opts: Options defining the details of the training - """ - # import data from file and create DataFrame - if "tsv" in opts.inputFile: - df = fp.importDataFile( - opts.inputFile, import_function=fp.importDstoxTSV, fp_size=opts.fpSize - ) - else: - df = fp.importDataFile( - opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize - ) - # initialize (auto)encoders to None - encoder = None - autoencoder = None - if opts.trainAC: - if opts.aeType == "deterministic": - encoder, train_indices, test_indices = ac.train_full_ac(df, opts) - elif opts.aeType == "variational": - encoder, train_indices, test_indices = vae.train_full_vae(df, opts) - else: - raise ValueError(f"Unknown autoencoder type: {opts.aeType}") - - # if feature compression is enabled - if opts.compressFeatures: - if not opts.trainAC: - if opts.aeType == "variational": - (autoencoder, encoder) = vae.define_vae_model(opts=options.Options()) - else: - (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) - - if opts.ecWeightsFile == "": - encoder = load_model(opts.ecModelDir) - else: - autoencoder.load_weights( - os.path.join(opts.ecModelDir, opts.ecWeightsFile) - ) - # compress the fingerprints using the autoencoder - df = ac.compress_fingerprints(df, encoder) - if opts.visualizeLatent and opts.trainAC: - ac.visualize_fingerprints( - df, - train_indices=train_indices, - test_indices=test_indices, - save_as=f"{opts.ecModelDir}/UMAP_{opts.aeSplitType}.png", - ) - elif opts.visualizeLatent: - logging.info( - "Visualizing latent space is only available if you train the autoencoder. Skipping visualization." - ) - - # train single label models if requested - if opts.trainFNN and not opts.enableMultiLabel: - sl.train_single_label_models(df=df, opts=opts) - - # train multi-label models if requested - if opts.trainFNN and opts.enableMultiLabel: - fNN.train_nn_models_multi(df=df, opts=opts) - - -def predict(opts: options.Options) -> None: - """ - Run prediction given specific options - :param opts: Options defining the details of the prediction - """ - # import data from file and create DataFrame - if "tsv" in opts.inputFile: - df = fp.importDataFile( - opts.inputFile, import_function=fp.importDstoxTSV, fp_size=opts.fpSize - ) - else: - df = fp.importDataFile( - opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize - ) - - if opts.compressFeatures: - # load trained model for autoencoder - if opts.aeType == "deterministic": - (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) - if opts.aeType == "variational": - (autoencoder, encoder) = vae.define_vae_model(opts=options.Options()) - # Load trained model for autoencoder - if opts.ecWeightsFile == "": - encoder = load_model(opts.ecModelDir) - else: - encoder.load_weights(os.path.join(opts.ecModelDir, opts.ecWeightsFile)) - df = ac.compress_fingerprints(df, encoder) - - # Run predictions on the compressed fingerprints and store the results in a dataframe - df2 = predictions.predict_values(df=df, opts=opts) - - # Extract the column names from the dataframe, excluding the 'fp' and 'fpcompressed' columns - names_columns = [c for c in df2.columns if c not in ["fp", "fpcompressed"]] - - # Save the predicted values to a CSV file in the output directory - df2[names_columns].to_csv(path_or_buf=path.join(opts.outputDir, opts.outputFile)) - - # Log successful completion of prediction and the file path where the results were saved - logging.info( - f"Prediction successful. Results written to '{path.join(opts.outputDir, opts.outputFile)}'" - ) - - -def createLogger(filename: str) -> None: - """ - Set up a logger for the main function that also saves to a log file - """ - # get root logger and set its level - logger = logging.getLogger() - logger.setLevel(logging.INFO) - # create file handler which logs info messages - fh = logging.FileHandler(filename, mode="w") - fh.setLevel(logging.INFO) - # create console handler - ch = logging.StreamHandler() - ch.setLevel(logging.INFO) - # create formatter and add it to the handlers - formatterFile = logging.Formatter( - "{asctime} - {name} - {levelname} - {message}", style="{" - ) - formatterConsole = logging.Formatter("{levelname} {message}", style="{") - fh.setFormatter(formatterFile) - ch.setFormatter(formatterConsole) - # add the handlers to the logger - logger.addHandler(fh) - logger.addHandler(ch) + { + "traingnn": traindmpnn, + "predictgnn": predictdmpnn, + "interpretgnn": interpretdmpnn, + "train": train, + "predict": predict, + "convert": convert + }[subprogram_name](args) def main(): - """ - Main function that runs training/prediction defined by command line arguments - """ - - parser = options.createCommandlineParser() - prog_args: Namespace = parser.parse_args() - try: - if prog_args.method == "convert": - directory = makePathAbsolute(prog_args.f) - if path.isdir(directory): - createLogger(path.join(directory, "convert.log")) - logging.info(f"Convert all data files in {directory}") - fp.convert_all(directory) - else: - raise ValueError("Input directory is not a directory") - elif prog_args.method == "traingnn": - traingnn_opts = options.GnnOptions.fromCmdArgs(prog_args) - createLogger("traingnn.log") - traindmpnn(traingnn_opts) - - elif prog_args.method == "predictgnn": - predictgnn_opts = options.PredictGnnOptions.fromCmdArgs(prog_args) - createLogger("predictgnn.log") - predictdmpnn(predictgnn_opts) - elif prog_args.method == "interpretgnn": - interpretgnn_opts = options.InterpretGNNoptions.fromCmdArgs(prog_args) - createLogger("interpretgnn.log") - interpretdmpnn(interpretgnn_opts) - - elif prog_args.method == "train": - if prog_args.configFile is None and prog_args.inputFile is None: - parser.error("Either --configFile or --inputFile must be provided.") + args = parse_dfpl(*sys.argv[1:]) - train_opts = options.Options.fromCmdArgs(prog_args) - fixed_opts = dataclasses.replace( - train_opts, - inputFile=makePathAbsolute(train_opts.inputFile), - outputDir=makePathAbsolute(train_opts.outputDir), - ) - createDirectory(fixed_opts.outputDir) - createLogger(path.join(fixed_opts.outputDir, "train.log")) - logging.info( - f"The following arguments are received or filled with default values:\n{fixed_opts}" - ) - train(fixed_opts) - elif prog_args.method == "predict": - if prog_args.configFile is None and prog_args.inputFile is None: - parser.error("Either --configFile or --inputFile must be provided.") - predict_opts = options.Options.fromCmdArgs(prog_args) - fixed_opts = dataclasses.replace( - predict_opts, - inputFile=makePathAbsolute(predict_opts.inputFile), - outputDir=makePathAbsolute(predict_opts.outputDir), - outputFile=makePathAbsolute( - path.join(predict_opts.outputDir, predict_opts.outputFile) - ), - ecModelDir=makePathAbsolute(predict_opts.ecModelDir), - fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir), - ) - createDirectory(fixed_opts.outputDir) - createLogger(path.join(fixed_opts.outputDir, "predict.log")) - logging.info( - f"The following arguments are received or filled with default values:\n{prog_args}" - ) - predict(fixed_opts) - except AttributeError as e: - print(e) - parser.print_usage() + # dynamic import after parsing was successful (to allow for faster CLI feedback) + run_dfpl(args) if __name__ == "__main__": diff --git a/dfpl/autoencoder.py b/dfpl/autoencoder.py index b2b13d76..df30c75b 100644 --- a/dfpl/autoencoder.py +++ b/dfpl/autoencoder.py @@ -1,26 +1,28 @@ import logging import math import os.path +from os.path import basename from typing import Tuple import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns -import umap.umap_ as umap +import umap import wandb from sklearn.model_selection import train_test_split from tensorflow.keras import initializers, losses, optimizers from tensorflow.keras.layers import Dense, Input -from tensorflow.keras.models import Model, load_model +from tensorflow.keras.models import Model from dfpl import callbacks from dfpl import history as ht -from dfpl import options, settings +from dfpl import settings +from dfpl.train import TrainOptions from dfpl.utils import ae_scaffold_split, weight_split -def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Model]: +def define_ac_model(opts: TrainOptions, output_bias=None) -> Tuple[Model, Model]: """ This function provides an autoencoder model to reduce a certain input to a compressed version. @@ -31,13 +33,9 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod """ input_size = opts.fpSize encoding_dim = opts.encFPSize - lr_schedule = optimizers.schedules.ExponentialDecay( - opts.aeLearningRate, - decay_steps=1000, - decay_rate=opts.aeLearningRateDecay, - staircase=True, + ac_optimizer = optimizers.Adam( + learning_rate=opts.aeLearningRate, decay=opts.aeLearningRateDecay ) - ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) if output_bias is not None: output_bias = initializers.Constant(output_bias) @@ -107,6 +105,7 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod )(decoded) # output layer + # to either 0 or 1 and hence we use sigmoid activation function. decoded = Dense( units=input_size, activation="sigmoid", bias_initializer=output_bias )(decoded) @@ -133,12 +132,12 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod return autoencoder, encoder -def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: +def train_full_ac(df: pd.DataFrame, opts: TrainOptions) -> Model: """ Trains an autoencoder on the given feature matrix X. The response matrix is only used to split the data into meaningful test and train sets. - :param opts: Command line arguments as defined in options.py + :param opts: Command line arguments :param df: Pandas dataframe that contains the SMILES/InChI data for training the autoencoder :return: The encoder model of the trained autoencoder """ @@ -147,8 +146,37 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: if opts.aeWabTracking and not opts.wabTracking: wandb.init(project=f"AE_{opts.aeSplitType}") - save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_split_autoencoder") + # Define output files for autoencoder and encoder weights + if opts.ecWeightsFile == "": + # If no encoder weights file is specified, use the input file name to generate a default file name + logging.info("No AE encoder weights file specified") + base_file_name = ( + os.path.splitext(basename(opts.inputFile))[0] + opts.aeSplitType + ) + logging.info( + f"(auto)encoder weights will be saved in {base_file_name}.autoencoder.hdf5" + ) + ac_weights_file = os.path.join( + opts.outputDir, base_file_name + ".autoencoder.weights.hdf5" + ) + # ec_weights_file = os.path.join( + # opts.outputDir, base_file_name + ".encoder.weights.hdf5" + # ) + else: + # If an encoder weights file is specified, use it as the encoder weights file name + logging.info(f"AE encoder will be saved in {opts.ecWeightsFile}") + base_file_name = ( + os.path.splitext(basename(opts.ecWeightsFile))[0] + opts.aeSplitType + ) + ac_weights_file = os.path.join( + opts.outputDir, base_file_name + ".autoencoder.weights.hdf5" + ) + # ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile) + # Collect the callbacks for training + callback_list = callbacks.autoencoder_callback( + checkpoint_path=ac_weights_file, opts=opts + ) # Select all fingerprints that are valid and turn them into a numpy array fp_matrix = np.array( @@ -259,35 +287,30 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: # Set up the model of the AC w.r.t. the input size and the dimension of the bottle neck (z!) (autoencoder, encoder) = define_ac_model(opts, output_bias=initial_bias) - callback_list = callbacks.autoencoder_callback(checkpoint_path=save_path, opts=opts) + # Train the autoencoder on the training data auto_hist = autoencoder.fit( x_train, x_train, - callbacks=[callback_list], + callbacks=callback_list, epochs=opts.aeEpochs, batch_size=opts.aeBatchSize, verbose=opts.verbose, validation_data=(x_test, x_test) if opts.testSize > 0.0 else None, ) + logging.info(f"Autoencoder weights stored in file: {ac_weights_file}") # Store the autoencoder training history and plot the metrics ht.store_and_plot_history( - base_file_name=save_path, + base_file_name=os.path.join(opts.outputDir, base_file_name + ".AC"), hist=auto_hist, ) # Save the autoencoder callback model to disk + save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_autoencoder") if opts.testSize > 0.0: - # Re-define autoencoder and encoder using your function - callback_autoencoder = load_model(filepath=save_path) - _, callback_encoder = define_ac_model(opts) - for i, layer in enumerate(callback_encoder.layers): - layer.set_weights(callback_autoencoder.layers[i].get_weights()) - - # Save the encoder model - encoder_save_path = os.path.join(save_path, "encoder_model") - callback_encoder.save(filepath=encoder_save_path) + (callback_autoencoder, callback_encoder) = define_ac_model(opts) + callback_encoder.save(filepath=save_path) else: encoder.save(filepath=save_path) # Return the encoder model of the trained autoencoder diff --git a/dfpl/callbacks.py b/dfpl/callbacks.py index 8bf157fd..7a935f49 100644 --- a/dfpl/callbacks.py +++ b/dfpl/callbacks.py @@ -4,10 +4,11 @@ # for testing in Weights & Biases from wandb.keras import WandbCallback -from dfpl import options, settings +from dfpl import settings +from dfpl.train import TrainOptions -def autoencoder_callback(checkpoint_path: str, opts: options.Options) -> list: +def autoencoder_callback(checkpoint_path: str, opts: TrainOptions) -> list: """ Callbacks for fitting the autoencoder @@ -22,25 +23,15 @@ def autoencoder_callback(checkpoint_path: str, opts: options.Options) -> list: else: target = "loss" # enable this checkpoint to restore the weights of the best performing model - if opts.aeType == "deterministic": - checkpoint = ModelCheckpoint( - checkpoint_path, - monitor=target, - mode="min", - verbose=1, - save_freq="epoch", - save_best_only=True, - ) - else: - checkpoint = ModelCheckpoint( - checkpoint_path, - monitor=target, - mode="min", - verbose=1, - save_freq="epoch", - save_best_only=True, - save_weights_only=True, - ) + checkpoint = ModelCheckpoint( + checkpoint_path, + monitor=target, + mode="min", + verbose=1, + period=settings.ac_train_check_period, + save_best_only=True, + save_weights_only=True, + ) callbacks.append(checkpoint) # enable early stopping if val_loss is not improving anymore @@ -53,12 +44,13 @@ def autoencoder_callback(checkpoint_path: str, opts: options.Options) -> list: restore_best_weights=True, ) callbacks.append(early_stop) + if opts.aeWabTracking and not opts.wabTracking: callbacks.append(WandbCallback(save_model=False)) return callbacks -def nn_callback(checkpoint_path: str, opts: options.Options) -> list: +def nn_callback(checkpoint_path: str, opts: TrainOptions) -> list: """ Callbacks for fitting the feed forward network (FNN) @@ -74,7 +66,7 @@ def nn_callback(checkpoint_path: str, opts: options.Options) -> list: checkpoint = ModelCheckpoint( checkpoint_path, verbose=1, - save_freq="epoch", + period=settings.nn_train_check_period, save_best_only=True, monitor="val_loss", mode="min", diff --git a/dfpl/compression.json b/dfpl/compression.json new file mode 100644 index 00000000..0fb9520d --- /dev/null +++ b/dfpl/compression.json @@ -0,0 +1,14 @@ +{ + "fpSize": 2048, + "encFPSize": 256, + "aeLearningRate": 0.001, + "aeLearningRateDecay": 0.96, + "aeActivationFunction": "selu", + "ecWeightsFile": "", + "ecModelDir": "", + "aeType": "deterministic", + "aeEpochs": 100, + "aeBatchSize": 512, + "aeSplitType": "random", + "visualizeLatent": false +} diff --git a/dfpl/convert.py b/dfpl/convert.py new file mode 100644 index 00000000..0cc49e5e --- /dev/null +++ b/dfpl/convert.py @@ -0,0 +1,35 @@ +from __future__ import annotations + +import argparse +import logging +from argparse import Namespace +from os import path + +from dfpl import fingerprint as fp +from dfpl.utils import makePathAbsolute, createLogger + + +def parseInputConvert(parser_convert: argparse.ArgumentParser) -> None: + """ + Parse the input arguments. + + :return: A namespace object built up from attributes parsed out of the cmd line. + """ + parser_convert.add_argument( + "-f", + metavar="FILE", + type=str, + help="Input directory where your CSV/TSV files are stored.", + required=True, + default="", + ) + + +def convert(args: Namespace): + directory = makePathAbsolute(args.f) + if path.isdir(directory): + createLogger(path.join(directory, "convert.log")) + logging.info(f"Convert all data files in {directory}") + fp.convert_all(directory) + else: + raise ValueError("Input directory is not a directory") diff --git a/dfpl/feedforwardNN.py b/dfpl/feedforwardNN.py index bf4241aa..cb22433b 100644 --- a/dfpl/feedforwardNN.py +++ b/dfpl/feedforwardNN.py @@ -16,9 +16,10 @@ # for NN model functions from tensorflow.keras.models import Model, Sequential +from dfpl.train import TrainOptions from dfpl import callbacks as cb from dfpl import history as ht -from dfpl import options, settings +from dfpl import settings def define_out_file_names(path_prefix: str, target: str, fold: int = -1) -> tuple: @@ -67,18 +68,12 @@ def define_out_file_names(path_prefix: str, target: str, fold: int = -1) -> tupl def define_nn_multi_label_model( - input_size: int, output_size: int, opts: options.Options + input_size: int, output_size: int, opts: TrainOptions ) -> Model: - lr_schedule = optimizers.schedules.ExponentialDecay( - opts.aeLearningRate, - decay_steps=1000, - decay_rate=opts.aeLearningRateDecay, - staircase=True, - ) if opts.optimizer == "Adam": - my_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) + my_optimizer = optimizers.Adam(learning_rate=opts.learningRate) elif opts.optimizer == "SGD": - my_optimizer = optimizers.legacy.SGD(lr=lr_schedule, momentum=0.9) + my_optimizer = optimizers.SGD(lr=opts.learningRate, momentum=0.9) else: logging.error(f"Your selected optimizer is not supported:{opts.optimizer}.") sys.exit("Unsupported optimizer.") @@ -138,9 +133,9 @@ def define_nn_model_multi( decay: float = 0.01, ) -> Model: if optimizer == "Adam": - my_optimizer = optimizers.legacy.Adam(learning_rate=lr, decay=decay) + my_optimizer = optimizers.Adam(learning_rate=lr, decay=decay) elif optimizer == "SGD": - my_optimizer = optimizers.legacy.SGD(lr=lr, momentum=0.9, decay=decay) + my_optimizer = optimizers.SGD(lr=lr, momentum=0.9, decay=decay) else: my_optimizer = optimizer @@ -237,7 +232,7 @@ def validate_multi_model_on_test_data( return [f1_random, f1_trained] -def train_nn_models_multi(df: pd.DataFrame, opts: options.Options) -> None: +def train_nn_models_multi(df: pd.DataFrame, opts: TrainOptions) -> None: # find target columns names_y = [ c @@ -300,8 +295,6 @@ def train_nn_models_multi(df: pd.DataFrame, opts: options.Options) -> None: model_file_path_weights, model_file_path_json, model_hist_path, - model_hist_csv_path, - model_predict_valset_csv_path, model_validation, model_auc_file, model_auc_file_data, diff --git a/dfpl/interpretgnn.py b/dfpl/interpretgnn.py new file mode 100644 index 00000000..a1c36756 --- /dev/null +++ b/dfpl/interpretgnn.py @@ -0,0 +1,100 @@ +from __future__ import annotations + +import argparse +from argparse import Namespace +from typing import List + +import chemprop +from chemprop.args import InterpretArgs + +from dfpl.utils import createLogger + + +class InterpretGNNoptions(InterpretArgs): + """ + Dataclass to hold all options used for training the graph models + """ + + # configFile: str = "./example/interpret.json" + data_path: str = "./example/data/smiles.csv" + batch_size: int = 500 + c_puct: float = 10.0 + max_atoms: int = 20 + min_atoms: int = 8 + prop_delta: float = 0.5 + property_id: List[int] = None + rollout: int = 20 + + +def parseInterpretGnn(parser_interpret_gnn: argparse.ArgumentParser) -> None: + interpret_gnn_files_args = parser_interpret_gnn.add_argument_group("Files") + interpret_gnn_interpret_args = parser_interpret_gnn.add_argument_group("Interpretation Configuration") + interpret_gnn_files_args.add_argument( + "--preds_path", + type=str, + metavar="FILE", + help="Path to CSV file where predictions will be saved", + default="", + ) + interpret_gnn_files_args.add_argument( + "--checkpoint_dir", + type=str, + metavar="DIR", + help="Directory from which to load model checkpoints" + "(walks directory and ensembles all models that are found)", + default="./ckpt", + ) + interpret_gnn_files_args.add_argument( + "--checkpoint_path", + type=str, + metavar="DIR", + help="Path to model checkpoint (.pt file)", + ) + interpret_gnn_files_args.add_argument( + "--data_path", + type=str, + metavar="FILE", + help="Path to CSV file for which predictions will be made", + ) + interpret_gnn_interpret_args.add_argument( + "--max_atoms", + type=int, + metavar="INT", + help="Maximum number of atoms to use for interpretation", + ) + + interpret_gnn_interpret_args.add_argument( + "--min_atoms", + type=int, + metavar="INT", + help="Minimum number of atoms to use for interpretation", + ) + + interpret_gnn_interpret_args.add_argument( + "--prop_delta", + type=float, + metavar="FLOAT", + help="The minimum change in the property of interest that is considered significant", + ) + interpret_gnn_interpret_args.add_argument( + "--property_id", + type=int, + metavar="INT", + help="The index of the property of interest", + ) + # write the argument for rollouts + interpret_gnn_interpret_args.add_argument( + "--rollout", + type=int, + metavar="INT", + help="The number of rollouts to use for interpretation", + ) + + +def interpretdmpnn(args: Namespace) -> None: + """ + Interpret the predictions of a trained D-MPNN model with the given options. + """ + createLogger("interpretgnn.log") + opts = InterpretGNNoptions(**vars(args)) + chemprop.interpret.interpret(args=opts, save_to_csv=True) diff --git a/dfpl/options.py b/dfpl/options.py deleted file mode 100644 index 42ce57e5..00000000 --- a/dfpl/options.py +++ /dev/null @@ -1,1442 +0,0 @@ -from __future__ import annotations - -import argparse -from dataclasses import dataclass -from pathlib import Path -from typing import List, Literal, Optional - -import jsonpickle -import torch -from chemprop.args import InterpretArgs, PredictArgs, TrainArgs - -from dfpl.utils import parseCmdArgs - - -@dataclass -class Options: - """ - Dataclass for all options necessary for training the neural nets - """ - - configFile: str = None - inputFile: str = "" - outputDir: str = "" # changes according to mode - outputFile: str = "" - ecWeightsFile: str = "" - ecModelDir: str = "" - fnnModelDir: str = "" - type: str = "smiles" - fpType: str = "topological" # also "MACCS", "atompairs" - epochs: int = 100 - fpSize: int = 2048 - encFPSize: int = 256 - kFolds: int = 1 - testSize: float = 0.2 - enableMultiLabel: bool = False - verbose: int = 2 - trainAC: bool = False - trainFNN: bool = True - compressFeatures: bool = False - sampleFractionOnes: float = 0.5 - sampleDown: bool = False - split_type: str = "random" - aeSplitType: str = "random" - aeType: str = "deterministic" - aeEpochs: int = 100 - aeBatchSize: int = 512 - aeLearningRate: float = 0.001 - aeLearningRateDecay: float = 0.96 - aeActivationFunction: str = "selu" - aeOptimizer: str = "Adam" - fnnType: str = "FNN" - batchSize: int = 128 - optimizer: str = "Adam" - learningRate: float = 0.001 - learningRateDecay: float = 0.96 - lossFunction: str = "bce" - activationFunction: str = "relu" - l2reg: float = 0.001 - dropout: float = 0.2 - threshold: float = 0.5 - visualizeLatent: bool = False # only if autoencoder is trained or loaded - gpu: int = None - aeWabTracking: bool = False # Wand & Biases autoencoder tracking - wabTracking: bool = False # Wand & Biases FNN tracking - wabTarget: str = "AR" # Wand & Biases target used for showing training progress - - def saveToFile(self, file: str) -> None: - """ - Saves an instance to a JSON file - """ - jsonFile = Path(file) - with jsonFile.open("w") as f: - f.write(jsonpickle.encode(self)) - - @classmethod - def fromCmdArgs(cls, args: argparse.Namespace) -> "Options": - return parseCmdArgs(cls, args) - - -@dataclass -class GnnOptions(TrainArgs): - """ - Dataclass to hold all options used for training the graph models - """ - - total_epochs: int = 30 - save: bool = True - configFile: str = "" - data_path: str = "" - use_compound_names: bool = False - save_dir: str = "" - no_cache: bool = False - features_scaling: bool = True - use_input_features: str = "" - cuda: bool = False - num_lrs: int = 2 - minimize_score: bool = False - num_tasks: int = 12 - preds_path: str = "" - test_path: str = "" - save_preds: bool = True - calibration_method: str = "" - uncertainty_method: str = "" - calibration_path: str = "" - evaluation_methods: str = "" - evaluation_scores_path: str = "" - wabTracking: bool = False - split_sizes: List[float] = None - # save_smiles_splits: bool = False - - @classmethod - def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): - # Initialize with JSON config if provided - if json_config: - opts = cls(**json_config) - else: - opts = cls() - - # Update with command-line arguments - for key, value in vars(args).items(): - if value is not None: - setattr(opts, key, value) - - return opts - - -class PredictGnnOptions(PredictArgs): - """ - Dataclass to hold all options used for training the graph models - """ - - configFile: str = "" - calibration_atom_descriptors_path: str = None - calibration_features_path: str = None - calibration_interval_percentile: float = 95 - calibration_method: Optional[ - Literal[ - "zscaling", - "tscaling", - "zelikman_interval", - "mve_weighting", - "platt", - "isotonic", - ] - ] = None - calibration_path: str = None - calibration_phase_features_path: str = None - drop_extra_columns: bool = False - dropout_sampling_size: int = 10 - evaluation_methods: List[str] = None - evaluation_scores_path: str = None - # no_features_scaling: bool = True - individual_ensemble_predictions: bool = False - preds_path: str = None - regression_calibrator_metric: Optional[Literal["stdev", "interval"]] = None - test_path: str = None - uncertainty_dropout_p: float = 0.1 - uncertainty_method: Optional[ - Literal[ - "mve", - "ensemble", - "evidential_epistemic", - "evidential_aleatoric", - "evidential_total", - "classification", - "dropout", - ] - ] = None - - @classmethod - def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): - # Initialize with JSON config if provided - if json_config: - opts = cls(**json_config) - else: - opts = cls() - - # Update with command-line arguments - for key, value in vars(args).items(): - if value is not None: - setattr(opts, key, value) - - return opts - - -class InterpretGNNoptions(InterpretArgs): - """ - Dataclass to hold all options used for training the graph models - """ - - configFile: str = "./example/interpret.json" - data_path: str = "./example/data/smiles.csv" - batch_size: int = 500 - c_puct: float = 10.0 - max_atoms: int = 20 - min_atoms: int = 8 - prop_delta: float = 0.5 - property_id: List[int] = None - rollout: int = 20 - - @classmethod - def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): - # Initialize with JSON config if provided - if json_config: - opts = cls(**json_config) - else: - opts = cls() - - # Update with command-line arguments - for key, value in vars(args).items(): - if value is not None: - setattr(opts, key, value) - - return opts - - -def createCommandlineParser() -> argparse.ArgumentParser: - """ - Build the parser for arguments with its two subparsers - """ - parser = argparse.ArgumentParser(prog="deepFPlearn") - subparsers = parser.add_subparsers(help="Sub programs of deepFPlearn") - - parser_train_gnn = subparsers.add_parser( - "traingnn", help="Train new GNN models with your data" - ) - parser_train_gnn.set_defaults(method="traingnn") - parseTrainGnn(parser_train_gnn) - - parser_predict_gnn = subparsers.add_parser( - "predictgnn", help="Predict with your GNN models" - ) - parser_predict_gnn.set_defaults(method="predictgnn") - parsePredictGnn(parser_predict_gnn) - - parser_interpret_gnn = subparsers.add_parser( - "interpretgnn", help="Interpret your GNN models" - ) - parser_interpret_gnn.set_defaults(method="interpretgnn") - parseInterpretGnn(parser_interpret_gnn) - - parser_train = subparsers.add_parser( - "train", help="Train new models with your data" - ) - parser_train.set_defaults(method="train") - parseInputTrain(parser_train) - - parser_predict = subparsers.add_parser( - "predict", help="Predict your data with existing models" - ) - parser_predict.set_defaults(method="predict") - parseInputPredict(parser_predict) - - parser_convert = subparsers.add_parser( - "convert", help="Convert known data files to pickle serialization files" - ) - parser_convert.set_defaults(method="convert") - parseInputConvert(parser_convert) - return parser - - -def parseInputTrain(parser: argparse.ArgumentParser) -> None: - """ - Parse the input arguments. - - :return: A namespace object built up from attributes parsed out of the cmd line. - """ - # Create argument groups - general_args = parser.add_argument_group("Model Configuration") - autoencoder_args = parser.add_argument_group("Autoencoder Configuration") - training_args = parser.add_argument_group("Training Configuration") - tracking_args = parser.add_argument_group("Tracking Configuration") - - # Model Configuration - general_args.add_argument( - "-f", - "--configFile", - metavar="FILE", - type=str, - help="Input JSON file that contains all information for training/predicting.", - default="example/train.json", - ) - general_args.add_argument( - "-i", - "--inputFile", - metavar="FILE", - type=str, - help="The file containing the data for training in " - "comma separated CSV format.The first column should be smiles.", - default="tests/data/smiles.csv", - ) - general_args.add_argument( - "-o", - "--outputDir", - metavar="DIR", - type=str, - help="Prefix of output file name. Trained model and " - "respective stats will be returned in this directory.", - default="example/results_train/", - ) - - # TODO CHECK WHAT IS TYPE DOING? - general_args.add_argument( - "-t", - "--type", - type=str, - choices=["fp", "smiles"], - help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default="fp", - ) - general_args.add_argument( - "-thr", - "--threshold", - type=float, - metavar="FLOAT", - help="Threshold for binary classification.", - default=0.5, - ) - general_args.add_argument( - "-gpu", - "--gpu", - metavar="INT", - type=int, - help="Select which gpu to use by index. If not available, leave empty", - default=None, - ) - general_args.add_argument( - "--fpType", - type=str, - choices=["topological", "MACCS"], - help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.", - default="topological", - ) - general_args.add_argument( - "--fpSize", - type=int, - help="Length of the fingerprint that should be generated.", - default=2048, - ) - general_args.add_argument( - "-c", - "--compressFeatures", - action="store_true", - default=False, - help="Should the fingerprints be compressed or not. Activates the autoencoder. ", - ) - general_args.add_argument( - "--enableMultiLabel", - metavar="BOOL", - type=bool, - help="Train multi-label classification model in addition to the individual models.", - default=argparse.SUPPRESS, - ) - # Autoencoder Configuration - autoencoder_args.add_argument( - "-a", - "--ecWeightsFile", - type=str, - metavar="FILE", - help="The .hdf5 file of a trained encoder", - default="", - ) - autoencoder_args.add_argument( - "--ecModelDir", - type=str, - metavar="DIR", - help="The directory where the full model of the encoder will be saved", - default=argparse.SUPPRESS, - ) - autoencoder_args.add_argument( - "--aeType", - type=str, - choices=["variational", "deterministic"], - help="Autoencoder type, variational or deterministic.", - default="deterministic", - ) - autoencoder_args.add_argument( - "--aeEpochs", - metavar="INT", - type=int, - help="Number of epochs for autoencoder training.", - default=100, - ) - autoencoder_args.add_argument( - "--aeBatchSize", - metavar="INT", - type=int, - help="Batch size in autoencoder training.", - default=512, - ) - autoencoder_args.add_argument( - "--aeActivationFunction", - type=str, - choices=["relu", "selu"], - help="The activation function for the hidden layers in the autoencoder.", - default=argparse.SUPPRESS, - ) - autoencoder_args.add_argument( - "--aeLearningRate", - metavar="FLOAT", - type=float, - help="Learning rate for autoencoder training.", - default=0.001, - ) - autoencoder_args.add_argument( - "--aeLearningRateDecay", - metavar="FLOAT", - type=float, - help="Learning rate decay for autoencoder training.", - default=0.96, - ) - autoencoder_args.add_argument( - "--aeSplitType", - type=str, - choices=["scaffold_balanced", "random", "molecular_weight"], - help="Set how the data is going to be split for the autoencoder", - default=argparse.SUPPRESS, - ) - autoencoder_args.add_argument( - "-d", - "--encFPSize", - metavar="INT", - type=int, - help="Size of encoded fingerprint (z-layer of autoencoder).", - default=256, - ) - autoencoder_args.add_argument( - "--visualizeLatent", - action="store_true", - help="UMAP the latent space for exploration", - default=False, - ) - # Training Configuration - training_args.add_argument( - "--split_type", - type=str, - choices=["scaffold_balanced", "random", "molecular_weight"], - help="Set how the data is going to be split for the feedforward neural network", - default=argparse.SUPPRESS, - ) - training_args.add_argument( - "--testSize", - metavar="FLOAT", - type=float, - help="Fraction of the dataset that should be used for testing. Value in [0,1].", - default=argparse.SUPPRESS, - ) - autoencoder_args.add_argument( - "--fnnType", - type=str, - choices=["FNN", "SNN"], - help="The type of the feedforward neural network.", - default="FNN", - ) - training_args.add_argument( - "-K", - "--kFolds", - metavar="INT", - type=int, - help="K that is used for K-fold cross-validation in the training procedure.", - default=argparse.SUPPRESS, - ) - training_args.add_argument( - "-v", - "--verbose", - type=int, - choices=[0, 1, 2], - help="Verbosity level. O: No additional output, " - + "1: Some additional output, 2: full additional output", - default=2, - ) - training_args.add_argument( - "--trainAC", - action="store_true", - help="Choose to train or not, the autoencoder based on the input file", - default=argparse.SUPPRESS, - ) - training_args.add_argument( - "--trainFNN", - action="store_false", - help="When called it deactivates the training.", - default=argparse.SUPPRESS, - ) - training_args.add_argument( - "--sampleFractionOnes", - metavar="FLOAT", - type=float, - help="This is the fraction of positive target associations (1s) in comparison to the majority class(0s)." - "only works if --sampleDown is enabled", - default=argparse.SUPPRESS, - ) - training_args.add_argument( - "--sampleDown", - metavar="BOOL", - type=bool, - help="Enable automatic down sampling of the 0 valued samples.", - default=argparse.SUPPRESS, - ) - training_args.add_argument( - "-e", - "--epochs", - metavar="INT", - type=int, - help="Number of epochs that should be used for the FNN training", - default=argparse.SUPPRESS, - ) - # TODO CHECK IF ALL LOSSES MAKE SENSE HERE - training_args.add_argument( - "--lossFunction", - type=str, - choices=["mse", "bce", "focal"], - help="Loss function to use during training. mse - mean squared error, bce - binary cross entropy.", - default=argparse.SUPPRESS, - ) - training_args.add_argument( - "--optimizer", - type=str, - choices=["Adam", "SGD"], - help='Optimizer to use for backpropagation in the FNN. Possible values: "Adam", "SGD"', - default=argparse.SUPPRESS, - ) - training_args.add_argument( - "--batchSize", - metavar="INT", - type=int, - help="Batch size in FNN training.", - default=128, - ) - training_args.add_argument( - "--l2reg", - metavar="FLOAT", - type=float, - help="Value for l2 kernel regularizer.", - default=0.001, - ) - training_args.add_argument( - "--dropout", - metavar="FLOAT", - type=float, - help="The fraction of data that is dropped out in each dropout layer.", - default=0.2, - ) - training_args.add_argument( - "--learningRate", - metavar="FLOAT", - type=float, - help="Learning rate size in FNN training.", - default=0.000022, - ) - training_args.add_argument( - "--learningRateDecay", - metavar="FLOAT", - type=float, - help="Learning rate size in FNN training.", - default=0.96, - ) - training_args.add_argument( - "--activationFunction", - type=str, - choices=["relu", "selu"], - help="The activation function for hidden layers in the FNN.", - default=argparse.SUPPRESS, - ) - # Tracking Configuration - tracking_args.add_argument( - "--aeWabTracking", - metavar="BOOL", - type=bool, - help="Track autoencoder performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, - ) - tracking_args.add_argument( - "--wabTracking", - metavar="BOOL", - type=bool, - help="Track FNN performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, - ) - tracking_args.add_argument( - "--wabTarget", - metavar="STRING", - type=str, - choices=["AR", "ER", "ED", "GR", "TR", "PPARg", "Aromatase"], - help="Which target to use for tracking performance via Weights & Biases, see https://wandb.ai.", - default="AR", - ) - - -def parseInputPredict(parser: argparse.ArgumentParser) -> None: - """ - Parse the input arguments. - - :return: A namespace object built up from attributes parsed out of the cmd line. - """ - - general_args = parser.add_argument_group("General Configuration") - files_args = parser.add_argument_group("Files") - files_args.add_argument( - "-f", - "--configFile", - metavar="FILE", - type=str, - help="Input JSON file that contains all information for training/predicting.", - ) - files_args.add_argument( - "-i", - "--inputFile", - metavar="FILE", - type=str, - help="The file containing the data for the prediction in (unquoted) " - "comma separated CSV format. The column named 'smiles' or 'fp'" - "contains the field to be predicted. Please adjust the type " - "that should be predicted (fp or smile) with -t option appropriately." - "An optional column 'id' is used to assign the outcomes to the" - "original identifiers. If this column is missing, the results are" - "numbered in the order of their appearance in the input file." - "A header is expected and respective column names are used.", - default="tests/data/smiles.csv", - ) - files_args.add_argument( - "-o", - "--outputDir", - metavar="DIR", - type=str, - help="Prefix of output directory. It will contain a log file and the file specified" - "with --outputFile.", - default="example/results_predict/", - ) - files_args.add_argument( - "--outputFile", - metavar="FILE", - type=str, - help="Output .CSV file name which will contain one prediction per input line. " - "Default: prefix of input file name.", - default="results.csv", - ) - # TODO AGAIN THIS TRASH HERE? CAN WE EVEN PROCESS SMILES? - general_args.add_argument( - "-t", - "--type", - type=str, - choices=["fp", "smiles"], - help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default="fp", - ) - general_args.add_argument( - "-k", - "--fpType", - type=str, - choices=["topological", "MACCS"], - help="The type of fingerprint to be generated/used in input file. Should be the same as the type of the fps that the model was trained upon.", - default="topological", - ) - files_args.add_argument( - "--ecModelDir", - type=str, - metavar="DIR", - help="The directory where the full model of the encoder will be saved (if trainAE=True) or " - "loaded from (if trainAE=False). Provide a full path here.", - default="", - ) - files_args.add_argument( - "--ecWeightsFile", - type=str, - metavar="STR", - help="The file where the full model of the encoder will be loaded from, to compress the fingerprints. Provide a full path here.", - default="", - ) - files_args.add_argument( - "--fnnModelDir", - type=str, - metavar="DIR", - help="The directory where the full model of the fnn is loaded from. " - "Provide a full path here.", - default="example/results_train/AR_saved_model", - ) - general_args.add_argument( - "-c", "--compressFeatures", action="store_true", default=False - ) - ( - general_args.add_argument( - "--aeType", - type=str, - choices=["variational", "deterministic"], - help="Autoencoder type, variational or deterministic.", - default="deterministic", - ) - ) - - -def parseTrainGnn(parser: argparse.ArgumentParser) -> None: - general_args = parser.add_argument_group("General Configuration") - data_args = parser.add_argument_group("Data Configuration") - files_args = parser.add_argument_group("Files") - model_args = parser.add_argument_group("Model arguments") - training_args = parser.add_argument_group("Training Configuration") - uncertainty_args = parser.add_argument_group("Uncertainty Configuration") - uncertainty_args.add_argument( - "--uncertainty_method", - type=str, - choices=[ - "mve", - "ensemble", - "evidential_epistemic", - "evidential_aleatoric", - "evidential_total", - "classification", - "dropout", - "dirichlet", - ], - help="Method to use for uncertainty estimation", - default="none", - ) - # Uncertainty arguments - uncertainty_args.add_argument( - "--calibration_method", - type=str, - choices=[ - "zscaling", - "tscaling", - "zelikman_interval", - "mve_weighting", - "platt", - "isotonic", - ], - help="Method to use for calibration", - default="none", - ) - uncertainty_args.add_argument( - "--calibration_path", - type=str, - metavar="FILE", - help="Path to file with calibration data", - ) - - # General arguments - general_args.add_argument("--split_key_molecule", type=int) - general_args.add_argument("--pytorch_seed", type=int) - general_args.add_argument("--cache_cutoff", type=float) - general_args.add_argument("--save_preds", type=bool) - general_args.add_argument("--wabTracking", action="store_true", default=False) - general_args.add_argument( - "--cuda", action="store_true", default=False, help="Turn on cuda" - ) - # general_args.add_argument( - # "--save_smiles_splits", - # action="store_true", - # default=False, - # help="Save smiles for each train/val/test splits for prediction convenience later", - # ) - general_args.add_argument( - "--test", - action="store_true", - default=False, - help="Whether to skip training and only test the model", - ) - general_args.add_argument( - "--gpu", - type=int, - choices=list(range(torch.cuda.device_count())), - help="Which GPU to use", - ) - general_args.add_argument("--save", type=bool) - general_args.add_argument( - "--quiet", - action="store_true", - default=False, - help="Skip non-essential print statements", - ) - general_args.add_argument( - "--log_frequency", - type=int, - metavar="INT", - default=10, - help="The number of batches between each logging of the training loss", - ) - general_args.add_argument( - "--no_cache", - action="store_true", - default=False, - help="Turn off caching mol2graph computation", - ) - - # FILES ARGUMENTS - files_args.add_argument( - "-f", - "--configFile", - metavar="FILE", - type=str, - help="Input JSON file that contains all information for training/predicting.", - ) - files_args.add_argument( - "--save_dir", - type=str, - metavar="DIR", - default="./ckpt/", - help="Directory where model checkpoints will be saved", - ) - files_args.add_argument( - "--checkpoint_dir", - type=str, - metavar="DIR", - default=None, - help="Directory from which to load model checkpoints" - "(walks directory and ensembles all models that are found)", - ) - files_args.add_argument( - "--checkpoint_path", - type=str, - metavar="FILE", - default=None, - help="Path to model checkpoint (.pt file)", - ) - files_args.add_argument( - "--checkpoint_paths", - type=str, - metavar="FILE", - nargs="*", - default=None, - help="Path to model checkpoint (.pt file)", - ) - files_args.add_argument( - "--separate_val_path", - type=str, - metavar="FILE", - help="Path to separate val set, optional", - ) - files_args.add_argument( - "--separate_val_features_path", - type=str, - metavar="FILE", - nargs="*", - help="Path to file with features for separate val set", - ) - files_args.add_argument( - "--separate_test_path", - type=str, - metavar="FILE", - help="Path to separate test set, optional", - ) - files_args.add_argument( - "--separate_test_features_path", - type=str, - metavar="FILE", - nargs="*", - help="Path to file with features for separate test set", - ) - files_args.add_argument( - "--folds_file", - type=str, - metavar="FILE", - default=None, - help="Optional file of fold labels", - ) - files_args.add_argument( - "--val_fold_index", - type=int, - metavar="INT", - default=None, - help="Which fold to use as val for cross val", - ) - files_args.add_argument( - "--test_fold_index", - type=int, - metavar="INT", - default=None, - help="Which fold to use as test for cross val", - ) - files_args.add_argument( - "--crossval_index_dir", - type=str, - metavar="DIR", - help="Directory in which to find cross validation index files", - ) - files_args.add_argument( - "--crossval_index_file", - type=str, - metavar="FILE", - help="Indices of files to use as train/val/test" - "Overrides --num_folds and --seed.", - ) - files_args.add_argument( - "--data_weights_path", - type=str, - metavar="FILE", - help="Path where the data weight are saved", - ) - files_args.add_argument( - "--features_path", - type=str, - metavar="FILE", - nargs="*", - help="Path to features to use in FNN (instead of features_generator)", - ) - - files_args.add_argument( - "--separate_val_phase_features_path", type=str, metavar="FILE" - ) - files_args.add_argument( - "--separate_test_phase_features_path", type=str, metavar="FILE" - ) - - files_args.add_argument( - "--separate_val_atom_descriptors_path", type=str, metavar="FILE" - ) - files_args.add_argument( - "--separate_test_atom_descriptors_path", type=str, metavar="FILE" - ) - # Data related arguments - data_args.add_argument( - "--data_path", - type=str, - metavar="FILE", - help="Path to data CSV file", - default="", - ) - data_args.add_argument( - "--use_compound_names", - action="store_true", - default=False, - help="Use when test data file contains compound names in addition to SMILES strings", - ) - data_args.add_argument( - "--max_data_size", - type=int, - metavar="INT", - help="Maximum number of data points to load", - ) - - data_args.add_argument( - "--features_only", - action="store_true", - default=False, - help="Use only the additional features in an FFN, no graph network", - ) - data_args.add_argument( - "--dataset_type", - type=str, - choices=["classification", "regression", "multiclass"], - help="Type of dataset, e.g. classification or regression." - "This determines the loss function used during training.", - default="regression", - ) # classification - data_args.add_argument( - "--multiclass_num_classes", - type=int, - metavar="INT", - default=3, - help="Number of classes when running multiclass classification", - ) - data_args.add_argument( - "--split_type", - type=str, - default="random", - choices=[ - "random", - "scaffold_balanced", - "predetermined", - "crossval", - "index_predetermined", - ], - help="Method of splitting the data into train/val/test", - ) - data_args.add_argument( - "--split_sizes", - type=float, - metavar="FLOAT", - nargs=3, - default=[0.8, 0.2, 0.0], - help="Split proportions for train/validation/test sets", - ) - - data_args.add_argument( - "--seed", - type=int, - default=0, - help="Random seed to use when splitting data into train/val/test sets." - "When `num_folds` > 1, the first fold uses this seed and all" - "subsequent folds add 1 to the seed.", - ) - data_args.add_argument( - "--smiles_columns", - type=str, - metavar="STRING", - help="Name of the smiles columns", - ) - - data_args.add_argument( - "--target_columns", - type=str, - metavar="STRING", - help="Name of the target columns", - ) - - data_args.add_argument( - "--ignore_columns", - type=str, - metavar="STRING", - help="Names of the columns to ignore", - ) - data_args.add_argument( - "--num_tasks", type=int, metavar="INT", help="NUmber of tasks" - ) - data_args.add_argument( - "--no_features_scaling", - action="store_true", - default=False, - help="Turn off scaling of features", - ) - data_args.add_argument( - "--features_scaling", - action="store_true", - default=False, - help="Turn on scaling of features", - ) - data_args.add_argument( - "--use_input_features", - type=str, - metavar="STRING", - help="Turn on scaling of features", - ) - - # Model arguments - model_args.add_argument( - "--ensemble_size", - type=int, - metavar="INT", - default=1, - help="Number of models in ensemble", - ) - model_args.add_argument( - "--hidden_size", - type=int, - metavar="INT", - default=300, - help="Dimensionality of hidden layers in MPN", - ) - model_args.add_argument( - "--bias", - action="store_true", - default=False, - help="Whether to add bias to linear layers", - ) - model_args.add_argument( - "--depth", - type=int, - metavar="INT", - default=3, - help="Number of message passing steps", - ) - model_args.add_argument( - "--dropout", - type=float, - metavar="FLOAT", - default=0.0, - help="Dropout probability", - ) - model_args.add_argument( - "--activation", - type=str, - default="ReLU", - choices=["ReLU", "LeakyReLU", "PReLU", "tanh", "SELU", "ELU"], - help="Activation function", - ) - model_args.add_argument( - "--undirected", - action="store_true", - default=False, - help="Undirected edges (always sum the two relevant bond vectors)", - ) - model_args.add_argument( - "--ffn_hidden_size", - type=int, - metavar="INT", - default=2, - help="Hidden dim for higher-capacity FFN (defaults to hidden_size)", - ) - model_args.add_argument( - "--ffn_num_layers", - type=int, - metavar="INT", - default=2, - help="Number of layers in FFN after MPN encoding", - ) - model_args.add_argument( - "--atom_messages", - action="store_true", - default=False, - help="Use messages on atoms instead of messages on bonds", - ) - - model_args.add_argument( - "--num_lrs", - type=int, - metavar="INT", - default=2, - help="Number of layers in FFN after MPN encoding", - ) - model_args.add_argument("--checkpoint_frzn", type=str, metavar="STRING") - - # Model arguments - model_args.add_argument("--mpn_shared", type=bool, metavar="BOOL") - model_args.add_argument( - "--show_individual_scores", - action="store_true", - default=True, - help="Show all scores for individual targets, not just average, at the end", - ) - model_args.add_argument("--aggregation", choices=["mean", "sum", "norm"]) - model_args.add_argument("--aggregation_norm", type=int) - model_args.add_argument("--explicit_h", type=bool, metavar="BOOL") - model_args.add_argument("--adding_h", type=bool, metavar="BOOL") - # Training arguments - model_args.add_argument("--class_balance", type=bool, metavar="BOOL") - model_args.add_argument("--evidential_regularization", type=float, metavar="FLOAT") - model_args.add_argument( - "--overwrite_default_atom_features", type=bool, metavar="BOOL" - ) - model_args.add_argument("--no_atom_descriptor_scaling", type=bool, metavar="BOOL") - model_args.add_argument( - "--overwrite_default_bond_features", type=bool, metavar="BOOL" - ) - model_args.add_argument("--frzn_ffn_layers", type=int, metavar="INT") - model_args.add_argument("--freeze_first_only", type=bool, metavar="BOOL") - # Training arguments - training_args.add_argument( - "--epochs", type=int, metavar="INT", default=30, help="Number of epochs to run" - ) - training_args.add_argument( - "--total_epochs", - type=int, - metavar="INT", - default=30, - help="Number of total epochs to run", - ) - training_args.add_argument( - "--batch_size", type=int, metavar="INT", default=50, help="Batch size" - ) - training_args.add_argument( - "--warmup_epochs", - type=int, - metavar="INT", - default=2, - help="Number of epochs during which learning rate increases linearly from" - "init_lr to max_lr. Afterwards, learning rate decreases exponentially" - "from max_lr to final_lr.", - ) - training_args.add_argument( - "--init_lr", - type=float, - metavar="FLOAT", - default=1e-4, - help="Initial learning rate", - ) - training_args.add_argument( - "--max_lr", - type=float, - metavar="FLOAT", - default=1e-3, - help="Maximum learning rate", - ) - training_args.add_argument( - "--final_lr", - type=float, - metavar="FLOAT", - default=1e-4, - help="Final learning rate", - ) - training_args.add_argument( - "--extra_metrics", - type=str, - metavar="STRING", - nargs="*", - help="Extra metrics to use", - ) - training_args.add_argument( - "--loss_function", - type=str, - choices=[ - "mse", - "bounded_mse", - "binary_cross_entropy", - "cross_entropy", - "mcc", - "sid", - "wasserstein", - "mve", - "evidential", - "dirichlet", - ], - ) - training_args.add_argument("--grad_clip", type=float) - training_args.add_argument( - "--metric", - type=str, - default=None, - choices=[ - "auc", - "prc-auc", - "rmse", - "mae", - "mse", - "r2", - "accuracy", - "cross_entropy", - ], - help="Metric to use during evaluation." - "Note: Does NOT affect loss function used during training" - "(loss is determined by the `dataset_type` argument)." - 'Note: Defaults to "auc" for classification and "rmse" for regression.', - ) - training_args.add_argument( - "--num_folds", - type=int, - metavar="INT", - default=1, - help="Number of folds when performing cross validation", - ) - - -def parsePredictGnn(parser: argparse.ArgumentParser) -> None: - general_args = parser.add_argument_group("General Configuration") - files_args = parser.add_argument_group("Files") - uncertainty_args = parser.add_argument_group("Uncertainty Configuration") - - general_args.add_argument( - "--checkpoint_path", - type=str, - metavar="FILE", - help="Path to model checkpoint (.pt file)", - ) - # general_args.add_argument( - # "--no_features_scaling", - # action="store_true", - # help="Turn on scaling of features", - # ) - files_args.add_argument( - "-f", - "--configFile", - type=str, - metavar="FILE", - help="Path to a .json file containing arguments. Any arguments present in the config" - "file will override arguments specified via the command line or by the defaults.", - ) - files_args.add_argument( - "--test_path", - type=str, - help="Path to CSV file containing testing data for which predictions will be made.", - ) - files_args.add_argument( - "--preds_path", - type=str, - help="Predictions output file. CSV or PICKLE file where predictions will be saved.", - ) - files_args.add_argument( - "--calibration_path", - type=str, - help="Data file to be used for uncertainty calibration.", - ) - files_args.add_argument( - "--calibration_features_path", - type=str, - nargs="+", - help="Feature data file to be used with the uncertainty calibration dataset.", - ) - files_args.add_argument("--calibration_phase_features_path", type=str, help="") - files_args.add_argument( - "--calibration_atom_descriptors_path", - type=str, - help="Extra atom descriptors file.", - ) - files_args.add_argument( - "--calibration_bond_descriptors_path", - type=str, - help="Extra bond descriptors file. Path to the extra bond descriptors that will be used as bond features to featurize a given molecule.", - ) - - general_args.add_argument( - "--drop_extra_columns", - action="store_true", - help="Keep only SMILES and new prediction columns in the test data files.", - ) - - uncertainty_args.add_argument( - "--uncertainty_method", - type=str, - choices=[ - "mve", - "ensemble", - "evidential_epistemic", - "evidential_aleatoric", - "evidential_total", - "classification", - "dropout", - "spectra_roundrobin", - "dirichlet", - ], - help="The method of calculating uncertainty.", - ) - uncertainty_args.add_argument( - "--calibration_method", - type=str, - nargs="+", - choices=[ - "zscaling", - "tscaling", - "zelikman_interval", - "mve_weighting", - "platt", - "isotonic", - ], - help="Methods used for calibrating the uncertainty calculated with uncertainty method.", - ) - uncertainty_args.add_argument( - "--individual_ensemble_predictions", - action="store_true", - default=False, - help="Save individual ensemble predictions.", - ) - uncertainty_args.add_argument( - "--evaluation_methods", - type=str, - nargs="+", - help="Methods used for evaluating the uncertainty performance. Only used if the test data provided includes targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available classification or multiclass metric.", - ) - uncertainty_args.add_argument( - "--evaluation_scores_path", - type=str, - help="Location to save the results of uncertainty evaluations.", - ) - uncertainty_args.add_argument( - "--uncertainty_dropout_p", - type=float, - default=0.1, - help="The probability to use for Monte Carlo dropout uncertainty estimation.", - ) - uncertainty_args.add_argument( - "--dropout_sampling_size", - type=int, - default=10, - help="The number of samples to use for Monte Carlo dropout uncertainty estimation. Distinct from the dropout used during training.", - ) - uncertainty_args.add_argument( - "--calibration_interval_percentile", - type=float, - default=95, - help="Percentile used in calibration methods. Must be in the range (1,100).", - ) - uncertainty_args.add_argument( - "--regression_calibrator_metric", - type=str, - choices=["stdev", "interval"], - help="Regression calibrator output metric. Regression calibrators can output either a stdev or an inverval.", - ) - - -def parseInterpretGnn(parser: argparse.ArgumentParser) -> None: - files_args = parser.add_argument_group("Files") - interpret_args = parser.add_argument_group("Interpretation Configuration") - files_args.add_argument( - "-f", - "--configFile", - metavar="FILE", - type=str, - help="Input JSON file that contains all information for interpretation.", - ) - files_args.add_argument( - "--preds_path", - type=str, - metavar="FILE", - help="Path to CSV file where predictions will be saved", - default="", - ) - files_args.add_argument( - "--checkpoint_dir", - type=str, - metavar="DIR", - help="Directory from which to load model checkpoints" - "(walks directory and ensembles all models that are found)", - default="./ckpt", - ) - files_args.add_argument( - "--checkpoint_path", - type=str, - metavar="DIR", - help="Path to model checkpoint (.pt file)", - ) - files_args.add_argument( - "--data_path", - type=str, - metavar="FILE", - help="Path to CSV file containing testing data for which predictions will be made", - ) - interpret_args.add_argument( - "--max_atoms", - type=int, - metavar="INT", - help="Maximum number of atoms to use for interpretation", - ) - - interpret_args.add_argument( - "--min_atoms", - type=int, - metavar="INT", - help="Minimum number of atoms to use for interpretation", - ) - - interpret_args.add_argument( - "--prop_delta", - type=float, - metavar="FLOAT", - help="The minimum change in the property of interest that is considered significant", - ) - interpret_args.add_argument( - "--property_id", - type=int, - metavar="INT", - help="The index of the property of interest", - ) - # write the argument for rollouts - interpret_args.add_argument( - "--rollout", - type=int, - metavar="INT", - help="The number of rollouts to use for interpretation", - ) - - -def parseInputConvert(parser: argparse.ArgumentParser) -> None: - """ - Parse the input arguments. - - :return: A namespace object built up from attributes parsed out of the cmd line. - """ - parser.add_argument( - "-f", - metavar="FILE", - type=str, - help="Input directory where your CSV/TSV files are stored.", - required=True, - default="", - ) diff --git a/dfpl/parse.py b/dfpl/parse.py new file mode 100644 index 00000000..0b479567 --- /dev/null +++ b/dfpl/parse.py @@ -0,0 +1,134 @@ +import argparse +import json +from argparse import Namespace +from pathlib import Path + +from dfpl.convert import parseInputConvert +from dfpl.interpretgnn import parseInterpretGnn +from dfpl.predictgnn import parsePredictGnn +from dfpl.traingnn import parseTrainGnn +from dfpl.train import parseInputTrain +from dfpl.predict import parseInputPredict + + +def parse_dfpl(*cli_args: str, **kwargs) -> Namespace: + """ + Main function that runs training/prediction defined by command line arguments + """ + parser = argparse.ArgumentParser(prog="deepFPlearn") + + # all subcommands might have common arguments + # -> use this parent parser to register them + common_args = argparse.ArgumentParser(add_help=False) + + subparsers = parser.add_subparsers( + dest="method", # this allows to check the name of the subparser that was invoked via the .method attribute + help="Sub programs of deepFPlearn") + + parseTrainGnn(subparsers.add_parser("traingnn", + help="Train new GNN models with your data", + parents=[common_args])) + parsePredictGnn(subparsers.add_parser("predictgnn", + help="Predict with your GNN models", + parents=[common_args])) + parseInterpretGnn(subparsers.add_parser("interpretgnn", + help="Interpret your GNN models", + parents=[common_args])) + parseInputTrain(subparsers.add_parser("train", + help="Train new models with your data", + parents=[common_args])) + parseInputPredict(subparsers.add_parser("predict", + help="Predict your data with existing models", + parents=[common_args])) + parseInputConvert(subparsers.add_parser("convert", + help="Convert known data files to pickle serialization files", + parents=[common_args])) + + if len(cli_args) == 0: + parser.error("Need at least one argument") + + # handle the --configFile argument with a separate parser + # and extract additional args from config file + config_arg_parser = argparse.ArgumentParser(add_help=False) + config_arg_parser.add_argument( + "-f", + "--configFile", + type=str, + metavar="FILE", + help="Path to a JSON file containing arguments. CLI arguments will override these.", + default=None, + ) + # extract AFTER the first CLI argument (name of the method / sub program) + cli_args = [cli_args[0], + *extract_config_args(list(cli_args)[1:], config_arg_parser), + *dict_to_cli_args(kwargs)] + + return parser.parse_args(cli_args) + + +def dict_to_cli_args(d: dict) -> list[str]: + """ + Takes a dict (e.g. parsed from a JSON file) + and converts it into a list of arguments + that argparse can interpret as CLI arguments. + + Example: + { + "inputFile": "/home/hello world.txt", + "myBool": true, + "myInt": 123, + "myList": [1, "abc", 3] + } + becomes + [ + "--inputFile", "/home/hello world.txt", + "--myBool", + "--myInt", "123", + "--myList", "1", "abc", "3" + ] + """ + args = [] + for key, value in d.items(): + key = f"--{key}" + if isinstance(value, bool): + if value: + args.append(key) + elif isinstance(value, list): + args.append(key) + args.extend(map(str, value)) # simply append all list elements as strings + else: + # regular case + args.append(key) + args.append(str(value)) + return args + + +def extract_config_args(args: list[str], + config_arg_parser: argparse.ArgumentParser) -> list[str]: + """ + Looks for the "configFile" argument, + extracts the CLI arguments from the file and puts them in front. + This is done recursively. + + Args: + args: List of raw CLI arguments (might contain --configFile ...) + config_arg_parser: A parser that only has a single configFile argument registered. + + Returns: + A list of CLI arguments with all arguments from the config files extracted. + This list is guaranteed to no longer contain any --configFile arguments. + """ + config_arg_result, remaining_args = config_arg_parser.parse_known_args(args) + if config_arg_result.configFile: + json_file = Path(config_arg_result.configFile) + if json_file.exists() and json_file.is_file(): + with json_file.open() as f: + d = json.load(f) + extracted_args = dict_to_cli_args(d) + + # insert extracted arguments BEFORE the other CLI arguments + # (give original CLI arguments priority over the extracted ones) + remaining_args[0:] = extract_config_args(extracted_args, config_arg_parser) # recursive extraction + else: + raise ValueError("Could not find JSON config file", config_arg_result.configFile) + return remaining_args diff --git a/dfpl/predict.py b/dfpl/predict.py new file mode 100644 index 00000000..1a521b71 --- /dev/null +++ b/dfpl/predict.py @@ -0,0 +1,183 @@ +from __future__ import annotations + +import argparse +import dataclasses +from argparse import Namespace +from dataclasses import dataclass + +from dfpl.train import TrainPredictCommonOptions, load_compression_options + + +@dataclass +class PredictOptions(TrainPredictCommonOptions): + """ + Dataclass for all options necessary for inferring the neural nets. + Corresponds to `dfpl predict`. + """ + outputFile: str + fnnModelDir: str + + +def parseInputPredict(parser_input_predict: argparse.ArgumentParser) -> None: + """ + Parse the input arguments. + + :return: A namespace object built up from attributes parsed out of the cmd line. + """ + + input_predict_general_args = parser_input_predict.add_argument_group("General Configuration") + input_predict_files_args = parser_input_predict.add_argument_group("Files") + input_predict_files_args.add_argument( + "-i", + "--inputFile", + metavar="FILE", + type=str, + help="The file containing the data for the prediction in (unquoted) " + "comma separated CSV format. The column named 'smiles' or 'fp'" + "contains the field to be predicted. Please adjust the type " + "that should be predicted (fp or smile) with -t option appropriately." + "An optional column 'id' is used to assign the outcomes to the" + "original identifiers. If this column is missing, the results are" + "numbered in the order of their appearance in the input file." + "A header is expected and respective column names are used.", + required=True, + ) + input_predict_files_args.add_argument( + "-o", + "--outputDir", + metavar="DIR", + type=str, + help="Prefix of output directory. It will contain a log file and the file specified with --outputFile.", + default="example/results_predict/", + ) + input_predict_files_args.add_argument( + "--outputFile", + metavar="FILE", + type=str, + help="Output csv file name which will contain one prediction per input line. " + "Default: prefix of input file name.", + default="results.csv", # todo: This doesn't look like it will actually become the prefix of the input file name + ) + input_predict_general_args.add_argument( + "-t", + "--type", + type=str, + choices=["fp", "smiles"], + help="Type of the chemical representation. Choices: 'fp', 'smiles'.", + default="fp", + ) + input_predict_general_args.add_argument( + "-k", + "--fpType", + type=str, + choices=["topological", "MACCS"], + help="The type of fingerprint to be generated/used in input file.", + default="topological", + ) + input_predict_files_args.add_argument( + "--ecModelDir", + type=str, + metavar="DIR", + help="The encoder dir where it is saved (if trainAE=True) or " + "it is loaded from (if trainAE=False). Provide a full path here.", + default="", + ) + input_predict_files_args.add_argument( + "--ecWeightsFile", + type=str, + metavar="STR", + help="The encoder file where it is loaded from, to compress the fingerprints.", + default="", + ) + input_predict_files_args.add_argument( + "--fnnModelDir", + type=str, + metavar="DIR", + help="The directory where the full model of the fnn is loaded from.", + default="example/results_train/AR_saved_model", + ) + input_predict_general_args.add_argument( + "-c", + "--compressFeatures", + action="store_true", + help="Compresses the fingerprints if encoder dir/file is provided", + default=False, + ) + input_predict_general_args.add_argument( + "--aeType", + type=str, + choices=["variational", "deterministic"], + help="Autoencoder type, variational or deterministic.", + default="deterministic", + ) + + +def predict(args: Namespace) -> None: + """ + Run prediction given specific options + """ + import logging + import os + from os import path + + from keras.saving.save import load_model + + from dfpl import fingerprint as fp, autoencoder as ac, vae as vae + from dfpl.predictions import predict_values + + from dfpl.utils import makePathAbsolute, createDirectory, createLogger + + predict_opts = PredictOptions(**vars(args)) + opts = dataclasses.replace( + predict_opts, + inputFile=makePathAbsolute(predict_opts.inputFile), + outputDir=makePathAbsolute(predict_opts.outputDir), + outputFile=makePathAbsolute( + path.join(predict_opts.outputDir, predict_opts.outputFile) + ), + ecModelDir=makePathAbsolute(predict_opts.ecModelDir), + fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir), + ) + createDirectory(opts.outputDir) + createLogger(path.join(opts.outputDir, "predict.log")) + logging.info( + f"The following arguments are received or filled with default values:\n{args}" + ) + + # import data from file and create DataFrame + if "tsv" in opts.inputFile: + df = fp.importDataFile( + opts.inputFile, import_function=fp.importDstoxTSV, fp_size=opts.fpSize + ) + else: + df = fp.importDataFile( + opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize + ) + + if opts.compressFeatures: + # load trained model for autoencoder + compression_options = load_compression_options() + if opts.aeType == "deterministic": + (autoencoder, encoder) = ac.define_ac_model(opts=compression_options) + if opts.aeType == "variational": + (autoencoder, encoder) = vae.define_vae_model(opts=compression_options) + # Load trained model for autoencoder + if opts.ecWeightsFile == "": + encoder = load_model(opts.ecModelDir) + else: + encoder.load_weights(os.path.join(opts.ecModelDir, opts.ecWeightsFile)) + df = ac.compress_fingerprints(df, encoder) + + # Run predictions on the compressed fingerprints and store the results in a dataframe + df2 = predict_values(df=df, opts=opts) + + # Extract the column names from the dataframe, excluding the 'fp' and 'fpcompressed' columns + names_columns = [c for c in df2.columns if c not in ["fp", "fpcompressed"]] + + # Save the predicted values to a CSV file in the output directory + df2[names_columns].to_csv(path_or_buf=path.join(opts.outputDir, opts.outputFile)) + + # Log successful completion of prediction and the file path where the results were saved + logging.info( + f"Prediction successful. Results written to '{path.join(opts.outputDir, opts.outputFile)}'" + ) diff --git a/dfpl/predictgnn.py b/dfpl/predictgnn.py new file mode 100644 index 00000000..e11e3c3a --- /dev/null +++ b/dfpl/predictgnn.py @@ -0,0 +1,190 @@ +from __future__ import annotations + +import argparse +from argparse import Namespace +from typing import Optional, Literal, List + +import chemprop +from chemprop.args import PredictArgs + +from dfpl.utils import createLogger + + +class PredictGnnOptions(PredictArgs): + """ + Dataclass to hold all options used for training the graph models + """ + + # configFile: str = "./example/predictgnn.json" + calibration_atom_descriptors_path: str = None + calibration_features_path: str = None + calibration_interval_percentile: float = 95 + calibration_method: Optional[ + Literal[ + "zscaling", + "tscaling", + "zelikman_interval", + "mve_weighting", + "platt", + "isotonic", + ] + ] = None + calibration_path: str = None + calibration_phase_features_path: str = None + drop_extra_columns: bool = False + dropout_sampling_size: int = 10 + evaluation_methods: List[str] = None + evaluation_scores_path: str = None + # no_features_scaling: bool = True + individual_ensemble_predictions: bool = False + preds_path: str = None + regression_calibrator_metric: Optional[Literal["stdev", "interval"]] = None + test_path: str = None + uncertainty_dropout_p: float = 0.1 + uncertainty_method: Optional[ + Literal[ + "mve", + "ensemble", + "evidential_epistemic", + "evidential_aleatoric", + "evidential_total", + "classification", + "dropout", + ] + ] = None + + +def parsePredictGnn(parser_predict_gnn: argparse.ArgumentParser) -> None: + predict_gnn_general_args = parser_predict_gnn.add_argument_group("General Configuration") + predict_gnn_files_args = parser_predict_gnn.add_argument_group("Files") + predict_gnn_uncertainty_args = parser_predict_gnn.add_argument_group("Uncertainty Configuration") + + predict_gnn_general_args.add_argument( + "--checkpoint_path", + type=str, + metavar="FILE", + help="Path to model checkpoint (.pt file)", + ) + predict_gnn_files_args.add_argument( + "--test_path", + type=str, + help="Path to CSV file for which predictions will be made.", + ) + predict_gnn_files_args.add_argument( + "--preds_path", + type=str, + help="Predictions output file. CSV or PICKLE file where predictions will be saved.", + ) + predict_gnn_files_args.add_argument( + "--calibration_path", + type=str, + help="Data file to be used for uncertainty calibration.", + ) + predict_gnn_files_args.add_argument( + "--calibration_features_path", + type=str, + nargs="+", + help="Feature data file to be used with the uncertainty calibration dataset.", + ) + predict_gnn_files_args.add_argument("--calibration_phase_features_path", type=str, help="") + predict_gnn_files_args.add_argument( + "--calibration_atom_descriptors_path", + type=str, + help="Extra atom descriptors file.", + ) + predict_gnn_files_args.add_argument( + "--calibration_bond_descriptors_path", + type=str, + help="Extra bond descriptors file. Path to the extra bond descriptors that will be used as bond features to " + "featurize a given molecule.", + ) + + predict_gnn_general_args.add_argument( + "--drop_extra_columns", + action="store_true", + help="Keep only SMILES and new prediction columns in the test data files.", + ) + + predict_gnn_uncertainty_args.add_argument( + "--uncertainty_method", + type=str, + choices=[ + "mve", + "ensemble", + "evidential_epistemic", + "evidential_aleatoric", + "evidential_total", + "classification", + "dropout", + "spectra_roundrobin", + "dirichlet", + ], + help="The method of calculating uncertainty.", + ) + predict_gnn_uncertainty_args.add_argument( + "--calibration_method", + type=str, + nargs="+", + choices=[ + "zscaling", + "tscaling", + "zelikman_interval", + "mve_weighting", + "platt", + "isotonic", + ], + help="Methods used for calibrating the uncertainty.", + ) + predict_gnn_uncertainty_args.add_argument( + "--individual_ensemble_predictions", + action="store_true", + default=False, + help="Save individual ensemble predictions.", + ) + predict_gnn_uncertainty_args.add_argument( + "--evaluation_methods", + type=str, + nargs="+", + help="Methods used for evaluating the uncertainty performance. Only used if the test data provided includes " + "targets. Available methods are [nll, miscalibration_area, ence, spearman] or any available " + "classification or multiclass metric.", + ) + predict_gnn_uncertainty_args.add_argument( + "--evaluation_scores_path", + type=str, + help="Location to save the results of uncertainty evaluations.", + ) + predict_gnn_uncertainty_args.add_argument( + "--uncertainty_dropout_p", + type=float, + default=0.1, + help="The probability to use for Monte Carlo dropout uncertainty estimation.", + ) + predict_gnn_uncertainty_args.add_argument( + "--dropout_sampling_size", + type=int, + default=10, + help="The number of samples to use for Monte Carlo dropout uncertainty estimation. Distinct from the dropout " + "used during training.", + ) + predict_gnn_uncertainty_args.add_argument( + "--calibration_interval_percentile", + type=float, + default=95, + help="Percentile used in calibration methods. Must be in the range (1,100).", + ) + predict_gnn_uncertainty_args.add_argument( + "--regression_calibrator_metric", + type=str, + choices=["stdev", "interval"], + help="Regression calibrator output metric. Regression calibrators can output either a stdev or an inverval.", + ) + + +def predictdmpnn(args: Namespace) -> None: + """ + Predict the values using a trained D-MPNN model with the given options. + """ + createLogger("predictgnn.log") + opts = PredictGnnOptions(**vars(args)) + chemprop.train.make_predictions(args=opts) diff --git a/dfpl/predictions.py b/dfpl/predictions.py index 29e73142..6ed7b526 100644 --- a/dfpl/predictions.py +++ b/dfpl/predictions.py @@ -4,10 +4,12 @@ import pandas as pd import tensorflow.keras.models -from dfpl import options, settings +import dfpl.predict +import dfpl.train +from dfpl import settings -def predict_values(df: pd.DataFrame, opts: options.Options) -> pd.DataFrame: +def predict_values(df: pd.DataFrame, opts: dfpl.predict.PredictOptions) -> pd.DataFrame: """ Predict a set of chemicals using a selected model. @@ -16,7 +18,7 @@ def predict_values(df: pd.DataFrame, opts: options.Options) -> pd.DataFrame: :return: """ model = tensorflow.keras.models.load_model(opts.fnnModelDir, compile=False) - model.compile(loss=opts.lossFunction, optimizer=opts.optimizer) + model.compile(loss=opts.lossFunction, optimizer=opts.optimizer) # todo: Why are these parameters needed here? if opts.compressFeatures: sub_df = df[df["fpcompressed"].notnull()] x = np.array( @@ -25,7 +27,7 @@ def predict_values(df: pd.DataFrame, opts: options.Options) -> pd.DataFrame: copy=settings.numpy_copy_values, ) logging.info(f"Compressed FP matrix with shape {x.shape} and type {x.dtype}") - sub_df["predicted"] = pd.DataFrame(model.predict(x), columns=["predicted"]) + sub_df["predicted"] = pd.DataFrame(dfpl.predict.predict(x), columns=["predicted"]) return sub_df else: sub_df = df[df["fp"].notnull()] @@ -35,5 +37,5 @@ def predict_values(df: pd.DataFrame, opts: options.Options) -> pd.DataFrame: copy=settings.numpy_copy_values, ) logging.info(f"Uncompressed FP matrix with shape {x.shape} and type {x.dtype}") - sub_df["predicted"] = pd.DataFrame(model.predict(x), columns=["predicted"]) + sub_df["predicted"] = pd.DataFrame(dfpl.predict.predict(x), columns=["predicted"]) return sub_df diff --git a/dfpl/single_label_model.py b/dfpl/single_label_model.py index 191690ba..76ab0022 100644 --- a/dfpl/single_label_model.py +++ b/dfpl/single_label_model.py @@ -29,15 +29,15 @@ ) from tensorflow.keras.models import Model, Sequential +from dfpl.train import TrainOptions from dfpl import callbacks as cb -from dfpl import options from dfpl import plot as pl from dfpl import settings from dfpl.utils import ae_scaffold_split, weight_split def prepare_nn_training_data( - df: pd.DataFrame, target: str, opts: options.Options, return_dataframe: bool = False + df: pd.DataFrame, target: str, opts: TrainOptions, return_dataframe: bool = False ) -> Union[Tuple[np.ndarray, np.ndarray], pd.DataFrame]: # Check the value counts and abort if too imbalanced allowed_imbalance = 0.1 @@ -180,7 +180,7 @@ def prepare_nn_training_data( # This function defines a feedforward neural network (FNN) with the given input size, options, and output bias def build_fnn_network( - input_size: int, opts: options.Options, output_bias=None + input_size: int, opts: TrainOptions, output_bias=None ) -> Model: # Set the output bias if it is provided if output_bias is not None: @@ -256,7 +256,7 @@ def build_fnn_network( # This function defines a shallow neural network (SNN) with the given input size, options, and output bias def build_snn_network( - input_size: int, opts: options.Options, output_bias=None + input_size: int, opts: TrainOptions, output_bias=None ) -> Model: # Set the output bias if it is provided if output_bias is not None: @@ -310,14 +310,14 @@ def balanced_accuracy(y_true: np.ndarray, y_pred: np.ndarray) -> np.float64: def define_single_label_model( - input_size: int, opts: options.Options, output_bias=None + input_size: int, opts: TrainOptions, output_bias=None ) -> Model: """ Defines and compiles the single-label neural network model. Args: input_size (int): The size of the input layer. - opts (options.Options): The options used in the model. + opts (dfpl.train.TrainOptions): The options used in the model. output_bias (float): The initial bias for the last sigmoid layer of the model. Returns: @@ -333,17 +333,12 @@ def define_single_label_model( else: logging.error(f"Your selected loss is not supported: {opts.lossFunction}.") sys.exit("Unsupported loss function") - lr_schedule = optimizers.schedules.ExponentialDecay( - opts.learningRate, - decay_steps=1000, - decay_rate=opts.learningRateDecay, - staircase=True, - ) + # Set the optimizer according to the option selected if opts.optimizer == "Adam": - my_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) + my_optimizer = optimizers.Adam(learning_rate=opts.learningRate) elif opts.optimizer == "SGD": - my_optimizer = optimizers.legacy.SGD(lr=lr_schedule, momentum=0.9) + my_optimizer = optimizers.SGD(lr=opts.learningRate, momentum=0.9) else: logging.error(f"Your selected optimizer is not supported: {opts.optimizer}.") sys.exit("Unsupported optimizer") @@ -494,7 +489,7 @@ def fit_and_evaluate_model( y_test: np.ndarray, fold: int, target: str, - opts: options.Options, + opts: TrainOptions, ) -> pd.DataFrame: # Print info about training logging.info(f"Training of fold number: {fold}") @@ -566,7 +561,7 @@ def get_x_y( target: str, train_set: pd.DataFrame, test_set: pd.DataFrame, - opts: options.Options, + opts: TrainOptions, ): train_indices = train_set.index test_indices = test_set.index @@ -589,7 +584,7 @@ def get_x_y( return x_train, y_train, x_test, y_test -def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: +def train_single_label_models(df: pd.DataFrame, opts: TrainOptions) -> None: """ Train individual models for all targets (columns) present in the provided target data (y) and a multi-label model that classifies all targets at once. For each individual target the data is first subset to exclude NA @@ -601,7 +596,11 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: """ # find target columns - targets = [c for c in df.columns if c not in ["smiles", "fp", "fpcompressed"]] + targets = [ + c + for c in df.columns + if c in ["AR", "ER", "ED", "TR", "GR", "PPARg", "Aromatase"] + ] if opts.wabTracking and opts.wabTarget != "": # For W&B tracking, we only train one target that's specified as wabTarget "ER". # In case it's not there, we use the first one available diff --git a/dfpl/train.py b/dfpl/train.py new file mode 100644 index 00000000..353fa924 --- /dev/null +++ b/dfpl/train.py @@ -0,0 +1,491 @@ +from __future__ import annotations + +import argparse +import pathlib +from argparse import Namespace +from dataclasses import dataclass + +from dfpl.utils import makePathAbsolute + + +@dataclass +class TrainPredictCommonOptions: + """ + Dataclass for all options needed both for training and inferring the neural nets. + Corresponds to `dfpl train` and `dfpl predict`. + """ + inputFile: str + outputDir: str + ecWeightsFile: str + ecModelDir: str + type: str + fpType: str + compressFeatures: bool + aeType: str + fnnType: str + fpSize: int + + +@dataclass +class TrainOptions(TrainPredictCommonOptions): + """ + Dataclass for all options necessary for training the neural nets. + Corresponds to `dfpl train`. + """ + epochs: int + encFPSize: int + kFolds: int + testSize: float + enableMultiLabel: bool + verbose: int + trainAC: bool + trainFNN: bool + sampleFractionOnes: float + sampleDown: bool + split_type: str + aeSplitType: str + aeEpochs: int + aeBatchSize: int + aeLearningRate: float + aeLearningRateDecay: float + aeActivationFunction: str + batchSize: int + optimizer: str + learningRate: float + learningRateDecay: float + lossFunction: str + activationFunction: str + l2reg: float + dropout: float + threshold: float + visualizeLatent: bool + gpu: int + aeWabTracking: bool + wabTracking: bool + wabTarget: str + + +def parseInputTrain(parser_train: argparse.ArgumentParser) -> None: + """ + Parse the input arguments. + + :return: A namespace object built up from attributes parsed out of the cmd line. + """ + # Create argument groups + input_tain_general_args = parser_train.add_argument_group("Model Configuration") + input_tain_autoencoder_args = parser_train.add_argument_group("Autoencoder Configuration") + input_tain_training_args = parser_train.add_argument_group("Training Configuration") + input_tain_tracking_args = parser_train.add_argument_group("Tracking Configuration") + + input_tain_general_args.add_argument( + "-i", + "--inputFile", + metavar="FILE", + type=str, + help="The file containing the data for training in " + "comma separated CSV format.The first column should be smiles.", + required=True, + ) + input_tain_general_args.add_argument( + "-o", + "--outputDir", + metavar="DIR", + type=str, + help="Prefix of output file name. Trained model and " + "respective stats will be returned in this directory.", + default="example/results_train/", # changes according to mode + ) + + # TODO CHECK WHAT IS TYPE DOING? + input_tain_general_args.add_argument( + "-t", + "--type", + type=str, + choices=["fp", "smiles"], + help="Type of the chemical representation. Choices: 'fp', 'smiles'.", + default="fp", + ) + input_tain_general_args.add_argument( + "-thr", + "--threshold", + type=float, + metavar="FLOAT", + help="Threshold for binary classification.", + default=0.5, + ) + input_tain_general_args.add_argument( + "-gpu", + "--gpu", + metavar="INT", + type=int, + help="Select which gpu to use by index. If not available, leave empty", + default=None, + ) + input_tain_general_args.add_argument( + "--fpType", + type=str, + # todo: A previous comment in class "Options" listed an additional option "atompairs". + # Either add this option here or remove this comment. + choices=["topological", "MACCS"], + help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.", + default="topological", + ) + input_tain_general_args.add_argument( + "--fpSize", + type=int, + help="Length of the fingerprint that should be generated.", + default=2048, + ) + input_tain_general_args.add_argument( + "--compressFeatures", + action="store_true", + help="Compresses the fingerprints. Needs a path of a trained autoencoder or needs the trainAC also set to True.", + default=False, + ) + input_tain_general_args.add_argument( + "--enableMultiLabel", + action="store_true", + help="Train multi-label classification model. individual models.", + default=False, + ) + # Autoencoder Configuration + input_tain_autoencoder_args.add_argument( + "-a", + "--ecWeightsFile", + type=str, + metavar="FILE", + help="The .hdf5 file of a trained encoder", + default="", + ) + input_tain_autoencoder_args.add_argument( + "--ecModelDir", + type=str, + metavar="DIR", + help="The directory where the full encoder will be saved", + default="example/results_train/AE_encoder/", + ) + input_tain_autoencoder_args.add_argument( + "--aeType", + type=str, + choices=["variational", "deterministic"], + help="Autoencoder type, variational or deterministic.", + default="deterministic", + ) + input_tain_autoencoder_args.add_argument( + "--aeEpochs", + metavar="INT", + type=int, + help="Number of epochs for autoencoder training.", + default=100, + ) + input_tain_autoencoder_args.add_argument( + "--aeBatchSize", + metavar="INT", + type=int, + help="Batch size in autoencoder training.", + default=512, + ) + input_tain_autoencoder_args.add_argument( + "--aeActivationFunction", + type=str, + choices=["relu", "selu"], + help="The activation function of the autoencoder.", + default="relu", + ) + input_tain_autoencoder_args.add_argument( + "--aeLearningRate", + metavar="FLOAT", + type=float, + help="Learning rate for autoencoder training.", + default=0.001, + ) + input_tain_autoencoder_args.add_argument( + "--aeLearningRateDecay", + metavar="FLOAT", + type=float, + help="Learning rate decay for autoencoder training.", + default=0.96, + ) + input_tain_autoencoder_args.add_argument( + "--aeSplitType", + type=str, + choices=["scaffold_balanced", "random", "molecular_weight"], + help="Set how the data is split for the autoencoder", + default="random", + ) + input_tain_autoencoder_args.add_argument( + "-d", + "--encFPSize", + metavar="INT", + type=int, + help="Size of encoded fingerprint (z-layer of autoencoder).", + default=256, + ) + # only if autoencoder is trained or loaded + input_tain_autoencoder_args.add_argument( + "--visualizeLatent", + action="store_true", + help="UMAP the latent space for exploration", + default=False, + ) + # Training Configuration + input_tain_training_args.add_argument( + "--split_type", + type=str, + choices=["scaffold_balanced", "random", "molecular_weight"], + help="Set how the data is split for the feedforward neural network", + default="random", + ) + input_tain_training_args.add_argument( + "--testSize", + metavar="FLOAT", + type=float, + help="Fraction[0,1] of the dataset that should be used for testing", + default=0.2, + ) + input_tain_autoencoder_args.add_argument( + "--fnnType", + type=str, + choices=["FNN", "SNN"], + help="The type of the feedforward neural network.", + default="FNN", + ) + input_tain_training_args.add_argument( + "-K", + "--kFolds", + metavar="INT", + type=int, + help="Number of folds for cross-validation.", + default=1, + ) + input_tain_training_args.add_argument( + "-v", + "--verbose", + type=int, + choices=[0, 1, 2], + help="Verbosity level. O: No additional output, " + + "1: Some additional output, 2: full additional output", + default=2, + ) + input_tain_training_args.add_argument( + "--trainAC", + action="store_true", + help="Trains the autoencoder.", + default=False, + ) + input_tain_training_args.add_argument( + "--trainFNN", + action="store_false", + # todo: This argument is confusing. + # Users would expect this flag to be called "--no-trainFNN" or something similar. + # Proposal: Rename the flag to "--no-trainFNN", + # but use the parameter dest="trainFNN", so that it + # still appears as "trainFNN" attribute in the resulting arg Namespace + # (set to False, if --no-trainFNN is provided). + help="Deactivates the FNN training.", + default=True, + ) + input_tain_training_args.add_argument( + "--sampleFractionOnes", + metavar="FLOAT", + type=float, + help="This is the desired fraction 1s/0s.only works if --sampleDown is enabled", + default=0.5, + ) + input_tain_training_args.add_argument( + "--sampleDown", + metavar="BOOL", + type=bool, + help="Down sampling of the 0 valued samples.", + default=False, + ) + input_tain_training_args.add_argument( + "-e", + "--epochs", + metavar="INT", + type=int, + help="Number of epochs for the FNN training", + default=100, + ) + # TODO CHECK IF ALL LOSSES MAKE SENSE HERE + input_tain_training_args.add_argument( + "--lossFunction", + type=str, + choices=["mse", "bce", "focal"], + help="Loss function for FNN training. mse - mean squared error, bce - binary cross entropy.", + default="bce", + ) + # TODO DO I NEED ALL ARGUMENTS TO BE USER SPECIFIED? WHAT DOES THE USER KNOW ABOUT OPTIMIZERS? + input_tain_training_args.add_argument( + "--optimizer", + type=str, + choices=["Adam", "SGD"], + help="Optimizer of the FNN.", + default="Adam", + ) + input_tain_training_args.add_argument( + "--batchSize", + metavar="INT", + type=int, + help="Batch size in FNN training.", + default=128, + ) + input_tain_training_args.add_argument( + "--l2reg", + metavar="FLOAT", + type=float, + help="Value for l2 kernel regularizer.", + default=0.001, + ) + input_tain_training_args.add_argument( + "--dropout", + metavar="FLOAT", + type=float, + help="The fraction of data that is dropped out in each dropout layer.", + default=0.2, + ) + input_tain_training_args.add_argument( + "--learningRate", + metavar="FLOAT", + type=float, + help="Learning rate size in FNN training.", + default=0.000022, + ) + input_tain_training_args.add_argument( + "--learningRateDecay", + metavar="FLOAT", + type=float, + help="Learning rate size in FNN training.", + default=0.96, + ) + input_tain_training_args.add_argument( + "--activationFunction", + type=str, + choices=["relu", "selu"], + help="The activation function of the FNN.", + default="relu", + ) + # Tracking Configuration + # Wand & Biases autoencoder tracking + input_tain_tracking_args.add_argument( + "--aeWabTracking", + metavar="BOOL", + type=bool, + help="Track autoencoder performance via Weights & Biases.", + default=False, + ) + # Wand & Biases FNN tracking + input_tain_tracking_args.add_argument( + "--wabTracking", + metavar="BOOL", + type=bool, + help="Track FNN performance via Weights & Biases", + default=False, + ) + # Wand & Biases target used for showing training progress + input_tain_tracking_args.add_argument( + "--wabTarget", + metavar="STRING", + type=str, + help="Which endpoint to use for tracking performance via Weights & Biases. Should match the column name.", + default=None, + ) + + +def load_compression_options() -> TrainOptions: + """ + This is a utility function that is needed both by `train` and `predict`. + It loads options from a JSON file + that are used to instantiate the autoencoder. + """ + from dfpl.parse import parse_dfpl + + project_directory = pathlib.Path(__file__).parent.absolute() + args = parse_dfpl("train", + configFile=makePathAbsolute(f"{project_directory}/compression.json")) + return TrainOptions(**vars(args)) + + +def train(args: Namespace): + """ + Run the main training procedure + """ + import dataclasses + import logging + import os + + from os import path + + from keras.saving.save import load_model + + from dfpl import fingerprint as fp, autoencoder as ac, vae as vae, single_label_model as sl, feedforwardNN as fNN + from dfpl.utils import makePathAbsolute, createDirectory, createLogger + + train_opts = TrainOptions(**vars(args)) + opts = dataclasses.replace( + train_opts, + inputFile=makePathAbsolute(train_opts.inputFile), + outputDir=makePathAbsolute(train_opts.outputDir), + ) + createDirectory(opts.outputDir) + createLogger(path.join(opts.outputDir, "train.log")) + logging.info( + f"The following arguments are received or filled with default values:\n{opts}" + ) + # import data from file and create DataFrame + if "tsv" in opts.inputFile: + df = fp.importDataFile( + opts.inputFile, import_function=fp.importDstoxTSV, fp_size=opts.fpSize + ) + else: + df = fp.importDataFile( + opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize + ) + # initialize (auto)encoders to None + encoder = None + autoencoder = None + if opts.trainAC: + if opts.aeType == "deterministic": + encoder, train_indices, test_indices = ac.train_full_ac(df, opts) + elif opts.aeType == "variational": + encoder, train_indices, test_indices = vae.train_full_vae(df, opts) + else: + raise ValueError(f"Unknown autoencoder type: {opts.aeType}") + + # if feature compression is enabled + if opts.compressFeatures: + if not opts.trainAC: + # load default options for autoencoder from config file + compression_options = load_compression_options() + if opts.aeType == "variational": + (autoencoder, encoder) = vae.define_vae_model(opts=compression_options) + else: + (autoencoder, encoder) = ac.define_ac_model(opts=compression_options) + + if opts.ecWeightsFile == "": + encoder = load_model(opts.ecModelDir) + else: + autoencoder.load_weights( + os.path.join(opts.ecModelDir, opts.ecWeightsFile) + ) + # compress the fingerprints using the autoencoder + df = ac.compress_fingerprints(df, encoder) + if opts.visualizeLatent and opts.trainAC: + ac.visualize_fingerprints( + df, + train_indices=train_indices, + test_indices=test_indices, + save_as=f"{opts.ecModelDir}/UMAP_{opts.aeSplitType}.png", + ) + elif opts.visualizeLatent: + logging.info( + "Visualizing latent space is only available if you train the autoencoder. Skipping visualization." + ) + + # train single label models if requested + if opts.trainFNN and not opts.enableMultiLabel: + sl.train_single_label_models(df=df, opts=opts) + + # train multi-label models if requested + if opts.trainFNN and opts.enableMultiLabel: + fNN.train_nn_models_multi(df=df, opts=opts) diff --git a/dfpl/traingnn.py b/dfpl/traingnn.py new file mode 100644 index 00000000..b346df99 --- /dev/null +++ b/dfpl/traingnn.py @@ -0,0 +1,627 @@ +from __future__ import annotations + +import argparse +import logging +from argparse import Namespace +from dataclasses import dataclass +from typing import List + +import chemprop +from chemprop.args import TrainArgs + +from dfpl.utils import createLogger + + +@dataclass +class GnnOptions(TrainArgs): + """ + Dataclass to hold all options used for training the graph models + """ + + total_epochs: int = 30 + save: bool = True + # configFile: str = "./example/traingnn.json" + data_path: str = "./example/data/tox21.csv" + use_compound_names: bool = False + save_dir: str = "" + no_cache: bool = False + features_scaling: bool = True + use_input_features: str = "" + cuda: bool = False + num_lrs: int = 2 + minimize_score: bool = False + num_tasks: int = 12 + preds_path: str = "./tox21dmpnn.csv" + test_path: str = "" + save_preds: bool = True + calibration_method: str = "none" + uncertainty_method: str = "none" + calibration_path: str = "" + evaluation_methods: str = "none" + evaluation_scores_path: str = "" + wabTracking: bool = False + split_sizes: List[float] = None + + # save_smiles_splits: bool = False + + +def parseTrainGnn(parser_train_gnn: argparse.ArgumentParser) -> None: + train_gnn_general_args = parser_train_gnn.add_argument_group("General Configuration") + train_gnn_data_args = parser_train_gnn.add_argument_group("Data Configuration") + train_gnn_files_args = parser_train_gnn.add_argument_group("Files") + train_gnn_model_args = parser_train_gnn.add_argument_group("Model arguments") + train_gnn_training_args = parser_train_gnn.add_argument_group("Training Configuration") + train_gnn_uncertainty_args = parser_train_gnn.add_argument_group("Uncertainty Configuration") + train_gnn_uncertainty_args.add_argument( + "--uncertainty_method", + type=str, + choices=[ + "mve", + "ensemble", + "evidential_epistemic", + "evidential_aleatoric", + "evidential_total", + "classification", + "dropout", + "dirichlet", + ], + help="Method to use for uncertainty estimation", + default="none", + ) + # Uncertainty arguments + train_gnn_uncertainty_args.add_argument( + "--calibration_method", + type=str, + choices=[ + "zscaling", + "tscaling", + "zelikman_interval", + "mve_weighting", + "platt", + "isotonic", + ], + help="Method to use for calibration", + default="none", + ) + train_gnn_uncertainty_args.add_argument( + "--calibration_path", + type=str, + metavar="FILE", + help="Path to file with calibration data", + ) + + # General arguments + train_gnn_general_args.add_argument( + "--split_key_molecule", + type=int, + help="The index of the key molecule used for splitting", + ) + train_gnn_general_args.add_argument("--pytorch_seed", type=int, help="Seed for pytorch") + train_gnn_general_args.add_argument( + "--cache_cutoff", + type=float, + help="Maximum number of molecules in dataset to allow caching.", + ) + train_gnn_general_args.add_argument( + "--save_preds", help="Saves test split predictions during training", type=bool + ) + train_gnn_general_args.add_argument("--wabTracking", action="store_true", default=False) + train_gnn_general_args.add_argument( + "--cuda", action="store_true", default=False, help="Turn on cuda" + ) + train_gnn_general_args.add_argument( + "--save_smiles_splits", + action="store_true", + default=False, + help="Save smiles for each train/val/test splits", + ) + train_gnn_general_args.add_argument( + "--test", + action="store_true", + default=False, + help="Whether to skip training and only test the model", + ) + train_gnn_general_args.add_argument( + "--gpu", + type=int, + # choices=list(range(torch.cuda.device_count())), + help="Which GPU to use", + ) + train_gnn_general_args.add_argument("--save", type=bool) + train_gnn_general_args.add_argument( + "--quiet", + action="store_true", + default=False, + help="Skip non-essential print statements", + ) + train_gnn_general_args.add_argument( + "--log_frequency", + type=int, + metavar="INT", + default=10, + help="The number of batches between each log", + ) + train_gnn_general_args.add_argument( + "--no_cache_mol", + action="store_true", + default=False, + help="If raised, Turn off caching rdkit mols", + ) + + # FILES ARGUMENTS + train_gnn_files_args.add_argument( + "--save_dir", + type=str, + metavar="DIR", + default="./ckpt/", + help="Directory where model checkpoints will be saved", + ) + train_gnn_files_args.add_argument( + "--checkpoint_dir", + type=str, + metavar="DIR", + default=None, + help="Directory from which to load model checkpoints" + "(walks directory and ensembles all models that are found)", + ) + train_gnn_files_args.add_argument( + "--checkpoint_path", + type=str, + metavar="FILE", + default=None, + help="Path to model checkpoint (.pt file)", + ) + train_gnn_files_args.add_argument( + "--checkpoint_paths", + type=str, + metavar="FILE", + nargs="*", + default=None, + help="Path to model checkpoint (.pt file)", + ) + train_gnn_files_args.add_argument( + "--separate_val_path", + type=str, + metavar="FILE", + help="Path to separate val set, optional", + ) + train_gnn_files_args.add_argument( + "--separate_val_features_path", + type=str, + metavar="FILE", + nargs="*", + help="Path to file with features for separate val set", + ) + train_gnn_files_args.add_argument( + "--separate_test_path", + type=str, + metavar="FILE", + help="Path to separate test set, optional", + ) + train_gnn_files_args.add_argument( + "--separate_test_features_path", + type=str, + metavar="FILE", + nargs="*", + help="Path to file with features for separate test set", + ) + train_gnn_files_args.add_argument( + "--folds_file", + type=str, + metavar="FILE", + default=None, + help="Optional file of fold labels", + ) + train_gnn_files_args.add_argument( + "--val_fold_index", + type=int, + metavar="INT", + default=None, + help="Which fold to use as val for cross val", + ) + train_gnn_files_args.add_argument( + "--test_fold_index", + type=int, + metavar="INT", + default=None, + help="Which fold to use as test for cross val", + ) + train_gnn_files_args.add_argument( + "--crossval_index_dir", + type=str, + metavar="DIR", + help="Directory in which to find cross validation index files", + ) + train_gnn_files_args.add_argument( + "--crossval_index_file", + type=str, + metavar="FILE", + help="Indices of files to use as train/val/test" + "Overrides --num_folds and --seed.", + ) + train_gnn_files_args.add_argument( + "--data_weights_path", + type=str, + metavar="FILE", + help="Path where the data weight are saved", + ) + train_gnn_files_args.add_argument( + "--features_path", + type=str, + metavar="FILE", + nargs="*", + help="Path to features to use in FNN (instead of features_generator)", + ) + + train_gnn_files_args.add_argument( + "--separate_val_phase_features_path", type=str, metavar="FILE" + ) + train_gnn_files_args.add_argument( + "--separate_test_phase_features_path", type=str, metavar="FILE" + ) + + train_gnn_files_args.add_argument( + "--separate_val_atom_descriptors_path", type=str, metavar="FILE" + ) + train_gnn_files_args.add_argument( + "--separate_test_atom_descriptors_path", type=str, metavar="FILE" + ) + # Data related arguments + train_gnn_data_args.add_argument( + "--data_path", + type=str, + metavar="FILE", + help="Path to data CSV file", + default="", + ) + train_gnn_data_args.add_argument( + "--use_compound_names", + action="store_true", + default=False, + help="Use when test data file contains compound names in addition to SMILES strings", + ) + train_gnn_data_args.add_argument( + "--max_data_size", + type=int, + metavar="INT", + help="Maximum number of data points to load", + ) + + train_gnn_data_args.add_argument( + "--features_only", + action="store_true", + default=False, + help="Use only the additional features in an FFN, no graph network", + ) + train_gnn_data_args.add_argument( + "--dataset_type", + type=str, + choices=["classification", "regression", "multiclass"], + help="Type of dataset, e.g. classification or regression." + "This determines the loss function used during training.", + default="regression", + ) # classification + train_gnn_data_args.add_argument( + "--multiclass_num_classes", + type=int, + metavar="INT", + default=3, + help="Number of classes in multiclass classification", + ) + train_gnn_data_args.add_argument( + "--split_type", + type=str, + default="random", + choices=[ + "random", + "scaffold_balanced", + "predetermined", + "crossval", + "index_predetermined", + ], + help="Method of splitting the data into train/val/test", + ) + train_gnn_data_args.add_argument( + "--split_sizes", + type=float, + metavar="FLOAT", + nargs=3, + default=[0.8, 0.2, 0.0], + help="Split proportions for train/validation/test sets", + ) + + train_gnn_data_args.add_argument( + "--seed", + type=int, + default=0, + help="Random seed to use when splitting data into train/val/test sets." + "When `num_folds` > 1, the first fold uses this seed and all" + "subsequent folds add 1 to the seed.", + ) + train_gnn_data_args.add_argument( + "--smiles_columns", + type=str, + metavar="STRING", + help="Name of the smiles columns", + ) + + train_gnn_data_args.add_argument( + "--target_columns", + type=str, + nargs="*", + metavar="STRING", + help="Name of the target columns", + ) + + train_gnn_data_args.add_argument( + "--ignore_columns", + type=str, + nargs="*", + metavar="STRING", + help="Names of the columns to ignore", + ) + train_gnn_data_args.add_argument( + "--num_tasks", type=int, metavar="INT", help="Number of tasks" + ) + train_gnn_data_args.add_argument( + "--no_features_scaling", + action="store_true", + default=False, + help="Turn off scaling of features", + ) + train_gnn_data_args.add_argument( + "--features_scaling", + action="store_true", + default=False, + help="Turn on scaling of features", + ) + train_gnn_data_args.add_argument( + "--use_input_features", + type=str, + metavar="STRING", + help="Turn on scaling of features", + ) + + # Model arguments + train_gnn_model_args.add_argument( + "--ensemble_size", + type=int, + metavar="INT", + default=1, + help="Number of models in ensemble", + ) + train_gnn_model_args.add_argument( + "--hidden_size", + type=int, + metavar="INT", + default=300, + help="Dimensionality of hidden layers in MPN", + ) + train_gnn_model_args.add_argument( + "--bias", + action="store_true", + default=False, + help="Whether to add bias to linear layers", + ) + train_gnn_model_args.add_argument( + "--depth", + type=int, + metavar="INT", + default=3, + help="Number of message passing steps", + ) + train_gnn_model_args.add_argument( + "--dropout", + type=float, + metavar="FLOAT", + default=0.0, + help="Dropout probability", + ) + train_gnn_model_args.add_argument( + "--activation", + type=str, + default="ReLU", + choices=["ReLU", "LeakyReLU", "PReLU", "tanh", "SELU", "ELU"], + help="Activation function", + ) + train_gnn_model_args.add_argument( + "--undirected", + action="store_true", + default=False, + help="Undirected edges (always sum the two relevant bond vectors)", + ) + train_gnn_model_args.add_argument( + "--ffn_hidden_size", + type=int, + metavar="INT", + default=2, + help="Hidden dim for higher-capacity FFN (defaults to hidden_size)", + ) + train_gnn_model_args.add_argument( + "--ffn_num_layers", + type=int, + metavar="INT", + default=2, + help="Number of layers in FFN after MPN encoding", + ) + train_gnn_model_args.add_argument( + "--atom_messages", + action="store_true", + default=False, + help="Use messages on atoms instead of messages on bonds", + ) + + train_gnn_model_args.add_argument( + "--num_lrs", + type=int, + metavar="INT", + default=2, + help="Number of layers in FFN after MPN encoding", + ) + train_gnn_model_args.add_argument( + "--checkpoint_frzn", type=str, metavar="STRING", help="Freeze the loaded model" + ) + # Model arguments + # model_args.add_argument("--mpn_shared", type=bool, metavar="BOOL") + train_gnn_model_args.add_argument( + "--show_individual_scores", + action="store_true", + default=True, + help="Show all scores for individual targets, not just average, at the end", + ) + train_gnn_model_args.add_argument( + "--aggregation", + choices=["mean", "sum", "norm"], + help="Aggregation scheme for atomic vectors into molecular vectors", + ) + train_gnn_model_args.add_argument( + "--aggregation_norm", + type=int, + help="For norm aggregation, number by which to divide summed up atomic features", + ) + # model_args.add_argument("--explicit_h", type=bool, metavar="BOOL",help="A explicit hydrogen") + train_gnn_model_args.add_argument( + "--adding_h", type=bool, metavar="BOOL", help="Adding hydrogen" + ) + # Training arguments + train_gnn_model_args.add_argument( + "--class_balance", + type=bool, + metavar="BOOL", + help="Balances the classes across batches", + ) + train_gnn_model_args.add_argument( + "--evidential_regularization", + type=float, + metavar="FLOAT", + help="Regularization parameter for evidential loss", + ) + train_gnn_model_args.add_argument( + "--overwrite_default_atom_features", + type=bool, + metavar="BOOL", + help="Overwrites default atom features instead of concatenating", + ) + train_gnn_model_args.add_argument("--no_atom_descriptor_scaling", type=bool, metavar="BOOL") + train_gnn_model_args.add_argument( + "--overwrite_default_bond_features", + type=bool, + metavar="BOOL", + help="Overwrites default bond features instead of concatenating", + ) + train_gnn_model_args.add_argument( + "--frzn_ffn_layers", + type=int, + metavar="INT", + help="Number of layers in FFN to freeze", + ) + # model_args.add_argument("--freeze_first_only", type=bool, metavar="BOOL") + # Training arguments + train_gnn_training_args.add_argument( + "--epochs", type=int, metavar="INT", default=30, help="Number of epochs to run" + ) + train_gnn_training_args.add_argument( + "--total_epochs", + type=int, + metavar="INT", + default=30, + help="Number of total epochs to run", + ) + train_gnn_training_args.add_argument( + "--batch_size", type=int, metavar="INT", default=50, help="Batch size" + ) + train_gnn_training_args.add_argument( + "--warmup_epochs", + type=int, + metavar="INT", + default=2, + help="Number of epochs during which learning rate increases linearly from" + "init_lr to max_lr. Afterwards, learning rate decreases exponentially" + "from max_lr to final_lr.", + ) + train_gnn_training_args.add_argument( + "--init_lr", + type=float, + metavar="FLOAT", + default=1e-4, + help="Initial learning rate", + ) + train_gnn_training_args.add_argument( + "--max_lr", + type=float, + metavar="FLOAT", + default=1e-3, + help="Maximum learning rate", + ) + train_gnn_training_args.add_argument( + "--final_lr", + type=float, + metavar="FLOAT", + default=1e-4, + help="Final learning rate", + ) + train_gnn_training_args.add_argument( + "--extra_metrics", + type=str, + metavar="STRING", + nargs="*", + help="Extra metrics to use", + ) + train_gnn_training_args.add_argument( + "--loss_function", + type=str, + choices=[ + "mse", + "bounded_mse", + "binary_cross_entropy", + "cross_entropy", + "mcc", + "sid", + "wasserstein", + "mve", + "evidential", + "dirichlet", + ], + ) + train_gnn_training_args.add_argument( + "--grad_clip", type=float, metavar="FLOAT", help="Gradient clipping value" + ) + train_gnn_training_args.add_argument( + "--metric", + type=str, + default=None, + choices=[ + "auc", + "prc-auc", + "rmse", + "mae", + "mse", + "r2", + "accuracy", + "cross_entropy", + ], + help="Metric to use during evaluation." + "Note: Does NOT affect loss function used during training" + "(loss is determined by the `dataset_type` argument)." + 'Note: Defaults to "auc" for classification and "rmse" for regression.', + ) + train_gnn_training_args.add_argument( + "--num_folds", + type=int, + metavar="INT", + default=1, + help="Number of folds when performing cross validation", + ) + + +def traindmpnn(args: Namespace) -> None: + """ + Train a D-MPNN model using the given options. + """ + createLogger("traingnn.log") + opts = GnnOptions(**vars(args)) + logging.info("Training DMPNN...") + mean_score, std_score = chemprop.train.cross_validate( + args=opts, train_func=chemprop.train.run_training + ) + logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}") diff --git a/dfpl/utils.py b/dfpl/utils.py index 58ba0965..7dc8573c 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -1,4 +1,4 @@ -import argparse +import json import json import logging import os @@ -6,11 +6,9 @@ import sys import warnings from collections import defaultdict -from pathlib import Path from random import Random -from typing import Dict, List, Set, Tuple, Type, TypeVar, Union +from typing import Dict, List, Set, Tuple, TypeVar, Union -import jsonpickle import numpy as np import pandas as pd from rdkit import Chem, RDLogger @@ -25,43 +23,6 @@ T = TypeVar("T") -def parseCmdArgs(cls: Type[T], args: argparse.Namespace) -> T: - """ - Parses command-line arguments to create an instance of the given class. - - Args: - cls: The class to create an instance of. - args: argparse.Namespace containing the command-line arguments. - - Returns: - An instance of cls populated with values from the command-line arguments. - """ - # Extract argument flags from sys.argv - arg_flags = {arg.lstrip("-") for arg in sys.argv if arg.startswith("-")} - - # Create the result instance, which will be modified and returned - result = cls() - - # Load JSON file if specified - if hasattr(args, "configFile") and args.configFile: - jsonFile = Path(args.configFile) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = jsonpickle.decode(f.read()) - for key, value in vars(content).items(): - setattr(result, key, value) - else: - raise ValueError(f"Could not find JSON input file {jsonFile}") - - # Override with user-provided command-line arguments - for key in arg_flags: - if hasattr(args, key): - user_value = getattr(args, key, None) - setattr(result, key, user_value) - - return result - - def makePathAbsolute(p: str) -> str: path = pathlib.Path(p) if path.is_absolute(): @@ -425,3 +386,28 @@ def log_scaffold_stats( ) logging.info("\n") return stats + + +def createLogger(filename: str) -> None: + """ + Set up a logger for the main function that also saves to a log file + """ + # get root logger and set its level + logger = logging.getLogger() + logger.setLevel(logging.INFO) + # create file handler which logs info messages + fh = logging.FileHandler(filename, mode="w") + fh.setLevel(logging.INFO) + # create console handler + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + # create formatter and add it to the handlers + formatterFile = logging.Formatter( + "{asctime} - {name} - {levelname} - {message}", style="{" + ) + formatterConsole = logging.Formatter("{levelname} {message}", style="{") + fh.setFormatter(formatterFile) + ch.setFormatter(formatterConsole) + # add the handlers to the logger + logger.addHandler(fh) + logger.addHandler(ch) diff --git a/dfpl/vae.py b/dfpl/vae.py index 45cfda7a..d9df6f2b 100644 --- a/dfpl/vae.py +++ b/dfpl/vae.py @@ -14,15 +14,16 @@ from tensorflow.keras.models import Model from tensorflow.python.framework.ops import disable_eager_execution +from dfpl.train import TrainOptions from dfpl import callbacks from dfpl import history as ht -from dfpl import options, settings +from dfpl import settings from dfpl.utils import ae_scaffold_split, weight_split disable_eager_execution() -def define_vae_model(opts: options.Options, output_bias=None) -> Tuple[Model, Model]: +def define_vae_model(opts: TrainOptions, output_bias=None) -> Tuple[Model, Model]: input_size = opts.fpSize encoding_dim = ( opts.encFPSize @@ -154,12 +155,12 @@ def vae_loss(y_true, y_pred): return autoencoder, encoder -def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: +def train_full_vae(df: pd.DataFrame, opts: TrainOptions) -> Model: """ Trains an autoencoder on the given feature matrix X. The response matrix is only used to split the data into meaningful test and train sets. - :param opts: Command line arguments as defined in options.py + :param opts: Command line arguments :param df: Pandas dataframe that contains the SMILES/InChI data for training the autoencoder :return: The encoder model of the trained autoencoder """ diff --git a/environment.yml b/environment.yml index 164db6bc..bdee2fe6 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,6 @@ channels: - defaults dependencies: # application requirements - - jsonpickle=2.1 - matplotlib=3.5.1 - numpy=1.22.0 - pandas=1.4.2 diff --git a/example/predict.json b/example/predict.json index d96ad803..c3c7a335 100755 --- a/example/predict.json +++ b/example/predict.json @@ -1,5 +1,4 @@ { - "py/object": "dfpl.options.Options", "inputFile": "tests/data/S_dataset.csv", "outputDir": "example/results_predict/", "outputFile": "smiles.csv", diff --git a/example/predictgnn.json b/example/predictgnn.json index dfdd6a8d..5137a6f7 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -1,5 +1,4 @@ { - "py/object": "dfpl.options.GnnOptions", "test_path": "tests/data/smiles.csv", "preds_path": "preds_dmpnn/preds.csv", "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt" diff --git a/example/train.json b/example/train.json index bf57e7e2..11361d46 100755 --- a/example/train.json +++ b/example/train.json @@ -1,5 +1,4 @@ { - "py/object": "dfpl.options.Options", "inputFile": "tests/data/S_dataset.csv", "outputDir": "example/results_train/", "ecModelDir": "example/results_train/", @@ -14,7 +13,6 @@ "aeSplitType": "random", "aeEpochs": 2, "aeBatchSize": 351, - "aeOptimizer": "Adam", "aeActivationFunction": "relu", "aeLearningRate": 0.001, "aeLearningRateDecay": 0.96, diff --git a/example/traingnn.json b/example/traingnn.json index 5536f700..9b87d9db 100644 --- a/example/traingnn.json +++ b/example/traingnn.json @@ -1,5 +1,4 @@ { - "py/object": "dfpl.options.GnnOptions", "data_path": "tests/data/S_dataset.csv", "save_dir": "dmpnn-random/", "epochs": 4, diff --git a/setup.py b/setup.py index 42c22c0c..702d614b 100644 --- a/setup.py +++ b/setup.py @@ -20,7 +20,6 @@ # all packages need for the final usage # for additional packages during development, use requirements.txt install_requires=[ - "jsonpickle~=2.1.0", "matplotlib==3.5.1", "numpy==1.22.0", "pandas==1.4.2", diff --git a/singularity_container/environment.yaml b/singularity_container/environment.yaml index 55898aa2..ff5128c1 100644 --- a/singularity_container/environment.yaml +++ b/singularity_container/environment.yaml @@ -5,7 +5,6 @@ channels: dependencies: - conda-build=3.21.8 - conda=4.12.0 - - jsonpickle=2.1.0 - matplotlib=3.5.1 - numpy=1.19.5 - pandas=1.4.2 diff --git a/tests/run_autoencoder.py b/tests/run_autoencoder.py index d61fcfae..fc5e5624 100644 --- a/tests/run_autoencoder.py +++ b/tests/run_autoencoder.py @@ -3,11 +3,11 @@ import dfpl.autoencoder as ac import dfpl.fingerprint as fp -import dfpl.options as opt +from dfpl.train import TrainOptions import dfpl.utils as utils project_directory = pathlib.Path(__file__).parent.absolute() -test_train_args = opt.Options( +test_train_args = TrainOptions( inputFile=utils.makePathAbsolute(f"{project_directory}/data/S_dataset.csv"), ecModelDir=utils.makePathAbsolute(f"{project_directory}/data"), outputDir=utils.makePathAbsolute(f"{project_directory}/output"), @@ -26,7 +26,7 @@ ) -def runAutoencoder(opts: opt.Options) -> None: +def runAutoencoder(opts: TrainOptions) -> None: """ Run and test auto-encoder """ diff --git a/tests/run_fnntraining.py b/tests/run_fnntraining.py index 4146ad4a..0b445e0c 100644 --- a/tests/run_fnntraining.py +++ b/tests/run_fnntraining.py @@ -3,12 +3,12 @@ import dfpl.autoencoder as ac import dfpl.fingerprint as fp -import dfpl.options as opt import dfpl.single_label_model as fNN import dfpl.utils as utils +from dfpl.train import TrainOptions project_directory = pathlib.Path(__file__).parent.absolute() -test_train_args = opt.Options( +test_train_args = TrainOptions( inputFile=utils.makePathAbsolute(f"{project_directory}/data/S_dataset.csv"), ecModelDir=utils.makePathAbsolute( f"{project_directory}/output/fnnTrainingCompressed/" @@ -29,7 +29,7 @@ ) -def run_single_label_training(opts: opt.Options) -> None: +def run_single_label_training(opts: TrainOptions) -> None: logging.basicConfig( format="DFPL-{levelname}: {message}", style="{", level=logging.INFO ) diff --git a/tests/run_predictgnn.py b/tests/run_predictgnn.py index 979c2868..b54b0b3b 100644 --- a/tests/run_predictgnn.py +++ b/tests/run_predictgnn.py @@ -5,7 +5,7 @@ import pandas as pd from chemprop import args, train -import dfpl.options as opt +from dfpl.traingnn import GnnOptions import dfpl.utils as utils project_directory = pathlib.Path(__file__).parent.absolute() @@ -13,13 +13,13 @@ project_directory.parent / "example" ) # Path to the example directory -test_predict_args = opt.GnnOptions( +test_predict_args = GnnOptions( configFile=utils.makePathAbsolute(f"{example_directory}/predictgnn.json"), save_dir=utils.makePathAbsolute(f"{project_directory}/output"), ) -def test_predictdmpnn(opts: opt.GnnOptions) -> None: +def test_predictdmpnn(opts: GnnOptions) -> None: print("Running predictdmpnn test...") logging.basicConfig( format="DFPL-{levelname}: {message}", style="{", level=logging.INFO diff --git a/tests/run_prediction.py b/tests/run_prediction.py index cb7d1fea..3baa7295 100644 --- a/tests/run_prediction.py +++ b/tests/run_prediction.py @@ -4,12 +4,14 @@ import dfpl.autoencoder as ac import dfpl.fingerprint as fp -import dfpl.options as opt +import dfpl.convert as opt +import dfpl.predict +import dfpl.train import dfpl.predictions as p import dfpl.utils as utils project_directory = pathlib.Path(__file__).parent.absolute() -test_predict_args = opt.Options( +test_predict_args = dfpl.predict.PredictOptions( inputFile=f"{project_directory}/data/smiles.csv", outputDir=f"{project_directory}/preds/", ecModelDir=utils.makePathAbsolute(f"{project_directory}/output/"), @@ -23,7 +25,7 @@ ) -def test_predictions(opts: opt.Options): +def test_predictions(opts: dfpl.predict.PredictOptions): opts = test_predict_args logging.basicConfig( diff --git a/tests/run_traingnn.py b/tests/run_traingnn.py index 582d4627..c81f363e 100644 --- a/tests/run_traingnn.py +++ b/tests/run_traingnn.py @@ -2,19 +2,19 @@ import pathlib import dfpl.__main__ as main -import dfpl.options as opt import dfpl.utils as utils +from dfpl.traingnn import GnnOptions project_directory = pathlib.Path(__file__).parent.absolute() example_directory = project_directory.parent / "example" -test_train_args = opt.GnnOptions( +test_train_args = GnnOptions( configFile=utils.makePathAbsolute(f"{example_directory}/traingnn.json"), save_dir=utils.makePathAbsolute(f"{project_directory}/output"), total_epochs=1, ) -def test_traindmpnn(opts: opt.GnnOptions) -> None: +def test_traindmpnn(opts: GnnOptions) -> None: print("Running traindmpnn test...") logging.basicConfig( format="DFPL-{levelname}: {message}", style="{", level=logging.INFO diff --git a/tests/run_vae.py b/tests/run_vae.py index d6095d96..c809eebb 100644 --- a/tests/run_vae.py +++ b/tests/run_vae.py @@ -2,12 +2,12 @@ import pathlib import dfpl.fingerprint as fp -import dfpl.options as opt +from dfpl.train import TrainOptions import dfpl.utils as utils import dfpl.vae as vae project_directory = pathlib.Path(__file__).parent.absolute() -test_train_args = opt.Options( +test_train_args = TrainOptions( inputFile=utils.makePathAbsolute(f"{project_directory}/data/S_dataset.csv"), ecModelDir=utils.makePathAbsolute(f"{project_directory}/data"), outputDir=utils.makePathAbsolute(f"{project_directory}/output"), @@ -29,7 +29,7 @@ ) -def runVae(opts: opt.Options) -> None: +def runVae(opts: TrainOptions) -> None: """ Run and test auto-encoder """ diff --git a/tests/test_fractional_sampling.py b/tests/test_fractional_sampling.py index bf27f2e6..48550ec4 100644 --- a/tests/test_fractional_sampling.py +++ b/tests/test_fractional_sampling.py @@ -4,7 +4,7 @@ import numpy as np from dfpl import fingerprint as fp -from dfpl import options as opts +from dfpl import convert as opts from dfpl import single_label_model as fNN diff --git a/tests/try_fpcomparison.py b/tests/try_fpcomparison.py index 6e7b90ad..a2507371 100644 --- a/tests/try_fpcomparison.py +++ b/tests/try_fpcomparison.py @@ -4,8 +4,10 @@ from rdkit.Chem import Draw import dfpl.fingerprint as fp +import dfpl.predict +import dfpl.train from dfpl import autoencoder as ac -from dfpl import options as opt +from dfpl import convert as opt from dfpl import predictions # read both datasets @@ -80,7 +82,7 @@ img.show() project_directory = "" -opts = opt.PredictOptions( +opts = dfpl.predict.PredictOptions( inputFile=f"", outputDir=f"/home/hertelj/tmp/", model=f"/home/hertelj/git-hertelj/deepFPlearn_CODE/validation/case_03/results/ER_compressed-True_sampled-None.best.FNN.model.hdf5",