From e376b7eac6912de3773cf4589e5c7c1fa45c42d5 Mon Sep 17 00:00:00 2001 From: soulios Date: Mon, 1 Jan 2024 20:51:55 +0100 Subject: [PATCH 01/10] update options and saving for encoders --- dfpl/__main__.py | 115 ++-------- dfpl/autoencoder.py | 63 ++---- dfpl/callbacks.py | 33 ++- dfpl/feedforwardNN.py | 16 +- dfpl/options.py | 442 +++++++++++++++++-------------------- dfpl/single_label_model.py | 17 +- dfpl/utils.py | 73 +++++- dfpl/vae.py | 244 ++++++++------------ example/predict.json | 10 +- example/predictgnn.json | 5 +- example/train.json | 18 +- example/traingnn.json | 6 +- 12 files changed, 463 insertions(+), 579 deletions(-) diff --git a/dfpl/__main__.py b/dfpl/__main__.py index 7896d451..aada91a3 100755 --- a/dfpl/__main__.py +++ b/dfpl/__main__.py @@ -17,43 +17,8 @@ from dfpl import vae as vae from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute -project_directory = pathlib.Path(".").parent.parent.absolute() -test_train_opts = options.Options( - inputFile=f"{project_directory}/input_datasets/S_dataset.pkl", - outputDir=f"{project_directory}/output_data/console_test", - ecWeightsFile=f"{project_directory}/output_data/case_00/AE_S/ae_S.encoder.hdf5", - ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model", - type="smiles", - fpType="topological", - epochs=100, - batchSize=1024, - fpSize=2048, - encFPSize=256, - enableMultiLabel=False, - testSize=0.2, - kFolds=2, - verbose=2, - trainAC=False, - trainFNN=True, - compressFeatures=True, - activationFunction="selu", - lossFunction="bce", - optimizer="Adam", - fnnType="FNN", -) -test_pred_opts = options.Options( - inputFile=f"{project_directory}/input_datasets/S_dataset.pkl", - outputDir=f"{project_directory}/output_data/console_test", - outputFile=f"{project_directory}/output_data/console_test/S_dataset.predictions_ER.csv", - ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model", - fnnModelDir=f"{project_directory}/output_data/console_test/ER_saved_model", - type="smiles", - fpType="topological", -) - - -def traindmpnn(opts: options.GnnOptions): +def traindmpnn(opts: options.GnnOptions) -> None: """ Train a D-MPNN model using the given options. Args: @@ -61,54 +26,29 @@ def traindmpnn(opts: options.GnnOptions): Returns: - None """ - os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}" - ignore_elements = ["py/object"] # Load options from a JSON file and replace the relevant attributes in `opts` - arguments = createArgsFromJson( - opts.configFile, ignore_elements, return_json_object=False - ) + arguments = createArgsFromJson(jsonFile = opts.configFile) opts = cp.args.TrainArgs().parse_args(arguments) logging.info("Training DMPNN...") - # Train the model and get the mean and standard deviation of AUC score from cross-validation mean_score, std_score = cp.train.cross_validate( args=opts, train_func=cp.train.run_training ) logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}") -def predictdmpnn(opts: options.GnnOptions, json_arg_path: str) -> None: +def predictdmpnn(opts: options.GnnOptions) -> None: """ Predict the values using a trained D-MPNN model with the given options. Args: - opts: options.GnnOptions instance containing the details of the prediction - - JSON_ARG_PATH: path to a JSON file containing additional arguments for prediction Returns: - None """ - ignore_elements = [ - "py/object", - "checkpoint_paths", - "save_dir", - "saving_name", - ] # Load options and additional arguments from a JSON file - arguments, data = createArgsFromJson( - json_arg_path, ignore_elements, return_json_object=True - ) - arguments.append("--preds_path") - arguments.append("") - save_dir = data.get("save_dir") - name = data.get("saving_name") - # Replace relevant attributes in `opts` with loaded options + arguments = createArgsFromJson(jsonFile = opts.configFile) opts = cp.args.PredictArgs().parse_args(arguments) - opts.preds_path = save_dir + "/" + name - df = pd.read_csv(opts.test_path) - smiles = [] - for index, rows in df.iterrows(): - my_list = [rows.smiles] - smiles.append(my_list) - # Make predictions and return the result - cp.train.make_predictions(args=opts, smiles=smiles) + + cp.train.make_predictions(args=opts) def train(opts: options.Options): @@ -116,9 +56,6 @@ def train(opts: options.Options): Run the main training procedure :param opts: Options defining the details of the training """ - - os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}" - # import data from file and create DataFrame if "tsv" in opts.inputFile: df = fp.importDataFile( @@ -128,7 +65,7 @@ def train(opts: options.Options): df = fp.importDataFile( opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize ) - # initialize encoders to None + # initialize (auto)encoders to None encoder = None autoencoder = None if opts.trainAC: @@ -142,11 +79,12 @@ def train(opts: options.Options): # if feature compression is enabled if opts.compressFeatures: if not opts.trainAC: - if opts.aeType == "deterministic": - (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) - elif opts.aeType == "variational": + if opts.aeType == "variational": (autoencoder, encoder) = vae.define_vae_model(opts=options.Options()) - elif opts.ecWeightsFile == "": + else: + (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) + + if opts.ecWeightsFile == "": encoder = load_model(opts.ecModelDir) else: autoencoder.load_weights( @@ -154,14 +92,15 @@ def train(opts: options.Options): ) # compress the fingerprints using the autoencoder df = ac.compress_fingerprints(df, encoder) - # ac.visualize_fingerprints( - # df, - # before_col="fp", - # after_col="fpcompressed", - # train_indices=train_indices, - # test_indices=test_indices, - # save_as=f"UMAP_{opts.aeSplitType}.png", - # ) + if opts.visualizeLatent: + ac.visualize_fingerprints( + df, + before_col="fp", + after_col="fpcompressed", + train_indices=train_indices, + test_indices=test_indices, + save_as=f"UMAP_{opts.aeSplitType}.png", + ) # train single label models if requested if opts.trainFNN and not opts.enableMultiLabel: sl.train_single_label_models(df=df, opts=opts) @@ -257,7 +196,7 @@ def main(): raise ValueError("Input directory is not a directory") elif prog_args.method == "traingnn": traingnn_opts = options.GnnOptions.fromCmdArgs(prog_args) - + createLogger("traingnn.log") traindmpnn(traingnn_opts) elif prog_args.method == "predictgnn": @@ -267,12 +206,8 @@ def main(): test_path=makePathAbsolute(predictgnn_opts.test_path), preds_path=makePathAbsolute(predictgnn_opts.preds_path), ) - - logging.info( - f"The following arguments are received or filled with default values:\n{prog_args}" - ) - - predictdmpnn(fixed_opts, prog_args.configFile) + createLogger("predictgnn.log") + predictdmpnn(fixed_opts) elif prog_args.method == "train": train_opts = options.Options.fromCmdArgs(prog_args) @@ -298,8 +233,6 @@ def main(): ), ecModelDir=makePathAbsolute(predict_opts.ecModelDir), fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir), - trainAC=False, - trainFNN=False, ) createDirectory(fixed_opts.outputDir) createLogger(path.join(fixed_opts.outputDir, "predict.log")) diff --git a/dfpl/autoencoder.py b/dfpl/autoencoder.py index 99bf4578..6909b156 100644 --- a/dfpl/autoencoder.py +++ b/dfpl/autoencoder.py @@ -8,12 +8,12 @@ import numpy as np import pandas as pd import seaborn as sns -import umap +import umap.umap_ as umap import wandb from sklearn.model_selection import train_test_split from tensorflow.keras import initializers, losses, optimizers from tensorflow.keras.layers import Dense, Input -from tensorflow.keras.models import Model +from tensorflow.keras.models import Model, load_model from dfpl import callbacks from dfpl import history as ht @@ -32,9 +32,13 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod """ input_size = opts.fpSize encoding_dim = opts.encFPSize - ac_optimizer = optimizers.Adam( - learning_rate=opts.aeLearningRate, decay=opts.aeLearningRateDecay + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, ) + ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) if output_bias is not None: output_bias = initializers.Constant(output_bias) @@ -104,7 +108,6 @@ def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mod )(decoded) # output layer - # to either 0 or 1 and hence we use sigmoid activation function. decoded = Dense( units=input_size, activation="sigmoid", bias_initializer=output_bias )(decoded) @@ -145,37 +148,8 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: if opts.aeWabTracking and not opts.wabTracking: wandb.init(project=f"AE_{opts.aeSplitType}") - # Define output files for autoencoder and encoder weights - if opts.ecWeightsFile == "": - # If no encoder weights file is specified, use the input file name to generate a default file name - logging.info("No AE encoder weights file specified") - base_file_name = ( - os.path.splitext(basename(opts.inputFile))[0] + opts.aeSplitType - ) - logging.info( - f"(auto)encoder weights will be saved in {base_file_name}.autoencoder.hdf5" - ) - ac_weights_file = os.path.join( - opts.outputDir, base_file_name + ".autoencoder.weights.hdf5" - ) - # ec_weights_file = os.path.join( - # opts.outputDir, base_file_name + ".encoder.weights.hdf5" - # ) - else: - # If an encoder weights file is specified, use it as the encoder weights file name - logging.info(f"AE encoder will be saved in {opts.ecWeightsFile}") - base_file_name = ( - os.path.splitext(basename(opts.ecWeightsFile))[0] + opts.aeSplitType - ) - ac_weights_file = os.path.join( - opts.outputDir, base_file_name + ".autoencoder.weights.hdf5" - ) - # ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile) - + save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_split_autoencoder") # Collect the callbacks for training - callback_list = callbacks.autoencoder_callback( - checkpoint_path=ac_weights_file, opts=opts - ) # Select all fingerprints that are valid and turn them into a numpy array fp_matrix = np.array( @@ -286,30 +260,35 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: # Set up the model of the AC w.r.t. the input size and the dimension of the bottle neck (z!) (autoencoder, encoder) = define_ac_model(opts, output_bias=initial_bias) - + callback_list = callbacks.autoencoder_callback(checkpoint_path=save_path, opts=opts) # Train the autoencoder on the training data auto_hist = autoencoder.fit( x_train, x_train, - callbacks=callback_list, + callbacks=[callback_list], epochs=opts.aeEpochs, batch_size=opts.aeBatchSize, verbose=opts.verbose, validation_data=(x_test, x_test) if opts.testSize > 0.0 else None, ) - logging.info(f"Autoencoder weights stored in file: {ac_weights_file}") # Store the autoencoder training history and plot the metrics ht.store_and_plot_history( - base_file_name=os.path.join(opts.outputDir, base_file_name + ".AC"), + base_file_name=save_path, hist=auto_hist, ) # Save the autoencoder callback model to disk - save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_autoencoder") if opts.testSize > 0.0: - (callback_autoencoder, callback_encoder) = define_ac_model(opts) - callback_encoder.save(filepath=save_path) + # Re-define autoencoder and encoder using your function + callback_autoencoder = load_model(filepath=save_path) + _, callback_encoder = define_ac_model(opts) + for i, layer in enumerate(callback_encoder.layers): + layer.set_weights(callback_autoencoder.layers[i].get_weights()) + + # Save the encoder model + encoder_save_path = os.path.join(save_path, "encoder_model") + callback_encoder.save(filepath=encoder_save_path) else: encoder.save(filepath=save_path) # Return the encoder model of the trained autoencoder diff --git a/dfpl/callbacks.py b/dfpl/callbacks.py index 6eae7965..fc1f817c 100644 --- a/dfpl/callbacks.py +++ b/dfpl/callbacks.py @@ -22,28 +22,37 @@ def autoencoder_callback(checkpoint_path: str, opts: options.Options) -> list: else: target = "loss" # enable this checkpoint to restore the weights of the best performing model - checkpoint = ModelCheckpoint( + if opts.aeType == "deterministic": + checkpoint = ModelCheckpoint( checkpoint_path, monitor=target, mode="min", verbose=1, - period=settings.ac_train_check_period, + save_freq="epoch", save_best_only=True, - save_weights_only=True, - ) + ) + else: + checkpoint = ModelCheckpoint( + checkpoint_path, + monitor=target, + mode="min", + verbose=1, + save_freq="epoch", + save_best_only=True, + save_weights_only=True + ) callbacks.append(checkpoint) # enable early stopping if val_loss is not improving anymore early_stop = EarlyStopping( - monitor=target, - mode="min", - patience=settings.ac_train_patience, - min_delta=settings.ac_train_min_delta, - verbose=1, - restore_best_weights=True, + monitor=target, + mode="min", + patience=settings.ac_train_patience, + min_delta=settings.ac_train_min_delta, + verbose=1, + restore_best_weights=True, ) callbacks.append(early_stop) - if opts.aeWabTracking and not opts.wabTracking: callbacks.append(WandbCallback(save_model=False)) return callbacks @@ -65,7 +74,7 @@ def nn_callback(checkpoint_path: str, opts: options.Options) -> list: checkpoint = ModelCheckpoint( checkpoint_path, verbose=1, - period=settings.nn_train_check_period, + save_freq="epoch", save_best_only=True, monitor="val_loss", mode="min", diff --git a/dfpl/feedforwardNN.py b/dfpl/feedforwardNN.py index e9c88776..bf4241aa 100644 --- a/dfpl/feedforwardNN.py +++ b/dfpl/feedforwardNN.py @@ -69,10 +69,16 @@ def define_out_file_names(path_prefix: str, target: str, fold: int = -1) -> tupl def define_nn_multi_label_model( input_size: int, output_size: int, opts: options.Options ) -> Model: + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, + ) if opts.optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=opts.learningRate) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) elif opts.optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=opts.learningRate, momentum=0.9) + my_optimizer = optimizers.legacy.SGD(lr=lr_schedule, momentum=0.9) else: logging.error(f"Your selected optimizer is not supported:{opts.optimizer}.") sys.exit("Unsupported optimizer.") @@ -132,9 +138,9 @@ def define_nn_model_multi( decay: float = 0.01, ) -> Model: if optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=lr, decay=decay) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr, decay=decay) elif optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=lr, momentum=0.9, decay=decay) + my_optimizer = optimizers.legacy.SGD(lr=lr, momentum=0.9, decay=decay) else: my_optimizer = optimizer @@ -294,6 +300,8 @@ def train_nn_models_multi(df: pd.DataFrame, opts: options.Options) -> None: model_file_path_weights, model_file_path_json, model_hist_path, + model_hist_csv_path, + model_predict_valset_csv_path, model_validation, model_auc_file, model_auc_file_data, diff --git a/dfpl/options.py b/dfpl/options.py index 6d84dbc4..d1d657aa 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -3,12 +3,12 @@ import argparse from dataclasses import dataclass from pathlib import Path - +from typing import Optional import jsonpickle import torch from chemprop.args import TrainArgs -from dfpl.utils import makePathAbsolute +from dfpl.utils import makePathAbsolute, parseCmdArgs @dataclass @@ -17,51 +17,51 @@ class Options: Dataclass for all options necessary for training the neural nets """ - configFile: str = "./example/train.json" - inputFile: str = "/deepFPlearn/CMPNN/data/tox21.csv" - outputDir: str = "." - outputFile: str = "" - ecWeightsFile: str = "AE.encoder.weights.hdf5" - ecModelDir: str = "AE_encoder" - fnnModelDir: str = "modeltraining" + configFile: str = None + inputFile: str = "tests/data/smiles.csv" + outputDir: str = "example/results_train/" # changes according to mode + outputFile: str = "results.csv" + ecWeightsFile: str = "" + ecModelDir: str = "example/results_train/AE_encoder/" + fnnModelDir: str = "example/results_train/AR_saved_model/" type: str = "smiles" fpType: str = "topological" # also "MACCS", "atompairs" - epochs: int = 512 + epochs: int = 100 fpSize: int = 2048 encFPSize: int = 256 - kFolds: int = 0 + kFolds: int = 1 testSize: float = 0.2 enableMultiLabel: bool = False - verbose: int = 0 - trainAC: bool = True # if set to False, an AC weight file must be provided! + verbose: int = 2 + trainAC: bool = False trainFNN: bool = True - compressFeatures: bool = True - sampleFractionOnes: float = 0.5 # Only used when value is in [0,1] + compressFeatures: bool = False + sampleFractionOnes: float = 0.5 sampleDown: bool = False split_type: str = "random" aeSplitType: str = "random" aeType: str = "deterministic" - aeEpochs: int = 3000 + aeEpochs: int = 100 aeBatchSize: int = 512 aeLearningRate: float = 0.001 - aeLearningRateDecay: float = 0.01 - aeActivationFunction: str = "relu" + aeLearningRateDecay: float = 0.96 + aeActivationFunction: str = "selu" aeOptimizer: str = "Adam" fnnType: str = "FNN" batchSize: int = 128 optimizer: str = "Adam" learningRate: float = 0.001 + learningRateDecay: float = 0.96 lossFunction: str = "bce" activationFunction: str = "relu" l2reg: float = 0.001 dropout: float = 0.2 threshold: float = 0.5 - gpu: str = "" - snnDepth = 8 - snnWidth = 50 - aeWabTracking: str = "" # Wand & Biases autoencoder tracking - wabTracking: str = "" # Wand & Biases FNN tracking - wabTarget: str = "ER" # Wand & Biases target used for showing training progress + visualizeLatent: bool = False #only if autoencoder is trained or loaded + gpu: int = None + aeWabTracking: bool = False # Wand & Biases autoencoder tracking + wabTracking: bool = False # Wand & Biases FNN tracking + wabTarget: str = "AR" # Wand & Biases target used for showing training progress def saveToFile(self, file: str) -> None: """ @@ -72,42 +72,8 @@ def saveToFile(self, file: str) -> None: f.write(jsonpickle.encode(self)) @classmethod - def fromJson(cls, file: str) -> Options: - """ - Create an instance from a JSON file - """ - jsonFile = Path(file) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - return jsonpickle.decode(content) - raise ValueError("JSON file does not exist or is not readable") - - @classmethod - def fromCmdArgs(cls, args: argparse.Namespace) -> Options: - """ - Creates Options instance from cmdline arguments. - - If a training file (JSON) is provided, the values from that file are used. - However, additional commandline arguments will be preferred. If, e.g., "fpSize" is specified both in the - JSON file and on the commandline, then the value of the commandline argument will be used. - """ - result = Options() - if "configFile" in vars(args).keys(): - jsonFile = Path(makePathAbsolute(args.configFile)) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - result = jsonpickle.decode(content) - else: - raise ValueError("Could not find JSON input file") - - for key, value in vars(args).items(): - # The args dict will contain a "method" key from the subparser. - # We don't use this. - if key != "method": - result.__setattr__(key, value) - return result + def fromCmdArgs(cls, args: argparse.Namespace) -> "Options": + return parseCmdArgs(cls, args) @dataclass @@ -134,37 +100,19 @@ class GnnOptions(TrainArgs): save_preds: bool = True @classmethod - def fromCmdArgs(cls, args: argparse.Namespace) -> GnnOptions: - """ - Creates Options instance from cmdline arguments. + def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): + # Initialize with JSON config if provided + if json_config: + opts = cls(**json_config) + else: + opts = cls() - If a training file (JSON) is provided, the values from that file are used. - However, additional commandline arguments will be preferred. If, e.g., "fpSize" is specified both in the - JSON file and on the commandline, then the value of the commandline argument will be used. - """ - result = GnnOptions() - if "configFile" in vars(args).keys(): - jsonFile = Path(makePathAbsolute(args.configFile)) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - result = jsonpickle.decode(content) - else: - raise ValueError("Could not find JSON input file") - - return result + # Update with command-line arguments + for key, value in vars(args).items(): + if value is not None: + setattr(opts, key, value) - @classmethod - def fromJson(cls, file: str) -> GnnOptions: - """ - Create an instance from a JSON file - """ - jsonFile = Path(file) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - return jsonpickle.decode(content) - raise ValueError("JSON file does not exist or is not readable") + return opts def createCommandlineParser() -> argparse.ArgumentParser: @@ -225,7 +173,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="FILE", type=str, help="Input JSON file that contains all information for training/predicting.", - default=argparse.SUPPRESS, + default="example/train.json", ) general_args.add_argument( "-i", @@ -234,7 +182,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="The file containing the data for training in " "comma separated CSV format.The first column should be smiles.", - default=argparse.SUPPRESS, + default="tests/data/smiles.csv" ) general_args.add_argument( "-o", @@ -243,8 +191,10 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="Prefix of output file name. Trained model and " "respective stats will be returned in this directory.", - default=argparse.SUPPRESS, + default="example/results_train/" ) + + # TODO CHECK WHAT IS TYPE DOING? general_args.add_argument( "-t", "--type", @@ -252,7 +202,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["fp", "smiles"], help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default=argparse.SUPPRESS, + default="fp" ) general_args.add_argument( "-thr", @@ -260,47 +210,41 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=float, metavar="FLOAT", help="Threshold for binary classification.", - default=argparse.SUPPRESS, + default=0.5 ) general_args.add_argument( "-gpu", "--gpu", metavar="INT", type=int, - help="Select which gpu to use. If not available, leave empty.", - default=argparse.SUPPRESS, + help="Select which gpu to use by index. If not available, leave empty", + default=None ) general_args.add_argument( - "-k", "--fpType", metavar="STR", type=str, - choices=["topological", "MACCS"], # , 'atompairs', 'torsions'], - help="The type of fingerprint to be generated/used in input file.", - default=argparse.SUPPRESS, + choices=["topological", "MACCS"], + help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.", + default="topological" ) general_args.add_argument( - "-s", "--fpSize", type=int, - help="Size of fingerprint that should be generated.", - default=argparse.SUPPRESS, + help="Length of the fingerprint that should be generated.", + default=2048 ) general_args.add_argument( - "-c", "--compressFeatures", - metavar="BOOL", - type=bool, - help="Should the fingerprints be compressed or not. Activates the autoencoder. ", - default=argparse.SUPPRESS, + action="store_true", + help="Should the fingerprints be compressed or not. Needs a path of a trained autoencoder or needs the trainAC also set to True.", + default=False ) general_args.add_argument( - "-m", "--enableMultiLabel", - metavar="BOOL", - type=bool, + action="store_true", help="Train multi-label classification model in addition to the individual models.", - default=argparse.SUPPRESS, + default=False ) # Autoencoder Configuration autoencoder_args.add_argument( @@ -309,14 +253,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, metavar="FILE", help="The .hdf5 file of a trained encoder", - default=argparse.SUPPRESS, + default="" ) autoencoder_args.add_argument( "--ecModelDir", type=str, metavar="DIR", help="The directory where the full model of the encoder will be saved", - default=argparse.SUPPRESS, + default="example/results_train/AE_encoder/" ) autoencoder_args.add_argument( "--aeType", @@ -324,21 +268,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["variational", "deterministic"], help="Autoencoder type, variational or deterministic.", - default=argparse.SUPPRESS, + default="deterministic" ) autoencoder_args.add_argument( "--aeEpochs", metavar="INT", type=int, help="Number of epochs for autoencoder training.", - default=argparse.SUPPRESS, + default=100 ) autoencoder_args.add_argument( "--aeBatchSize", metavar="INT", type=int, help="Batch size in autoencoder training.", - default=argparse.SUPPRESS, + default=512 ) autoencoder_args.add_argument( "--aeActivationFunction", @@ -346,21 +290,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["relu", "selu"], help="The activation function for the hidden layers in the autoencoder.", - default=argparse.SUPPRESS, + default="relu" ) autoencoder_args.add_argument( "--aeLearningRate", metavar="FLOAT", type=float, help="Learning rate for autoencoder training.", - default=argparse.SUPPRESS, + default=0.001 ) autoencoder_args.add_argument( "--aeLearningRateDecay", metavar="FLOAT", type=float, help="Learning rate decay for autoencoder training.", - default=argparse.SUPPRESS, + default=0.96 ) autoencoder_args.add_argument( "--aeSplitType", @@ -368,7 +312,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the autoencoder", - default=argparse.SUPPRESS, + default="random" ) autoencoder_args.add_argument( "-d", @@ -376,7 +320,13 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Size of encoded fingerprint (z-layer of autoencoder).", - default=argparse.SUPPRESS, + default=256 + ) + autoencoder_args.add_argument( + "--visualizeLatent", + action="store_true", + help="UMAP the latent space for exploration", + default=False ) # Training Configuration training_args.add_argument( @@ -385,15 +335,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the feedforward neural network", - default=argparse.SUPPRESS, + default="random" ) training_args.add_argument( - "-l", "--testSize", metavar="FLOAT", type=float, help="Fraction of the dataset that should be used for testing. Value in [0,1].", - default=argparse.SUPPRESS, + default=0.2 ) training_args.add_argument( "-K", @@ -401,7 +350,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="K that is used for K-fold cross-validation in the training procedure.", - default=argparse.SUPPRESS, + default=1 ) training_args.add_argument( "-v", @@ -411,21 +360,19 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: choices=[0, 1, 2], help="Verbosity level. O: No additional output, " + "1: Some additional output, 2: full additional output", - default=argparse.SUPPRESS, + default=2, ) training_args.add_argument( "--trainAC", - metavar="BOOL", - type=bool, + action="store_true", help="Choose to train or not, the autoencoder based on the input file", - default=argparse.SUPPRESS, + default=False, ) training_args.add_argument( "--trainFNN", - metavar="BOOL", - type=bool, - help="Train the feedforward network either with provided weights.", - default=argparse.SUPPRESS, + action="store_false", + help="When called it deactivates the training.", + default=True, ) training_args.add_argument( "--sampleFractionOnes", @@ -433,14 +380,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=float, help="This is the fraction of positive target associations (1s) in comparison to the majority class(0s)." "only works if --sampleDown is enabled", - default=argparse.SUPPRESS, + default=0.5, ) training_args.add_argument( "--sampleDown", metavar="BOOL", type=bool, help="Enable automatic down sampling of the 0 valued samples.", - default=argparse.SUPPRESS, + default=False, ) training_args.add_argument( "-e", @@ -448,52 +395,60 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Number of epochs that should be used for the FNN training", - default=argparse.SUPPRESS, + default=100, ) - + # TODO CHECK IF ALL LOSSES MAKE SENSE HERE training_args.add_argument( "--lossFunction", metavar="STRING", type=str, choices=["mse", "bce", "focal"], help="Loss function to use during training. mse - mean squared error, bce - binary cross entropy.", - default=argparse.SUPPRESS, + default="bce", ) + # TODO DO I NEED ALL ARGUMENTS TO BE USER SPECIFIED? WHAT DOES THE USER KNOW ABOUT OPTIMIZERS? training_args.add_argument( "--optimizer", metavar="STRING", type=str, choices=["Adam", "SGD"], help='Optimizer to use for backpropagation in the FNN. Possible values: "Adam", "SGD"', - default=argparse.SUPPRESS, + default="Adam", ) training_args.add_argument( "--batchSize", metavar="INT", type=int, help="Batch size in FNN training.", - default=argparse.SUPPRESS, + default=128, ) training_args.add_argument( "--l2reg", metavar="FLOAT", type=float, help="Value for l2 kernel regularizer.", - default=argparse.SUPPRESS, + default=0.001, ) training_args.add_argument( "--dropout", metavar="FLOAT", type=float, help="The fraction of data that is dropped out in each dropout layer.", - default=argparse.SUPPRESS, + default=0.2, ) training_args.add_argument( "--learningRate", metavar="FLOAT", type=float, help="Learning rate size in FNN training.", - default=argparse.SUPPRESS, + default=0.000022, + ) + training_args.add_argument( + "--learningRateDecay", + metavar="FLOAT", + type=float, + help="Learning rate size in FNN training.", + default=0.96, ) training_args.add_argument( "--activationFunction", @@ -501,7 +456,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["relu", "selu"], help="The activation function for hidden layers in the FNN.", - default=argparse.SUPPRESS, + default="relu", ) # Tracking Configuration tracking_args.add_argument( @@ -509,14 +464,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="BOOL", type=bool, help="Track autoencoder performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default=False, ) tracking_args.add_argument( "--wabTracking", metavar="BOOL", type=bool, help="Track FNN performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default=False, ) tracking_args.add_argument( "--wabTarget", @@ -524,8 +479,108 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["AR", "ER", "ED", "GR", "TR", "PPARg", "Aromatase"], help="Which target to use for tracking performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default="AR", + ) + + +def parseInputPredict(parser: argparse.ArgumentParser) -> None: + """ + Parse the input arguments. + + :return: A namespace object built up from attributes parsed out of the cmd line. + """ + + general_args = parser.add_argument_group("General Configuration") + files_args = parser.add_argument_group("Files") + files_args.add_argument( + "-f", + "--configFile", + metavar="FILE", + type=str, + help="Input JSON file that contains all information for training/predicting." + ) + files_args.add_argument( + "-i", + "--inputFile", + metavar="FILE", + type=str, + help="The file containing the data for the prediction in (unquoted) " + "comma separated CSV format. The column named 'smiles' or 'fp'" + "contains the field to be predicted. Please adjust the type " + "that should be predicted (fp or smile) with -t option appropriately." + "An optional column 'id' is used to assign the outcomes to the" + "original identifiers. If this column is missing, the results are" + "numbered in the order of their appearance in the input file." + "A header is expected and respective column names are used.", + default="tests/data/smiles.csv", + ) + files_args.add_argument( + "-o", + "--outputDir", + metavar="DIR", + type=str, + help="Prefix of output directory. It will contain a log file and the file specified" + "with --outputFile.", + default="example/results_predict/", + ) + files_args.add_argument( + "--outputFile", + metavar="FILE", + type=str, + help="Output .CSV file name which will contain one prediction per input line. " + "Default: prefix of input file name.", + default="results.csv", + ) + # TODO AGAIN THIS TRASH HERE? CAN WE EVEN PROCESS SMILES? + general_args.add_argument( + "-t", + "--type", + metavar="STR", + type=str, + choices=["fp", "smiles"], + help="Type of the chemical representation. Choices: 'fp', 'smiles'.", + default="fp", + ) + general_args.add_argument( + "-k", + "--fpType", + metavar="STR", + type=str, + choices=["topological", "MACCS"], + help="The type of fingerprint to be generated/used in input file. Should be the same as the type of the fps that the model was trained upon.", + default="topological", + ) + files_args.add_argument( + "--ecModelDir", + type=str, + metavar="DIR", + help="The directory where the full model of the encoder will be saved (if trainAE=True) or " + "loaded from (if trainAE=False). Provide a full path here.", + default="", + ) + files_args.add_argument( + "--ecWeightsFile", + type=str, + metavar="STR", + help="The file where the full model of the encoder will be loaded from, to compress the fingerprints. Provide a full path here.", + default="", ) + files_args.add_argument( + "--fnnModelDir", + type=str, + metavar="DIR", + help="The directory where the full model of the fnn is loaded from. " + "Provide a full path here.", + default="example/results_train/AR_saved_model", + ) + general_args.add_argument( + "-c", "--compressFeatures", action="store_true", default=False + ) + (general_args.add_argument( + "--aeType", metavar="STRING", type=str, + choices=["variational", "deterministic"], + help="Autoencoder type, variational or deterministic.", + default="deterministic")) def parseTrainGnn(parser: argparse.ArgumentParser) -> None: @@ -575,9 +630,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: default=10, help="The number of batches between each logging of the training loss", ) - general_args.add_argument( - "--no_cuda", action="store_true", default=True, help="Turn off cuda" - ) general_args.add_argument( "--no_cache", action="store_true", @@ -1034,91 +1086,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: ) -def parseInputPredict(parser: argparse.ArgumentParser) -> None: - """ - Parse the input arguments. - - :return: A namespace object built up from attributes parsed out of the cmd line. - """ - - general_args = parser.add_argument_group("General Configuration") - files_args = parser.add_argument_group("Files") - files_args.add_argument( - "-f", - "--configFile", - metavar="FILE", - type=str, - help="Input JSON file that contains all information for training/predicting.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "-i", - "--inputFile", - metavar="FILE", - type=str, - help="The file containing the data for the prediction in (unquoted) " - "comma separated CSV format. The column named 'smiles' or 'fp'" - "contains the field to be predicted. Please adjust the type " - "that should be predicted (fp or smile) with -t option appropriately." - "An optional column 'id' is used to assign the outcomes to the" - "original identifiers. If this column is missing, the results are" - "numbered in the order of their appearance in the input file." - "A header is expected and respective column names are used.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "-o", - "--outputDir", - metavar="DIR", - type=str, - help="Prefix of output directory. It will contain a log file and the file specified" - "with --outputFile.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "--outputFile", - metavar="FILE", - type=str, - help="Output .CSV file name which will contain one prediction per input line. " - "Default: prefix of input file name.", - default=argparse.SUPPRESS, - ) - general_args.add_argument( - "-t", - "--type", - metavar="STR", - type=str, - choices=["fp", "smiles"], - help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default=argparse.SUPPRESS, - ) - general_args.add_argument( - "-k", - "--fpType", - metavar="STR", - type=str, - choices=["topological", "MACCS"], # , 'atompairs', 'torsions'], - help="The type of fingerprint to be generated/used in input file.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "--ecModelDir", - type=str, - metavar="DIR", - help="The directory where the full model of the encoder will be saved (if trainAE=True) or " - "loaded from (if trainAE=False). Provide a full path here.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "--fnnModelDir", - type=str, - metavar="DIR", - help="The directory where the full model of the fnn is loaded from. " - "Provide a full path here.", - default=argparse.SUPPRESS, - ) - - def parsePredictGnn(parser: argparse.ArgumentParser) -> None: general_args = parser.add_argument_group("General Configuration") data_args = parser.add_argument_group("Data Configuration") @@ -1139,9 +1106,6 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: choices=list(range(torch.cuda.device_count())), help="Which GPU to use", ) - general_args.add_argument( - "--no_cuda", action="store_true", default=False, help="Turn off cuda" - ) general_args.add_argument( "--num_workers", type=int, diff --git a/dfpl/single_label_model.py b/dfpl/single_label_model.py index 18402f09..191690ba 100644 --- a/dfpl/single_label_model.py +++ b/dfpl/single_label_model.py @@ -333,12 +333,17 @@ def define_single_label_model( else: logging.error(f"Your selected loss is not supported: {opts.lossFunction}.") sys.exit("Unsupported loss function") - + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.learningRate, + decay_steps=1000, + decay_rate=opts.learningRateDecay, + staircase=True, + ) # Set the optimizer according to the option selected if opts.optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=opts.learningRate) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) elif opts.optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=opts.learningRate, momentum=0.9) + my_optimizer = optimizers.legacy.SGD(lr=lr_schedule, momentum=0.9) else: logging.error(f"Your selected optimizer is not supported: {opts.optimizer}.") sys.exit("Unsupported optimizer") @@ -596,11 +601,7 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: """ # find target columns - targets = [ - c - for c in df.columns - if c in ["AR", "ER", "ED", "TR", "GR", "PPARg", "Aromatase"] - ] + targets = [c for c in df.columns if c not in ["smiles", "fp", "fpcompressed"]] if opts.wabTracking and opts.wabTarget != "": # For W&B tracking, we only train one target that's specified as wabTarget "ER". # In case it's not there, we use the first one available diff --git a/dfpl/utils.py b/dfpl/utils.py index db3d6ec1..15fd018b 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -5,8 +5,14 @@ import warnings from collections import defaultdict from random import Random -from typing import Dict, List, Set, Tuple, Union +from typing import Dict, List, Set, Tuple, Union, Type, TypeVar, Any +# Define a type variable + +from pathlib import Path +import argparse +import jsonpickle +import sys import numpy as np import pandas as pd from rdkit import Chem, RDLogger @@ -15,7 +21,44 @@ from tqdm import tqdm RDLogger.DisableLog("rdApp.*") +T = TypeVar("T") + + +def parseCmdArgs(cls: Type[T], args: argparse.Namespace) -> T: + """ + Parses command-line arguments to create an instance of the given class. + + Args: + cls: The class to create an instance of. + args: argparse.Namespace containing the command-line arguments. + + Returns: + An instance of cls populated with values from the command-line arguments. + """ + # Extract argument flags from sys.argv + arg_flags = {arg.lstrip('-') for arg in sys.argv if arg.startswith('-')} + + # Create the result instance, which will be modified and returned + result = cls() + + # Load JSON file if specified + if hasattr(args, "configFile") and args.configFile: + jsonFile = Path(args.configFile) + if jsonFile.exists() and jsonFile.is_file(): + with jsonFile.open() as f: + content = jsonpickle.decode(f.read()) + for key, value in vars(content).items(): + setattr(result, key, value) + else: + raise ValueError("Could not find JSON input file") + + # Override with user-provided command-line arguments + for key in arg_flags: + if hasattr(args, key): + user_value = getattr(args, key, None) + setattr(result, key, user_value) + return result def makePathAbsolute(p: str) -> str: path = pathlib.Path(p) @@ -31,20 +74,34 @@ def createDirectory(directory: str): os.makedirs(path) -def createArgsFromJson(in_json: str, ignore_elements: list, return_json_object: bool): +def createArgsFromJson(jsonFile: str): arguments = [] - with open(in_json, "r") as f: + ignore_elements = ["py/object"] + + with open(jsonFile, "r") as f: data = json.load(f) + + # Check each key in the JSON file against command-line arguments for key, value in data.items(): if key not in ignore_elements: + # Prepare the command-line argument format + cli_arg_key = f"--{key}" + + # Check if this argument is provided in the command line + if cli_arg_key in sys.argv: + # Find the index of the argument in sys.argv and get its value + arg_index = sys.argv.index(cli_arg_key) + 1 + if arg_index < len(sys.argv): + cli_value = sys.argv[arg_index] + value = cli_value # Override JSON value with command-line value + + # Append the argument and its value to the list if key == "extra_metrics" and isinstance(value, list): - arguments.append("--extra_metrics") + arguments.append(cli_arg_key) arguments.extend(value) else: - arguments.append("--" + str(key)) - arguments.append(str(value)) - if return_json_object: - return arguments, data + arguments.extend([cli_arg_key, str(value)]) + return arguments diff --git a/dfpl/vae.py b/dfpl/vae.py index d0a89dbe..cc61b17d 100644 --- a/dfpl/vae.py +++ b/dfpl/vae.py @@ -13,7 +13,7 @@ from sklearn.model_selection import train_test_split from tensorflow.keras import initializers, optimizers from tensorflow.keras.layers import Dense, Input, Lambda -from tensorflow.keras.models import Model +from tensorflow.keras.models import Model,load_model from tensorflow.python.framework.ops import disable_eager_execution from dfpl import callbacks @@ -26,114 +26,71 @@ def define_vae_model(opts: options.Options, output_bias=None) -> Tuple[Model, Model]: input_size = opts.fpSize - encoding_dim = opts.encFPSize - ac_optimizer = optimizers.Adam( - learning_rate=opts.aeLearningRate, decay=opts.aeLearningRateDecay + encoding_dim = opts.encFPSize # This should be the intended size of your latent space, e.g., 256 + + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, ) + ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) if output_bias is not None: output_bias = initializers.Constant(output_bias) - # get the number of meaningful hidden layers (latent space included) hidden_layer_count = round(math.log2(input_size / encoding_dim)) - # the input placeholder input_vec = Input(shape=(input_size,)) - # 1st hidden layer, that receives weights from input layer - # equals bottleneck layer, if hidden_layer_count==1! + # 1st hidden layer if opts.aeActivationFunction != "selu": - encoded = Dense( - units=int(input_size / 2), activation=opts.aeActivationFunction - )(input_vec) + encoded = Dense(units=int(input_size / 2), activation=opts.aeActivationFunction)(input_vec) else: - encoded = Dense( - units=int(input_size / 2), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(input_vec) - - if hidden_layer_count > 1: - # encoding layers, incl. bottle-neck - for i in range(1, hidden_layer_count): - factor_units = 2 ** (i + 1) - # print(f'{factor_units}: {int(input_size / factor_units)}') - if opts.aeActivationFunction != "selu": - encoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - else: - encoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(encoded) - - # latent space layers - factor_units = 2 ** (hidden_layer_count - 1) + encoded = Dense(units=int(input_size / 2), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(input_vec) + + # encoding layers + for i in range(1, hidden_layer_count - 1): # Adjust the range to stop before the latent space layers + factor_units = 2 ** (i + 1) if opts.aeActivationFunction != "selu": - z_mean = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - z_log_var = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) + encoded = Dense(units=int(input_size / factor_units), activation=opts.aeActivationFunction)(encoded) else: - z_mean = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(encoded) - z_log_var = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(encoded) - - # sampling layer - def sampling(args): - z_mean, z_log_var = args - batch = K.shape(z_mean)[0] - dim = K.int_shape(z_mean)[1] - epsilon = K.random_normal(shape=(batch, dim)) - return z_mean + K.exp(0.5 * z_log_var) * epsilon - - # sample from latent space - z = Lambda(sampling, output_shape=(int(input_size / factor_units),))( - [z_mean, z_log_var] - ) - decoded = z - # decoding layers - for i in range(hidden_layer_count - 2, 0, -1): - factor_units = 2**i - # print(f'{factor_units}: {int(input_size/factor_units)}') - if opts.aeActivationFunction != "selu": - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(decoded) - else: - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(decoded) - - # output layer - decoded = Dense( - units=input_size, activation="sigmoid", bias_initializer=output_bias - )(decoded) + encoded = Dense(units=int(input_size / factor_units), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(encoded) + # latent space layers + if opts.aeActivationFunction != "selu": + z_mean = Dense(units=encoding_dim, activation=opts.aeActivationFunction)(encoded) # Adjusted size to encoding_dim + z_log_var = Dense(units=encoding_dim, activation=opts.aeActivationFunction)(encoded) # Adjusted size to encoding_dim else: - # output layer - decoded = Dense( - units=input_size, activation="sigmoid", bias_initializer=output_bias - )(encoded) + z_mean = Dense(units=encoding_dim, activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(encoded) # Adjusted size to encoding_dim + z_log_var = Dense(units=encoding_dim, activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(encoded) # Adjusted size to encoding_dim + + # sampling layer + def sampling(args): + z_mean, z_log_var = args + batch = K.shape(z_mean)[0] + dim = K.int_shape(z_mean)[1] + epsilon = K.random_normal(shape=(batch, dim)) + return z_mean + K.exp(0.5 * z_log_var) * epsilon + + z = Lambda(sampling, output_shape=(encoding_dim,))([z_mean, z_log_var]) + decoded = z + + # decoding layers + for i in range(hidden_layer_count - 2, 0, -1): + factor_units = 2 ** i + if opts.aeActivationFunction != "selu": + decoded = Dense(units=int(input_size / factor_units), activation=opts.aeActivationFunction)(decoded) + else: + decoded = Dense(units=int(input_size / factor_units), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(decoded) + + # output layer + decoded = Dense(units=input_size, activation="sigmoid", bias_initializer=output_bias)(decoded) autoencoder = Model(input_vec, decoded) + encoder = Model(input_vec, z) + autoencoder.summary(print_fn=logging.info) + # KL divergence loss def kl_loss(z_mean, z_log_var): @@ -155,9 +112,6 @@ def vae_loss(y_true, y_pred): optimizer=ac_optimizer, loss=vae_loss, metrics=[bce_loss, kl_loss] ) - # build encoder model - encoder = Model(input_vec, z_mean) - return autoencoder, encoder @@ -175,39 +129,9 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: if opts.aeWabTracking and not opts.wabTracking: wandb.init(project=f"VAE_{opts.aeSplitType}") - # Define output files for VAE and encoder weights - if opts.ecWeightsFile == "": - # If no encoder weights file is specified, use the input file name to generate a default file name - logging.info("No VAE encoder weights file specified") - base_file_name = ( - os.path.splitext(basename(opts.inputFile))[0] - + opts.aeType - + opts.aeSplitType - ) - logging.info( - f"(variational) encoder weights will be saved in {base_file_name}.autoencoder.hdf5" - ) - vae_weights_file = os.path.join( - opts.outputDir, base_file_name + ".vae.weights.hdf5" - ) - # ec_weights_file = os.path.join( - # opts.outputDir, base_file_name + ".encoder.weights.hdf5" - # ) - else: - # If an encoder weights file is specified, use it as the encoder weights file name - logging.info(f"VAE encoder will be saved in {opts.ecWeightsFile}") - base_file_name = ( - os.path.splitext(basename(opts.ecWeightsFile))[0] + opts.aeSplitType - ) - vae_weights_file = os.path.join( - opts.outputDir, base_file_name + ".vae.weights.hdf5" - ) - # ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile) - + save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_split_autoencoder") # Collect the callbacks for training - callback_list = callbacks.autoencoder_callback( - checkpoint_path=vae_weights_file, opts=opts - ) + # Select all fingerprints that are valid and turn them into a numpy array fp_matrix = np.array( df[df["fp"].notnull()]["fp"].to_list(), @@ -219,17 +143,17 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: ) assert 0.0 <= opts.testSize <= 0.5 if opts.aeSplitType == "random": - logging.info("Training VAE using random split") - train_indices = np.arange(fp_matrix.shape[0]) + logging.info("Training autoencoder using random split") + initial_indices = np.arange(fp_matrix.shape[0]) if opts.testSize > 0.0: # Split data into test and training data if opts.aeWabTracking: - x_train, x_test, _, _ = train_test_split( - fp_matrix, train_indices, test_size=opts.testSize, random_state=42 + x_train, x_test, train_indices, test_indices = train_test_split( + fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 ) else: - x_train, x_test, _, _ = train_test_split( - fp_matrix, train_indices, test_size=opts.testSize, random_state=42 + x_train, x_test, train_indices, test_indices = train_test_split( + fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 ) else: x_train = fp_matrix @@ -255,6 +179,12 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: dtype=settings.ac_fp_numpy_type, copy=settings.numpy_copy_values, ) + train_indices = df[ + df.index.isin(train_data[train_data["fp"].notnull()].index) + ].index.to_numpy() + test_indices = df[ + df.index.isin(test_data[test_data["fp"].notnull()].index) + ].index.to_numpy() else: x_train = fp_matrix x_test = None @@ -262,7 +192,6 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: logging.info("Training autoencoder using molecular weight split") train_indices = np.arange(fp_matrix.shape[0]) if opts.testSize > 0.0: - # if opts.aeWabTracking: train_data, val_data, test_data = weight_split( df, sizes=(1 - opts.testSize, 0.0, opts.testSize), bias="small" ) @@ -276,16 +205,21 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: dtype=settings.ac_fp_numpy_type, copy=settings.numpy_copy_values, ) + df_sorted = df.sort_values(by="mol_weight", ascending=True) + # Get the sorted indices from the sorted DataFrame + sorted_indices = df_sorted.index.to_numpy() + + # Find the corresponding indices for train_data, val_data, and test_data in the sorted DataFrame + train_indices = sorted_indices[df.index.isin(train_data.index)] + # val_indices = sorted_indices[df.index.isin(val_data.index)] + test_indices = sorted_indices[df.index.isin(test_data.index)] else: x_train = fp_matrix x_test = None else: raise ValueError(f"Invalid split type: {opts.split_type}") - if opts.testSize > 0.0: - train_indices = train_indices[train_indices < x_train.shape[0]] - test_indices = np.arange(x_train.shape[0], x_train.shape[0] + x_test.shape[0]) - else: - test_indices = None + + # Calculate the initial bias aka the log ratio between 1's and 0'1 in all fingerprints ids, counts = np.unique(x_train.flatten(), return_counts=True) count_dict = dict(zip(ids, counts)) if count_dict[0] == 0: @@ -304,34 +238,32 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: (vae, encoder) = define_vae_model(opts, output_bias=initial_bias) # Train the VAE on the training data + callback_list = callbacks.autoencoder_callback(checkpoint_path=f"{save_path}.h5", opts=opts) + vae_hist = vae.fit( x_train, x_train, epochs=opts.aeEpochs, batch_size=opts.aeBatchSize, verbose=opts.verbose, - callbacks=callback_list, + callbacks=[callback_list], validation_data=(x_test, x_test) if opts.testSize > 0.0 else None, ) # Save the VAE weights - logging.info(f"VAE weights stored in file: {vae_weights_file}") ht.store_and_plot_history( - base_file_name=os.path.join(opts.outputDir, base_file_name + ".VAE"), + base_file_name=save_path, hist=vae_hist, ) - save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_VAE.h5") - if opts.testSize > 0.0: - (callback_vae, callback_encoder) = define_vae_model(opts) - callback_vae.load_weights(filepath=vae_weights_file) - callback_encoder.save(filepath=save_path) - else: - encoder.save(filepath=save_path) - latent_space = encoder.predict(fp_matrix) - latent_space_file = os.path.join( - opts.outputDir, base_file_name + ".latent_space.csv" - ) - with open(latent_space_file, "w", newline="") as file: - writer = csv.writer(file) - writer.writerows(latent_space) + # Re-define autoencoder and encoder using your function + callback_autoencoder, callback_encoder = define_vae_model(opts) + callback_autoencoder.load_weights(filepath=f"{save_path}.h5") + + for i, layer in enumerate(callback_encoder.layers): + layer.set_weights(callback_autoencoder.layers[i].get_weights()) + + # Save the encoder model + encoder_save_path = f"{save_path}_encoder.h5" + callback_encoder.save_weights(filepath=encoder_save_path) + return encoder, train_indices, test_indices diff --git a/example/predict.json b/example/predict.json index 252965e3..e3305c7c 100755 --- a/example/predict.json +++ b/example/predict.json @@ -1,12 +1,12 @@ { "py/object": "dfpl.options.Options", - "inputFile": "tests/data/smiles.csv", + "inputFile": "tests/data/tox21.csv", "outputDir": "example/results_predict/", "outputFile": "smiles.csv", - "ecModelDir": "example/results_train/random_autoencoder/", - "ecWeightsFile": "", - "fnnModelDir": "example/results_train/AR_saved_model", + "ecModelDir": "example/results_train/random_split_autoencoder", + "ecWeightsFile": "random_split_autoencoder_encoder.h5", + "fnnModelDir": "example/results_train/NR-AR-1_best_saved_model", + "aeType": "variational", "compressFeatures": true, - "trainAC": false, "trainFNN": false } diff --git a/example/predictgnn.json b/example/predictgnn.json index 157b5e05..813cf0c5 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -1,7 +1,6 @@ { "py/object": "dfpl.options.GnnOptions", "test_path": "tests/data/smiles.csv", - "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt", - "save_dir": "preds_dmpnn", - "saving_name": "DMPNN_preds.csv" + "preds_path": "example/results_gnn.csv", + "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt" } \ No newline at end of file diff --git a/example/train.json b/example/train.json index 62f2abb4..53575adc 100755 --- a/example/train.json +++ b/example/train.json @@ -1,22 +1,23 @@ { "py/object": "dfpl.options.Options", - "inputFile": "tests/data/S_dataset.csv", + "inputFile": "tests/data/tox21.csv", "outputDir": "example/results_train/", - "ecModelDir": "example/results_train/", - "ecWeightsFile": "random_autoencoder.hdf5", + "ecModelDir": "example/results_train/random_split_autoencoder/", + "ecWeightsFile": "", "verbose": 2, - "trainAC": true, - "compressFeatures": true, + "trainAC": false, + "compressFeatures": false, + "visualizeLatent": false, "encFPSize": 256, "aeSplitType": "random", - "aeEpochs": 2, + "aeEpochs": 4, "aeBatchSize": 351, "aeOptimizer": "Adam", "aeActivationFunction": "relu", "aeLearningRate": 0.001, - "aeLearningRateDecay": 0.0001, + "aeLearningRateDecay": 0.96, "aeType": "deterministic", "type": "smiles", @@ -29,7 +30,7 @@ "gpu": "", "trainFNN": true, - "kFolds": 1, + "kFolds": 2, "threshold": 0.5, "testSize": 0.2, "fnnType": "FNN", @@ -40,6 +41,7 @@ "activationFunction": "selu", "dropout": 0.0107, "learningRate": 0.0000022, + "learningRateDecay": 0.96, "l2reg": 0.001, "aeWabTracking": false, diff --git a/example/traingnn.json b/example/traingnn.json index 7a5a0712..1ca58a30 100644 --- a/example/traingnn.json +++ b/example/traingnn.json @@ -2,13 +2,13 @@ "py/object": "dfpl.options.GnnOptions", "data_path": "tests/data/S_dataset.csv", "save_dir": "dmpnn-random/", - "epochs": 2, - "num_folds": 2, + "epochs": 4, + "num_folds": 1, "metric": "accuracy", "loss_function": "binary_cross_entropy", "split_type": "random", "dataset_type": "classification", "smiles_columns": "smiles", - "extra_metrics": ["balanced_accuracy","auc","f1","mcc","recall","specificity","precision"], + "extra_metrics": ["balanced_accuracy","auc","f1","mcc","recall","precision"], "hidden_size": 256 } \ No newline at end of file From fdc8efdef911ecbb28c0e239583811715af37b4e Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 30 Oct 2024 14:03:23 +0100 Subject: [PATCH 02/10] changed output format for galaxy --- dfpl/__main__.py | 30 ++-- dfpl/autoencoder.py | 338 ++++++++++++++----------------------- dfpl/callbacks.py | 36 ++-- dfpl/dfplmodule.py | 3 - dfpl/fingerprint.py | 23 +-- dfpl/history.py | 2 +- dfpl/options.py | 10 +- dfpl/plot.py | 220 ++++++++---------------- dfpl/predictions.py | 66 +++++--- dfpl/single_label_model.py | 253 +++++++++++---------------- dfpl/utils.py | 4 +- dfpl/vae.py | 266 ++++++++--------------------- example/predict.json | 9 +- example/train.json | 15 +- 14 files changed, 451 insertions(+), 824 deletions(-) diff --git a/dfpl/__main__.py b/dfpl/__main__.py index aada91a3..3f2ecd8a 100755 --- a/dfpl/__main__.py +++ b/dfpl/__main__.py @@ -1,14 +1,10 @@ import dataclasses import logging import os.path -import pathlib from argparse import Namespace from os import path import chemprop as cp -import pandas as pd -from keras.models import load_model - from dfpl import autoencoder as ac from dfpl import feedforwardNN as fNN from dfpl import fingerprint as fp @@ -65,9 +61,9 @@ def train(opts: options.Options): df = fp.importDataFile( opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize ) + # initialize (auto)encoders to None encoder = None - autoencoder = None if opts.trainAC: if opts.aeType == "deterministic": encoder, train_indices, test_indices = ac.train_full_ac(df, opts) @@ -84,22 +80,19 @@ def train(opts: options.Options): else: (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) - if opts.ecWeightsFile == "": - encoder = load_model(opts.ecModelDir) - else: + if opts.ecWeightsFile != "": autoencoder.load_weights( - os.path.join(opts.ecModelDir, opts.ecWeightsFile) + os.path.join(opts.outputDir, opts.ecWeightsFile) ) # compress the fingerprints using the autoencoder df = ac.compress_fingerprints(df, encoder) - if opts.visualizeLatent: + if opts.visualizeLatent: ## visualize latent space only if you train the autoencoder ac.visualize_fingerprints( df, - before_col="fp", - after_col="fpcompressed", + comressed_col="fpcompressed", train_indices=train_indices, test_indices=test_indices, - save_as=f"UMAP_{opts.aeSplitType}.png", + save_as=f"UMAP_{opts.aeType}.png", ) # train single label models if requested if opts.trainFNN and not opts.enableMultiLabel: @@ -129,12 +122,13 @@ def predict(opts: options.Options) -> None: # load trained model for autoencoder if opts.aeType == "deterministic": (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) - if opts.aeType == "variational": + elif opts.aeType == "variational": (autoencoder, encoder) = vae.define_vae_model(opts=options.Options()) - # Load trained model for autoencoder - if opts.ecWeightsFile == "": - encoder = load_model(opts.ecModelDir) else: + raise ValueError(f"Unknown autoencoder type: {opts.aeType}") + + # Load trained model for autoencoder + if opts.ecWeightsFile != "": encoder.load_weights(os.path.join(opts.ecModelDir, opts.ecWeightsFile)) df = ac.compress_fingerprints(df, encoder) @@ -196,7 +190,6 @@ def main(): raise ValueError("Input directory is not a directory") elif prog_args.method == "traingnn": traingnn_opts = options.GnnOptions.fromCmdArgs(prog_args) - createLogger("traingnn.log") traindmpnn(traingnn_opts) elif prog_args.method == "predictgnn": @@ -206,7 +199,6 @@ def main(): test_path=makePathAbsolute(predictgnn_opts.test_path), preds_path=makePathAbsolute(predictgnn_opts.preds_path), ) - createLogger("predictgnn.log") predictdmpnn(fixed_opts) elif prog_args.method == "train": diff --git a/dfpl/autoencoder.py b/dfpl/autoencoder.py index 6909b156..083ba325 100644 --- a/dfpl/autoencoder.py +++ b/dfpl/autoencoder.py @@ -1,7 +1,6 @@ import logging import math import os.path -from os.path import basename from typing import Tuple import matplotlib.pyplot as plt @@ -11,9 +10,9 @@ import umap.umap_ as umap import wandb from sklearn.model_selection import train_test_split -from tensorflow.keras import initializers, losses, optimizers +from tensorflow.keras import losses, optimizers from tensorflow.keras.layers import Dense, Input -from tensorflow.keras.models import Model, load_model +from tensorflow.keras.models import Model from dfpl import callbacks from dfpl import history as ht @@ -21,235 +20,167 @@ from dfpl.utils import ae_scaffold_split, weight_split -def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Model]: +def create_dense_layer(inputs, units, activation): + """Create a Dense layer with optional SELU initialization.""" + return Dense( + units=units, + activation=activation, + kernel_initializer="lecun_normal" if activation == "selu" else 'glorot_uniform' + )(inputs) + + +def define_ac_model(opts: options.Options) -> Tuple[Model, Model]: """ - This function provides an autoencoder model to reduce a certain input to a compressed version. + Define and compile an autoencoder model with the specified encoding dimension, with a mirrored decoder. - :param opts: Training options that provide values for adjusting the neural net - :param output_bias: Bias used to initialize the last layer. It gives the net a head start in training on - imbalanced data (which the fingerprints are, because they have many more 0's than 1's in them). - :return: a tuple of autoencoder and encoder models + :param opts: Training options containing model parameters. + :param output_bias: Bias for the output layer, used for initializing the last layer. + :return: Tuple containing the autoencoder and encoder models. """ input_size = opts.fpSize - encoding_dim = opts.encFPSize + encoding_dim = opts.encFPSize # Desired encoding dimension lr_schedule = optimizers.schedules.ExponentialDecay( opts.aeLearningRate, - decay_steps=1000, + decay_steps=math.ceil(7000 / opts.batchSize) * 3, decay_rate=opts.aeLearningRateDecay, staircase=True, ) ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) - if output_bias is not None: - output_bias = initializers.Constant(output_bias) + input_vec = Input(shape=(input_size,)) + initial_layer_size = int(input_size / 2) + encoded = create_dense_layer(input_vec, initial_layer_size, opts.aeActivationFunction) - # get the number of meaningful hidden layers (latent space included) - hidden_layer_count = round(math.log2(input_size / encoding_dim)) + # Start `layer_sizes` with the initial layer size (1024) + layer_sizes = [initial_layer_size] - # the input placeholder - input_vec = Input(shape=(input_size,)) + # Build intermediate encoding layers and store their sizes + hidden_layer_count = round(math.log2(input_size / encoding_dim)) + for i in range(1, hidden_layer_count): + factor_units = 2 ** (i + 1) + layer_size = int(input_size / factor_units) + layer_sizes.append(layer_size) + encoded = create_dense_layer(encoded, layer_size, opts.aeActivationFunction) - # 1st hidden layer, that receives weights from input layer - # equals bottleneck layer, if hidden_layer_count==1! - if opts.aeActivationFunction != "selu": - encoded = Dense( - units=int(input_size / 2), activation=opts.aeActivationFunction - )(input_vec) - else: - encoded = Dense( - units=int(input_size / 2), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(input_vec) - - if hidden_layer_count > 1: - # encoding layers, incl. bottle-neck - for i in range(1, hidden_layer_count): - factor_units = 2 ** (i + 1) - if opts.aeActivationFunction != "selu": - encoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - else: - encoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(encoded) - - # 1st decoding layer - factor_units = 2 ** (hidden_layer_count - 1) - if opts.aeActivationFunction != "selu": - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - else: - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(encoded) - - # decoding layers - for i in range(hidden_layer_count - 2, 0, -1): - factor_units = 2**i - if opts.aeActivationFunction != "selu": - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(decoded) - else: - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(decoded) - - # output layer - decoded = Dense( - units=input_size, activation="sigmoid", bias_initializer=output_bias - )(decoded) + # Build decoder layers in exact reverse order, excluding the first layer size + decoded = encoded + for layer_size in reversed(layer_sizes[:-1]): + decoded = Dense(units=layer_size, activation=opts.aeActivationFunction)(decoded) - else: - # output layer - decoded = Dense( - units=input_size, activation="sigmoid", bias_initializer=output_bias - )(encoded) + # Final output layer to reconstruct input + decoded = Dense(units=input_size, activation="sigmoid")(decoded) + # Define autoencoder and encoder models autoencoder = Model(input_vec, decoded) encoder = Model(input_vec, encoded) + + # Compile autoencoder + autoencoder.compile(optimizer=ac_optimizer, loss=losses.BinaryCrossentropy()) autoencoder.summary(print_fn=logging.info) - autoencoder.compile( - optimizer=ac_optimizer, - loss=losses.BinaryCrossentropy(), - # metrics=[ - # metrics.AUC(), - # metrics.Precision(), - # metrics.Recall() - # ] - ) return autoencoder, encoder -def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: + +def setup_train_test_split(df: pd.DataFrame, opts: options.Options) -> Tuple[ + np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ - Trains an autoencoder on the given feature matrix X. The response matrix is only used to - split the data into meaningful test and train sets. + Sets up the training and test split based on the provided options. :param opts: Command line arguments as defined in options.py :param df: Pandas dataframe that contains the SMILES/InChI data for training the autoencoder - :return: The encoder model of the trained autoencoder + :return: Tuple containing training data, test data, training indices, test indices, and initial bias. """ - # If wandb tracking is enabled for autoencoder weights but not for the main program, initialize a new wandb run - if opts.aeWabTracking and not opts.wabTracking: - wandb.init(project=f"AE_{opts.aeSplitType}") - - save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_split_autoencoder") - # Collect the callbacks for training - # Select all fingerprints that are valid and turn them into a numpy array fp_matrix = np.array( df[df["fp"].notnull()]["fp"].to_list(), dtype=settings.ac_fp_numpy_type, copy=settings.numpy_copy_values, ) - logging.info( - f"Training AC on a matrix of shape {fp_matrix.shape} with type {fp_matrix.dtype}" - ) - # When training the final AE, we don't want any test data. We want to train it on all available fingerprints. + logging.info(f"Setting up train/test split on a matrix of shape {fp_matrix.shape} with type {fp_matrix.dtype}") + + # Validate test size assert 0.0 <= opts.testSize <= 0.5 + initial_indices = np.arange(fp_matrix.shape[0]) + if opts.aeSplitType == "random": - logging.info("Training autoencoder using random split") - initial_indices = np.arange(fp_matrix.shape[0]) + logging.info("Using random split for training.") if opts.testSize > 0.0: - # Split data into test and training data - if opts.aeWabTracking: - x_train, x_test, train_indices, test_indices = train_test_split( - fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 - ) - else: - x_train, x_test, train_indices, test_indices = train_test_split( - fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 - ) - else: - x_train = fp_matrix - x_test = None - elif opts.aeSplitType == "scaffold_balanced": - logging.info("Training autoencoder using scaffold split") - train_indices = np.arange(fp_matrix.shape[0]) - if opts.testSize > 0.0: - train_data, val_data, test_data = ae_scaffold_split( - df, - sizes=(1 - opts.testSize, 0.0, opts.testSize), - balanced=True, - seed=42, - ) - x_train = np.array( - train_data[train_data["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - x_test = np.array( - test_data[test_data["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, + x_train, x_test, train_indices, test_indices = train_test_split( + fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 ) - train_indices = df[ - df.index.isin(train_data[train_data["fp"].notnull()].index) - ].index.to_numpy() - test_indices = df[ - df.index.isin(test_data[test_data["fp"].notnull()].index) - ].index.to_numpy() else: x_train = fp_matrix x_test = None + train_indices = initial_indices + test_indices = None + + elif opts.aeSplitType == "scaffold_balanced": + logging.info("Using scaffold split for training.") + train_data, val_data, test_data = ae_scaffold_split( + df, sizes=(1 - opts.testSize, 0.0, opts.testSize), balanced=True, seed=42 + ) + x_train = np.array( + train_data[train_data["fp"].notnull()]["fp"].to_list(), + dtype=settings.ac_fp_numpy_type, + copy=settings.numpy_copy_values, + ) + x_test = np.array( + test_data[test_data["fp"].notnull()]["fp"].to_list(), + dtype=settings.ac_fp_numpy_type, + copy=settings.numpy_copy_values, + ) + train_indices = df[df.index.isin(train_data[train_data["fp"].notnull()].index)].index.to_numpy() + test_indices = df[df.index.isin(test_data[test_data["fp"].notnull()].index)].index.to_numpy() + elif opts.aeSplitType == "molecular_weight": - logging.info("Training autoencoder using molecular weight split") - train_indices = np.arange(fp_matrix.shape[0]) - if opts.testSize > 0.0: - train_data, val_data, test_data = weight_split( - df, sizes=(1 - opts.testSize, 0.0, opts.testSize), bias="small" - ) - x_train = np.array( - train_data[train_data["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - x_test = np.array( - test_data[test_data["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - df_sorted = df.sort_values(by="mol_weight", ascending=True) - # Get the sorted indices from the sorted DataFrame - sorted_indices = df_sorted.index.to_numpy() - - # Find the corresponding indices for train_data, val_data, and test_data in the sorted DataFrame - train_indices = sorted_indices[df.index.isin(train_data.index)] - # val_indices = sorted_indices[df.index.isin(val_data.index)] - test_indices = sorted_indices[df.index.isin(test_data.index)] - else: - x_train = fp_matrix - x_test = None - else: - raise ValueError(f"Invalid split type: {opts.split_type}") - - # Calculate the initial bias aka the log ratio between 1's and 0'1 in all fingerprints - ids, counts = np.unique(x_train.flatten(), return_counts=True) - count_dict = dict(zip(ids, counts)) - if count_dict[0] == 0: - initial_bias = None - logging.info("No zeroes in training labels. Setting initial_bias to None.") + logging.info("Using molecular weight split for training.") + train_data, val_data, test_data = weight_split( + df, sizes=(1 - opts.testSize, 0.0, opts.testSize), bias="small" + ) + x_train = np.array( + train_data[train_data["fp"].notnull()]["fp"].to_list(), + dtype=settings.ac_fp_numpy_type, + copy=settings.numpy_copy_values, + ) + x_test = np.array( + test_data[test_data["fp"].notnull()]["fp"].to_list(), + dtype=settings.ac_fp_numpy_type, + copy=settings.numpy_copy_values, + ) + df_sorted = df.sort_values(by="mol_weight", ascending=True) + sorted_indices = df_sorted.index.to_numpy() + train_indices = sorted_indices[df.index.isin(train_data.index)] + test_indices = sorted_indices[df.index.isin(test_data.index)] + else: - initial_bias = np.log([count_dict[1] / count_dict[0]]) - logging.info(f"Initial bias for last sigmoid layer: {initial_bias[0]}") + raise ValueError(f"Invalid split type: {opts.aeSplitType}") - # Check if we're doing training/testing mode or full training mode + return x_train, x_test, train_indices, test_indices + + +def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: + """ + Trains an autoencoder on the given feature matrix X. The response matrix is only used to + split the data into meaningful test and train sets. + + :param opts: Command line arguments as defined in options.py + :param df: Pandas dataframe that contains the SMILES/InChI data for training the autoencoder + :return: The encoder model of the trained autoencoder + """ + + # If wandb tracking is enabled for autoencoder weights but not for the main program, initialize a new wandb run + if opts.aeWabTracking: + wandb.init(project=f"AE_{opts.aeSplitType}") + + save_path = os.path.join(opts.ecModelDir, f"autoencoder_weights.h5") + + # Set up train/test split + x_train, x_test, train_indices, test_indices = setup_train_test_split(df, opts) + + # Log training mode if opts.testSize > 0.0: logging.info(f"AE training/testing mode with train- and test-samples") logging.info(f"AC train data shape {x_train.shape} with type {x_train.dtype}") @@ -258,9 +189,10 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: logging.info(f"AE full train mode without test-samples") logging.info(f"AC train data shape {x_train.shape} with type {x_train.dtype}") - # Set up the model of the AC w.r.t. the input size and the dimension of the bottle neck (z!) - (autoencoder, encoder) = define_ac_model(opts, output_bias=initial_bias) + # Set up the model of the AC + (autoencoder, encoder) = define_ac_model(opts) callback_list = callbacks.autoencoder_callback(checkpoint_path=save_path, opts=opts) + # Train the autoencoder on the training data auto_hist = autoencoder.fit( x_train, @@ -277,21 +209,12 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: base_file_name=save_path, hist=auto_hist, ) + # load the model with the best weights + autoencoder.load_weights(save_path) + # Save the encoder weights + encoder.save_weights(os.path.join(opts.ecModelDir, "encoder_weights.h5")) + - # Save the autoencoder callback model to disk - if opts.testSize > 0.0: - # Re-define autoencoder and encoder using your function - callback_autoencoder = load_model(filepath=save_path) - _, callback_encoder = define_ac_model(opts) - for i, layer in enumerate(callback_encoder.layers): - layer.set_weights(callback_autoencoder.layers[i].get_weights()) - - # Save the encoder model - encoder_save_path = os.path.join(save_path, "encoder_model") - callback_encoder.save(filepath=encoder_save_path) - else: - encoder.save(filepath=save_path) - # Return the encoder model of the trained autoencoder return encoder, train_indices, test_indices @@ -322,14 +245,13 @@ def compress_fingerprints(dataframe: pd.DataFrame, encoder: Model) -> pd.DataFra def visualize_fingerprints( df: pd.DataFrame, - before_col: str, - after_col: str, + comressed_col: str, train_indices: np.ndarray, test_indices: np.ndarray, save_as: str, ): # Calculate the number of samples to be taken from each set - num_samples = 1000 + num_samples = 10 train_samples = int(num_samples * len(train_indices) / len(df)) test_samples = num_samples - train_samples @@ -345,7 +267,7 @@ def visualize_fingerprints( df_sampled = pd.concat([train_data_sampled, test_data_sampled]) # Convert the boolean values in the after_col column to floats - df_sampled[after_col] = df_sampled[after_col].apply( + df_sampled[comressed_col] = df_sampled[comressed_col].apply( lambda x: np.array(x, dtype=float) ) @@ -356,7 +278,7 @@ def visualize_fingerprints( n_neighbors=15, min_dist=0.1, metric="euclidean", random_state=42 ) # Filter out the rows with invalid arrays - umap_results = umap_model.fit_transform(df_sampled[after_col].tolist()) + umap_results = umap_model.fit_transform(df_sampled[comressed_col].tolist()) # Add UMAP results to the DataFrame df_sampled["umap_x"] = umap_results[:, 0] df_sampled["umap_y"] = umap_results[:, 1] @@ -364,8 +286,6 @@ def visualize_fingerprints( # Define custom color palette palette = {"train": "blue", "test": "red"} - # Create the scatter plot - sns.set(style="white") fig, ax = plt.subplots(figsize=(10, 8)) split = save_as.split("_", 1) part_after_underscore = split[1] diff --git a/dfpl/callbacks.py b/dfpl/callbacks.py index fc1f817c..4cabc230 100644 --- a/dfpl/callbacks.py +++ b/dfpl/callbacks.py @@ -22,25 +22,15 @@ def autoencoder_callback(checkpoint_path: str, opts: options.Options) -> list: else: target = "loss" # enable this checkpoint to restore the weights of the best performing model - if opts.aeType == "deterministic": - checkpoint = ModelCheckpoint( - checkpoint_path, - monitor=target, - mode="min", - verbose=1, - save_freq="epoch", - save_best_only=True, - ) - else: - checkpoint = ModelCheckpoint( - checkpoint_path, - monitor=target, - mode="min", - verbose=1, - save_freq="epoch", - save_best_only=True, - save_weights_only=True - ) + checkpoint = ModelCheckpoint( + checkpoint_path, + monitor=target, + mode="min", + verbose=1, + save_best_only=True, + save_weights_only=True, + period=settings.ac_train_check_period + ) callbacks.append(checkpoint) # enable early stopping if val_loss is not improving anymore @@ -53,7 +43,7 @@ def autoencoder_callback(checkpoint_path: str, opts: options.Options) -> list: restore_best_weights=True, ) callbacks.append(early_stop) - if opts.aeWabTracking and not opts.wabTracking: + if opts.aeWabTracking: callbacks.append(WandbCallback(save_model=False)) return callbacks @@ -74,11 +64,12 @@ def nn_callback(checkpoint_path: str, opts: options.Options) -> list: checkpoint = ModelCheckpoint( checkpoint_path, verbose=1, - save_freq="epoch", save_best_only=True, monitor="val_loss", mode="min", save_weights_only=True, + period=settings.nn_train_check_period + ) callbacks.append(checkpoint) @@ -89,10 +80,9 @@ def nn_callback(checkpoint_path: str, opts: options.Options) -> list: mode="min", min_delta=settings.nn_train_min_delta, verbose=1, - restore_best_weights=True, + restore_best_weights=True ) callbacks.append(early_stop) - if opts.wabTracking: callbacks.append(WandbCallback(save_model=False)) return callbacks diff --git a/dfpl/dfplmodule.py b/dfpl/dfplmodule.py index d0838ebf..67cd3a02 100755 --- a/dfpl/dfplmodule.py +++ b/dfpl/dfplmodule.py @@ -118,7 +118,6 @@ def smi2fp(smile, fptype, size=2048): # lengths. After all paths have been identified, the fingerprint is typically # folded down until a particular density of set bits is obtained. try: - # fp = Chem.RDKFingerprint(mol, fpSize=size) return Chem.RDKFingerprint(mol, fpSize=size) except Exception: print("SMILES not convertable to topological fingerprint:") @@ -134,7 +133,6 @@ def smi2fp(smile, fptype, size=2048): # things looked pretty good. try: - # fp = MACCSkeys.GenMACCSKeys(mol) return MACCSkeys.GenMACCSKeys(mol) except Exception: print("SMILES not convertable to MACSS fingerprint:") @@ -166,7 +164,6 @@ def smi2fp(smile, fptype, size=2048): # GetTopologicalTorsionFingerprintAsBitVect function. try: - # fp = Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) return Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) except Exception: print("SMILES not convertable to torsions fingerprint:") diff --git a/dfpl/fingerprint.py b/dfpl/fingerprint.py index 16c0c87b..e2016896 100644 --- a/dfpl/fingerprint.py +++ b/dfpl/fingerprint.py @@ -18,7 +18,7 @@ default_fp_size = 2048 -def addFPColumn(data_frame: pd.DataFrame, fp_size: int) -> pd.DataFrame: +def addFPColumn(data_frame: pd.DataFrame,fp_size: int) -> pd.DataFrame: """ Adds a fingerprint to each row in the dataframe. This function works on parallel chunks of the original dataframe. @@ -35,15 +35,6 @@ def smile2fp(smile: str) -> Any: None otherwise """ - # generate morgan fp (circular, ecfp) - # smile = df['smiles'][1] - # mol = Chem.MolFromSmiles(smile) - # from rdkit.Chem import AllChem - # morgan = AllChem.GetMorganFingerprintAsBitVect(mol, 2) - # npa = np.zeros((0,), dtype=np.bool) - # from rdkit import DataStructs - # DataStructs.ConvertToNumpyArray(morgan, npa) - npa = np.zeros((0,), dtype=np.bool_) try: DataStructs.ConvertToNumpyArray( @@ -56,13 +47,6 @@ def smile2fp(smile: str) -> Any: except Exception: return None - # try: - # return np.array( - # Chem.RDKFingerprint(Chem.MolFromSmiles(smile), fpSize=fp_size), - # dtype=settings.df_fp_numpy_type, copy=settings.numpy_copy_values) - # except: - # # Note: We don't need to log here since rdkit already logs - # return None def inchi2fp(inchi: str) -> Any: """ @@ -139,11 +123,6 @@ def importDstoxTSV(tsvfilename: str) -> pd.DataFrame: "S_dataset.csv": importSmilesCSV, "smiles.csv": importSmilesCSV, "inchi.tsv": importDstoxTSV - # "S_dataset_extended.csv": importSmilesCSV, - # "D_dataset.tsv": importDstoxTSV, - # "train_data.csv": importSmilesCSV, - # "predict_data.csv": importDstoxTSV, - # "B_data_ER.csv": importDstoxTSV } diff --git a/dfpl/history.py b/dfpl/history.py index 1f16c2a9..6dcf5374 100644 --- a/dfpl/history.py +++ b/dfpl/history.py @@ -5,7 +5,7 @@ import matplotlib as mpl import matplotlib.pyplot as plt import pandas as pd -from tensorflow.keras.callbacks import History +from tensorflow.python.keras.callbacks import History mpl.use("Agg") diff --git a/dfpl/options.py b/dfpl/options.py index d1d657aa..08494c3d 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -8,7 +8,7 @@ import torch from chemprop.args import TrainArgs -from dfpl.utils import makePathAbsolute, parseCmdArgs +from dfpl.utils import parseCmdArgs @dataclass @@ -47,6 +47,7 @@ class Options: aeLearningRateDecay: float = 0.96 aeActivationFunction: str = "selu" aeOptimizer: str = "Adam" + vaeBeta: float = 0.5 fnnType: str = "FNN" batchSize: int = 128 optimizer: str = "Adam" @@ -328,6 +329,13 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: help="UMAP the latent space for exploration", default=False ) + autoencoder_args.add_argument( + "--vaeBeta", + metavar="FLOAT", + type=float, + help="Beta parameter for the Beta VAE", + default=1.0 + ) # Training Configuration training_args.add_argument( "--split_type", diff --git a/dfpl/plot.py b/dfpl/plot.py index 30d9503f..75c7fce4 100644 --- a/dfpl/plot.py +++ b/dfpl/plot.py @@ -2,12 +2,11 @@ import matplotlib.pyplot as plt import numpy as np -import pandas as pd import wandb from matplotlib.axes import Axes - +from sklearn.metrics import precision_recall_curve, auc # for NN model functions -from tensorflow.keras.callbacks import History +from tensorflow.python.keras.callbacks import History def get_max_validation_accuracy(history: History) -> str: @@ -45,8 +44,7 @@ def get_max_training_accuracy(history: History) -> str: y_max: float = max(training) return "Max training accuracy ≈ " + str(round(y_max, 3) * 100) + "%" - -def smooth_curve(points: np.ndarray, factor: float = 0.75) -> np.ndarray: +def smooth_curve(points: np.ndarray, factor: float = 0.8) -> List[float]: smoothed_points: List[float] = [] for point in points: if smoothed_points: @@ -56,173 +54,89 @@ def smooth_curve(points: np.ndarray, factor: float = 0.75) -> np.ndarray: smoothed_points.append(point) return smoothed_points - +# Plot the accuracy and loss data with enhanced visuals def set_plot_history_data(ax: Axes, history: History, which_graph: str) -> None: - (train, valid) = (None, None) - - if which_graph == "acc": - train = smooth_curve(history.history["accuracy"]) - valid = smooth_curve(history.history["val_accuracy"]) - - if which_graph == "loss": - train = smooth_curve(history.history["loss"]) - valid = smooth_curve(history.history["val_loss"]) - - # plt.xkcd() # make plots look like xkcd + if which_graph == "balanced_acc": + # Plot balanced accuracy when "acc" is specified + train = smooth_curve(np.array(history.history["balanced_accuracy"])) + valid = smooth_curve(np.array(history.history["val_balanced_accuracy"])) + label = "Balanced Accuracy" + elif which_graph == "loss": + train = smooth_curve(np.array(history.history["loss"])) + valid = smooth_curve(np.array(history.history["val_loss"])) + label = "Loss" + else: + return epochs = range(1, len(train) + 1) - trim = 0 # remove first 5 epochs - # when graphing loss the first few epochs may skew the (loss) graph - - ax.plot(epochs[trim:], train[trim:], "dodgerblue", linewidth=15, alpha=0.1) - ax.plot(epochs[trim:], train[trim:], "dodgerblue", label="Training") - - ax.plot(epochs[trim:], valid[trim:], "g", linewidth=15, alpha=0.1) - ax.plot(epochs[trim:], valid[trim:], "g", label="Validation") - + # Plot training and validation data with styles + ax.plot(epochs, train, color="dodgerblue", linewidth=2, label=f"Training {label}") + ax.plot(epochs, valid, color="green", linestyle="--", linewidth=2, label=f"Validation {label}") + ax.set_ylabel(label) + ax.legend(loc="best") + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) def plot_history(history: History, file: str) -> None: - fig, (ax1, ax2) = plt.subplots( - nrows=2, - ncols=1, - figsize=(10, 6), - sharex="all", - gridspec_kw={"height_ratios": [5, 2]}, - ) - - set_plot_history_data(ax1, history, "acc") + fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(10, 8), sharex="all") + set_plot_history_data(ax1, history, "balanced_acc") set_plot_history_data(ax2, history, "loss") - # Accuracy graph - ax1.set_ylabel("Accuracy") - ax1.set_ylim(bottom=0.5, top=1) - ax1.legend(loc="lower right") - ax1.spines["top"].set_visible(False) - ax1.spines["right"].set_visible(False) - ax1.xaxis.set_ticks_position("none") - ax1.spines["bottom"].set_visible(False) - - # max accuracy text - plt.text( - 0.5, - 0.6, - get_max_validation_balanced_accuracy(history), - horizontalalignment="right", - verticalalignment="top", - transform=ax1.transAxes, - fontsize=12, - ) - plt.text( - 0.5, - 0.8, - get_max_training_balanced_accuracy(history), - horizontalalignment="right", - verticalalignment="top", - transform=ax1.transAxes, - fontsize=12, - ) - - # Loss graph - ax2.set_ylabel("Loss") - ax2.set_yticks([]) - ax2.plot(legend=False) + # Set shared x-axis label and save the plot ax2.set_xlabel("Epochs") - ax2.spines["top"].set_visible(False) - ax2.spines["right"].set_visible(False) - plt.tight_layout() plt.savefig(fname=file, format="svg") plt.close() - -def plot_train_history(hist, target, file_accuracy, file_loss): - """ - Plot the training performance in terms of accuracy and loss values for each epoch. - :param hist: The history returned by model.fit function - :param target: The name of the target of the model - :param file_accuracy: The filename for plotting accuracy values - :param file_loss: The filename for plotting loss values - :return: none - """ - - # plot accuracy - plt.figure() - plt.plot(hist.history["accuracy"]) - if "val_accuracy" in hist.history.keys(): - plt.plot(hist.history["val_accuracy"]) - plt.title("Model accuracy - " + target) - plt.ylabel("Accuracy") - plt.xlabel("Epoch") - if "val_accuracy" in hist.history.keys(): - plt.legend(["Train", "Test"], loc="upper left") - else: - plt.legend(["Train"], loc="upper_left") - plt.savefig(fname=file_accuracy, format="svg") - - # Plot training & validation loss values - plt.figure() - plt.plot(hist.history["loss"]) - plt.plot(hist.history["val_loss"]) - plt.title("Model loss - " + target) - plt.ylabel("Loss") - plt.xlabel("Epoch") - plt.legend(["Train", "Test"], loc="upper left") - # plt.show() - plt.savefig(fname=file_loss, format="svg") +# Enhanced AUC plot +def plot_auc(fpr: np.ndarray, tpr: np.ndarray, auc_value: float, target: str, filename: str, wandb_logging: bool = False) -> None: + plt.figure(figsize=(8, 6)) + plt.plot([0, 1], [0, 1], "k--", linewidth=1) + plt.plot(fpr, tpr, color="darkorange", linewidth=2, label=f"AUC = {auc_value:.3f}") + plt.xlabel("False Positive Rate") + plt.ylabel("True Positive Rate") + plt.title(f"ROC Curve - {target}") + plt.legend(loc="lower right") + plt.grid(True, linestyle="--", alpha=0.5) + plt.savefig(fname=filename, format="png") + if wandb_logging: + wandb.log({"roc_plot": plt}) plt.close() - -def plot_history_vis( - hist: History, - model_hist_plot_path: str, - model_hist_csv_path: str, - model_hist_plot_path_a: str, - model_hist_plot_path_l: str, - target: str, -) -> None: - plot_history(history=hist, file=model_hist_plot_path) - histDF = pd.DataFrame(hist.history) - histDF.to_csv(model_hist_csv_path) - - # plot accuracy and loss for the training and validation during training - plot_train_history( - hist=hist, - target=target, - file_accuracy=model_hist_plot_path_a, - file_loss=model_hist_plot_path_l, - ) - - -def plot_auc( - fpr: np.ndarray, - tpr: np.ndarray, - auc_value: float, - target: str, - filename: str, - wandb_logging: bool = False, +def plot_prc( + y_true: np.ndarray, + y_scores: np.ndarray, + target: str, + filename: str, + wandb_logging: bool = False ) -> None: """ - Plot the area under the curve to the provided file - - :param fpr: An array containing the false positives - :param tpr: An array containing the true positives - :param auc_value: The value of the area under the curve - :param target: The name of the training target - :param filename: The filename to which the plot should be stored - :param wandb_logging: Whether to log the plot to wandb + Plot the Precision-Recall Curve (PRC) with AUC. + + :param y_true: True binary labels + :param y_scores: Target scores, typically predicted probabilities + :param target: The name of the model or target being evaluated + :param filename: The filename to save the plot + :param wandb_logging: Whether to log the plot to Weights & Biases :rtype: None """ - # Create a boolean mask to filter out zero values - plt.figure() - plt.plot([0, 1], [0, 1], "k--") - plt.plot(fpr, tpr, label=f"Keras (area = {auc_value:.3f})") - plt.xlabel("False positive rate") - plt.ylabel("True positive rate") - plt.title("ROC curve " + target) - plt.legend(loc="best") + # Calculate precision-recall curve and AUC + precision, recall, _ = precision_recall_curve(y_true, y_scores) + prc_auc_value = auc(recall, precision) + + # Plot PRC curve + plt.figure(figsize=(8, 6)) + plt.plot(recall, precision, color="purple", linewidth=2, label=f"PRC-AUC = {prc_auc_value:.3f}") + plt.xlabel("Recall") + plt.ylabel("Precision") + plt.title(f"Precision-Recall Curve - {target}") + plt.legend(loc="lower left") + plt.grid(True, linestyle="--", alpha=0.5) + + # Save plot plt.savefig(fname=filename, format="png") if wandb_logging: - wandb.log({"roc_plot": plt}) - plt.close() + wandb.log({"prc_plot": plt}) + plt.close() \ No newline at end of file diff --git a/dfpl/predictions.py b/dfpl/predictions.py index 29e73142..ef1000d8 100644 --- a/dfpl/predictions.py +++ b/dfpl/predictions.py @@ -1,39 +1,51 @@ import logging - +import os import numpy as np import pandas as pd -import tensorflow.keras.models from dfpl import options, settings +from dfpl import single_label_model as sl def predict_values(df: pd.DataFrame, opts: options.Options) -> pd.DataFrame: """ Predict a set of chemicals using a selected model. - :param df: - :param opts: - :return: + :param df: Input DataFrame containing the features (either compressed or uncompressed). + :param opts: Model options including paths, feature types, and prediction preferences. + :return: DataFrame with predictions. """ - model = tensorflow.keras.models.load_model(opts.fnnModelDir, compile=False) - model.compile(loss=opts.lossFunction, optimizer=opts.optimizer) - if opts.compressFeatures: - sub_df = df[df["fpcompressed"].notnull()] - x = np.array( - sub_df["fpcompressed"].to_list(), - dtype=settings.nn_fp_compressed_numpy_type, - copy=settings.numpy_copy_values, - ) - logging.info(f"Compressed FP matrix with shape {x.shape} and type {x.dtype}") - sub_df["predicted"] = pd.DataFrame(model.predict(x), columns=["predicted"]) - return sub_df - else: - sub_df = df[df["fp"].notnull()] - x = np.array( - sub_df["fp"].to_list(), - dtype=settings.nn_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - logging.info(f"Uncompressed FP matrix with shape {x.shape} and type {x.dtype}") - sub_df["predicted"] = pd.DataFrame(model.predict(x), columns=["predicted"]) - return sub_df + + # Determine the correct feature column and input size + feature_column = "fpcompressed" if opts.compressFeatures else "fp" + sub_df = df[df[feature_column].notnull()] + + if sub_df.empty: + logging.warning(f"No valid features found in column '{feature_column}'") + return pd.DataFrame() + + # Prepare the feature matrix for prediction + x = np.array( + sub_df[feature_column].to_list(), + dtype=settings.nn_fp_compressed_numpy_type if opts.compressFeatures else settings.nn_fp_numpy_type, + copy=settings.numpy_copy_values + ) + logging.info( + f"{'Compressed' if opts.compressFeatures else 'Uncompressed'} FP matrix with shape {x.shape} and type {x.dtype}") + + # Define the model architecture based on the feature size + feature_input_size = x.shape[1] + model = sl.define_single_label_model(input_size=feature_input_size, opts=opts) + + # Load the model weights + weights_path = os.path.join(opts.fnnModelDir, 'model_weights.hdf5') + model.load_weights(weights_path) + logging.info(f"Model weights loaded from {weights_path}") + + # Make predictions + predictions = model.predict(x) + + # Add predictions to the DataFrame + sub_df["predicted"] = predictions + + return sub_df diff --git a/dfpl/single_label_model.py b/dfpl/single_label_model.py index 191690ba..0f64ace2 100644 --- a/dfpl/single_label_model.py +++ b/dfpl/single_label_model.py @@ -10,7 +10,7 @@ import numpy as np import pandas as pd import tensorflow as tf -import tensorflow.keras.backend as K +import tensorflow.python.keras.backend as K import wandb from sklearn.metrics import ( auc, @@ -187,7 +187,7 @@ def build_fnn_network( output_bias = tf.keras.initializers.Constant(output_bias) # Define the number of hidden layers based on the input size - my_hidden_layers = {"2048": 6, "1024": 5, "999": 5, "512": 4, "256": 3} + my_hidden_layers = {"2048": 6, "1024": 5, "999": 5, "512": 4, "256": 3, "167": 3} if not str(input_size) in my_hidden_layers.keys(): raise ValueError("Wrong input-size. Must be in {2048, 1024, 999, 512, 256}.") nhl = int(math.log2(input_size) / 2 - 1) @@ -403,7 +403,7 @@ def evaluate_model( "target": target, "fold": fold, } - ).to_csv(path_or_buf=f"{file_prefix}.predicted.testdata.csv") + ).to_csv(path_or_buf=f"{file_prefix}/predicted.testdata.csv") ) # Compute the confusion matrix @@ -414,9 +414,16 @@ def evaluate_model( y_test_int, y_predict_int, output_dict=True ) prf = pd.DataFrame.from_dict(precision_recall)[["0", "1"]] - + # plot the precision-recall curve + pl.plot_prc( + y_true=y_test, + y_scores=y_predict, + target=target, + filename=f"{file_prefix}/prc.png", + wandb_logging=False + ) # Add balanced accuracy to the computed metrics - prf.to_csv(path_or_buf=f"{file_prefix}.predicted.testdata.prec_rec_f1.csv") + prf.to_csv(path_or_buf=f"{file_prefix}/predicted.testdata.prec_rec_f1.csv") # Evaluate the model on the validation set and log the results loss, acc, auc_value, precision, recall, balanced_acc = tuple( @@ -455,17 +462,16 @@ def evaluate_model( ) ), columns=["fpr", "tpr", "auc_value", "target", "fold"], - ).to_csv(path_or_buf=f"{file_prefix}.predicted.testdata.aucdata.csv") + ).to_csv(path_or_buf=f"{file_prefix}/predicted.testdata.aucdata.csv") # Generate and save AUC-ROC curve plot pl.plot_auc( fpr=FPR, tpr=TPR, target=target, auc_value=AUC, - filename=f"{file_prefix}_auc_data.png", + filename=f"{file_prefix}/auc_data.png", wandb_logging=False, ) - # Return a DataFrame containing the computed metrics return pd.DataFrame.from_dict( { @@ -490,19 +496,27 @@ def evaluate_model( def fit_and_evaluate_model( x_train: np.ndarray, x_test: np.ndarray, + x_val: np.ndarray, y_train: np.ndarray, y_test: np.ndarray, + y_val: np.ndarray, fold: int, target: str, opts: options.Options, ) -> pd.DataFrame: # Print info about training logging.info(f"Training of fold number: {fold}") + target_suffix = f"{target}_compressed" if opts.compressFeatures else target # Define file name prefix for saving models - model_file_prefix = path.join( - opts.outputDir, f"{target}_{opts.split_type}_single-labeled_Fold-{fold}" - ) + if fold > 1: + model_file_prefix = path.join( + "tmp", f"{target_suffix}/fold-{fold}" + ) + else: + model_file_prefix = path.join( + opts.outputDir, target_suffix + ) # Compute class imbalance ids, counts = np.unique(y_train, return_counts=True) @@ -522,7 +536,7 @@ def fit_and_evaluate_model( ) # Define checkpoint to save model weights during training - checkpoint_model_weights_path = f"{model_file_prefix}.model.weights.hdf5" + checkpoint_model_weights_path = f"{model_file_prefix}/model_weights.hdf5" callback_list = cb.nn_callback( checkpoint_path=checkpoint_model_weights_path, opts=opts ) @@ -536,7 +550,7 @@ def fit_and_evaluate_model( epochs=opts.epochs, batch_size=opts.batchSize, verbose=opts.verbose, - validation_data=(x_test, y_test), + validation_data=(x_val, y_val), ) trainTime = str(round((time() - start) / 60, ndigits=2)) logging.info( @@ -544,8 +558,8 @@ def fit_and_evaluate_model( ) # Save and plot model history - pd.DataFrame(hist.history).to_csv(path_or_buf=f"{model_file_prefix}.history.csv") - pl.plot_history(history=hist, file=f"{model_file_prefix}.history.svg") + pd.DataFrame(hist.history).to_csv(path_or_buf=f"{model_file_prefix}/history.csv") + pl.plot_history(history=hist, file=f"{model_file_prefix}/history.svg") # Evaluate model callback_model = define_single_label_model(input_size=x_train.shape[1], opts=opts) callback_model.load_weights(filepath=checkpoint_model_weights_path) @@ -565,6 +579,7 @@ def get_x_y( df: pd.DataFrame, target: str, train_set: pd.DataFrame, + val_set: pd.DataFrame, test_set: pd.DataFrame, opts: options.Options, ): @@ -582,11 +597,15 @@ def get_x_y( y_train = df.iloc[train_indices][target].values x_test = new_df.iloc[test_indices, :].values y_test = df.iloc[test_indices][target].values + x_val = new_df.iloc[val_set.index, :].values + y_val = df.iloc[val_set.index][target].values + x_val = x_val.astype("float32") + y_val = y_val.astype("float32") x_train = x_train.astype("float32") y_train = y_train.astype("float32") x_test = x_test.astype("float32") y_test = y_test.astype("float32") - return x_train, y_train, x_test, y_test + return x_train, y_train, x_test, y_test, x_val, y_val def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: @@ -599,7 +618,6 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: :param opts: The command line arguments in the options class :param df: The dataframe containing x matrix and at least one column for a y target. """ - # find target columns targets = [c for c in df.columns if c not in ["smiles", "fp", "fpcompressed"]] if opts.wabTracking and opts.wabTarget != "": @@ -645,6 +663,8 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: x_train, x_test, y_train, y_test = train_test_split( x, y, stratify=y, test_size=opts.testSize, random_state=1 ) + x_val , x_test, y_val, y_test = train_test_split( + x_test,y_test, stratify=y_test, test_size=0.5, random_state=1) logging.info( f"Splitting train/test data with fixed random initializer" ) @@ -652,32 +672,21 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: x_train, x_test, y_train, y_test = train_test_split( x, y, stratify=y, test_size=opts.testSize ) + x_val , x_test, y_val, y_test = train_test_split( + x_test,y_test, stratify=y_test, test_size=0.5) performance = fit_and_evaluate_model( x_train=x_train, x_test=x_test, + x_val=x_val, y_train=y_train, y_test=y_test, + y_val=y_val, fold=0, target=target, opts=opts, ) performance_list.append(performance) - # save complete model - trained_model = define_single_label_model( - input_size=len(x[0]), opts=opts - ) - # trained_model.load_weights - # (path.join(opts.outputDir, f"{target}_single-labeled_Fold-0.model.weights.hdf5")) - trained_model.save_weights( - path.join( - opts.outputDir, - f"{target}_single-labeled_Fold-0.model.weights.hdf5", - ) - ) - trained_model.save( - filepath=path.join(opts.outputDir, f"{target}_saved_model") - ) elif 1 < opts.kFolds < 10: # int(x.shape[0] / 100): # do a kfold cross-validation @@ -686,10 +695,18 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: ) fold_no = 1 # split the data - for train, test in kfold_c_validator.split(x, y): + for train_idx, test_idx in kfold_c_validator.split(x, y): + # Split test into validation and actual test set + x_train, x_test = x[train_idx], x[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + + # Further split test set into val and test set + x_val, x_test, y_val, y_test = train_test_split( + x_test, y_test, test_size=0.5, stratify=y_test, + ) if opts.wabTracking and not opts.aeWabTracking: wandb.init( - project=f"FNN_{opts.threshold}_{opts.split_type}", + project=f"FNN_{opts.split_type}", group=f"{target}", name=f"{target}-{fold_no}", reinit=True, @@ -704,41 +721,25 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: # Initialize wandb for the feed forward model wandb.init( - project=f"FNN_{opts.threshold}_{opts.split_type}", + project=f"FNN_{opts.split_type}", group=f"{target}", name=f"FNN_{target}-{fold_no}", reinit=True, ) performance = fit_and_evaluate_model( - x_train=x[train], - x_test=x[test], - y_train=y[train], - y_test=y[test], + x_train=x_train, + x_test=x_test, + x_val=x_val, + y_train=y_train, + y_test=y_test, + y_val=y_val, fold=fold_no, target=target, opts=opts, ) performance_list.append(performance) - # save complete model - trained_model = define_single_label_model( - input_size=len(x[0]), opts=opts - ) - # trained_model.load_weights - # (path.join(opts.outputDir, f"{target}_single-labeled_Fold-0.model.weights.hdf5")) - trained_model.save_weights( - path.join( - opts.outputDir, - f"{target}_single-labeled_Fold-{fold_no}.model.weights.hdf5", - ) - ) - # create output directory and store complete model - trained_model.save( - filepath=path.join( - opts.outputDir, f"{target}-{fold_no}_saved_model" - ) - ) if opts.wabTracking: wandb.finish() fold_no += 1 @@ -747,31 +748,22 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: best_fold = pd.concat(performance_list, ignore_index=True).sort_values( by=["p_1", "r_1", "MCC"], ascending=False, ignore_index=True )["fold"][0] - # rename the fold to best fold - src = os.path.join( - opts.outputDir, - f"{target}_single-labeled_Fold-{best_fold}.model.weights.hdf5", - ) - dst = os.path.join( - opts.outputDir, - f"{target}_single-labeled_Best_Fold-{best_fold}.model.weights.hdf5", - ) - os.rename(src, dst) + src = os.path.join("tmp/", f"{target}/fold-{best_fold}/") + if opts.compressFeatures: + opts.outputDir = opts.ecModelDir + dst = os.path.join(opts.outputDir, f"{target}/") - src_dir = os.path.join( - opts.outputDir, f"{target}-{best_fold}_saved_model" - ) - dst_dir = os.path.join( - opts.outputDir, f"{target}-{best_fold}_best_saved_model" - ) + # Ensure the destination directory exists + os.makedirs(src, exist_ok=True) + os.makedirs(dst, exist_ok=True) - if path.isdir(dst_dir): - shutil.rmtree(dst_dir) + # Copy all contents from the source (best fold) to the destination + shutil.copytree(src, dst, dirs_exist_ok=True) + + # Optionally, clean up the temporary directory + shutil.rmtree("tmp/") - # Rename source directory to destination directory - os.rename(src_dir, dst_dir) - # save complete model else: logging.info( "Your selected number of folds for Cross validation is out of range. " @@ -788,7 +780,7 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: # For each individual target train a model elif opts.split_type == "scaffold_balanced": # df, irrelevant_columns = preprocess_dataframe(df, opts) - for idx, target in enumerate(targets): + for target in targets: df = prepare_nn_training_data(df, target, opts, return_dataframe=True) relevant_cols = ["smiles"] + ["fp"] + [target] # list(irrelevant_columns) if opts.compressFeatures: @@ -800,12 +792,12 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: if opts.kFolds == 1: train_set, val_set, test_set = ae_scaffold_split( df_task, - sizes=(1 - opts.testSize, 0.0, opts.testSize), + sizes=(1 - opts.testSize, opts.testSize/2, opts.testSize/2), balanced=False, seed=42, ) - x_train, y_train, x_test, y_test = get_x_y( - df_task, target, train_set, test_set, opts + x_train, y_train, x_test, y_test, x_val, y_val = get_x_y( + df_task, target, train_set,val_set, test_set, opts ) if opts.wabTracking and not opts.aeWabTracking: wandb.init( @@ -820,41 +812,30 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: group=f"{target}", ) - performance = fit_and_evaluate_model( + performance= fit_and_evaluate_model( x_train=x_train, x_test=x_test, + x_val=x_val, y_train=y_train, y_test=y_test, + y_val=y_val, fold=0, target=target, opts=opts, ) performance_list.append(performance) - trained_model = define_single_label_model( - input_size=len(x_train[0]), opts=opts - ) - trained_model.save_weights( - path.join( - opts.outputDir, - f"{target}_scaffold_single-labeled_Fold-0.model.weights.hdf5", - ) - ) - trained_model.save( - filepath=path.join( - opts.outputDir, f"{target}_scaffold_saved_model_0" - ) - ) + elif opts.kFolds > 1: for fold_no in range(1, opts.kFolds + 1): print(f"Splitting data with seed {fold_no}") train_set, val_set, test_set = ae_scaffold_split( df_task, - sizes=(1 - opts.testSize, 0.0, opts.testSize), + sizes=(1 - opts.testSize,opts.testSize/2, opts.testSize/2), balanced=True, seed=fold_no, ) - x_train, y_train, x_test, y_test = get_x_y( - df_task, target, train_set, test_set, opts + x_train, y_train, x_test, y_test,x_val, y_val = get_x_y( + df_task, target, train_set,val_set, test_set, opts ) if opts.wabTracking and not opts.aeWabTracking: wandb.init( @@ -873,6 +854,8 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: performance = fit_and_evaluate_model( x_train=x_train, x_test=x_test, + x_val=x_val, + y_val=y_val, y_train=y_train, y_test=y_test, fold=fold_no, @@ -881,20 +864,6 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: ) performance_list.append(performance) - trained_model = define_single_label_model( - input_size=len(x_train[0]), opts=opts - ) - trained_model.save_weights( - path.join( - opts.outputDir, - f"{target}_scaffold_single-labeled_Fold-{fold_no}.model.weights.hdf5", - ) - ) - trained_model.save( - filepath=path.join( - opts.outputDir, f"{target}_scaffold_saved_model_{fold_no}" - ) - ) if opts.wabTracking: wandb.finish() fold_no += 1 @@ -902,30 +871,21 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: best_fold = pd.concat(performance_list, ignore_index=True).sort_values( by=["p_1", "r_1", "MCC"], ascending=False, ignore_index=True )["fold"][0] - # rename the fold to best fold - src = os.path.join( - opts.outputDir, - f"{target}_scaffold_single-labeled_Fold-{best_fold}.model.weights.hdf5", - ) - dst = os.path.join( - opts.outputDir, - f"{target}_scaffold_single-labeled_BEST_Fold-{best_fold}.model.weights.hdf5", - ) - os.rename(src, dst) + src = os.path.join("tmp/", f"{target}/fold-{best_fold}/") + if opts.compressFeatures: + opts.outputDir = opts.ecModelDir + dst = os.path.join(opts.outputDir, f"{target}/") - src_dir = os.path.join( - opts.outputDir, f"{target}_scaffold_saved_model_{best_fold}" - ) - dst_dir = os.path.join( - opts.outputDir, - f"{target}_scaffold_saved_model_BEST_FOLD_{best_fold}", - ) + # Ensure the destination directory exists + os.makedirs(dst, exist_ok=True) + os.makedirs(src, exist_ok=True) - if path.isdir(dst_dir): - shutil.rmtree(dst_dir) + # Copy all contents from the source (best fold) to the destination + shutil.copytree(src, dst, dirs_exist_ok=True) + + # Optionally, clean up the temporary directory + shutil.rmtree("tmp/") - # Rename source directory to destination directory - os.rename(src_dir, dst_dir) else: logging.info( @@ -942,7 +902,7 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: ) elif opts.split_type == "molecular_weight": logging.info("You can use molecular_weight split once.") - for idx, target in enumerate(targets): + for target in targets: df = prepare_nn_training_data(df, target, opts, return_dataframe=True) relevant_cols = ["smiles"] + ["fp"] + [target] if opts.compressFeatures: @@ -952,10 +912,10 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: df_task.reset_index(drop=True, inplace=True) if opts.kFolds == 1: train_set, val_set, test_set = weight_split( - df_task, bias="small", sizes=(1 - opts.testSize, 0.0, opts.testSize) + df_task, bias="small", sizes=(1 - opts.testSize, opts.testSize/2, opts.testSize/2) ) - x_train, y_train, x_test, y_test = get_x_y( - df_task, target, train_set, test_set, opts + x_train, y_train, x_test, y_test, x_val, y_val = get_x_y( + df_task, target, train_set,val_set, test_set, opts ) if opts.wabTracking and not opts.aeWabTracking: wandb.init( @@ -972,6 +932,8 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: performance = fit_and_evaluate_model( x_train=x_train, x_test=x_test, + x_val = x_val, + y_val = y_val, y_train=y_train, y_test=y_test, fold=0, @@ -979,18 +941,7 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: opts=opts, ) performance_list.append(performance) - trained_model = define_single_label_model( - input_size=len(x_train[0]), opts=opts - ) - trained_model.save_weights( - path.join( - opts.outputDir, - f"{target}_weight_single-labeled_Fold-0.model.weights.hdf5", - ) - ) - trained_model.save( - filepath=path.join(opts.outputDir, f"{target}_weight_saved_model_0") - ) + elif opts.kFolds > 1: raise Exception( f"Unsupported number of folds: {opts.kFolds} for {opts.split_type} split.\ diff --git a/dfpl/utils.py b/dfpl/utils.py index 15fd018b..29540364 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -197,7 +197,7 @@ def inchi_to_mol(inchi: str) -> Chem.Mol: def weight_split( - data: pd.DataFrame, bias: str, sizes: Tuple[float, float, float] = (0.8, 0, 0.2) + data: pd.DataFrame, bias: str, sizes: Tuple[float, float, float] = (0.8, 0.1, 0.1) ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: if not (len(sizes) == 3 and np.isclose(sum(sizes), 1)): raise ValueError(f"Invalid train/val/test splits! got: {sizes}") @@ -245,7 +245,7 @@ def weight_split( def ae_scaffold_split( data: pd.DataFrame, - sizes: Tuple[float, float, float] = (0.8, 0, 0.2), + sizes: Tuple[float, float, float] = (0.8, 0.1, 0.1), balanced: bool = False, key_molecule_index: int = 0, seed: int = 0, diff --git a/dfpl/vae.py b/dfpl/vae.py index cc61b17d..d8381efb 100644 --- a/dfpl/vae.py +++ b/dfpl/vae.py @@ -2,8 +2,7 @@ import logging import math import os.path -from os.path import basename -from typing import Tuple +from typing import Tuple, Union import numpy as np import pandas as pd @@ -13,20 +12,27 @@ from sklearn.model_selection import train_test_split from tensorflow.keras import initializers, optimizers from tensorflow.keras.layers import Dense, Input, Lambda -from tensorflow.keras.models import Model,load_model +from tensorflow.keras.models import Model, load_model from tensorflow.python.framework.ops import disable_eager_execution from dfpl import callbacks from dfpl import history as ht from dfpl import options, settings from dfpl.utils import ae_scaffold_split, weight_split - +from dfpl.autoencoder import setup_train_test_split, create_dense_layer disable_eager_execution() -def define_vae_model(opts: options.Options, output_bias=None) -> Tuple[Model, Model]: +def define_vae_model(opts: options.Options) -> Tuple[Model, Model]: + """ + Define and compile a Variational Autoencoder (VAE) model based on the given options. + + :param opts: Training options with model parameters. + :param output_bias: Bias for the output layer, used for initializing the last layer. + :return: Tuple containing the VAE and encoder models. + """ input_size = opts.fpSize - encoding_dim = opts.encFPSize # This should be the intended size of your latent space, e.g., 256 + encoding_dim = opts.encFPSize lr_schedule = optimizers.schedules.ExponentialDecay( opts.aeLearningRate, @@ -34,236 +40,96 @@ def define_vae_model(opts: options.Options, output_bias=None) -> Tuple[Model, Mo decay_rate=opts.aeLearningRateDecay, staircase=True, ) - ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) - - if output_bias is not None: - output_bias = initializers.Constant(output_bias) - - hidden_layer_count = round(math.log2(input_size / encoding_dim)) - + vae_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) input_vec = Input(shape=(input_size,)) + initial_layer_size = int(input_size / 2) + encoded = create_dense_layer(input_vec, initial_layer_size, opts.aeActivationFunction) - # 1st hidden layer - if opts.aeActivationFunction != "selu": - encoded = Dense(units=int(input_size / 2), activation=opts.aeActivationFunction)(input_vec) - else: - encoded = Dense(units=int(input_size / 2), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(input_vec) + # Start `layer_sizes` with the initial layer size (1024) + layer_sizes = [initial_layer_size] - # encoding layers - for i in range(1, hidden_layer_count - 1): # Adjust the range to stop before the latent space layers - factor_units = 2 ** (i + 1) - if opts.aeActivationFunction != "selu": - encoded = Dense(units=int(input_size / factor_units), activation=opts.aeActivationFunction)(encoded) - else: - encoded = Dense(units=int(input_size / factor_units), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(encoded) + # Building encoding layers and storing their sizes + hidden_layer_count = round(math.log2(input_size / encoding_dim)) + for i in range(1, hidden_layer_count - 1): + layer_size = int(input_size / (2 ** (i + 1))) + layer_sizes.append(layer_size) + encoded = create_dense_layer(encoded,layer_size, opts.aeActivationFunction) - # latent space layers - if opts.aeActivationFunction != "selu": - z_mean = Dense(units=encoding_dim, activation=opts.aeActivationFunction)(encoded) # Adjusted size to encoding_dim - z_log_var = Dense(units=encoding_dim, activation=opts.aeActivationFunction)(encoded) # Adjusted size to encoding_dim - else: - z_mean = Dense(units=encoding_dim, activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(encoded) # Adjusted size to encoding_dim - z_log_var = Dense(units=encoding_dim, activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(encoded) # Adjusted size to encoding_dim + # Latent space layers + z_mean = create_dense_layer (encoded, encoding_dim, opts.aeActivationFunction) + z_log_var = create_dense_layer(encoded, encoding_dim, opts.aeActivationFunction) + # Sampling layer - # sampling layer def sampling(args): z_mean, z_log_var = args batch = K.shape(z_mean)[0] dim = K.int_shape(z_mean)[1] epsilon = K.random_normal(shape=(batch, dim)) - return z_mean + K.exp(0.5 * z_log_var) * epsilon + return z_mean + K.exp(opts.vaeBeta * z_log_var) * epsilon z = Lambda(sampling, output_shape=(encoding_dim,))([z_mean, z_log_var]) - decoded = z - - # decoding layers - for i in range(hidden_layer_count - 2, 0, -1): - factor_units = 2 ** i - if opts.aeActivationFunction != "selu": - decoded = Dense(units=int(input_size / factor_units), activation=opts.aeActivationFunction)(decoded) - else: - decoded = Dense(units=int(input_size / factor_units), activation=opts.aeActivationFunction, kernel_initializer="lecun_normal")(decoded) - # output layer - decoded = Dense(units=input_size, activation="sigmoid", bias_initializer=output_bias)(decoded) - - autoencoder = Model(input_vec, decoded) - encoder = Model(input_vec, z) - autoencoder.summary(print_fn=logging.info) + # Build decoder layers starting directly from `z` + decoded = z + for layer_size in reversed(layer_sizes): + decoded = create_dense_layer(decoded, layer_size, opts.aeActivationFunction) + # Final output layer to reconstruct input + decoded = Dense(units=input_size, activation="sigmoid")(decoded) - # KL divergence loss + # Define VAE and encoder models + vae = Model(input_vec, decoded) + encoder = Model(input_vec, z_mean) + # Define custom loss def kl_loss(z_mean, z_log_var): - return -0.5 * K.sum( - 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1 - ) + return -0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) - # binary cross-entropy loss def bce_loss(y_true, y_pred): return metrics.binary_crossentropy(y_true, y_pred) - # combined loss def vae_loss(y_true, y_pred): - bce = bce_loss(y_true, y_pred) - kl = kl_loss(z_mean, z_log_var) - return bce + 0.5 * kl + return bce_loss(y_true, y_pred) + 0.5 * kl_loss(z_mean, z_log_var) - autoencoder.compile( - optimizer=ac_optimizer, loss=vae_loss, metrics=[bce_loss, kl_loss] - ) + vae.compile(optimizer=vae_optimizer, loss=vae_loss, metrics=[kl_loss, bce_loss]) + vae.summary(print_fn=logging.info) - return autoencoder, encoder + return vae, encoder -def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: - """ - Trains an autoencoder on the given feature matrix X. The response matrix is only used to - split the data into meaningful test and train sets. - :param opts: Command line arguments as defined in options.py - :param df: Pandas dataframe that contains the SMILES/InChI data for training the autoencoder - :return: The encoder model of the trained autoencoder +def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Tuple[Model, np.ndarray, np.ndarray]: """ + Trains a VAE on the provided data and returns the trained encoder and split indices. - # If wandb tracking is enabled for VAE weights but not for the main program, initialize a new wandb run - if opts.aeWabTracking and not opts.wabTracking: + :param df: DataFrame containing SMILES/InChI data for training. + :param opts: Training options. + :return: The encoder model of the trained VAE and split indices. + """ + # Initialize wandb tracking if needed + if opts.aeWabTracking: wandb.init(project=f"VAE_{opts.aeSplitType}") - save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_split_autoencoder") - # Collect the callbacks for training + # Define paths for saving weights + save_path = os.path.join(opts.ecModelDir, f"vae_weights.h5") - # Select all fingerprints that are valid and turn them into a numpy array - fp_matrix = np.array( - df[df["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - logging.info( - f"Training VAE on a matrix of shape {fp_matrix.shape} with type {fp_matrix.dtype}" - ) - assert 0.0 <= opts.testSize <= 0.5 - if opts.aeSplitType == "random": - logging.info("Training autoencoder using random split") - initial_indices = np.arange(fp_matrix.shape[0]) - if opts.testSize > 0.0: - # Split data into test and training data - if opts.aeWabTracking: - x_train, x_test, train_indices, test_indices = train_test_split( - fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 - ) - else: - x_train, x_test, train_indices, test_indices = train_test_split( - fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 - ) - else: - x_train = fp_matrix - x_test = None - elif opts.aeSplitType == "scaffold_balanced": - logging.info("Training autoencoder using scaffold split") - train_indices = np.arange(fp_matrix.shape[0]) - if opts.testSize > 0.0: - # if opts.aeWabTracking: - train_data, val_data, test_data = ae_scaffold_split( - df, - sizes=(1 - opts.testSize, 0.0, opts.testSize), - balanced=True, - seed=42, - ) - x_train = np.array( - train_data[train_data["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - x_test = np.array( - test_data[test_data["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - train_indices = df[ - df.index.isin(train_data[train_data["fp"].notnull()].index) - ].index.to_numpy() - test_indices = df[ - df.index.isin(test_data[test_data["fp"].notnull()].index) - ].index.to_numpy() - else: - x_train = fp_matrix - x_test = None - elif opts.aeSplitType == "molecular_weight": - logging.info("Training autoencoder using molecular weight split") - train_indices = np.arange(fp_matrix.shape[0]) - if opts.testSize > 0.0: - train_data, val_data, test_data = weight_split( - df, sizes=(1 - opts.testSize, 0.0, opts.testSize), bias="small" - ) - x_train = np.array( - train_data[train_data["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - x_test = np.array( - test_data[test_data["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - df_sorted = df.sort_values(by="mol_weight", ascending=True) - # Get the sorted indices from the sorted DataFrame - sorted_indices = df_sorted.index.to_numpy() - - # Find the corresponding indices for train_data, val_data, and test_data in the sorted DataFrame - train_indices = sorted_indices[df.index.isin(train_data.index)] - # val_indices = sorted_indices[df.index.isin(val_data.index)] - test_indices = sorted_indices[df.index.isin(test_data.index)] - else: - x_train = fp_matrix - x_test = None - else: - raise ValueError(f"Invalid split type: {opts.split_type}") - - # Calculate the initial bias aka the log ratio between 1's and 0'1 in all fingerprints - ids, counts = np.unique(x_train.flatten(), return_counts=True) - count_dict = dict(zip(ids, counts)) - if count_dict[0] == 0: - initial_bias = None - logging.info("No zeroes in training labels. Setting initial_bias to None.") - else: - initial_bias = np.log([count_dict[1] / count_dict[0]]) - logging.info(f"Initial bias for last sigmoid layer: {initial_bias[0]}") - if opts.testSize > 0.0: - logging.info(f"VAE training/testing mode with train- and test-samples") - logging.info(f"VAE train data shape {x_train.shape} with type {x_train.dtype}") - logging.info(f"VAE test data shape {x_test.shape} with type {x_test.dtype}") - else: - logging.info(f"VAE full train mode without test-samples") - logging.info(f"VAE train data shape {x_train.shape} with type {x_train.dtype}") + x_train, x_test, train_indices, test_indices = setup_train_test_split(df, opts) - (vae, encoder) = define_vae_model(opts, output_bias=initial_bias) - # Train the VAE on the training data - callback_list = callbacks.autoencoder_callback(checkpoint_path=f"{save_path}.h5", opts=opts) + # Define VAE and encoder models + vae, encoder = define_vae_model(opts) + # Set up callbacks and train the VAE model + callback_list = callbacks.autoencoder_callback(checkpoint_path=save_path, opts=opts) vae_hist = vae.fit( - x_train, - x_train, - epochs=opts.aeEpochs, - batch_size=opts.aeBatchSize, - verbose=opts.verbose, - callbacks=[callback_list], - validation_data=(x_test, x_test) if opts.testSize > 0.0 else None, - ) - - # Save the VAE weights - ht.store_and_plot_history( - base_file_name=save_path, - hist=vae_hist, + x_train, x_train, epochs=opts.aeEpochs, batch_size=opts.aeBatchSize, + verbose=opts.verbose, callbacks=[callback_list], + validation_data=(x_test, x_test) if x_test is not None else None, ) - # Re-define autoencoder and encoder using your function - callback_autoencoder, callback_encoder = define_vae_model(opts) - callback_autoencoder.load_weights(filepath=f"{save_path}.h5") - for i, layer in enumerate(callback_encoder.layers): - layer.set_weights(callback_autoencoder.layers[i].get_weights()) + # Store training history + ht.store_and_plot_history(base_file_name=save_path, hist=vae_hist) - # Save the encoder model - encoder_save_path = f"{save_path}_encoder.h5" - callback_encoder.save_weights(filepath=encoder_save_path) + # load the whole vae from the checkpoint + vae.load_weights(save_path) + encoder.save_weights(os.path.join(opts.ecModelDir, "encoder_weights.h5")) return encoder, train_indices, test_indices diff --git a/example/predict.json b/example/predict.json index e3305c7c..b1f16e4b 100755 --- a/example/predict.json +++ b/example/predict.json @@ -3,10 +3,9 @@ "inputFile": "tests/data/tox21.csv", "outputDir": "example/results_predict/", "outputFile": "smiles.csv", - "ecModelDir": "example/results_train/random_split_autoencoder", - "ecWeightsFile": "random_split_autoencoder_encoder.h5", - "fnnModelDir": "example/results_train/NR-AR-1_best_saved_model", + "fnnModelDir": "example/results_train/NR-AR", + "ecModelDir": "example/results_train/vae", + "ecWeightsFile": "encoder_weights.h5", "aeType": "variational", - "compressFeatures": true, - "trainFNN": false + "compressFeatures": false } diff --git a/example/train.json b/example/train.json index 53575adc..79f06d1a 100755 --- a/example/train.json +++ b/example/train.json @@ -1,8 +1,8 @@ { "py/object": "dfpl.options.Options", - "inputFile": "tests/data/tox21.csv", + "inputFile": "tests/data/clintox.csv", "outputDir": "example/results_train/", - "ecModelDir": "example/results_train/random_split_autoencoder/", + "ecModelDir": "example/results_train/vae", "ecWeightsFile": "", "verbose": 2, @@ -12,16 +12,15 @@ "encFPSize": 256, "aeSplitType": "random", - "aeEpochs": 4, + "aeEpochs": 11, "aeBatchSize": 351, "aeOptimizer": "Adam", "aeActivationFunction": "relu", "aeLearningRate": 0.001, "aeLearningRateDecay": 0.96, "aeType": "deterministic", + "vaeBeta": 0.5, - "type": "smiles", - "fpType": "topological", "fpSize": 2048, "split_type": "random", "enableMultiLabel": false, @@ -30,13 +29,13 @@ "gpu": "", "trainFNN": true, - "kFolds": 2, + "kFolds": 1, "threshold": 0.5, "testSize": 0.2, "fnnType": "FNN", "optimizer": "Adam", - "lossFunction": "bce", - "epochs": 11, + "lossFunction": "focal", + "epochs": 50, "batchSize": 128, "activationFunction": "selu", "dropout": 0.0107, From d2848f1a31da913c84fec638c267b743636c9582 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Fri, 1 Nov 2024 14:16:40 +0100 Subject: [PATCH 03/10] updated tests --- dfpl/options.py | 9 +-------- dfpl/settings.py | 2 +- example/predictgnn.json | 2 +- example/traingnn.json | 2 +- tests/data/smiles.csv | 7 ------- tests/run_predictgnn.py | 20 +++----------------- tests/run_traingnn.py | 1 - 7 files changed, 7 insertions(+), 36 deletions(-) delete mode 100644 tests/data/smiles.csv diff --git a/dfpl/options.py b/dfpl/options.py index 08494c3d..76ee3432 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -83,7 +83,7 @@ class GnnOptions(TrainArgs): Dataclass to hold all options used for training the graph models """ - total_epochs: int = 30 + epochs: int = 30 save: bool = True configFile: str = "./example/traingnn.json" data_path: str = "./example/data/tox21.csv" @@ -1000,13 +1000,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: training_args.add_argument( "--epochs", type=int, metavar="INT", default=30, help="Number of epochs to run" ) - training_args.add_argument( - "--total_epochs", - type=int, - metavar="INT", - default=30, - help="Number of total epochs to run", - ) training_args.add_argument( "--batch_size", type=int, metavar="INT", default=50, help="Batch size" ) diff --git a/dfpl/settings.py b/dfpl/settings.py index 64eac190..20435290 100644 --- a/dfpl/settings.py +++ b/dfpl/settings.py @@ -43,7 +43,7 @@ # Training settings of the AC that were magic numbers in the code before. ac_train_min_delta = 0.0001 -ac_train_check_period = 5 +ac_train_check_period = 2 ac_train_patience = 5 # Training settings of the FNN that were magic numbers in the code before. diff --git a/example/predictgnn.json b/example/predictgnn.json index 813cf0c5..5bfbcec6 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -1,6 +1,6 @@ { "py/object": "dfpl.options.GnnOptions", - "test_path": "tests/data/smiles.csv", + "test_path": "tests/data/tox21.csv", "preds_path": "example/results_gnn.csv", "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt" } \ No newline at end of file diff --git a/example/traingnn.json b/example/traingnn.json index fa2b714f..5dc6a0a5 100644 --- a/example/traingnn.json +++ b/example/traingnn.json @@ -2,7 +2,7 @@ "py/object": "dfpl.options.GnnOptions", "data_path": "tests/data/S_dataset.csv", "save_dir": "dmpnn-random/", - "epochs": 4, + "epochs": 2, "num_folds": 1, "metric": "accuracy", "loss_function": "binary_cross_entropy", diff --git a/tests/data/smiles.csv b/tests/data/smiles.csv deleted file mode 100644 index 9383afdd..00000000 --- a/tests/data/smiles.csv +++ /dev/null @@ -1,7 +0,0 @@ -smiles -CN(C)c1ccc(cc1)C(=O)c2ccc(cc2)N(C)C -CC12CCC3C(CCC4=CC(=O)CCC34C)C1CCC2=O -CC12CCC3C(CCc4cc(O)ccc34)C1CCC2O -Oc1c(Br)cc(cc1Br)C#N -Oc1ccc(C=Cc2cc(O)cc(O)c2)cc1 -Oc1ccc(cc1)c2ccccc2 \ No newline at end of file diff --git a/tests/run_predictgnn.py b/tests/run_predictgnn.py index 979c2868..9c433d80 100644 --- a/tests/run_predictgnn.py +++ b/tests/run_predictgnn.py @@ -26,26 +26,12 @@ def test_predictdmpnn(opts: opt.GnnOptions) -> None: ) json_arg_path = utils.makePathAbsolute(f"{example_directory}/predictgnn.json") - ignore_elements = [ - "py/object", - "checkpoint_paths", - "save_dir", - "saving_name", - ] - arguments, data = utils.createArgsFromJson( - json_arg_path, ignore_elements, return_json_object=True + arguments = utils.createArgsFromJson( + json_arg_path, ) - arguments.append("--preds_path") - arguments.append("") - save_dir = data.get("save_dir") - name = data.get("saving_name") - opts = args.PredictArgs().parse_args(arguments) - opts.preds_path = os.path.join(save_dir, name) - df = pd.read_csv(opts.test_path) - smiles = [[row.smiles] for _, row in df.iterrows()] - train.make_predictions(args=opts, smiles=smiles) + train.make_predictions(args=opts) print("predictdmpnn test complete.") diff --git a/tests/run_traingnn.py b/tests/run_traingnn.py index 582d4627..adae40d2 100644 --- a/tests/run_traingnn.py +++ b/tests/run_traingnn.py @@ -10,7 +10,6 @@ test_train_args = opt.GnnOptions( configFile=utils.makePathAbsolute(f"{example_directory}/traingnn.json"), save_dir=utils.makePathAbsolute(f"{project_directory}/output"), - total_epochs=1, ) From 9ccfe4d9ecc4d7644966cb058df0f7e4f629b20f Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Fri, 1 Nov 2024 14:25:35 +0100 Subject: [PATCH 04/10] new version --- dfpl/__main__.py | 9 +++-- dfpl/autoencoder.py | 25 +++++++++----- dfpl/callbacks.py | 31 +++++++++-------- dfpl/fingerprint.py | 5 ++- dfpl/options.py | 68 +++++++++++++++++++++----------------- dfpl/plot.py | 46 ++++++++++++++++++++------ dfpl/predictions.py | 12 ++++--- dfpl/single_label_model.py | 49 +++++++++++++-------------- dfpl/utils.py | 16 +++++---- dfpl/vae.py | 29 +++++++++++----- example/predictgnn.json | 2 +- 11 files changed, 175 insertions(+), 117 deletions(-) diff --git a/dfpl/__main__.py b/dfpl/__main__.py index 3f2ecd8a..4d13abc5 100755 --- a/dfpl/__main__.py +++ b/dfpl/__main__.py @@ -5,6 +5,7 @@ from os import path import chemprop as cp + from dfpl import autoencoder as ac from dfpl import feedforwardNN as fNN from dfpl import fingerprint as fp @@ -23,7 +24,7 @@ def traindmpnn(opts: options.GnnOptions) -> None: - None """ # Load options from a JSON file and replace the relevant attributes in `opts` - arguments = createArgsFromJson(jsonFile = opts.configFile) + arguments = createArgsFromJson(jsonFile=opts.configFile) opts = cp.args.TrainArgs().parse_args(arguments) logging.info("Training DMPNN...") mean_score, std_score = cp.train.cross_validate( @@ -41,7 +42,7 @@ def predictdmpnn(opts: options.GnnOptions) -> None: - None """ # Load options and additional arguments from a JSON file - arguments = createArgsFromJson(jsonFile = opts.configFile) + arguments = createArgsFromJson(jsonFile=opts.configFile) opts = cp.args.PredictArgs().parse_args(arguments) cp.train.make_predictions(args=opts) @@ -86,7 +87,9 @@ def train(opts: options.Options): ) # compress the fingerprints using the autoencoder df = ac.compress_fingerprints(df, encoder) - if opts.visualizeLatent: ## visualize latent space only if you train the autoencoder + if ( + opts.visualizeLatent + ): ## visualize latent space only if you train the autoencoder ac.visualize_fingerprints( df, comressed_col="fpcompressed", diff --git a/dfpl/autoencoder.py b/dfpl/autoencoder.py index 083ba325..874b1aa0 100644 --- a/dfpl/autoencoder.py +++ b/dfpl/autoencoder.py @@ -25,7 +25,7 @@ def create_dense_layer(inputs, units, activation): return Dense( units=units, activation=activation, - kernel_initializer="lecun_normal" if activation == "selu" else 'glorot_uniform' + kernel_initializer="lecun_normal" if activation == "selu" else "glorot_uniform", )(inputs) @@ -49,7 +49,9 @@ def define_ac_model(opts: options.Options) -> Tuple[Model, Model]: input_vec = Input(shape=(input_size,)) initial_layer_size = int(input_size / 2) - encoded = create_dense_layer(input_vec, initial_layer_size, opts.aeActivationFunction) + encoded = create_dense_layer( + input_vec, initial_layer_size, opts.aeActivationFunction + ) # Start `layer_sizes` with the initial layer size (1024) layer_sizes = [initial_layer_size] @@ -81,9 +83,9 @@ def define_ac_model(opts: options.Options) -> Tuple[Model, Model]: return autoencoder, encoder - -def setup_train_test_split(df: pd.DataFrame, opts: options.Options) -> Tuple[ - np.ndarray, np.ndarray, np.ndarray, np.ndarray]: +def setup_train_test_split( + df: pd.DataFrame, opts: options.Options +) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Sets up the training and test split based on the provided options. @@ -99,7 +101,9 @@ def setup_train_test_split(df: pd.DataFrame, opts: options.Options) -> Tuple[ copy=settings.numpy_copy_values, ) - logging.info(f"Setting up train/test split on a matrix of shape {fp_matrix.shape} with type {fp_matrix.dtype}") + logging.info( + f"Setting up train/test split on a matrix of shape {fp_matrix.shape} with type {fp_matrix.dtype}" + ) # Validate test size assert 0.0 <= opts.testSize <= 0.5 @@ -132,8 +136,12 @@ def setup_train_test_split(df: pd.DataFrame, opts: options.Options) -> Tuple[ dtype=settings.ac_fp_numpy_type, copy=settings.numpy_copy_values, ) - train_indices = df[df.index.isin(train_data[train_data["fp"].notnull()].index)].index.to_numpy() - test_indices = df[df.index.isin(test_data[test_data["fp"].notnull()].index)].index.to_numpy() + train_indices = df[ + df.index.isin(train_data[train_data["fp"].notnull()].index) + ].index.to_numpy() + test_indices = df[ + df.index.isin(test_data[test_data["fp"].notnull()].index) + ].index.to_numpy() elif opts.aeSplitType == "molecular_weight": logging.info("Using molecular weight split for training.") @@ -214,7 +222,6 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: # Save the encoder weights encoder.save_weights(os.path.join(opts.ecModelDir, "encoder_weights.h5")) - return encoder, train_indices, test_indices diff --git a/dfpl/callbacks.py b/dfpl/callbacks.py index 4cabc230..c3ccb338 100644 --- a/dfpl/callbacks.py +++ b/dfpl/callbacks.py @@ -23,24 +23,24 @@ def autoencoder_callback(checkpoint_path: str, opts: options.Options) -> list: target = "loss" # enable this checkpoint to restore the weights of the best performing model checkpoint = ModelCheckpoint( - checkpoint_path, - monitor=target, - mode="min", - verbose=1, - save_best_only=True, - save_weights_only=True, - period=settings.ac_train_check_period + checkpoint_path, + monitor=target, + mode="min", + verbose=1, + save_best_only=True, + save_weights_only=True, + period=settings.ac_train_check_period, ) callbacks.append(checkpoint) # enable early stopping if val_loss is not improving anymore early_stop = EarlyStopping( - monitor=target, - mode="min", - patience=settings.ac_train_patience, - min_delta=settings.ac_train_min_delta, - verbose=1, - restore_best_weights=True, + monitor=target, + mode="min", + patience=settings.ac_train_patience, + min_delta=settings.ac_train_min_delta, + verbose=1, + restore_best_weights=True, ) callbacks.append(early_stop) if opts.aeWabTracking: @@ -68,8 +68,7 @@ def nn_callback(checkpoint_path: str, opts: options.Options) -> list: monitor="val_loss", mode="min", save_weights_only=True, - period=settings.nn_train_check_period - + period=settings.nn_train_check_period, ) callbacks.append(checkpoint) @@ -80,7 +79,7 @@ def nn_callback(checkpoint_path: str, opts: options.Options) -> list: mode="min", min_delta=settings.nn_train_min_delta, verbose=1, - restore_best_weights=True + restore_best_weights=True, ) callbacks.append(early_stop) if opts.wabTracking: diff --git a/dfpl/fingerprint.py b/dfpl/fingerprint.py index e2016896..0149d62d 100644 --- a/dfpl/fingerprint.py +++ b/dfpl/fingerprint.py @@ -18,7 +18,7 @@ default_fp_size = 2048 -def addFPColumn(data_frame: pd.DataFrame,fp_size: int) -> pd.DataFrame: +def addFPColumn(data_frame: pd.DataFrame, fp_size: int) -> pd.DataFrame: """ Adds a fingerprint to each row in the dataframe. This function works on parallel chunks of the original dataframe. @@ -47,7 +47,6 @@ def smile2fp(smile: str) -> Any: except Exception: return None - def inchi2fp(inchi: str) -> Any: """ Calculates one fingerprint from InChI @@ -122,7 +121,7 @@ def importDstoxTSV(tsvfilename: str) -> pd.DataFrame: conversion_rules = { "S_dataset.csv": importSmilesCSV, "smiles.csv": importSmilesCSV, - "inchi.tsv": importDstoxTSV + "inchi.tsv": importDstoxTSV, } diff --git a/dfpl/options.py b/dfpl/options.py index 76ee3432..05068156 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -4,6 +4,7 @@ from dataclasses import dataclass from pathlib import Path from typing import Optional + import jsonpickle import torch from chemprop.args import TrainArgs @@ -58,7 +59,7 @@ class Options: l2reg: float = 0.001 dropout: float = 0.2 threshold: float = 0.5 - visualizeLatent: bool = False #only if autoencoder is trained or loaded + visualizeLatent: bool = False # only if autoencoder is trained or loaded gpu: int = None aeWabTracking: bool = False # Wand & Biases autoencoder tracking wabTracking: bool = False # Wand & Biases FNN tracking @@ -183,7 +184,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="The file containing the data for training in " "comma separated CSV format.The first column should be smiles.", - default="tests/data/smiles.csv" + default="tests/data/smiles.csv", ) general_args.add_argument( "-o", @@ -192,7 +193,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="Prefix of output file name. Trained model and " "respective stats will be returned in this directory.", - default="example/results_train/" + default="example/results_train/", ) # TODO CHECK WHAT IS TYPE DOING? @@ -203,7 +204,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["fp", "smiles"], help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default="fp" + default="fp", ) general_args.add_argument( "-thr", @@ -211,7 +212,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=float, metavar="FLOAT", help="Threshold for binary classification.", - default=0.5 + default=0.5, ) general_args.add_argument( "-gpu", @@ -219,7 +220,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Select which gpu to use by index. If not available, leave empty", - default=None + default=None, ) general_args.add_argument( "--fpType", @@ -227,25 +228,25 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["topological", "MACCS"], help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.", - default="topological" + default="topological", ) general_args.add_argument( "--fpSize", type=int, help="Length of the fingerprint that should be generated.", - default=2048 + default=2048, ) general_args.add_argument( "--compressFeatures", action="store_true", help="Should the fingerprints be compressed or not. Needs a path of a trained autoencoder or needs the trainAC also set to True.", - default=False + default=False, ) general_args.add_argument( "--enableMultiLabel", action="store_true", help="Train multi-label classification model in addition to the individual models.", - default=False + default=False, ) # Autoencoder Configuration autoencoder_args.add_argument( @@ -254,14 +255,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, metavar="FILE", help="The .hdf5 file of a trained encoder", - default="" + default="", ) autoencoder_args.add_argument( "--ecModelDir", type=str, metavar="DIR", help="The directory where the full model of the encoder will be saved", - default="example/results_train/AE_encoder/" + default="example/results_train/AE_encoder/", ) autoencoder_args.add_argument( "--aeType", @@ -269,21 +270,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["variational", "deterministic"], help="Autoencoder type, variational or deterministic.", - default="deterministic" + default="deterministic", ) autoencoder_args.add_argument( "--aeEpochs", metavar="INT", type=int, help="Number of epochs for autoencoder training.", - default=100 + default=100, ) autoencoder_args.add_argument( "--aeBatchSize", metavar="INT", type=int, help="Batch size in autoencoder training.", - default=512 + default=512, ) autoencoder_args.add_argument( "--aeActivationFunction", @@ -291,21 +292,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["relu", "selu"], help="The activation function for the hidden layers in the autoencoder.", - default="relu" + default="relu", ) autoencoder_args.add_argument( "--aeLearningRate", metavar="FLOAT", type=float, help="Learning rate for autoencoder training.", - default=0.001 + default=0.001, ) autoencoder_args.add_argument( "--aeLearningRateDecay", metavar="FLOAT", type=float, help="Learning rate decay for autoencoder training.", - default=0.96 + default=0.96, ) autoencoder_args.add_argument( "--aeSplitType", @@ -313,7 +314,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the autoencoder", - default="random" + default="random", ) autoencoder_args.add_argument( "-d", @@ -321,20 +322,20 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Size of encoded fingerprint (z-layer of autoencoder).", - default=256 + default=256, ) autoencoder_args.add_argument( "--visualizeLatent", action="store_true", help="UMAP the latent space for exploration", - default=False + default=False, ) autoencoder_args.add_argument( "--vaeBeta", metavar="FLOAT", type=float, help="Beta parameter for the Beta VAE", - default=1.0 + default=1.0, ) # Training Configuration training_args.add_argument( @@ -343,14 +344,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the feedforward neural network", - default="random" + default="random", ) training_args.add_argument( "--testSize", metavar="FLOAT", type=float, help="Fraction of the dataset that should be used for testing. Value in [0,1].", - default=0.2 + default=0.2, ) training_args.add_argument( "-K", @@ -358,7 +359,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="K that is used for K-fold cross-validation in the training procedure.", - default=1 + default=1, ) training_args.add_argument( "-v", @@ -505,7 +506,7 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: "--configFile", metavar="FILE", type=str, - help="Input JSON file that contains all information for training/predicting." + help="Input JSON file that contains all information for training/predicting.", ) files_args.add_argument( "-i", @@ -584,11 +585,16 @@ def parseInputPredict(parser: argparse.ArgumentParser) -> None: general_args.add_argument( "-c", "--compressFeatures", action="store_true", default=False ) - (general_args.add_argument( - "--aeType", metavar="STRING", type=str, - choices=["variational", "deterministic"], - help="Autoencoder type, variational or deterministic.", - default="deterministic")) + ( + general_args.add_argument( + "--aeType", + metavar="STRING", + type=str, + choices=["variational", "deterministic"], + help="Autoencoder type, variational or deterministic.", + default="deterministic", + ) + ) def parseTrainGnn(parser: argparse.ArgumentParser) -> None: diff --git a/dfpl/plot.py b/dfpl/plot.py index 75c7fce4..84802023 100644 --- a/dfpl/plot.py +++ b/dfpl/plot.py @@ -4,7 +4,8 @@ import numpy as np import wandb from matplotlib.axes import Axes -from sklearn.metrics import precision_recall_curve, auc +from sklearn.metrics import auc, precision_recall_curve + # for NN model functions from tensorflow.python.keras.callbacks import History @@ -44,6 +45,7 @@ def get_max_training_accuracy(history: History) -> str: y_max: float = max(training) return "Max training accuracy ≈ " + str(round(y_max, 3) * 100) + "%" + def smooth_curve(points: np.ndarray, factor: float = 0.8) -> List[float]: smoothed_points: List[float] = [] for point in points: @@ -54,6 +56,7 @@ def smooth_curve(points: np.ndarray, factor: float = 0.8) -> List[float]: smoothed_points.append(point) return smoothed_points + # Plot the accuracy and loss data with enhanced visuals def set_plot_history_data(ax: Axes, history: History, which_graph: str) -> None: if which_graph == "balanced_acc": @@ -72,12 +75,20 @@ def set_plot_history_data(ax: Axes, history: History, which_graph: str) -> None: # Plot training and validation data with styles ax.plot(epochs, train, color="dodgerblue", linewidth=2, label=f"Training {label}") - ax.plot(epochs, valid, color="green", linestyle="--", linewidth=2, label=f"Validation {label}") + ax.plot( + epochs, + valid, + color="green", + linestyle="--", + linewidth=2, + label=f"Validation {label}", + ) ax.set_ylabel(label) ax.legend(loc="best") ax.spines["top"].set_visible(False) ax.spines["right"].set_visible(False) + def plot_history(history: History, file: str) -> None: fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(10, 8), sharex="all") @@ -90,8 +101,16 @@ def plot_history(history: History, file: str) -> None: plt.savefig(fname=file, format="svg") plt.close() + # Enhanced AUC plot -def plot_auc(fpr: np.ndarray, tpr: np.ndarray, auc_value: float, target: str, filename: str, wandb_logging: bool = False) -> None: +def plot_auc( + fpr: np.ndarray, + tpr: np.ndarray, + auc_value: float, + target: str, + filename: str, + wandb_logging: bool = False, +) -> None: plt.figure(figsize=(8, 6)) plt.plot([0, 1], [0, 1], "k--", linewidth=1) plt.plot(fpr, tpr, color="darkorange", linewidth=2, label=f"AUC = {auc_value:.3f}") @@ -105,12 +124,13 @@ def plot_auc(fpr: np.ndarray, tpr: np.ndarray, auc_value: float, target: str, fi wandb.log({"roc_plot": plt}) plt.close() + def plot_prc( - y_true: np.ndarray, - y_scores: np.ndarray, - target: str, - filename: str, - wandb_logging: bool = False + y_true: np.ndarray, + y_scores: np.ndarray, + target: str, + filename: str, + wandb_logging: bool = False, ) -> None: """ Plot the Precision-Recall Curve (PRC) with AUC. @@ -128,7 +148,13 @@ def plot_prc( # Plot PRC curve plt.figure(figsize=(8, 6)) - plt.plot(recall, precision, color="purple", linewidth=2, label=f"PRC-AUC = {prc_auc_value:.3f}") + plt.plot( + recall, + precision, + color="purple", + linewidth=2, + label=f"PRC-AUC = {prc_auc_value:.3f}", + ) plt.xlabel("Recall") plt.ylabel("Precision") plt.title(f"Precision-Recall Curve - {target}") @@ -139,4 +165,4 @@ def plot_prc( plt.savefig(fname=filename, format="png") if wandb_logging: wandb.log({"prc_plot": plt}) - plt.close() \ No newline at end of file + plt.close() diff --git a/dfpl/predictions.py b/dfpl/predictions.py index ef1000d8..90a0c3bd 100644 --- a/dfpl/predictions.py +++ b/dfpl/predictions.py @@ -1,5 +1,6 @@ import logging import os + import numpy as np import pandas as pd @@ -27,18 +28,21 @@ def predict_values(df: pd.DataFrame, opts: options.Options) -> pd.DataFrame: # Prepare the feature matrix for prediction x = np.array( sub_df[feature_column].to_list(), - dtype=settings.nn_fp_compressed_numpy_type if opts.compressFeatures else settings.nn_fp_numpy_type, - copy=settings.numpy_copy_values + dtype=settings.nn_fp_compressed_numpy_type + if opts.compressFeatures + else settings.nn_fp_numpy_type, + copy=settings.numpy_copy_values, ) logging.info( - f"{'Compressed' if opts.compressFeatures else 'Uncompressed'} FP matrix with shape {x.shape} and type {x.dtype}") + f"{'Compressed' if opts.compressFeatures else 'Uncompressed'} FP matrix with shape {x.shape} and type {x.dtype}" + ) # Define the model architecture based on the feature size feature_input_size = x.shape[1] model = sl.define_single_label_model(input_size=feature_input_size, opts=opts) # Load the model weights - weights_path = os.path.join(opts.fnnModelDir, 'model_weights.hdf5') + weights_path = os.path.join(opts.fnnModelDir, "model_weights.hdf5") model.load_weights(weights_path) logging.info(f"Model weights loaded from {weights_path}") diff --git a/dfpl/single_label_model.py b/dfpl/single_label_model.py index 0f64ace2..d1e707a7 100644 --- a/dfpl/single_label_model.py +++ b/dfpl/single_label_model.py @@ -420,7 +420,7 @@ def evaluate_model( y_scores=y_predict, target=target, filename=f"{file_prefix}/prc.png", - wandb_logging=False + wandb_logging=False, ) # Add balanced accuracy to the computed metrics prf.to_csv(path_or_buf=f"{file_prefix}/predicted.testdata.prec_rec_f1.csv") @@ -510,13 +510,9 @@ def fit_and_evaluate_model( # Define file name prefix for saving models if fold > 1: - model_file_prefix = path.join( - "tmp", f"{target_suffix}/fold-{fold}" - ) + model_file_prefix = path.join("tmp", f"{target_suffix}/fold-{fold}") else: - model_file_prefix = path.join( - opts.outputDir, target_suffix - ) + model_file_prefix = path.join(opts.outputDir, target_suffix) # Compute class imbalance ids, counts = np.unique(y_train, return_counts=True) @@ -663,8 +659,9 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: x_train, x_test, y_train, y_test = train_test_split( x, y, stratify=y, test_size=opts.testSize, random_state=1 ) - x_val , x_test, y_val, y_test = train_test_split( - x_test,y_test, stratify=y_test, test_size=0.5, random_state=1) + x_val, x_test, y_val, y_test = train_test_split( + x_test, y_test, stratify=y_test, test_size=0.5, random_state=1 + ) logging.info( f"Splitting train/test data with fixed random initializer" ) @@ -672,8 +669,9 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: x_train, x_test, y_train, y_test = train_test_split( x, y, stratify=y, test_size=opts.testSize ) - x_val , x_test, y_val, y_test = train_test_split( - x_test,y_test, stratify=y_test, test_size=0.5) + x_val, x_test, y_val, y_test = train_test_split( + x_test, y_test, stratify=y_test, test_size=0.5 + ) performance = fit_and_evaluate_model( x_train=x_train, @@ -702,7 +700,10 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: # Further split test set into val and test set x_val, x_test, y_val, y_test = train_test_split( - x_test, y_test, test_size=0.5, stratify=y_test, + x_test, + y_test, + test_size=0.5, + stratify=y_test, ) if opts.wabTracking and not opts.aeWabTracking: wandb.init( @@ -763,7 +764,6 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: # Optionally, clean up the temporary directory shutil.rmtree("tmp/") - else: logging.info( "Your selected number of folds for Cross validation is out of range. " @@ -792,12 +792,12 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: if opts.kFolds == 1: train_set, val_set, test_set = ae_scaffold_split( df_task, - sizes=(1 - opts.testSize, opts.testSize/2, opts.testSize/2), + sizes=(1 - opts.testSize, opts.testSize / 2, opts.testSize / 2), balanced=False, seed=42, ) x_train, y_train, x_test, y_test, x_val, y_val = get_x_y( - df_task, target, train_set,val_set, test_set, opts + df_task, target, train_set, val_set, test_set, opts ) if opts.wabTracking and not opts.aeWabTracking: wandb.init( @@ -812,7 +812,7 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: group=f"{target}", ) - performance= fit_and_evaluate_model( + performance = fit_and_evaluate_model( x_train=x_train, x_test=x_test, x_val=x_val, @@ -830,12 +830,12 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: print(f"Splitting data with seed {fold_no}") train_set, val_set, test_set = ae_scaffold_split( df_task, - sizes=(1 - opts.testSize,opts.testSize/2, opts.testSize/2), + sizes=(1 - opts.testSize, opts.testSize / 2, opts.testSize / 2), balanced=True, seed=fold_no, ) - x_train, y_train, x_test, y_test,x_val, y_val = get_x_y( - df_task, target, train_set,val_set, test_set, opts + x_train, y_train, x_test, y_test, x_val, y_val = get_x_y( + df_task, target, train_set, val_set, test_set, opts ) if opts.wabTracking and not opts.aeWabTracking: wandb.init( @@ -886,7 +886,6 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: # Optionally, clean up the temporary directory shutil.rmtree("tmp/") - else: logging.info( "Your selected number of folds for Cross validation is out of range. " @@ -912,10 +911,12 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: df_task.reset_index(drop=True, inplace=True) if opts.kFolds == 1: train_set, val_set, test_set = weight_split( - df_task, bias="small", sizes=(1 - opts.testSize, opts.testSize/2, opts.testSize/2) + df_task, + bias="small", + sizes=(1 - opts.testSize, opts.testSize / 2, opts.testSize / 2), ) x_train, y_train, x_test, y_test, x_val, y_val = get_x_y( - df_task, target, train_set,val_set, test_set, opts + df_task, target, train_set, val_set, test_set, opts ) if opts.wabTracking and not opts.aeWabTracking: wandb.init( @@ -932,8 +933,8 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: performance = fit_and_evaluate_model( x_train=x_train, x_test=x_test, - x_val = x_val, - y_val = y_val, + x_val=x_val, + y_val=y_val, y_train=y_train, y_test=y_test, fold=0, diff --git a/dfpl/utils.py b/dfpl/utils.py index 29540364..aa8d35ed 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -1,18 +1,16 @@ +import argparse import json import logging import os import pathlib +import sys import warnings from collections import defaultdict +from pathlib import Path from random import Random -from typing import Dict, List, Set, Tuple, Union, Type, TypeVar, Any +from typing import Any, Dict, List, Set, Tuple, Type, TypeVar, Union -# Define a type variable - -from pathlib import Path -import argparse import jsonpickle -import sys import numpy as np import pandas as pd from rdkit import Chem, RDLogger @@ -20,6 +18,9 @@ from rdkit.Chem.Scaffolds import MurckoScaffold from tqdm import tqdm +# Define a type variable + + RDLogger.DisableLog("rdApp.*") T = TypeVar("T") @@ -36,7 +37,7 @@ def parseCmdArgs(cls: Type[T], args: argparse.Namespace) -> T: An instance of cls populated with values from the command-line arguments. """ # Extract argument flags from sys.argv - arg_flags = {arg.lstrip('-') for arg in sys.argv if arg.startswith('-')} + arg_flags = {arg.lstrip("-") for arg in sys.argv if arg.startswith("-")} # Create the result instance, which will be modified and returned result = cls() @@ -60,6 +61,7 @@ def parseCmdArgs(cls: Type[T], args: argparse.Namespace) -> T: return result + def makePathAbsolute(p: str) -> str: path = pathlib.Path(p) if path.is_absolute(): diff --git a/dfpl/vae.py b/dfpl/vae.py index d8381efb..fdc187b7 100644 --- a/dfpl/vae.py +++ b/dfpl/vae.py @@ -18,8 +18,9 @@ from dfpl import callbacks from dfpl import history as ht from dfpl import options, settings +from dfpl.autoencoder import create_dense_layer, setup_train_test_split from dfpl.utils import ae_scaffold_split, weight_split -from dfpl.autoencoder import setup_train_test_split, create_dense_layer + disable_eager_execution() @@ -43,7 +44,9 @@ def define_vae_model(opts: options.Options) -> Tuple[Model, Model]: vae_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) input_vec = Input(shape=(input_size,)) initial_layer_size = int(input_size / 2) - encoded = create_dense_layer(input_vec, initial_layer_size, opts.aeActivationFunction) + encoded = create_dense_layer( + input_vec, initial_layer_size, opts.aeActivationFunction + ) # Start `layer_sizes` with the initial layer size (1024) layer_sizes = [initial_layer_size] @@ -53,10 +56,10 @@ def define_vae_model(opts: options.Options) -> Tuple[Model, Model]: for i in range(1, hidden_layer_count - 1): layer_size = int(input_size / (2 ** (i + 1))) layer_sizes.append(layer_size) - encoded = create_dense_layer(encoded,layer_size, opts.aeActivationFunction) + encoded = create_dense_layer(encoded, layer_size, opts.aeActivationFunction) # Latent space layers - z_mean = create_dense_layer (encoded, encoding_dim, opts.aeActivationFunction) + z_mean = create_dense_layer(encoded, encoding_dim, opts.aeActivationFunction) z_log_var = create_dense_layer(encoded, encoding_dim, opts.aeActivationFunction) # Sampling layer @@ -80,9 +83,12 @@ def sampling(args): # Define VAE and encoder models vae = Model(input_vec, decoded) encoder = Model(input_vec, z_mean) + # Define custom loss def kl_loss(z_mean, z_log_var): - return -0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1) + return -0.5 * K.sum( + 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1 + ) def bce_loss(y_true, y_pred): return metrics.binary_crossentropy(y_true, y_pred) @@ -96,8 +102,9 @@ def vae_loss(y_true, y_pred): return vae, encoder - -def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Tuple[Model, np.ndarray, np.ndarray]: +def train_full_vae( + df: pd.DataFrame, opts: options.Options +) -> Tuple[Model, np.ndarray, np.ndarray]: """ Trains a VAE on the provided data and returns the trained encoder and split indices. @@ -120,8 +127,12 @@ def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Tuple[Model, np.n # Set up callbacks and train the VAE model callback_list = callbacks.autoencoder_callback(checkpoint_path=save_path, opts=opts) vae_hist = vae.fit( - x_train, x_train, epochs=opts.aeEpochs, batch_size=opts.aeBatchSize, - verbose=opts.verbose, callbacks=[callback_list], + x_train, + x_train, + epochs=opts.aeEpochs, + batch_size=opts.aeBatchSize, + verbose=opts.verbose, + callbacks=[callback_list], validation_data=(x_test, x_test) if x_test is not None else None, ) diff --git a/example/predictgnn.json b/example/predictgnn.json index 5bfbcec6..b3c8f6d8 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -1,6 +1,6 @@ { "py/object": "dfpl.options.GnnOptions", "test_path": "tests/data/tox21.csv", - "preds_path": "example/results_gnn.csv", + "preds_path": "preds_dmpnn/DMPNN_preds.csv", "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt" } \ No newline at end of file From c202c628b486a14462c1960a2214a3334eb37110 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Tue, 5 Nov 2024 12:20:31 +0100 Subject: [PATCH 05/10] black, flaked, isorted --- dfpl/__main__.py | 5 ++--- dfpl/callbacks.py | 1 - dfpl/dfplmodule.py | 4 ++-- dfpl/feedforwardNN.py | 1 - dfpl/plot.py | 1 - dfpl/single_label_model.py | 16 ++++------------ dfpl/utils.py | 2 +- dfpl/vae.py | 11 ++++------- tests/run_predictgnn.py | 2 -- 9 files changed, 13 insertions(+), 30 deletions(-) diff --git a/dfpl/__main__.py b/dfpl/__main__.py index 4d13abc5..a38215a8 100755 --- a/dfpl/__main__.py +++ b/dfpl/__main__.py @@ -87,9 +87,8 @@ def train(opts: options.Options): ) # compress the fingerprints using the autoencoder df = ac.compress_fingerprints(df, encoder) - if ( - opts.visualizeLatent - ): ## visualize latent space only if you train the autoencoder + if opts.visualizeLatent: + # visualize latent space only if you train the autoencoder ac.visualize_fingerprints( df, comressed_col="fpcompressed", diff --git a/dfpl/callbacks.py b/dfpl/callbacks.py index c3ccb338..93d583e6 100644 --- a/dfpl/callbacks.py +++ b/dfpl/callbacks.py @@ -1,6 +1,5 @@ # for NN model functions from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint - # for testing in Weights & Biases from wandb.keras import WandbCallback diff --git a/dfpl/dfplmodule.py b/dfpl/dfplmodule.py index 67cd3a02..07671aec 100755 --- a/dfpl/dfplmodule.py +++ b/dfpl/dfplmodule.py @@ -15,9 +15,9 @@ from rdkit.Chem.AtomPairs import Pairs, Torsions from sklearn.model_selection import train_test_split from tensorflow.keras import optimizers, regularizers -from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau +from tensorflow.keras.callbacks import (EarlyStopping, ModelCheckpoint, + ReduceLROnPlateau) from tensorflow.keras.layers import Dense, Dropout, Input - # for NN model functions from tensorflow.keras.models import Model, Sequential from tensorflow.keras.optimizers import SGD diff --git a/dfpl/feedforwardNN.py b/dfpl/feedforwardNN.py index bf4241aa..3e6a744c 100644 --- a/dfpl/feedforwardNN.py +++ b/dfpl/feedforwardNN.py @@ -12,7 +12,6 @@ from sklearn.model_selection import KFold, train_test_split from tensorflow.keras import metrics, optimizers, regularizers from tensorflow.keras.layers import Dense, Dropout - # for NN model functions from tensorflow.keras.models import Model, Sequential diff --git a/dfpl/plot.py b/dfpl/plot.py index 84802023..c5675da8 100644 --- a/dfpl/plot.py +++ b/dfpl/plot.py @@ -5,7 +5,6 @@ import wandb from matplotlib.axes import Axes from sklearn.metrics import auc, precision_recall_curve - # for NN model functions from tensorflow.python.keras.callbacks import History diff --git a/dfpl/single_label_model.py b/dfpl/single_label_model.py index d1e707a7..91502680 100644 --- a/dfpl/single_label_model.py +++ b/dfpl/single_label_model.py @@ -12,21 +12,13 @@ import tensorflow as tf import tensorflow.python.keras.backend as K import wandb -from sklearn.metrics import ( - auc, - classification_report, - confusion_matrix, - matthews_corrcoef, - roc_curve, -) +from sklearn.metrics import (auc, classification_report, confusion_matrix, + matthews_corrcoef, roc_curve) from sklearn.model_selection import StratifiedKFold, train_test_split from tensorflow.keras import metrics, optimizers, regularizers from tensorflow.keras.layers import AlphaDropout, Dense, Dropout -from tensorflow.keras.losses import ( - BinaryCrossentropy, - BinaryFocalCrossentropy, - MeanSquaredError, -) +from tensorflow.keras.losses import (BinaryCrossentropy, + BinaryFocalCrossentropy, MeanSquaredError) from tensorflow.keras.models import Model, Sequential from dfpl import callbacks as cb diff --git a/dfpl/utils.py b/dfpl/utils.py index aa8d35ed..ac6d68de 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -8,7 +8,7 @@ from collections import defaultdict from pathlib import Path from random import Random -from typing import Any, Dict, List, Set, Tuple, Type, TypeVar, Union +from typing import Dict, List, Set, Tuple, Type, TypeVar, Union import jsonpickle import numpy as np diff --git a/dfpl/vae.py b/dfpl/vae.py index fdc187b7..911221cc 100644 --- a/dfpl/vae.py +++ b/dfpl/vae.py @@ -1,25 +1,22 @@ -import csv import logging import math import os.path -from typing import Tuple, Union +from typing import Tuple import numpy as np import pandas as pd import tensorflow.keras.metrics as metrics import wandb from keras import backend as K -from sklearn.model_selection import train_test_split -from tensorflow.keras import initializers, optimizers +from tensorflow.keras import optimizers from tensorflow.keras.layers import Dense, Input, Lambda -from tensorflow.keras.models import Model, load_model +from tensorflow.keras.models import Model from tensorflow.python.framework.ops import disable_eager_execution from dfpl import callbacks from dfpl import history as ht -from dfpl import options, settings +from dfpl import options from dfpl.autoencoder import create_dense_layer, setup_train_test_split -from dfpl.utils import ae_scaffold_split, weight_split disable_eager_execution() diff --git a/tests/run_predictgnn.py b/tests/run_predictgnn.py index 9c433d80..01d20ec7 100644 --- a/tests/run_predictgnn.py +++ b/tests/run_predictgnn.py @@ -1,8 +1,6 @@ import logging -import os import pathlib -import pandas as pd from chemprop import args, train import dfpl.options as opt From bc61c9df90aa1ce82bcf136b2d0db6bf1d46939f Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Tue, 5 Nov 2024 12:33:04 +0100 Subject: [PATCH 06/10] blacked after isort, fixing dataset path in train.json --- dfpl/callbacks.py | 1 + dfpl/dfplmodule.py | 4 ++-- dfpl/feedforwardNN.py | 1 + dfpl/plot.py | 1 + dfpl/single_label_model.py | 16 ++++++++++++---- example/train.json | 2 +- 6 files changed, 18 insertions(+), 7 deletions(-) diff --git a/dfpl/callbacks.py b/dfpl/callbacks.py index 93d583e6..c3ccb338 100644 --- a/dfpl/callbacks.py +++ b/dfpl/callbacks.py @@ -1,5 +1,6 @@ # for NN model functions from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint + # for testing in Weights & Biases from wandb.keras import WandbCallback diff --git a/dfpl/dfplmodule.py b/dfpl/dfplmodule.py index 07671aec..67cd3a02 100755 --- a/dfpl/dfplmodule.py +++ b/dfpl/dfplmodule.py @@ -15,9 +15,9 @@ from rdkit.Chem.AtomPairs import Pairs, Torsions from sklearn.model_selection import train_test_split from tensorflow.keras import optimizers, regularizers -from tensorflow.keras.callbacks import (EarlyStopping, ModelCheckpoint, - ReduceLROnPlateau) +from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau from tensorflow.keras.layers import Dense, Dropout, Input + # for NN model functions from tensorflow.keras.models import Model, Sequential from tensorflow.keras.optimizers import SGD diff --git a/dfpl/feedforwardNN.py b/dfpl/feedforwardNN.py index 3e6a744c..bf4241aa 100644 --- a/dfpl/feedforwardNN.py +++ b/dfpl/feedforwardNN.py @@ -12,6 +12,7 @@ from sklearn.model_selection import KFold, train_test_split from tensorflow.keras import metrics, optimizers, regularizers from tensorflow.keras.layers import Dense, Dropout + # for NN model functions from tensorflow.keras.models import Model, Sequential diff --git a/dfpl/plot.py b/dfpl/plot.py index c5675da8..84802023 100644 --- a/dfpl/plot.py +++ b/dfpl/plot.py @@ -5,6 +5,7 @@ import wandb from matplotlib.axes import Axes from sklearn.metrics import auc, precision_recall_curve + # for NN model functions from tensorflow.python.keras.callbacks import History diff --git a/dfpl/single_label_model.py b/dfpl/single_label_model.py index 91502680..d1e707a7 100644 --- a/dfpl/single_label_model.py +++ b/dfpl/single_label_model.py @@ -12,13 +12,21 @@ import tensorflow as tf import tensorflow.python.keras.backend as K import wandb -from sklearn.metrics import (auc, classification_report, confusion_matrix, - matthews_corrcoef, roc_curve) +from sklearn.metrics import ( + auc, + classification_report, + confusion_matrix, + matthews_corrcoef, + roc_curve, +) from sklearn.model_selection import StratifiedKFold, train_test_split from tensorflow.keras import metrics, optimizers, regularizers from tensorflow.keras.layers import AlphaDropout, Dense, Dropout -from tensorflow.keras.losses import (BinaryCrossentropy, - BinaryFocalCrossentropy, MeanSquaredError) +from tensorflow.keras.losses import ( + BinaryCrossentropy, + BinaryFocalCrossentropy, + MeanSquaredError, +) from tensorflow.keras.models import Model, Sequential from dfpl import callbacks as cb diff --git a/example/train.json b/example/train.json index 79f06d1a..fcd51747 100755 --- a/example/train.json +++ b/example/train.json @@ -1,6 +1,6 @@ { "py/object": "dfpl.options.Options", - "inputFile": "tests/data/clintox.csv", + "inputFile": "tests/data/tox21.csv", "outputDir": "example/results_train/", "ecModelDir": "example/results_train/vae", "ecWeightsFile": "", From 0944b180885945092fe5cf9e0fdd1156d31174de Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Tue, 5 Nov 2024 12:41:34 +0100 Subject: [PATCH 07/10] fixing dataset path in train.json --- example/train.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example/train.json b/example/train.json index fcd51747..119e2bf6 100755 --- a/example/train.json +++ b/example/train.json @@ -1,6 +1,6 @@ { "py/object": "dfpl.options.Options", - "inputFile": "tests/data/tox21.csv", + "inputFile": "tests/data/S_dataset.csv", "outputDir": "example/results_train/", "ecModelDir": "example/results_train/vae", "ecWeightsFile": "", From 1b3522ebb6e1ca131484330a91ac24f38aa0f1cf Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Nov 2024 14:14:57 +0100 Subject: [PATCH 08/10] synced env and removed compressed suffix from saving --- dfpl/single_label_model.py | 5 ++--- environment.yml | 17 ++++++----------- example/train.json | 4 ++-- 3 files changed, 10 insertions(+), 16 deletions(-) diff --git a/dfpl/single_label_model.py b/dfpl/single_label_model.py index d1e707a7..c8d4a9eb 100644 --- a/dfpl/single_label_model.py +++ b/dfpl/single_label_model.py @@ -506,13 +506,12 @@ def fit_and_evaluate_model( ) -> pd.DataFrame: # Print info about training logging.info(f"Training of fold number: {fold}") - target_suffix = f"{target}_compressed" if opts.compressFeatures else target # Define file name prefix for saving models if fold > 1: - model_file_prefix = path.join("tmp", f"{target_suffix}/fold-{fold}") + model_file_prefix = path.join("tmp", f"{target}/fold-{fold}") else: - model_file_prefix = path.join(opts.outputDir, target_suffix) + model_file_prefix = path.join(opts.outputDir, target) # Compute class imbalance ids, counts = np.unique(y_train, return_counts=True) diff --git a/environment.yml b/environment.yml index 3c2e7a6c..164db6bc 100644 --- a/environment.yml +++ b/environment.yml @@ -3,21 +3,16 @@ channels: - conda-forge - defaults dependencies: - # dev requirements - - conda-build=3.21.8 - - conda=4.12.0 - - pip=22.0.4 - - pytest=7.1.1 # application requirements - jsonpickle=2.1 - matplotlib=3.5.1 - - numpy=1.19.5 + - numpy=1.22.0 - pandas=1.4.2 - rdkit=2022.03.1 - scikit-learn=1.0.2 + - keras=2.9.0 + - tensorflow-gpu=2.9.3 + - wandb=0.12.0 + - umap-learn=0.1.1 - seaborn=0.12.2 - - tensorflow-gpu=2.6.0 - - wandb=0.12 - - umap=0.1.1 - - pip: - - git+https://github.com/soulios/chemprop.git@1d73523e49aa28a90b74edc04aaf45d7e124e338 \ No newline at end of file + - chemprop=1.7.1 \ No newline at end of file diff --git a/example/train.json b/example/train.json index 119e2bf6..8f811df3 100755 --- a/example/train.json +++ b/example/train.json @@ -34,8 +34,8 @@ "testSize": 0.2, "fnnType": "FNN", "optimizer": "Adam", - "lossFunction": "focal", - "epochs": 50, + "lossFunction": "bce", + "epochs": 11, "batchSize": 128, "activationFunction": "selu", "dropout": 0.0107, From 82997affa3c0302fdbc025a5d080a496b64098e7 Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Nov 2024 14:18:53 +0100 Subject: [PATCH 09/10] make sure to create path before saving --- dfpl/single_label_model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dfpl/single_label_model.py b/dfpl/single_label_model.py index c8d4a9eb..28a5cd48 100644 --- a/dfpl/single_label_model.py +++ b/dfpl/single_label_model.py @@ -531,7 +531,9 @@ def fit_and_evaluate_model( ) # Define checkpoint to save model weights during training - checkpoint_model_weights_path = f"{model_file_prefix}/model_weights.hdf5" + checkpoint_model_weights_path = os.path.join(model_file_prefix, "model_weights.hdf5") + + # Define callbacks callback_list = cb.nn_callback( checkpoint_path=checkpoint_model_weights_path, opts=opts ) From 4fdb4411039066e4b3f52ecb54de1f778ae6197c Mon Sep 17 00:00:00 2001 From: Kyriakos Soulios Date: Wed, 6 Nov 2024 14:31:31 +0100 Subject: [PATCH 10/10] applied black --- dfpl/single_label_model.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dfpl/single_label_model.py b/dfpl/single_label_model.py index 28a5cd48..41c083a9 100644 --- a/dfpl/single_label_model.py +++ b/dfpl/single_label_model.py @@ -531,7 +531,9 @@ def fit_and_evaluate_model( ) # Define checkpoint to save model weights during training - checkpoint_model_weights_path = os.path.join(model_file_prefix, "model_weights.hdf5") + checkpoint_model_weights_path = os.path.join( + model_file_prefix, "model_weights.hdf5" + ) # Define callbacks callback_list = cb.nn_callback(