diff --git a/dfpl/__main__.py b/dfpl/__main__.py index 7896d451..a38215a8 100755 --- a/dfpl/__main__.py +++ b/dfpl/__main__.py @@ -1,13 +1,10 @@ import dataclasses import logging import os.path -import pathlib from argparse import Namespace from os import path import chemprop as cp -import pandas as pd -from keras.models import load_model from dfpl import autoencoder as ac from dfpl import feedforwardNN as fNN @@ -17,43 +14,8 @@ from dfpl import vae as vae from dfpl.utils import createArgsFromJson, createDirectory, makePathAbsolute -project_directory = pathlib.Path(".").parent.parent.absolute() -test_train_opts = options.Options( - inputFile=f"{project_directory}/input_datasets/S_dataset.pkl", - outputDir=f"{project_directory}/output_data/console_test", - ecWeightsFile=f"{project_directory}/output_data/case_00/AE_S/ae_S.encoder.hdf5", - ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model", - type="smiles", - fpType="topological", - epochs=100, - batchSize=1024, - fpSize=2048, - encFPSize=256, - enableMultiLabel=False, - testSize=0.2, - kFolds=2, - verbose=2, - trainAC=False, - trainFNN=True, - compressFeatures=True, - activationFunction="selu", - lossFunction="bce", - optimizer="Adam", - fnnType="FNN", -) -test_pred_opts = options.Options( - inputFile=f"{project_directory}/input_datasets/S_dataset.pkl", - outputDir=f"{project_directory}/output_data/console_test", - outputFile=f"{project_directory}/output_data/console_test/S_dataset.predictions_ER.csv", - ecModelDir=f"{project_directory}/output_data/case_00/AE_S/saved_model", - fnnModelDir=f"{project_directory}/output_data/console_test/ER_saved_model", - type="smiles", - fpType="topological", -) - - -def traindmpnn(opts: options.GnnOptions): +def traindmpnn(opts: options.GnnOptions) -> None: """ Train a D-MPNN model using the given options. Args: @@ -61,54 +23,29 @@ def traindmpnn(opts: options.GnnOptions): Returns: - None """ - os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}" - ignore_elements = ["py/object"] # Load options from a JSON file and replace the relevant attributes in `opts` - arguments = createArgsFromJson( - opts.configFile, ignore_elements, return_json_object=False - ) + arguments = createArgsFromJson(jsonFile=opts.configFile) opts = cp.args.TrainArgs().parse_args(arguments) logging.info("Training DMPNN...") - # Train the model and get the mean and standard deviation of AUC score from cross-validation mean_score, std_score = cp.train.cross_validate( args=opts, train_func=cp.train.run_training ) logging.info(f"Results: {mean_score:.5f} +/- {std_score:.5f}") -def predictdmpnn(opts: options.GnnOptions, json_arg_path: str) -> None: +def predictdmpnn(opts: options.GnnOptions) -> None: """ Predict the values using a trained D-MPNN model with the given options. Args: - opts: options.GnnOptions instance containing the details of the prediction - - JSON_ARG_PATH: path to a JSON file containing additional arguments for prediction Returns: - None """ - ignore_elements = [ - "py/object", - "checkpoint_paths", - "save_dir", - "saving_name", - ] # Load options and additional arguments from a JSON file - arguments, data = createArgsFromJson( - json_arg_path, ignore_elements, return_json_object=True - ) - arguments.append("--preds_path") - arguments.append("") - save_dir = data.get("save_dir") - name = data.get("saving_name") - # Replace relevant attributes in `opts` with loaded options + arguments = createArgsFromJson(jsonFile=opts.configFile) opts = cp.args.PredictArgs().parse_args(arguments) - opts.preds_path = save_dir + "/" + name - df = pd.read_csv(opts.test_path) - smiles = [] - for index, rows in df.iterrows(): - my_list = [rows.smiles] - smiles.append(my_list) - # Make predictions and return the result - cp.train.make_predictions(args=opts, smiles=smiles) + + cp.train.make_predictions(args=opts) def train(opts: options.Options): @@ -116,9 +53,6 @@ def train(opts: options.Options): Run the main training procedure :param opts: Options defining the details of the training """ - - os.environ["CUDA_VISIBLE_DEVICES"] = f"{opts.gpu}" - # import data from file and create DataFrame if "tsv" in opts.inputFile: df = fp.importDataFile( @@ -128,9 +62,9 @@ def train(opts: options.Options): df = fp.importDataFile( opts.inputFile, import_function=fp.importSmilesCSV, fp_size=opts.fpSize ) - # initialize encoders to None + + # initialize (auto)encoders to None encoder = None - autoencoder = None if opts.trainAC: if opts.aeType == "deterministic": encoder, train_indices, test_indices = ac.train_full_ac(df, opts) @@ -142,26 +76,26 @@ def train(opts: options.Options): # if feature compression is enabled if opts.compressFeatures: if not opts.trainAC: - if opts.aeType == "deterministic": - (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) - elif opts.aeType == "variational": + if opts.aeType == "variational": (autoencoder, encoder) = vae.define_vae_model(opts=options.Options()) - elif opts.ecWeightsFile == "": - encoder = load_model(opts.ecModelDir) else: + (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) + + if opts.ecWeightsFile != "": autoencoder.load_weights( - os.path.join(opts.ecModelDir, opts.ecWeightsFile) + os.path.join(opts.outputDir, opts.ecWeightsFile) ) # compress the fingerprints using the autoencoder df = ac.compress_fingerprints(df, encoder) - # ac.visualize_fingerprints( - # df, - # before_col="fp", - # after_col="fpcompressed", - # train_indices=train_indices, - # test_indices=test_indices, - # save_as=f"UMAP_{opts.aeSplitType}.png", - # ) + if opts.visualizeLatent: + # visualize latent space only if you train the autoencoder + ac.visualize_fingerprints( + df, + comressed_col="fpcompressed", + train_indices=train_indices, + test_indices=test_indices, + save_as=f"UMAP_{opts.aeType}.png", + ) # train single label models if requested if opts.trainFNN and not opts.enableMultiLabel: sl.train_single_label_models(df=df, opts=opts) @@ -190,12 +124,13 @@ def predict(opts: options.Options) -> None: # load trained model for autoencoder if opts.aeType == "deterministic": (autoencoder, encoder) = ac.define_ac_model(opts=options.Options()) - if opts.aeType == "variational": + elif opts.aeType == "variational": (autoencoder, encoder) = vae.define_vae_model(opts=options.Options()) - # Load trained model for autoencoder - if opts.ecWeightsFile == "": - encoder = load_model(opts.ecModelDir) else: + raise ValueError(f"Unknown autoencoder type: {opts.aeType}") + + # Load trained model for autoencoder + if opts.ecWeightsFile != "": encoder.load_weights(os.path.join(opts.ecModelDir, opts.ecWeightsFile)) df = ac.compress_fingerprints(df, encoder) @@ -257,7 +192,6 @@ def main(): raise ValueError("Input directory is not a directory") elif prog_args.method == "traingnn": traingnn_opts = options.GnnOptions.fromCmdArgs(prog_args) - traindmpnn(traingnn_opts) elif prog_args.method == "predictgnn": @@ -267,12 +201,7 @@ def main(): test_path=makePathAbsolute(predictgnn_opts.test_path), preds_path=makePathAbsolute(predictgnn_opts.preds_path), ) - - logging.info( - f"The following arguments are received or filled with default values:\n{prog_args}" - ) - - predictdmpnn(fixed_opts, prog_args.configFile) + predictdmpnn(fixed_opts) elif prog_args.method == "train": train_opts = options.Options.fromCmdArgs(prog_args) @@ -298,8 +227,6 @@ def main(): ), ecModelDir=makePathAbsolute(predict_opts.ecModelDir), fnnModelDir=makePathAbsolute(predict_opts.fnnModelDir), - trainAC=False, - trainFNN=False, ) createDirectory(fixed_opts.outputDir) createLogger(path.join(fixed_opts.outputDir, "predict.log")) diff --git a/dfpl/autoencoder.py b/dfpl/autoencoder.py index 99bf4578..874b1aa0 100644 --- a/dfpl/autoencoder.py +++ b/dfpl/autoencoder.py @@ -1,17 +1,16 @@ import logging import math import os.path -from os.path import basename from typing import Tuple import matplotlib.pyplot as plt import numpy as np import pandas as pd import seaborn as sns -import umap +import umap.umap_ as umap import wandb from sklearn.model_selection import train_test_split -from tensorflow.keras import initializers, losses, optimizers +from tensorflow.keras import losses, optimizers from tensorflow.keras.layers import Dense, Input from tensorflow.keras.models import Model @@ -21,261 +20,175 @@ from dfpl.utils import ae_scaffold_split, weight_split -def define_ac_model(opts: options.Options, output_bias=None) -> Tuple[Model, Model]: +def create_dense_layer(inputs, units, activation): + """Create a Dense layer with optional SELU initialization.""" + return Dense( + units=units, + activation=activation, + kernel_initializer="lecun_normal" if activation == "selu" else "glorot_uniform", + )(inputs) + + +def define_ac_model(opts: options.Options) -> Tuple[Model, Model]: """ - This function provides an autoencoder model to reduce a certain input to a compressed version. + Define and compile an autoencoder model with the specified encoding dimension, with a mirrored decoder. - :param opts: Training options that provide values for adjusting the neural net - :param output_bias: Bias used to initialize the last layer. It gives the net a head start in training on - imbalanced data (which the fingerprints are, because they have many more 0's than 1's in them). - :return: a tuple of autoencoder and encoder models + :param opts: Training options containing model parameters. + :param output_bias: Bias for the output layer, used for initializing the last layer. + :return: Tuple containing the autoencoder and encoder models. """ input_size = opts.fpSize - encoding_dim = opts.encFPSize - ac_optimizer = optimizers.Adam( - learning_rate=opts.aeLearningRate, decay=opts.aeLearningRateDecay + encoding_dim = opts.encFPSize # Desired encoding dimension + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=math.ceil(7000 / opts.batchSize) * 3, + decay_rate=opts.aeLearningRateDecay, + staircase=True, ) + ac_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) - if output_bias is not None: - output_bias = initializers.Constant(output_bias) + input_vec = Input(shape=(input_size,)) + initial_layer_size = int(input_size / 2) + encoded = create_dense_layer( + input_vec, initial_layer_size, opts.aeActivationFunction + ) - # get the number of meaningful hidden layers (latent space included) - hidden_layer_count = round(math.log2(input_size / encoding_dim)) + # Start `layer_sizes` with the initial layer size (1024) + layer_sizes = [initial_layer_size] - # the input placeholder - input_vec = Input(shape=(input_size,)) + # Build intermediate encoding layers and store their sizes + hidden_layer_count = round(math.log2(input_size / encoding_dim)) + for i in range(1, hidden_layer_count): + factor_units = 2 ** (i + 1) + layer_size = int(input_size / factor_units) + layer_sizes.append(layer_size) + encoded = create_dense_layer(encoded, layer_size, opts.aeActivationFunction) - # 1st hidden layer, that receives weights from input layer - # equals bottleneck layer, if hidden_layer_count==1! - if opts.aeActivationFunction != "selu": - encoded = Dense( - units=int(input_size / 2), activation=opts.aeActivationFunction - )(input_vec) - else: - encoded = Dense( - units=int(input_size / 2), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(input_vec) - - if hidden_layer_count > 1: - # encoding layers, incl. bottle-neck - for i in range(1, hidden_layer_count): - factor_units = 2 ** (i + 1) - if opts.aeActivationFunction != "selu": - encoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - else: - encoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(encoded) - - # 1st decoding layer - factor_units = 2 ** (hidden_layer_count - 1) - if opts.aeActivationFunction != "selu": - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - else: - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(encoded) - - # decoding layers - for i in range(hidden_layer_count - 2, 0, -1): - factor_units = 2**i - if opts.aeActivationFunction != "selu": - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(decoded) - else: - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(decoded) - - # output layer - # to either 0 or 1 and hence we use sigmoid activation function. - decoded = Dense( - units=input_size, activation="sigmoid", bias_initializer=output_bias - )(decoded) + # Build decoder layers in exact reverse order, excluding the first layer size + decoded = encoded + for layer_size in reversed(layer_sizes[:-1]): + decoded = Dense(units=layer_size, activation=opts.aeActivationFunction)(decoded) - else: - # output layer - decoded = Dense( - units=input_size, activation="sigmoid", bias_initializer=output_bias - )(encoded) + # Final output layer to reconstruct input + decoded = Dense(units=input_size, activation="sigmoid")(decoded) + # Define autoencoder and encoder models autoencoder = Model(input_vec, decoded) encoder = Model(input_vec, encoded) + + # Compile autoencoder + autoencoder.compile(optimizer=ac_optimizer, loss=losses.BinaryCrossentropy()) autoencoder.summary(print_fn=logging.info) - autoencoder.compile( - optimizer=ac_optimizer, - loss=losses.BinaryCrossentropy(), - # metrics=[ - # metrics.AUC(), - # metrics.Precision(), - # metrics.Recall() - # ] - ) return autoencoder, encoder -def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: +def setup_train_test_split( + df: pd.DataFrame, opts: options.Options +) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ - Trains an autoencoder on the given feature matrix X. The response matrix is only used to - split the data into meaningful test and train sets. + Sets up the training and test split based on the provided options. :param opts: Command line arguments as defined in options.py :param df: Pandas dataframe that contains the SMILES/InChI data for training the autoencoder - :return: The encoder model of the trained autoencoder + :return: Tuple containing training data, test data, training indices, test indices, and initial bias. """ - # If wandb tracking is enabled for autoencoder weights but not for the main program, initialize a new wandb run - if opts.aeWabTracking and not opts.wabTracking: - wandb.init(project=f"AE_{opts.aeSplitType}") - - # Define output files for autoencoder and encoder weights - if opts.ecWeightsFile == "": - # If no encoder weights file is specified, use the input file name to generate a default file name - logging.info("No AE encoder weights file specified") - base_file_name = ( - os.path.splitext(basename(opts.inputFile))[0] + opts.aeSplitType - ) - logging.info( - f"(auto)encoder weights will be saved in {base_file_name}.autoencoder.hdf5" - ) - ac_weights_file = os.path.join( - opts.outputDir, base_file_name + ".autoencoder.weights.hdf5" - ) - # ec_weights_file = os.path.join( - # opts.outputDir, base_file_name + ".encoder.weights.hdf5" - # ) - else: - # If an encoder weights file is specified, use it as the encoder weights file name - logging.info(f"AE encoder will be saved in {opts.ecWeightsFile}") - base_file_name = ( - os.path.splitext(basename(opts.ecWeightsFile))[0] + opts.aeSplitType - ) - ac_weights_file = os.path.join( - opts.outputDir, base_file_name + ".autoencoder.weights.hdf5" - ) - # ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile) - - # Collect the callbacks for training - callback_list = callbacks.autoencoder_callback( - checkpoint_path=ac_weights_file, opts=opts - ) - # Select all fingerprints that are valid and turn them into a numpy array fp_matrix = np.array( df[df["fp"].notnull()]["fp"].to_list(), dtype=settings.ac_fp_numpy_type, copy=settings.numpy_copy_values, ) + logging.info( - f"Training AC on a matrix of shape {fp_matrix.shape} with type {fp_matrix.dtype}" + f"Setting up train/test split on a matrix of shape {fp_matrix.shape} with type {fp_matrix.dtype}" ) - # When training the final AE, we don't want any test data. We want to train it on all available fingerprints. + # Validate test size assert 0.0 <= opts.testSize <= 0.5 + initial_indices = np.arange(fp_matrix.shape[0]) + if opts.aeSplitType == "random": - logging.info("Training autoencoder using random split") - initial_indices = np.arange(fp_matrix.shape[0]) + logging.info("Using random split for training.") if opts.testSize > 0.0: - # Split data into test and training data - if opts.aeWabTracking: - x_train, x_test, train_indices, test_indices = train_test_split( - fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 - ) - else: - x_train, x_test, train_indices, test_indices = train_test_split( - fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 - ) - else: - x_train = fp_matrix - x_test = None - elif opts.aeSplitType == "scaffold_balanced": - logging.info("Training autoencoder using scaffold split") - train_indices = np.arange(fp_matrix.shape[0]) - if opts.testSize > 0.0: - train_data, val_data, test_data = ae_scaffold_split( - df, - sizes=(1 - opts.testSize, 0.0, opts.testSize), - balanced=True, - seed=42, - ) - x_train = np.array( - train_data[train_data["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - x_test = np.array( - test_data[test_data["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, + x_train, x_test, train_indices, test_indices = train_test_split( + fp_matrix, initial_indices, test_size=opts.testSize, random_state=42 ) - train_indices = df[ - df.index.isin(train_data[train_data["fp"].notnull()].index) - ].index.to_numpy() - test_indices = df[ - df.index.isin(test_data[test_data["fp"].notnull()].index) - ].index.to_numpy() else: x_train = fp_matrix x_test = None + train_indices = initial_indices + test_indices = None + + elif opts.aeSplitType == "scaffold_balanced": + logging.info("Using scaffold split for training.") + train_data, val_data, test_data = ae_scaffold_split( + df, sizes=(1 - opts.testSize, 0.0, opts.testSize), balanced=True, seed=42 + ) + x_train = np.array( + train_data[train_data["fp"].notnull()]["fp"].to_list(), + dtype=settings.ac_fp_numpy_type, + copy=settings.numpy_copy_values, + ) + x_test = np.array( + test_data[test_data["fp"].notnull()]["fp"].to_list(), + dtype=settings.ac_fp_numpy_type, + copy=settings.numpy_copy_values, + ) + train_indices = df[ + df.index.isin(train_data[train_data["fp"].notnull()].index) + ].index.to_numpy() + test_indices = df[ + df.index.isin(test_data[test_data["fp"].notnull()].index) + ].index.to_numpy() + elif opts.aeSplitType == "molecular_weight": - logging.info("Training autoencoder using molecular weight split") - train_indices = np.arange(fp_matrix.shape[0]) - if opts.testSize > 0.0: - train_data, val_data, test_data = weight_split( - df, sizes=(1 - opts.testSize, 0.0, opts.testSize), bias="small" - ) - x_train = np.array( - train_data[train_data["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - x_test = np.array( - test_data[test_data["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - df_sorted = df.sort_values(by="mol_weight", ascending=True) - # Get the sorted indices from the sorted DataFrame - sorted_indices = df_sorted.index.to_numpy() - - # Find the corresponding indices for train_data, val_data, and test_data in the sorted DataFrame - train_indices = sorted_indices[df.index.isin(train_data.index)] - # val_indices = sorted_indices[df.index.isin(val_data.index)] - test_indices = sorted_indices[df.index.isin(test_data.index)] - else: - x_train = fp_matrix - x_test = None - else: - raise ValueError(f"Invalid split type: {opts.split_type}") - - # Calculate the initial bias aka the log ratio between 1's and 0'1 in all fingerprints - ids, counts = np.unique(x_train.flatten(), return_counts=True) - count_dict = dict(zip(ids, counts)) - if count_dict[0] == 0: - initial_bias = None - logging.info("No zeroes in training labels. Setting initial_bias to None.") + logging.info("Using molecular weight split for training.") + train_data, val_data, test_data = weight_split( + df, sizes=(1 - opts.testSize, 0.0, opts.testSize), bias="small" + ) + x_train = np.array( + train_data[train_data["fp"].notnull()]["fp"].to_list(), + dtype=settings.ac_fp_numpy_type, + copy=settings.numpy_copy_values, + ) + x_test = np.array( + test_data[test_data["fp"].notnull()]["fp"].to_list(), + dtype=settings.ac_fp_numpy_type, + copy=settings.numpy_copy_values, + ) + df_sorted = df.sort_values(by="mol_weight", ascending=True) + sorted_indices = df_sorted.index.to_numpy() + train_indices = sorted_indices[df.index.isin(train_data.index)] + test_indices = sorted_indices[df.index.isin(test_data.index)] + else: - initial_bias = np.log([count_dict[1] / count_dict[0]]) - logging.info(f"Initial bias for last sigmoid layer: {initial_bias[0]}") + raise ValueError(f"Invalid split type: {opts.aeSplitType}") + + return x_train, x_test, train_indices, test_indices - # Check if we're doing training/testing mode or full training mode + +def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: + """ + Trains an autoencoder on the given feature matrix X. The response matrix is only used to + split the data into meaningful test and train sets. + + :param opts: Command line arguments as defined in options.py + :param df: Pandas dataframe that contains the SMILES/InChI data for training the autoencoder + :return: The encoder model of the trained autoencoder + """ + + # If wandb tracking is enabled for autoencoder weights but not for the main program, initialize a new wandb run + if opts.aeWabTracking: + wandb.init(project=f"AE_{opts.aeSplitType}") + + save_path = os.path.join(opts.ecModelDir, f"autoencoder_weights.h5") + + # Set up train/test split + x_train, x_test, train_indices, test_indices = setup_train_test_split(df, opts) + + # Log training mode if opts.testSize > 0.0: logging.info(f"AE training/testing mode with train- and test-samples") logging.info(f"AC train data shape {x_train.shape} with type {x_train.dtype}") @@ -284,35 +197,31 @@ def train_full_ac(df: pd.DataFrame, opts: options.Options) -> Model: logging.info(f"AE full train mode without test-samples") logging.info(f"AC train data shape {x_train.shape} with type {x_train.dtype}") - # Set up the model of the AC w.r.t. the input size and the dimension of the bottle neck (z!) - (autoencoder, encoder) = define_ac_model(opts, output_bias=initial_bias) + # Set up the model of the AC + (autoencoder, encoder) = define_ac_model(opts) + callback_list = callbacks.autoencoder_callback(checkpoint_path=save_path, opts=opts) # Train the autoencoder on the training data auto_hist = autoencoder.fit( x_train, x_train, - callbacks=callback_list, + callbacks=[callback_list], epochs=opts.aeEpochs, batch_size=opts.aeBatchSize, verbose=opts.verbose, validation_data=(x_test, x_test) if opts.testSize > 0.0 else None, ) - logging.info(f"Autoencoder weights stored in file: {ac_weights_file}") # Store the autoencoder training history and plot the metrics ht.store_and_plot_history( - base_file_name=os.path.join(opts.outputDir, base_file_name + ".AC"), + base_file_name=save_path, hist=auto_hist, ) + # load the model with the best weights + autoencoder.load_weights(save_path) + # Save the encoder weights + encoder.save_weights(os.path.join(opts.ecModelDir, "encoder_weights.h5")) - # Save the autoencoder callback model to disk - save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_autoencoder") - if opts.testSize > 0.0: - (callback_autoencoder, callback_encoder) = define_ac_model(opts) - callback_encoder.save(filepath=save_path) - else: - encoder.save(filepath=save_path) - # Return the encoder model of the trained autoencoder return encoder, train_indices, test_indices @@ -343,14 +252,13 @@ def compress_fingerprints(dataframe: pd.DataFrame, encoder: Model) -> pd.DataFra def visualize_fingerprints( df: pd.DataFrame, - before_col: str, - after_col: str, + comressed_col: str, train_indices: np.ndarray, test_indices: np.ndarray, save_as: str, ): # Calculate the number of samples to be taken from each set - num_samples = 1000 + num_samples = 10 train_samples = int(num_samples * len(train_indices) / len(df)) test_samples = num_samples - train_samples @@ -366,7 +274,7 @@ def visualize_fingerprints( df_sampled = pd.concat([train_data_sampled, test_data_sampled]) # Convert the boolean values in the after_col column to floats - df_sampled[after_col] = df_sampled[after_col].apply( + df_sampled[comressed_col] = df_sampled[comressed_col].apply( lambda x: np.array(x, dtype=float) ) @@ -377,7 +285,7 @@ def visualize_fingerprints( n_neighbors=15, min_dist=0.1, metric="euclidean", random_state=42 ) # Filter out the rows with invalid arrays - umap_results = umap_model.fit_transform(df_sampled[after_col].tolist()) + umap_results = umap_model.fit_transform(df_sampled[comressed_col].tolist()) # Add UMAP results to the DataFrame df_sampled["umap_x"] = umap_results[:, 0] df_sampled["umap_y"] = umap_results[:, 1] @@ -385,8 +293,6 @@ def visualize_fingerprints( # Define custom color palette palette = {"train": "blue", "test": "red"} - # Create the scatter plot - sns.set(style="white") fig, ax = plt.subplots(figsize=(10, 8)) split = save_as.split("_", 1) part_after_underscore = split[1] diff --git a/dfpl/callbacks.py b/dfpl/callbacks.py index 6eae7965..c3ccb338 100644 --- a/dfpl/callbacks.py +++ b/dfpl/callbacks.py @@ -27,9 +27,9 @@ def autoencoder_callback(checkpoint_path: str, opts: options.Options) -> list: monitor=target, mode="min", verbose=1, - period=settings.ac_train_check_period, save_best_only=True, save_weights_only=True, + period=settings.ac_train_check_period, ) callbacks.append(checkpoint) @@ -43,8 +43,7 @@ def autoencoder_callback(checkpoint_path: str, opts: options.Options) -> list: restore_best_weights=True, ) callbacks.append(early_stop) - - if opts.aeWabTracking and not opts.wabTracking: + if opts.aeWabTracking: callbacks.append(WandbCallback(save_model=False)) return callbacks @@ -65,11 +64,11 @@ def nn_callback(checkpoint_path: str, opts: options.Options) -> list: checkpoint = ModelCheckpoint( checkpoint_path, verbose=1, - period=settings.nn_train_check_period, save_best_only=True, monitor="val_loss", mode="min", save_weights_only=True, + period=settings.nn_train_check_period, ) callbacks.append(checkpoint) @@ -83,7 +82,6 @@ def nn_callback(checkpoint_path: str, opts: options.Options) -> list: restore_best_weights=True, ) callbacks.append(early_stop) - if opts.wabTracking: callbacks.append(WandbCallback(save_model=False)) return callbacks diff --git a/dfpl/dfplmodule.py b/dfpl/dfplmodule.py index d0838ebf..67cd3a02 100755 --- a/dfpl/dfplmodule.py +++ b/dfpl/dfplmodule.py @@ -118,7 +118,6 @@ def smi2fp(smile, fptype, size=2048): # lengths. After all paths have been identified, the fingerprint is typically # folded down until a particular density of set bits is obtained. try: - # fp = Chem.RDKFingerprint(mol, fpSize=size) return Chem.RDKFingerprint(mol, fpSize=size) except Exception: print("SMILES not convertable to topological fingerprint:") @@ -134,7 +133,6 @@ def smi2fp(smile, fptype, size=2048): # things looked pretty good. try: - # fp = MACCSkeys.GenMACCSKeys(mol) return MACCSkeys.GenMACCSKeys(mol) except Exception: print("SMILES not convertable to MACSS fingerprint:") @@ -166,7 +164,6 @@ def smi2fp(smile, fptype, size=2048): # GetTopologicalTorsionFingerprintAsBitVect function. try: - # fp = Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) return Torsions.GetTopologicalTorsionFingerprintAsIntVect(mol) except Exception: print("SMILES not convertable to torsions fingerprint:") diff --git a/dfpl/feedforwardNN.py b/dfpl/feedforwardNN.py index e9c88776..bf4241aa 100644 --- a/dfpl/feedforwardNN.py +++ b/dfpl/feedforwardNN.py @@ -69,10 +69,16 @@ def define_out_file_names(path_prefix: str, target: str, fold: int = -1) -> tupl def define_nn_multi_label_model( input_size: int, output_size: int, opts: options.Options ) -> Model: + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, + ) if opts.optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=opts.learningRate) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) elif opts.optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=opts.learningRate, momentum=0.9) + my_optimizer = optimizers.legacy.SGD(lr=lr_schedule, momentum=0.9) else: logging.error(f"Your selected optimizer is not supported:{opts.optimizer}.") sys.exit("Unsupported optimizer.") @@ -132,9 +138,9 @@ def define_nn_model_multi( decay: float = 0.01, ) -> Model: if optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=lr, decay=decay) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr, decay=decay) elif optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=lr, momentum=0.9, decay=decay) + my_optimizer = optimizers.legacy.SGD(lr=lr, momentum=0.9, decay=decay) else: my_optimizer = optimizer @@ -294,6 +300,8 @@ def train_nn_models_multi(df: pd.DataFrame, opts: options.Options) -> None: model_file_path_weights, model_file_path_json, model_hist_path, + model_hist_csv_path, + model_predict_valset_csv_path, model_validation, model_auc_file, model_auc_file_data, diff --git a/dfpl/fingerprint.py b/dfpl/fingerprint.py index 16c0c87b..0149d62d 100644 --- a/dfpl/fingerprint.py +++ b/dfpl/fingerprint.py @@ -35,15 +35,6 @@ def smile2fp(smile: str) -> Any: None otherwise """ - # generate morgan fp (circular, ecfp) - # smile = df['smiles'][1] - # mol = Chem.MolFromSmiles(smile) - # from rdkit.Chem import AllChem - # morgan = AllChem.GetMorganFingerprintAsBitVect(mol, 2) - # npa = np.zeros((0,), dtype=np.bool) - # from rdkit import DataStructs - # DataStructs.ConvertToNumpyArray(morgan, npa) - npa = np.zeros((0,), dtype=np.bool_) try: DataStructs.ConvertToNumpyArray( @@ -56,14 +47,6 @@ def smile2fp(smile: str) -> Any: except Exception: return None - # try: - # return np.array( - # Chem.RDKFingerprint(Chem.MolFromSmiles(smile), fpSize=fp_size), - # dtype=settings.df_fp_numpy_type, copy=settings.numpy_copy_values) - # except: - # # Note: We don't need to log here since rdkit already logs - # return None - def inchi2fp(inchi: str) -> Any: """ Calculates one fingerprint from InChI @@ -138,12 +121,7 @@ def importDstoxTSV(tsvfilename: str) -> pd.DataFrame: conversion_rules = { "S_dataset.csv": importSmilesCSV, "smiles.csv": importSmilesCSV, - "inchi.tsv": importDstoxTSV - # "S_dataset_extended.csv": importSmilesCSV, - # "D_dataset.tsv": importDstoxTSV, - # "train_data.csv": importSmilesCSV, - # "predict_data.csv": importDstoxTSV, - # "B_data_ER.csv": importDstoxTSV + "inchi.tsv": importDstoxTSV, } diff --git a/dfpl/history.py b/dfpl/history.py index 1f16c2a9..6dcf5374 100644 --- a/dfpl/history.py +++ b/dfpl/history.py @@ -5,7 +5,7 @@ import matplotlib as mpl import matplotlib.pyplot as plt import pandas as pd -from tensorflow.keras.callbacks import History +from tensorflow.python.keras.callbacks import History mpl.use("Agg") diff --git a/dfpl/options.py b/dfpl/options.py index 6d84dbc4..05068156 100644 --- a/dfpl/options.py +++ b/dfpl/options.py @@ -3,12 +3,13 @@ import argparse from dataclasses import dataclass from pathlib import Path +from typing import Optional import jsonpickle import torch from chemprop.args import TrainArgs -from dfpl.utils import makePathAbsolute +from dfpl.utils import parseCmdArgs @dataclass @@ -17,51 +18,52 @@ class Options: Dataclass for all options necessary for training the neural nets """ - configFile: str = "./example/train.json" - inputFile: str = "/deepFPlearn/CMPNN/data/tox21.csv" - outputDir: str = "." - outputFile: str = "" - ecWeightsFile: str = "AE.encoder.weights.hdf5" - ecModelDir: str = "AE_encoder" - fnnModelDir: str = "modeltraining" + configFile: str = None + inputFile: str = "tests/data/smiles.csv" + outputDir: str = "example/results_train/" # changes according to mode + outputFile: str = "results.csv" + ecWeightsFile: str = "" + ecModelDir: str = "example/results_train/AE_encoder/" + fnnModelDir: str = "example/results_train/AR_saved_model/" type: str = "smiles" fpType: str = "topological" # also "MACCS", "atompairs" - epochs: int = 512 + epochs: int = 100 fpSize: int = 2048 encFPSize: int = 256 - kFolds: int = 0 + kFolds: int = 1 testSize: float = 0.2 enableMultiLabel: bool = False - verbose: int = 0 - trainAC: bool = True # if set to False, an AC weight file must be provided! + verbose: int = 2 + trainAC: bool = False trainFNN: bool = True - compressFeatures: bool = True - sampleFractionOnes: float = 0.5 # Only used when value is in [0,1] + compressFeatures: bool = False + sampleFractionOnes: float = 0.5 sampleDown: bool = False split_type: str = "random" aeSplitType: str = "random" aeType: str = "deterministic" - aeEpochs: int = 3000 + aeEpochs: int = 100 aeBatchSize: int = 512 aeLearningRate: float = 0.001 - aeLearningRateDecay: float = 0.01 - aeActivationFunction: str = "relu" + aeLearningRateDecay: float = 0.96 + aeActivationFunction: str = "selu" aeOptimizer: str = "Adam" + vaeBeta: float = 0.5 fnnType: str = "FNN" batchSize: int = 128 optimizer: str = "Adam" learningRate: float = 0.001 + learningRateDecay: float = 0.96 lossFunction: str = "bce" activationFunction: str = "relu" l2reg: float = 0.001 dropout: float = 0.2 threshold: float = 0.5 - gpu: str = "" - snnDepth = 8 - snnWidth = 50 - aeWabTracking: str = "" # Wand & Biases autoencoder tracking - wabTracking: str = "" # Wand & Biases FNN tracking - wabTarget: str = "ER" # Wand & Biases target used for showing training progress + visualizeLatent: bool = False # only if autoencoder is trained or loaded + gpu: int = None + aeWabTracking: bool = False # Wand & Biases autoencoder tracking + wabTracking: bool = False # Wand & Biases FNN tracking + wabTarget: str = "AR" # Wand & Biases target used for showing training progress def saveToFile(self, file: str) -> None: """ @@ -72,42 +74,8 @@ def saveToFile(self, file: str) -> None: f.write(jsonpickle.encode(self)) @classmethod - def fromJson(cls, file: str) -> Options: - """ - Create an instance from a JSON file - """ - jsonFile = Path(file) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - return jsonpickle.decode(content) - raise ValueError("JSON file does not exist or is not readable") - - @classmethod - def fromCmdArgs(cls, args: argparse.Namespace) -> Options: - """ - Creates Options instance from cmdline arguments. - - If a training file (JSON) is provided, the values from that file are used. - However, additional commandline arguments will be preferred. If, e.g., "fpSize" is specified both in the - JSON file and on the commandline, then the value of the commandline argument will be used. - """ - result = Options() - if "configFile" in vars(args).keys(): - jsonFile = Path(makePathAbsolute(args.configFile)) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - result = jsonpickle.decode(content) - else: - raise ValueError("Could not find JSON input file") - - for key, value in vars(args).items(): - # The args dict will contain a "method" key from the subparser. - # We don't use this. - if key != "method": - result.__setattr__(key, value) - return result + def fromCmdArgs(cls, args: argparse.Namespace) -> "Options": + return parseCmdArgs(cls, args) @dataclass @@ -116,7 +84,7 @@ class GnnOptions(TrainArgs): Dataclass to hold all options used for training the graph models """ - total_epochs: int = 30 + epochs: int = 30 save: bool = True configFile: str = "./example/traingnn.json" data_path: str = "./example/data/tox21.csv" @@ -134,37 +102,19 @@ class GnnOptions(TrainArgs): save_preds: bool = True @classmethod - def fromCmdArgs(cls, args: argparse.Namespace) -> GnnOptions: - """ - Creates Options instance from cmdline arguments. - - If a training file (JSON) is provided, the values from that file are used. - However, additional commandline arguments will be preferred. If, e.g., "fpSize" is specified both in the - JSON file and on the commandline, then the value of the commandline argument will be used. - """ - result = GnnOptions() - if "configFile" in vars(args).keys(): - jsonFile = Path(makePathAbsolute(args.configFile)) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - result = jsonpickle.decode(content) - else: - raise ValueError("Could not find JSON input file") + def fromCmdArgs(cls, args: argparse.Namespace, json_config: Optional[dict] = None): + # Initialize with JSON config if provided + if json_config: + opts = cls(**json_config) + else: + opts = cls() - return result + # Update with command-line arguments + for key, value in vars(args).items(): + if value is not None: + setattr(opts, key, value) - @classmethod - def fromJson(cls, file: str) -> GnnOptions: - """ - Create an instance from a JSON file - """ - jsonFile = Path(file) - if jsonFile.exists() and jsonFile.is_file(): - with jsonFile.open() as f: - content = f.read() - return jsonpickle.decode(content) - raise ValueError("JSON file does not exist or is not readable") + return opts def createCommandlineParser() -> argparse.ArgumentParser: @@ -225,7 +175,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="FILE", type=str, help="Input JSON file that contains all information for training/predicting.", - default=argparse.SUPPRESS, + default="example/train.json", ) general_args.add_argument( "-i", @@ -234,7 +184,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="The file containing the data for training in " "comma separated CSV format.The first column should be smiles.", - default=argparse.SUPPRESS, + default="tests/data/smiles.csv", ) general_args.add_argument( "-o", @@ -243,8 +193,10 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, help="Prefix of output file name. Trained model and " "respective stats will be returned in this directory.", - default=argparse.SUPPRESS, + default="example/results_train/", ) + + # TODO CHECK WHAT IS TYPE DOING? general_args.add_argument( "-t", "--type", @@ -252,7 +204,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["fp", "smiles"], help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default=argparse.SUPPRESS, + default="fp", ) general_args.add_argument( "-thr", @@ -260,47 +212,41 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=float, metavar="FLOAT", help="Threshold for binary classification.", - default=argparse.SUPPRESS, + default=0.5, ) general_args.add_argument( "-gpu", "--gpu", metavar="INT", type=int, - help="Select which gpu to use. If not available, leave empty.", - default=argparse.SUPPRESS, + help="Select which gpu to use by index. If not available, leave empty", + default=None, ) general_args.add_argument( - "-k", "--fpType", metavar="STR", type=str, - choices=["topological", "MACCS"], # , 'atompairs', 'torsions'], - help="The type of fingerprint to be generated/used in input file.", - default=argparse.SUPPRESS, + choices=["topological", "MACCS"], + help="The type of fingerprint to be generated/used in input file. MACCS or topological are available.", + default="topological", ) general_args.add_argument( - "-s", "--fpSize", type=int, - help="Size of fingerprint that should be generated.", - default=argparse.SUPPRESS, + help="Length of the fingerprint that should be generated.", + default=2048, ) general_args.add_argument( - "-c", "--compressFeatures", - metavar="BOOL", - type=bool, - help="Should the fingerprints be compressed or not. Activates the autoencoder. ", - default=argparse.SUPPRESS, + action="store_true", + help="Should the fingerprints be compressed or not. Needs a path of a trained autoencoder or needs the trainAC also set to True.", + default=False, ) general_args.add_argument( - "-m", "--enableMultiLabel", - metavar="BOOL", - type=bool, + action="store_true", help="Train multi-label classification model in addition to the individual models.", - default=argparse.SUPPRESS, + default=False, ) # Autoencoder Configuration autoencoder_args.add_argument( @@ -309,14 +255,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, metavar="FILE", help="The .hdf5 file of a trained encoder", - default=argparse.SUPPRESS, + default="", ) autoencoder_args.add_argument( "--ecModelDir", type=str, metavar="DIR", help="The directory where the full model of the encoder will be saved", - default=argparse.SUPPRESS, + default="example/results_train/AE_encoder/", ) autoencoder_args.add_argument( "--aeType", @@ -324,21 +270,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["variational", "deterministic"], help="Autoencoder type, variational or deterministic.", - default=argparse.SUPPRESS, + default="deterministic", ) autoencoder_args.add_argument( "--aeEpochs", metavar="INT", type=int, help="Number of epochs for autoencoder training.", - default=argparse.SUPPRESS, + default=100, ) autoencoder_args.add_argument( "--aeBatchSize", metavar="INT", type=int, help="Batch size in autoencoder training.", - default=argparse.SUPPRESS, + default=512, ) autoencoder_args.add_argument( "--aeActivationFunction", @@ -346,21 +292,21 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["relu", "selu"], help="The activation function for the hidden layers in the autoencoder.", - default=argparse.SUPPRESS, + default="relu", ) autoencoder_args.add_argument( "--aeLearningRate", metavar="FLOAT", type=float, help="Learning rate for autoencoder training.", - default=argparse.SUPPRESS, + default=0.001, ) autoencoder_args.add_argument( "--aeLearningRateDecay", metavar="FLOAT", type=float, help="Learning rate decay for autoencoder training.", - default=argparse.SUPPRESS, + default=0.96, ) autoencoder_args.add_argument( "--aeSplitType", @@ -368,7 +314,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the autoencoder", - default=argparse.SUPPRESS, + default="random", ) autoencoder_args.add_argument( "-d", @@ -376,7 +322,20 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Size of encoded fingerprint (z-layer of autoencoder).", - default=argparse.SUPPRESS, + default=256, + ) + autoencoder_args.add_argument( + "--visualizeLatent", + action="store_true", + help="UMAP the latent space for exploration", + default=False, + ) + autoencoder_args.add_argument( + "--vaeBeta", + metavar="FLOAT", + type=float, + help="Beta parameter for the Beta VAE", + default=1.0, ) # Training Configuration training_args.add_argument( @@ -385,15 +344,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["scaffold_balanced", "random", "molecular_weight"], help="Set how the data is going to be split for the feedforward neural network", - default=argparse.SUPPRESS, + default="random", ) training_args.add_argument( - "-l", "--testSize", metavar="FLOAT", type=float, help="Fraction of the dataset that should be used for testing. Value in [0,1].", - default=argparse.SUPPRESS, + default=0.2, ) training_args.add_argument( "-K", @@ -401,7 +359,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="K that is used for K-fold cross-validation in the training procedure.", - default=argparse.SUPPRESS, + default=1, ) training_args.add_argument( "-v", @@ -411,21 +369,19 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: choices=[0, 1, 2], help="Verbosity level. O: No additional output, " + "1: Some additional output, 2: full additional output", - default=argparse.SUPPRESS, + default=2, ) training_args.add_argument( "--trainAC", - metavar="BOOL", - type=bool, + action="store_true", help="Choose to train or not, the autoencoder based on the input file", - default=argparse.SUPPRESS, + default=False, ) training_args.add_argument( "--trainFNN", - metavar="BOOL", - type=bool, - help="Train the feedforward network either with provided weights.", - default=argparse.SUPPRESS, + action="store_false", + help="When called it deactivates the training.", + default=True, ) training_args.add_argument( "--sampleFractionOnes", @@ -433,14 +389,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=float, help="This is the fraction of positive target associations (1s) in comparison to the majority class(0s)." "only works if --sampleDown is enabled", - default=argparse.SUPPRESS, + default=0.5, ) training_args.add_argument( "--sampleDown", metavar="BOOL", type=bool, help="Enable automatic down sampling of the 0 valued samples.", - default=argparse.SUPPRESS, + default=False, ) training_args.add_argument( "-e", @@ -448,52 +404,60 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="INT", type=int, help="Number of epochs that should be used for the FNN training", - default=argparse.SUPPRESS, + default=100, ) - + # TODO CHECK IF ALL LOSSES MAKE SENSE HERE training_args.add_argument( "--lossFunction", metavar="STRING", type=str, choices=["mse", "bce", "focal"], help="Loss function to use during training. mse - mean squared error, bce - binary cross entropy.", - default=argparse.SUPPRESS, + default="bce", ) + # TODO DO I NEED ALL ARGUMENTS TO BE USER SPECIFIED? WHAT DOES THE USER KNOW ABOUT OPTIMIZERS? training_args.add_argument( "--optimizer", metavar="STRING", type=str, choices=["Adam", "SGD"], help='Optimizer to use for backpropagation in the FNN. Possible values: "Adam", "SGD"', - default=argparse.SUPPRESS, + default="Adam", ) training_args.add_argument( "--batchSize", metavar="INT", type=int, help="Batch size in FNN training.", - default=argparse.SUPPRESS, + default=128, ) training_args.add_argument( "--l2reg", metavar="FLOAT", type=float, help="Value for l2 kernel regularizer.", - default=argparse.SUPPRESS, + default=0.001, ) training_args.add_argument( "--dropout", metavar="FLOAT", type=float, help="The fraction of data that is dropped out in each dropout layer.", - default=argparse.SUPPRESS, + default=0.2, ) training_args.add_argument( "--learningRate", metavar="FLOAT", type=float, help="Learning rate size in FNN training.", - default=argparse.SUPPRESS, + default=0.000022, + ) + training_args.add_argument( + "--learningRateDecay", + metavar="FLOAT", + type=float, + help="Learning rate size in FNN training.", + default=0.96, ) training_args.add_argument( "--activationFunction", @@ -501,7 +465,7 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["relu", "selu"], help="The activation function for hidden layers in the FNN.", - default=argparse.SUPPRESS, + default="relu", ) # Tracking Configuration tracking_args.add_argument( @@ -509,14 +473,14 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: metavar="BOOL", type=bool, help="Track autoencoder performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default=False, ) tracking_args.add_argument( "--wabTracking", metavar="BOOL", type=bool, help="Track FNN performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default=False, ) tracking_args.add_argument( "--wabTarget", @@ -524,7 +488,112 @@ def parseInputTrain(parser: argparse.ArgumentParser) -> None: type=str, choices=["AR", "ER", "ED", "GR", "TR", "PPARg", "Aromatase"], help="Which target to use for tracking performance via Weights & Biases, see https://wandb.ai.", - default=argparse.SUPPRESS, + default="AR", + ) + + +def parseInputPredict(parser: argparse.ArgumentParser) -> None: + """ + Parse the input arguments. + + :return: A namespace object built up from attributes parsed out of the cmd line. + """ + + general_args = parser.add_argument_group("General Configuration") + files_args = parser.add_argument_group("Files") + files_args.add_argument( + "-f", + "--configFile", + metavar="FILE", + type=str, + help="Input JSON file that contains all information for training/predicting.", + ) + files_args.add_argument( + "-i", + "--inputFile", + metavar="FILE", + type=str, + help="The file containing the data for the prediction in (unquoted) " + "comma separated CSV format. The column named 'smiles' or 'fp'" + "contains the field to be predicted. Please adjust the type " + "that should be predicted (fp or smile) with -t option appropriately." + "An optional column 'id' is used to assign the outcomes to the" + "original identifiers. If this column is missing, the results are" + "numbered in the order of their appearance in the input file." + "A header is expected and respective column names are used.", + default="tests/data/smiles.csv", + ) + files_args.add_argument( + "-o", + "--outputDir", + metavar="DIR", + type=str, + help="Prefix of output directory. It will contain a log file and the file specified" + "with --outputFile.", + default="example/results_predict/", + ) + files_args.add_argument( + "--outputFile", + metavar="FILE", + type=str, + help="Output .CSV file name which will contain one prediction per input line. " + "Default: prefix of input file name.", + default="results.csv", + ) + # TODO AGAIN THIS TRASH HERE? CAN WE EVEN PROCESS SMILES? + general_args.add_argument( + "-t", + "--type", + metavar="STR", + type=str, + choices=["fp", "smiles"], + help="Type of the chemical representation. Choices: 'fp', 'smiles'.", + default="fp", + ) + general_args.add_argument( + "-k", + "--fpType", + metavar="STR", + type=str, + choices=["topological", "MACCS"], + help="The type of fingerprint to be generated/used in input file. Should be the same as the type of the fps that the model was trained upon.", + default="topological", + ) + files_args.add_argument( + "--ecModelDir", + type=str, + metavar="DIR", + help="The directory where the full model of the encoder will be saved (if trainAE=True) or " + "loaded from (if trainAE=False). Provide a full path here.", + default="", + ) + files_args.add_argument( + "--ecWeightsFile", + type=str, + metavar="STR", + help="The file where the full model of the encoder will be loaded from, to compress the fingerprints. Provide a full path here.", + default="", + ) + files_args.add_argument( + "--fnnModelDir", + type=str, + metavar="DIR", + help="The directory where the full model of the fnn is loaded from. " + "Provide a full path here.", + default="example/results_train/AR_saved_model", + ) + general_args.add_argument( + "-c", "--compressFeatures", action="store_true", default=False + ) + ( + general_args.add_argument( + "--aeType", + metavar="STRING", + type=str, + choices=["variational", "deterministic"], + help="Autoencoder type, variational or deterministic.", + default="deterministic", + ) ) @@ -575,9 +644,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: default=10, help="The number of batches between each logging of the training loss", ) - general_args.add_argument( - "--no_cuda", action="store_true", default=True, help="Turn off cuda" - ) general_args.add_argument( "--no_cache", action="store_true", @@ -940,13 +1006,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: training_args.add_argument( "--epochs", type=int, metavar="INT", default=30, help="Number of epochs to run" ) - training_args.add_argument( - "--total_epochs", - type=int, - metavar="INT", - default=30, - help="Number of total epochs to run", - ) training_args.add_argument( "--batch_size", type=int, metavar="INT", default=50, help="Batch size" ) @@ -1034,91 +1093,6 @@ def parseTrainGnn(parser: argparse.ArgumentParser) -> None: ) -def parseInputPredict(parser: argparse.ArgumentParser) -> None: - """ - Parse the input arguments. - - :return: A namespace object built up from attributes parsed out of the cmd line. - """ - - general_args = parser.add_argument_group("General Configuration") - files_args = parser.add_argument_group("Files") - files_args.add_argument( - "-f", - "--configFile", - metavar="FILE", - type=str, - help="Input JSON file that contains all information for training/predicting.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "-i", - "--inputFile", - metavar="FILE", - type=str, - help="The file containing the data for the prediction in (unquoted) " - "comma separated CSV format. The column named 'smiles' or 'fp'" - "contains the field to be predicted. Please adjust the type " - "that should be predicted (fp or smile) with -t option appropriately." - "An optional column 'id' is used to assign the outcomes to the" - "original identifiers. If this column is missing, the results are" - "numbered in the order of their appearance in the input file." - "A header is expected and respective column names are used.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "-o", - "--outputDir", - metavar="DIR", - type=str, - help="Prefix of output directory. It will contain a log file and the file specified" - "with --outputFile.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "--outputFile", - metavar="FILE", - type=str, - help="Output .CSV file name which will contain one prediction per input line. " - "Default: prefix of input file name.", - default=argparse.SUPPRESS, - ) - general_args.add_argument( - "-t", - "--type", - metavar="STR", - type=str, - choices=["fp", "smiles"], - help="Type of the chemical representation. Choices: 'fp', 'smiles'.", - default=argparse.SUPPRESS, - ) - general_args.add_argument( - "-k", - "--fpType", - metavar="STR", - type=str, - choices=["topological", "MACCS"], # , 'atompairs', 'torsions'], - help="The type of fingerprint to be generated/used in input file.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "--ecModelDir", - type=str, - metavar="DIR", - help="The directory where the full model of the encoder will be saved (if trainAE=True) or " - "loaded from (if trainAE=False). Provide a full path here.", - default=argparse.SUPPRESS, - ) - files_args.add_argument( - "--fnnModelDir", - type=str, - metavar="DIR", - help="The directory where the full model of the fnn is loaded from. " - "Provide a full path here.", - default=argparse.SUPPRESS, - ) - - def parsePredictGnn(parser: argparse.ArgumentParser) -> None: general_args = parser.add_argument_group("General Configuration") data_args = parser.add_argument_group("Data Configuration") @@ -1139,9 +1113,6 @@ def parsePredictGnn(parser: argparse.ArgumentParser) -> None: choices=list(range(torch.cuda.device_count())), help="Which GPU to use", ) - general_args.add_argument( - "--no_cuda", action="store_true", default=False, help="Turn off cuda" - ) general_args.add_argument( "--num_workers", type=int, diff --git a/dfpl/plot.py b/dfpl/plot.py index 30d9503f..84802023 100644 --- a/dfpl/plot.py +++ b/dfpl/plot.py @@ -2,12 +2,12 @@ import matplotlib.pyplot as plt import numpy as np -import pandas as pd import wandb from matplotlib.axes import Axes +from sklearn.metrics import auc, precision_recall_curve # for NN model functions -from tensorflow.keras.callbacks import History +from tensorflow.python.keras.callbacks import History def get_max_validation_accuracy(history: History) -> str: @@ -46,7 +46,7 @@ def get_max_training_accuracy(history: History) -> str: return "Max training accuracy ≈ " + str(round(y_max, 3) * 100) + "%" -def smooth_curve(points: np.ndarray, factor: float = 0.75) -> np.ndarray: +def smooth_curve(points: np.ndarray, factor: float = 0.8) -> List[float]: smoothed_points: List[float] = [] for point in points: if smoothed_points: @@ -57,144 +57,52 @@ def smooth_curve(points: np.ndarray, factor: float = 0.75) -> np.ndarray: return smoothed_points +# Plot the accuracy and loss data with enhanced visuals def set_plot_history_data(ax: Axes, history: History, which_graph: str) -> None: - (train, valid) = (None, None) - - if which_graph == "acc": - train = smooth_curve(history.history["accuracy"]) - valid = smooth_curve(history.history["val_accuracy"]) - - if which_graph == "loss": - train = smooth_curve(history.history["loss"]) - valid = smooth_curve(history.history["val_loss"]) - - # plt.xkcd() # make plots look like xkcd + if which_graph == "balanced_acc": + # Plot balanced accuracy when "acc" is specified + train = smooth_curve(np.array(history.history["balanced_accuracy"])) + valid = smooth_curve(np.array(history.history["val_balanced_accuracy"])) + label = "Balanced Accuracy" + elif which_graph == "loss": + train = smooth_curve(np.array(history.history["loss"])) + valid = smooth_curve(np.array(history.history["val_loss"])) + label = "Loss" + else: + return epochs = range(1, len(train) + 1) - trim = 0 # remove first 5 epochs - # when graphing loss the first few epochs may skew the (loss) graph - - ax.plot(epochs[trim:], train[trim:], "dodgerblue", linewidth=15, alpha=0.1) - ax.plot(epochs[trim:], train[trim:], "dodgerblue", label="Training") - - ax.plot(epochs[trim:], valid[trim:], "g", linewidth=15, alpha=0.1) - ax.plot(epochs[trim:], valid[trim:], "g", label="Validation") + # Plot training and validation data with styles + ax.plot(epochs, train, color="dodgerblue", linewidth=2, label=f"Training {label}") + ax.plot( + epochs, + valid, + color="green", + linestyle="--", + linewidth=2, + label=f"Validation {label}", + ) + ax.set_ylabel(label) + ax.legend(loc="best") + ax.spines["top"].set_visible(False) + ax.spines["right"].set_visible(False) def plot_history(history: History, file: str) -> None: - fig, (ax1, ax2) = plt.subplots( - nrows=2, - ncols=1, - figsize=(10, 6), - sharex="all", - gridspec_kw={"height_ratios": [5, 2]}, - ) - - set_plot_history_data(ax1, history, "acc") + fig, (ax1, ax2) = plt.subplots(nrows=2, ncols=1, figsize=(10, 8), sharex="all") + set_plot_history_data(ax1, history, "balanced_acc") set_plot_history_data(ax2, history, "loss") - # Accuracy graph - ax1.set_ylabel("Accuracy") - ax1.set_ylim(bottom=0.5, top=1) - ax1.legend(loc="lower right") - ax1.spines["top"].set_visible(False) - ax1.spines["right"].set_visible(False) - ax1.xaxis.set_ticks_position("none") - ax1.spines["bottom"].set_visible(False) - - # max accuracy text - plt.text( - 0.5, - 0.6, - get_max_validation_balanced_accuracy(history), - horizontalalignment="right", - verticalalignment="top", - transform=ax1.transAxes, - fontsize=12, - ) - plt.text( - 0.5, - 0.8, - get_max_training_balanced_accuracy(history), - horizontalalignment="right", - verticalalignment="top", - transform=ax1.transAxes, - fontsize=12, - ) - - # Loss graph - ax2.set_ylabel("Loss") - ax2.set_yticks([]) - ax2.plot(legend=False) + # Set shared x-axis label and save the plot ax2.set_xlabel("Epochs") - ax2.spines["top"].set_visible(False) - ax2.spines["right"].set_visible(False) - plt.tight_layout() plt.savefig(fname=file, format="svg") plt.close() -def plot_train_history(hist, target, file_accuracy, file_loss): - """ - Plot the training performance in terms of accuracy and loss values for each epoch. - :param hist: The history returned by model.fit function - :param target: The name of the target of the model - :param file_accuracy: The filename for plotting accuracy values - :param file_loss: The filename for plotting loss values - :return: none - """ - - # plot accuracy - plt.figure() - plt.plot(hist.history["accuracy"]) - if "val_accuracy" in hist.history.keys(): - plt.plot(hist.history["val_accuracy"]) - plt.title("Model accuracy - " + target) - plt.ylabel("Accuracy") - plt.xlabel("Epoch") - if "val_accuracy" in hist.history.keys(): - plt.legend(["Train", "Test"], loc="upper left") - else: - plt.legend(["Train"], loc="upper_left") - plt.savefig(fname=file_accuracy, format="svg") - - # Plot training & validation loss values - plt.figure() - plt.plot(hist.history["loss"]) - plt.plot(hist.history["val_loss"]) - plt.title("Model loss - " + target) - plt.ylabel("Loss") - plt.xlabel("Epoch") - plt.legend(["Train", "Test"], loc="upper left") - # plt.show() - plt.savefig(fname=file_loss, format="svg") - plt.close() - - -def plot_history_vis( - hist: History, - model_hist_plot_path: str, - model_hist_csv_path: str, - model_hist_plot_path_a: str, - model_hist_plot_path_l: str, - target: str, -) -> None: - plot_history(history=hist, file=model_hist_plot_path) - histDF = pd.DataFrame(hist.history) - histDF.to_csv(model_hist_csv_path) - - # plot accuracy and loss for the training and validation during training - plot_train_history( - hist=hist, - target=target, - file_accuracy=model_hist_plot_path_a, - file_loss=model_hist_plot_path_l, - ) - - +# Enhanced AUC plot def plot_auc( fpr: np.ndarray, tpr: np.ndarray, @@ -202,27 +110,59 @@ def plot_auc( target: str, filename: str, wandb_logging: bool = False, +) -> None: + plt.figure(figsize=(8, 6)) + plt.plot([0, 1], [0, 1], "k--", linewidth=1) + plt.plot(fpr, tpr, color="darkorange", linewidth=2, label=f"AUC = {auc_value:.3f}") + plt.xlabel("False Positive Rate") + plt.ylabel("True Positive Rate") + plt.title(f"ROC Curve - {target}") + plt.legend(loc="lower right") + plt.grid(True, linestyle="--", alpha=0.5) + plt.savefig(fname=filename, format="png") + if wandb_logging: + wandb.log({"roc_plot": plt}) + plt.close() + + +def plot_prc( + y_true: np.ndarray, + y_scores: np.ndarray, + target: str, + filename: str, + wandb_logging: bool = False, ) -> None: """ - Plot the area under the curve to the provided file - - :param fpr: An array containing the false positives - :param tpr: An array containing the true positives - :param auc_value: The value of the area under the curve - :param target: The name of the training target - :param filename: The filename to which the plot should be stored - :param wandb_logging: Whether to log the plot to wandb + Plot the Precision-Recall Curve (PRC) with AUC. + + :param y_true: True binary labels + :param y_scores: Target scores, typically predicted probabilities + :param target: The name of the model or target being evaluated + :param filename: The filename to save the plot + :param wandb_logging: Whether to log the plot to Weights & Biases :rtype: None """ - # Create a boolean mask to filter out zero values - plt.figure() - plt.plot([0, 1], [0, 1], "k--") - plt.plot(fpr, tpr, label=f"Keras (area = {auc_value:.3f})") - plt.xlabel("False positive rate") - plt.ylabel("True positive rate") - plt.title("ROC curve " + target) - plt.legend(loc="best") + # Calculate precision-recall curve and AUC + precision, recall, _ = precision_recall_curve(y_true, y_scores) + prc_auc_value = auc(recall, precision) + + # Plot PRC curve + plt.figure(figsize=(8, 6)) + plt.plot( + recall, + precision, + color="purple", + linewidth=2, + label=f"PRC-AUC = {prc_auc_value:.3f}", + ) + plt.xlabel("Recall") + plt.ylabel("Precision") + plt.title(f"Precision-Recall Curve - {target}") + plt.legend(loc="lower left") + plt.grid(True, linestyle="--", alpha=0.5) + + # Save plot plt.savefig(fname=filename, format="png") if wandb_logging: - wandb.log({"roc_plot": plt}) + wandb.log({"prc_plot": plt}) plt.close() diff --git a/dfpl/predictions.py b/dfpl/predictions.py index 29e73142..90a0c3bd 100644 --- a/dfpl/predictions.py +++ b/dfpl/predictions.py @@ -1,39 +1,55 @@ import logging +import os import numpy as np import pandas as pd -import tensorflow.keras.models from dfpl import options, settings +from dfpl import single_label_model as sl def predict_values(df: pd.DataFrame, opts: options.Options) -> pd.DataFrame: """ Predict a set of chemicals using a selected model. - :param df: - :param opts: - :return: + :param df: Input DataFrame containing the features (either compressed or uncompressed). + :param opts: Model options including paths, feature types, and prediction preferences. + :return: DataFrame with predictions. """ - model = tensorflow.keras.models.load_model(opts.fnnModelDir, compile=False) - model.compile(loss=opts.lossFunction, optimizer=opts.optimizer) - if opts.compressFeatures: - sub_df = df[df["fpcompressed"].notnull()] - x = np.array( - sub_df["fpcompressed"].to_list(), - dtype=settings.nn_fp_compressed_numpy_type, - copy=settings.numpy_copy_values, - ) - logging.info(f"Compressed FP matrix with shape {x.shape} and type {x.dtype}") - sub_df["predicted"] = pd.DataFrame(model.predict(x), columns=["predicted"]) - return sub_df - else: - sub_df = df[df["fp"].notnull()] - x = np.array( - sub_df["fp"].to_list(), - dtype=settings.nn_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - logging.info(f"Uncompressed FP matrix with shape {x.shape} and type {x.dtype}") - sub_df["predicted"] = pd.DataFrame(model.predict(x), columns=["predicted"]) - return sub_df + + # Determine the correct feature column and input size + feature_column = "fpcompressed" if opts.compressFeatures else "fp" + sub_df = df[df[feature_column].notnull()] + + if sub_df.empty: + logging.warning(f"No valid features found in column '{feature_column}'") + return pd.DataFrame() + + # Prepare the feature matrix for prediction + x = np.array( + sub_df[feature_column].to_list(), + dtype=settings.nn_fp_compressed_numpy_type + if opts.compressFeatures + else settings.nn_fp_numpy_type, + copy=settings.numpy_copy_values, + ) + logging.info( + f"{'Compressed' if opts.compressFeatures else 'Uncompressed'} FP matrix with shape {x.shape} and type {x.dtype}" + ) + + # Define the model architecture based on the feature size + feature_input_size = x.shape[1] + model = sl.define_single_label_model(input_size=feature_input_size, opts=opts) + + # Load the model weights + weights_path = os.path.join(opts.fnnModelDir, "model_weights.hdf5") + model.load_weights(weights_path) + logging.info(f"Model weights loaded from {weights_path}") + + # Make predictions + predictions = model.predict(x) + + # Add predictions to the DataFrame + sub_df["predicted"] = predictions + + return sub_df diff --git a/dfpl/settings.py b/dfpl/settings.py index 64eac190..20435290 100644 --- a/dfpl/settings.py +++ b/dfpl/settings.py @@ -43,7 +43,7 @@ # Training settings of the AC that were magic numbers in the code before. ac_train_min_delta = 0.0001 -ac_train_check_period = 5 +ac_train_check_period = 2 ac_train_patience = 5 # Training settings of the FNN that were magic numbers in the code before. diff --git a/dfpl/single_label_model.py b/dfpl/single_label_model.py index 18402f09..41c083a9 100644 --- a/dfpl/single_label_model.py +++ b/dfpl/single_label_model.py @@ -10,7 +10,7 @@ import numpy as np import pandas as pd import tensorflow as tf -import tensorflow.keras.backend as K +import tensorflow.python.keras.backend as K import wandb from sklearn.metrics import ( auc, @@ -187,7 +187,7 @@ def build_fnn_network( output_bias = tf.keras.initializers.Constant(output_bias) # Define the number of hidden layers based on the input size - my_hidden_layers = {"2048": 6, "1024": 5, "999": 5, "512": 4, "256": 3} + my_hidden_layers = {"2048": 6, "1024": 5, "999": 5, "512": 4, "256": 3, "167": 3} if not str(input_size) in my_hidden_layers.keys(): raise ValueError("Wrong input-size. Must be in {2048, 1024, 999, 512, 256}.") nhl = int(math.log2(input_size) / 2 - 1) @@ -333,12 +333,17 @@ def define_single_label_model( else: logging.error(f"Your selected loss is not supported: {opts.lossFunction}.") sys.exit("Unsupported loss function") - + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.learningRate, + decay_steps=1000, + decay_rate=opts.learningRateDecay, + staircase=True, + ) # Set the optimizer according to the option selected if opts.optimizer == "Adam": - my_optimizer = optimizers.Adam(learning_rate=opts.learningRate) + my_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) elif opts.optimizer == "SGD": - my_optimizer = optimizers.SGD(lr=opts.learningRate, momentum=0.9) + my_optimizer = optimizers.legacy.SGD(lr=lr_schedule, momentum=0.9) else: logging.error(f"Your selected optimizer is not supported: {opts.optimizer}.") sys.exit("Unsupported optimizer") @@ -398,7 +403,7 @@ def evaluate_model( "target": target, "fold": fold, } - ).to_csv(path_or_buf=f"{file_prefix}.predicted.testdata.csv") + ).to_csv(path_or_buf=f"{file_prefix}/predicted.testdata.csv") ) # Compute the confusion matrix @@ -409,9 +414,16 @@ def evaluate_model( y_test_int, y_predict_int, output_dict=True ) prf = pd.DataFrame.from_dict(precision_recall)[["0", "1"]] - + # plot the precision-recall curve + pl.plot_prc( + y_true=y_test, + y_scores=y_predict, + target=target, + filename=f"{file_prefix}/prc.png", + wandb_logging=False, + ) # Add balanced accuracy to the computed metrics - prf.to_csv(path_or_buf=f"{file_prefix}.predicted.testdata.prec_rec_f1.csv") + prf.to_csv(path_or_buf=f"{file_prefix}/predicted.testdata.prec_rec_f1.csv") # Evaluate the model on the validation set and log the results loss, acc, auc_value, precision, recall, balanced_acc = tuple( @@ -450,17 +462,16 @@ def evaluate_model( ) ), columns=["fpr", "tpr", "auc_value", "target", "fold"], - ).to_csv(path_or_buf=f"{file_prefix}.predicted.testdata.aucdata.csv") + ).to_csv(path_or_buf=f"{file_prefix}/predicted.testdata.aucdata.csv") # Generate and save AUC-ROC curve plot pl.plot_auc( fpr=FPR, tpr=TPR, target=target, auc_value=AUC, - filename=f"{file_prefix}_auc_data.png", + filename=f"{file_prefix}/auc_data.png", wandb_logging=False, ) - # Return a DataFrame containing the computed metrics return pd.DataFrame.from_dict( { @@ -485,8 +496,10 @@ def evaluate_model( def fit_and_evaluate_model( x_train: np.ndarray, x_test: np.ndarray, + x_val: np.ndarray, y_train: np.ndarray, y_test: np.ndarray, + y_val: np.ndarray, fold: int, target: str, opts: options.Options, @@ -495,9 +508,10 @@ def fit_and_evaluate_model( logging.info(f"Training of fold number: {fold}") # Define file name prefix for saving models - model_file_prefix = path.join( - opts.outputDir, f"{target}_{opts.split_type}_single-labeled_Fold-{fold}" - ) + if fold > 1: + model_file_prefix = path.join("tmp", f"{target}/fold-{fold}") + else: + model_file_prefix = path.join(opts.outputDir, target) # Compute class imbalance ids, counts = np.unique(y_train, return_counts=True) @@ -517,7 +531,11 @@ def fit_and_evaluate_model( ) # Define checkpoint to save model weights during training - checkpoint_model_weights_path = f"{model_file_prefix}.model.weights.hdf5" + checkpoint_model_weights_path = os.path.join( + model_file_prefix, "model_weights.hdf5" + ) + + # Define callbacks callback_list = cb.nn_callback( checkpoint_path=checkpoint_model_weights_path, opts=opts ) @@ -531,7 +549,7 @@ def fit_and_evaluate_model( epochs=opts.epochs, batch_size=opts.batchSize, verbose=opts.verbose, - validation_data=(x_test, y_test), + validation_data=(x_val, y_val), ) trainTime = str(round((time() - start) / 60, ndigits=2)) logging.info( @@ -539,8 +557,8 @@ def fit_and_evaluate_model( ) # Save and plot model history - pd.DataFrame(hist.history).to_csv(path_or_buf=f"{model_file_prefix}.history.csv") - pl.plot_history(history=hist, file=f"{model_file_prefix}.history.svg") + pd.DataFrame(hist.history).to_csv(path_or_buf=f"{model_file_prefix}/history.csv") + pl.plot_history(history=hist, file=f"{model_file_prefix}/history.svg") # Evaluate model callback_model = define_single_label_model(input_size=x_train.shape[1], opts=opts) callback_model.load_weights(filepath=checkpoint_model_weights_path) @@ -560,6 +578,7 @@ def get_x_y( df: pd.DataFrame, target: str, train_set: pd.DataFrame, + val_set: pd.DataFrame, test_set: pd.DataFrame, opts: options.Options, ): @@ -577,11 +596,15 @@ def get_x_y( y_train = df.iloc[train_indices][target].values x_test = new_df.iloc[test_indices, :].values y_test = df.iloc[test_indices][target].values + x_val = new_df.iloc[val_set.index, :].values + y_val = df.iloc[val_set.index][target].values + x_val = x_val.astype("float32") + y_val = y_val.astype("float32") x_train = x_train.astype("float32") y_train = y_train.astype("float32") x_test = x_test.astype("float32") y_test = y_test.astype("float32") - return x_train, y_train, x_test, y_test + return x_train, y_train, x_test, y_test, x_val, y_val def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: @@ -594,13 +617,8 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: :param opts: The command line arguments in the options class :param df: The dataframe containing x matrix and at least one column for a y target. """ - # find target columns - targets = [ - c - for c in df.columns - if c in ["AR", "ER", "ED", "TR", "GR", "PPARg", "Aromatase"] - ] + targets = [c for c in df.columns if c not in ["smiles", "fp", "fpcompressed"]] if opts.wabTracking and opts.wabTarget != "": # For W&B tracking, we only train one target that's specified as wabTarget "ER". # In case it's not there, we use the first one available @@ -644,6 +662,9 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: x_train, x_test, y_train, y_test = train_test_split( x, y, stratify=y, test_size=opts.testSize, random_state=1 ) + x_val, x_test, y_val, y_test = train_test_split( + x_test, y_test, stratify=y_test, test_size=0.5, random_state=1 + ) logging.info( f"Splitting train/test data with fixed random initializer" ) @@ -651,32 +672,22 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: x_train, x_test, y_train, y_test = train_test_split( x, y, stratify=y, test_size=opts.testSize ) + x_val, x_test, y_val, y_test = train_test_split( + x_test, y_test, stratify=y_test, test_size=0.5 + ) performance = fit_and_evaluate_model( x_train=x_train, x_test=x_test, + x_val=x_val, y_train=y_train, y_test=y_test, + y_val=y_val, fold=0, target=target, opts=opts, ) performance_list.append(performance) - # save complete model - trained_model = define_single_label_model( - input_size=len(x[0]), opts=opts - ) - # trained_model.load_weights - # (path.join(opts.outputDir, f"{target}_single-labeled_Fold-0.model.weights.hdf5")) - trained_model.save_weights( - path.join( - opts.outputDir, - f"{target}_single-labeled_Fold-0.model.weights.hdf5", - ) - ) - trained_model.save( - filepath=path.join(opts.outputDir, f"{target}_saved_model") - ) elif 1 < opts.kFolds < 10: # int(x.shape[0] / 100): # do a kfold cross-validation @@ -685,10 +696,21 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: ) fold_no = 1 # split the data - for train, test in kfold_c_validator.split(x, y): + for train_idx, test_idx in kfold_c_validator.split(x, y): + # Split test into validation and actual test set + x_train, x_test = x[train_idx], x[test_idx] + y_train, y_test = y[train_idx], y[test_idx] + + # Further split test set into val and test set + x_val, x_test, y_val, y_test = train_test_split( + x_test, + y_test, + test_size=0.5, + stratify=y_test, + ) if opts.wabTracking and not opts.aeWabTracking: wandb.init( - project=f"FNN_{opts.threshold}_{opts.split_type}", + project=f"FNN_{opts.split_type}", group=f"{target}", name=f"{target}-{fold_no}", reinit=True, @@ -703,41 +725,25 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: # Initialize wandb for the feed forward model wandb.init( - project=f"FNN_{opts.threshold}_{opts.split_type}", + project=f"FNN_{opts.split_type}", group=f"{target}", name=f"FNN_{target}-{fold_no}", reinit=True, ) performance = fit_and_evaluate_model( - x_train=x[train], - x_test=x[test], - y_train=y[train], - y_test=y[test], + x_train=x_train, + x_test=x_test, + x_val=x_val, + y_train=y_train, + y_test=y_test, + y_val=y_val, fold=fold_no, target=target, opts=opts, ) performance_list.append(performance) - # save complete model - trained_model = define_single_label_model( - input_size=len(x[0]), opts=opts - ) - # trained_model.load_weights - # (path.join(opts.outputDir, f"{target}_single-labeled_Fold-0.model.weights.hdf5")) - trained_model.save_weights( - path.join( - opts.outputDir, - f"{target}_single-labeled_Fold-{fold_no}.model.weights.hdf5", - ) - ) - # create output directory and store complete model - trained_model.save( - filepath=path.join( - opts.outputDir, f"{target}-{fold_no}_saved_model" - ) - ) if opts.wabTracking: wandb.finish() fold_no += 1 @@ -746,31 +752,21 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: best_fold = pd.concat(performance_list, ignore_index=True).sort_values( by=["p_1", "r_1", "MCC"], ascending=False, ignore_index=True )["fold"][0] - # rename the fold to best fold - src = os.path.join( - opts.outputDir, - f"{target}_single-labeled_Fold-{best_fold}.model.weights.hdf5", - ) - dst = os.path.join( - opts.outputDir, - f"{target}_single-labeled_Best_Fold-{best_fold}.model.weights.hdf5", - ) - os.rename(src, dst) + src = os.path.join("tmp/", f"{target}/fold-{best_fold}/") + if opts.compressFeatures: + opts.outputDir = opts.ecModelDir + dst = os.path.join(opts.outputDir, f"{target}/") - src_dir = os.path.join( - opts.outputDir, f"{target}-{best_fold}_saved_model" - ) - dst_dir = os.path.join( - opts.outputDir, f"{target}-{best_fold}_best_saved_model" - ) + # Ensure the destination directory exists + os.makedirs(src, exist_ok=True) + os.makedirs(dst, exist_ok=True) - if path.isdir(dst_dir): - shutil.rmtree(dst_dir) + # Copy all contents from the source (best fold) to the destination + shutil.copytree(src, dst, dirs_exist_ok=True) - # Rename source directory to destination directory - os.rename(src_dir, dst_dir) + # Optionally, clean up the temporary directory + shutil.rmtree("tmp/") - # save complete model else: logging.info( "Your selected number of folds for Cross validation is out of range. " @@ -787,7 +783,7 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: # For each individual target train a model elif opts.split_type == "scaffold_balanced": # df, irrelevant_columns = preprocess_dataframe(df, opts) - for idx, target in enumerate(targets): + for target in targets: df = prepare_nn_training_data(df, target, opts, return_dataframe=True) relevant_cols = ["smiles"] + ["fp"] + [target] # list(irrelevant_columns) if opts.compressFeatures: @@ -799,12 +795,12 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: if opts.kFolds == 1: train_set, val_set, test_set = ae_scaffold_split( df_task, - sizes=(1 - opts.testSize, 0.0, opts.testSize), + sizes=(1 - opts.testSize, opts.testSize / 2, opts.testSize / 2), balanced=False, seed=42, ) - x_train, y_train, x_test, y_test = get_x_y( - df_task, target, train_set, test_set, opts + x_train, y_train, x_test, y_test, x_val, y_val = get_x_y( + df_task, target, train_set, val_set, test_set, opts ) if opts.wabTracking and not opts.aeWabTracking: wandb.init( @@ -822,38 +818,27 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: performance = fit_and_evaluate_model( x_train=x_train, x_test=x_test, + x_val=x_val, y_train=y_train, y_test=y_test, + y_val=y_val, fold=0, target=target, opts=opts, ) performance_list.append(performance) - trained_model = define_single_label_model( - input_size=len(x_train[0]), opts=opts - ) - trained_model.save_weights( - path.join( - opts.outputDir, - f"{target}_scaffold_single-labeled_Fold-0.model.weights.hdf5", - ) - ) - trained_model.save( - filepath=path.join( - opts.outputDir, f"{target}_scaffold_saved_model_0" - ) - ) + elif opts.kFolds > 1: for fold_no in range(1, opts.kFolds + 1): print(f"Splitting data with seed {fold_no}") train_set, val_set, test_set = ae_scaffold_split( df_task, - sizes=(1 - opts.testSize, 0.0, opts.testSize), + sizes=(1 - opts.testSize, opts.testSize / 2, opts.testSize / 2), balanced=True, seed=fold_no, ) - x_train, y_train, x_test, y_test = get_x_y( - df_task, target, train_set, test_set, opts + x_train, y_train, x_test, y_test, x_val, y_val = get_x_y( + df_task, target, train_set, val_set, test_set, opts ) if opts.wabTracking and not opts.aeWabTracking: wandb.init( @@ -872,6 +857,8 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: performance = fit_and_evaluate_model( x_train=x_train, x_test=x_test, + x_val=x_val, + y_val=y_val, y_train=y_train, y_test=y_test, fold=fold_no, @@ -880,20 +867,6 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: ) performance_list.append(performance) - trained_model = define_single_label_model( - input_size=len(x_train[0]), opts=opts - ) - trained_model.save_weights( - path.join( - opts.outputDir, - f"{target}_scaffold_single-labeled_Fold-{fold_no}.model.weights.hdf5", - ) - ) - trained_model.save( - filepath=path.join( - opts.outputDir, f"{target}_scaffold_saved_model_{fold_no}" - ) - ) if opts.wabTracking: wandb.finish() fold_no += 1 @@ -901,30 +874,20 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: best_fold = pd.concat(performance_list, ignore_index=True).sort_values( by=["p_1", "r_1", "MCC"], ascending=False, ignore_index=True )["fold"][0] - # rename the fold to best fold - src = os.path.join( - opts.outputDir, - f"{target}_scaffold_single-labeled_Fold-{best_fold}.model.weights.hdf5", - ) - dst = os.path.join( - opts.outputDir, - f"{target}_scaffold_single-labeled_BEST_Fold-{best_fold}.model.weights.hdf5", - ) - os.rename(src, dst) + src = os.path.join("tmp/", f"{target}/fold-{best_fold}/") + if opts.compressFeatures: + opts.outputDir = opts.ecModelDir + dst = os.path.join(opts.outputDir, f"{target}/") - src_dir = os.path.join( - opts.outputDir, f"{target}_scaffold_saved_model_{best_fold}" - ) - dst_dir = os.path.join( - opts.outputDir, - f"{target}_scaffold_saved_model_BEST_FOLD_{best_fold}", - ) + # Ensure the destination directory exists + os.makedirs(dst, exist_ok=True) + os.makedirs(src, exist_ok=True) - if path.isdir(dst_dir): - shutil.rmtree(dst_dir) + # Copy all contents from the source (best fold) to the destination + shutil.copytree(src, dst, dirs_exist_ok=True) - # Rename source directory to destination directory - os.rename(src_dir, dst_dir) + # Optionally, clean up the temporary directory + shutil.rmtree("tmp/") else: logging.info( @@ -941,7 +904,7 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: ) elif opts.split_type == "molecular_weight": logging.info("You can use molecular_weight split once.") - for idx, target in enumerate(targets): + for target in targets: df = prepare_nn_training_data(df, target, opts, return_dataframe=True) relevant_cols = ["smiles"] + ["fp"] + [target] if opts.compressFeatures: @@ -951,10 +914,12 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: df_task.reset_index(drop=True, inplace=True) if opts.kFolds == 1: train_set, val_set, test_set = weight_split( - df_task, bias="small", sizes=(1 - opts.testSize, 0.0, opts.testSize) + df_task, + bias="small", + sizes=(1 - opts.testSize, opts.testSize / 2, opts.testSize / 2), ) - x_train, y_train, x_test, y_test = get_x_y( - df_task, target, train_set, test_set, opts + x_train, y_train, x_test, y_test, x_val, y_val = get_x_y( + df_task, target, train_set, val_set, test_set, opts ) if opts.wabTracking and not opts.aeWabTracking: wandb.init( @@ -971,6 +936,8 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: performance = fit_and_evaluate_model( x_train=x_train, x_test=x_test, + x_val=x_val, + y_val=y_val, y_train=y_train, y_test=y_test, fold=0, @@ -978,18 +945,7 @@ def train_single_label_models(df: pd.DataFrame, opts: options.Options) -> None: opts=opts, ) performance_list.append(performance) - trained_model = define_single_label_model( - input_size=len(x_train[0]), opts=opts - ) - trained_model.save_weights( - path.join( - opts.outputDir, - f"{target}_weight_single-labeled_Fold-0.model.weights.hdf5", - ) - ) - trained_model.save( - filepath=path.join(opts.outputDir, f"{target}_weight_saved_model_0") - ) + elif opts.kFolds > 1: raise Exception( f"Unsupported number of folds: {opts.kFolds} for {opts.split_type} split.\ diff --git a/dfpl/utils.py b/dfpl/utils.py index db3d6ec1..ac6d68de 100644 --- a/dfpl/utils.py +++ b/dfpl/utils.py @@ -1,12 +1,16 @@ +import argparse import json import logging import os import pathlib +import sys import warnings from collections import defaultdict +from pathlib import Path from random import Random -from typing import Dict, List, Set, Tuple, Union +from typing import Dict, List, Set, Tuple, Type, TypeVar, Union +import jsonpickle import numpy as np import pandas as pd from rdkit import Chem, RDLogger @@ -14,7 +18,48 @@ from rdkit.Chem.Scaffolds import MurckoScaffold from tqdm import tqdm +# Define a type variable + + RDLogger.DisableLog("rdApp.*") +T = TypeVar("T") + + +def parseCmdArgs(cls: Type[T], args: argparse.Namespace) -> T: + """ + Parses command-line arguments to create an instance of the given class. + + Args: + cls: The class to create an instance of. + args: argparse.Namespace containing the command-line arguments. + + Returns: + An instance of cls populated with values from the command-line arguments. + """ + # Extract argument flags from sys.argv + arg_flags = {arg.lstrip("-") for arg in sys.argv if arg.startswith("-")} + + # Create the result instance, which will be modified and returned + result = cls() + + # Load JSON file if specified + if hasattr(args, "configFile") and args.configFile: + jsonFile = Path(args.configFile) + if jsonFile.exists() and jsonFile.is_file(): + with jsonFile.open() as f: + content = jsonpickle.decode(f.read()) + for key, value in vars(content).items(): + setattr(result, key, value) + else: + raise ValueError("Could not find JSON input file") + + # Override with user-provided command-line arguments + for key in arg_flags: + if hasattr(args, key): + user_value = getattr(args, key, None) + setattr(result, key, user_value) + + return result def makePathAbsolute(p: str) -> str: @@ -31,20 +76,34 @@ def createDirectory(directory: str): os.makedirs(path) -def createArgsFromJson(in_json: str, ignore_elements: list, return_json_object: bool): +def createArgsFromJson(jsonFile: str): arguments = [] - with open(in_json, "r") as f: + ignore_elements = ["py/object"] + + with open(jsonFile, "r") as f: data = json.load(f) + + # Check each key in the JSON file against command-line arguments for key, value in data.items(): if key not in ignore_elements: + # Prepare the command-line argument format + cli_arg_key = f"--{key}" + + # Check if this argument is provided in the command line + if cli_arg_key in sys.argv: + # Find the index of the argument in sys.argv and get its value + arg_index = sys.argv.index(cli_arg_key) + 1 + if arg_index < len(sys.argv): + cli_value = sys.argv[arg_index] + value = cli_value # Override JSON value with command-line value + + # Append the argument and its value to the list if key == "extra_metrics" and isinstance(value, list): - arguments.append("--extra_metrics") + arguments.append(cli_arg_key) arguments.extend(value) else: - arguments.append("--" + str(key)) - arguments.append(str(value)) - if return_json_object: - return arguments, data + arguments.extend([cli_arg_key, str(value)]) + return arguments @@ -140,7 +199,7 @@ def inchi_to_mol(inchi: str) -> Chem.Mol: def weight_split( - data: pd.DataFrame, bias: str, sizes: Tuple[float, float, float] = (0.8, 0, 0.2) + data: pd.DataFrame, bias: str, sizes: Tuple[float, float, float] = (0.8, 0.1, 0.1) ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: if not (len(sizes) == 3 and np.isclose(sum(sizes), 1)): raise ValueError(f"Invalid train/val/test splits! got: {sizes}") @@ -188,7 +247,7 @@ def weight_split( def ae_scaffold_split( data: pd.DataFrame, - sizes: Tuple[float, float, float] = (0.8, 0, 0.2), + sizes: Tuple[float, float, float] = (0.8, 0.1, 0.1), balanced: bool = False, key_molecule_index: int = 0, seed: int = 0, diff --git a/dfpl/vae.py b/dfpl/vae.py index d0a89dbe..911221cc 100644 --- a/dfpl/vae.py +++ b/dfpl/vae.py @@ -1,8 +1,6 @@ -import csv import logging import math import os.path -from os.path import basename from typing import Tuple import numpy as np @@ -10,328 +8,136 @@ import tensorflow.keras.metrics as metrics import wandb from keras import backend as K -from sklearn.model_selection import train_test_split -from tensorflow.keras import initializers, optimizers +from tensorflow.keras import optimizers from tensorflow.keras.layers import Dense, Input, Lambda from tensorflow.keras.models import Model from tensorflow.python.framework.ops import disable_eager_execution from dfpl import callbacks from dfpl import history as ht -from dfpl import options, settings -from dfpl.utils import ae_scaffold_split, weight_split +from dfpl import options +from dfpl.autoencoder import create_dense_layer, setup_train_test_split disable_eager_execution() -def define_vae_model(opts: options.Options, output_bias=None) -> Tuple[Model, Model]: +def define_vae_model(opts: options.Options) -> Tuple[Model, Model]: + """ + Define and compile a Variational Autoencoder (VAE) model based on the given options. + + :param opts: Training options with model parameters. + :param output_bias: Bias for the output layer, used for initializing the last layer. + :return: Tuple containing the VAE and encoder models. + """ input_size = opts.fpSize encoding_dim = opts.encFPSize - ac_optimizer = optimizers.Adam( - learning_rate=opts.aeLearningRate, decay=opts.aeLearningRateDecay - ) - - if output_bias is not None: - output_bias = initializers.Constant(output_bias) - # get the number of meaningful hidden layers (latent space included) - hidden_layer_count = round(math.log2(input_size / encoding_dim)) - - # the input placeholder + lr_schedule = optimizers.schedules.ExponentialDecay( + opts.aeLearningRate, + decay_steps=1000, + decay_rate=opts.aeLearningRateDecay, + staircase=True, + ) + vae_optimizer = optimizers.legacy.Adam(learning_rate=lr_schedule) input_vec = Input(shape=(input_size,)) + initial_layer_size = int(input_size / 2) + encoded = create_dense_layer( + input_vec, initial_layer_size, opts.aeActivationFunction + ) - # 1st hidden layer, that receives weights from input layer - # equals bottleneck layer, if hidden_layer_count==1! - if opts.aeActivationFunction != "selu": - encoded = Dense( - units=int(input_size / 2), activation=opts.aeActivationFunction - )(input_vec) - else: - encoded = Dense( - units=int(input_size / 2), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(input_vec) - - if hidden_layer_count > 1: - # encoding layers, incl. bottle-neck - for i in range(1, hidden_layer_count): - factor_units = 2 ** (i + 1) - # print(f'{factor_units}: {int(input_size / factor_units)}') - if opts.aeActivationFunction != "selu": - encoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - else: - encoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(encoded) - - # latent space layers - factor_units = 2 ** (hidden_layer_count - 1) - if opts.aeActivationFunction != "selu": - z_mean = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - z_log_var = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(encoded) - else: - z_mean = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(encoded) - z_log_var = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(encoded) - - # sampling layer - def sampling(args): - z_mean, z_log_var = args - batch = K.shape(z_mean)[0] - dim = K.int_shape(z_mean)[1] - epsilon = K.random_normal(shape=(batch, dim)) - return z_mean + K.exp(0.5 * z_log_var) * epsilon - - # sample from latent space - z = Lambda(sampling, output_shape=(int(input_size / factor_units),))( - [z_mean, z_log_var] - ) - decoded = z - # decoding layers - for i in range(hidden_layer_count - 2, 0, -1): - factor_units = 2**i - # print(f'{factor_units}: {int(input_size/factor_units)}') - if opts.aeActivationFunction != "selu": - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - )(decoded) - else: - decoded = Dense( - units=int(input_size / factor_units), - activation=opts.aeActivationFunction, - kernel_initializer="lecun_normal", - )(decoded) - - # output layer - decoded = Dense( - units=input_size, activation="sigmoid", bias_initializer=output_bias - )(decoded) - - else: - # output layer - decoded = Dense( - units=input_size, activation="sigmoid", bias_initializer=output_bias - )(encoded) + # Start `layer_sizes` with the initial layer size (1024) + layer_sizes = [initial_layer_size] - autoencoder = Model(input_vec, decoded) + # Building encoding layers and storing their sizes + hidden_layer_count = round(math.log2(input_size / encoding_dim)) + for i in range(1, hidden_layer_count - 1): + layer_size = int(input_size / (2 ** (i + 1))) + layer_sizes.append(layer_size) + encoded = create_dense_layer(encoded, layer_size, opts.aeActivationFunction) + + # Latent space layers + z_mean = create_dense_layer(encoded, encoding_dim, opts.aeActivationFunction) + z_log_var = create_dense_layer(encoded, encoding_dim, opts.aeActivationFunction) + # Sampling layer + + def sampling(args): + z_mean, z_log_var = args + batch = K.shape(z_mean)[0] + dim = K.int_shape(z_mean)[1] + epsilon = K.random_normal(shape=(batch, dim)) + return z_mean + K.exp(opts.vaeBeta * z_log_var) * epsilon + + z = Lambda(sampling, output_shape=(encoding_dim,))([z_mean, z_log_var]) + + # Build decoder layers starting directly from `z` + decoded = z + for layer_size in reversed(layer_sizes): + decoded = create_dense_layer(decoded, layer_size, opts.aeActivationFunction) + + # Final output layer to reconstruct input + decoded = Dense(units=input_size, activation="sigmoid")(decoded) + + # Define VAE and encoder models + vae = Model(input_vec, decoded) + encoder = Model(input_vec, z_mean) - # KL divergence loss + # Define custom loss def kl_loss(z_mean, z_log_var): return -0.5 * K.sum( 1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1 ) - # binary cross-entropy loss def bce_loss(y_true, y_pred): return metrics.binary_crossentropy(y_true, y_pred) - # combined loss def vae_loss(y_true, y_pred): - bce = bce_loss(y_true, y_pred) - kl = kl_loss(z_mean, z_log_var) - return bce + 0.5 * kl + return bce_loss(y_true, y_pred) + 0.5 * kl_loss(z_mean, z_log_var) - autoencoder.compile( - optimizer=ac_optimizer, loss=vae_loss, metrics=[bce_loss, kl_loss] - ) - - # build encoder model - encoder = Model(input_vec, z_mean) + vae.compile(optimizer=vae_optimizer, loss=vae_loss, metrics=[kl_loss, bce_loss]) + vae.summary(print_fn=logging.info) - return autoencoder, encoder + return vae, encoder -def train_full_vae(df: pd.DataFrame, opts: options.Options) -> Model: +def train_full_vae( + df: pd.DataFrame, opts: options.Options +) -> Tuple[Model, np.ndarray, np.ndarray]: """ - Trains an autoencoder on the given feature matrix X. The response matrix is only used to - split the data into meaningful test and train sets. + Trains a VAE on the provided data and returns the trained encoder and split indices. - :param opts: Command line arguments as defined in options.py - :param df: Pandas dataframe that contains the SMILES/InChI data for training the autoencoder - :return: The encoder model of the trained autoencoder + :param df: DataFrame containing SMILES/InChI data for training. + :param opts: Training options. + :return: The encoder model of the trained VAE and split indices. """ - - # If wandb tracking is enabled for VAE weights but not for the main program, initialize a new wandb run - if opts.aeWabTracking and not opts.wabTracking: + # Initialize wandb tracking if needed + if opts.aeWabTracking: wandb.init(project=f"VAE_{opts.aeSplitType}") - # Define output files for VAE and encoder weights - if opts.ecWeightsFile == "": - # If no encoder weights file is specified, use the input file name to generate a default file name - logging.info("No VAE encoder weights file specified") - base_file_name = ( - os.path.splitext(basename(opts.inputFile))[0] - + opts.aeType - + opts.aeSplitType - ) - logging.info( - f"(variational) encoder weights will be saved in {base_file_name}.autoencoder.hdf5" - ) - vae_weights_file = os.path.join( - opts.outputDir, base_file_name + ".vae.weights.hdf5" - ) - # ec_weights_file = os.path.join( - # opts.outputDir, base_file_name + ".encoder.weights.hdf5" - # ) - else: - # If an encoder weights file is specified, use it as the encoder weights file name - logging.info(f"VAE encoder will be saved in {opts.ecWeightsFile}") - base_file_name = ( - os.path.splitext(basename(opts.ecWeightsFile))[0] + opts.aeSplitType - ) - vae_weights_file = os.path.join( - opts.outputDir, base_file_name + ".vae.weights.hdf5" - ) - # ec_weights_file = os.path.join(opts.outputDir, opts.ecWeightsFile) + # Define paths for saving weights + save_path = os.path.join(opts.ecModelDir, f"vae_weights.h5") - # Collect the callbacks for training - callback_list = callbacks.autoencoder_callback( - checkpoint_path=vae_weights_file, opts=opts - ) - # Select all fingerprints that are valid and turn them into a numpy array - fp_matrix = np.array( - df[df["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - logging.info( - f"Training VAE on a matrix of shape {fp_matrix.shape} with type {fp_matrix.dtype}" - ) - assert 0.0 <= opts.testSize <= 0.5 - if opts.aeSplitType == "random": - logging.info("Training VAE using random split") - train_indices = np.arange(fp_matrix.shape[0]) - if opts.testSize > 0.0: - # Split data into test and training data - if opts.aeWabTracking: - x_train, x_test, _, _ = train_test_split( - fp_matrix, train_indices, test_size=opts.testSize, random_state=42 - ) - else: - x_train, x_test, _, _ = train_test_split( - fp_matrix, train_indices, test_size=opts.testSize, random_state=42 - ) - else: - x_train = fp_matrix - x_test = None - elif opts.aeSplitType == "scaffold_balanced": - logging.info("Training autoencoder using scaffold split") - train_indices = np.arange(fp_matrix.shape[0]) - if opts.testSize > 0.0: - # if opts.aeWabTracking: - train_data, val_data, test_data = ae_scaffold_split( - df, - sizes=(1 - opts.testSize, 0.0, opts.testSize), - balanced=True, - seed=42, - ) - x_train = np.array( - train_data[train_data["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - x_test = np.array( - test_data[test_data["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - else: - x_train = fp_matrix - x_test = None - elif opts.aeSplitType == "molecular_weight": - logging.info("Training autoencoder using molecular weight split") - train_indices = np.arange(fp_matrix.shape[0]) - if opts.testSize > 0.0: - # if opts.aeWabTracking: - train_data, val_data, test_data = weight_split( - df, sizes=(1 - opts.testSize, 0.0, opts.testSize), bias="small" - ) - x_train = np.array( - train_data[train_data["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - x_test = np.array( - test_data[test_data["fp"].notnull()]["fp"].to_list(), - dtype=settings.ac_fp_numpy_type, - copy=settings.numpy_copy_values, - ) - else: - x_train = fp_matrix - x_test = None - else: - raise ValueError(f"Invalid split type: {opts.split_type}") - if opts.testSize > 0.0: - train_indices = train_indices[train_indices < x_train.shape[0]] - test_indices = np.arange(x_train.shape[0], x_train.shape[0] + x_test.shape[0]) - else: - test_indices = None - ids, counts = np.unique(x_train.flatten(), return_counts=True) - count_dict = dict(zip(ids, counts)) - if count_dict[0] == 0: - initial_bias = None - logging.info("No zeroes in training labels. Setting initial_bias to None.") - else: - initial_bias = np.log([count_dict[1] / count_dict[0]]) - logging.info(f"Initial bias for last sigmoid layer: {initial_bias[0]}") - if opts.testSize > 0.0: - logging.info(f"VAE training/testing mode with train- and test-samples") - logging.info(f"VAE train data shape {x_train.shape} with type {x_train.dtype}") - logging.info(f"VAE test data shape {x_test.shape} with type {x_test.dtype}") - else: - logging.info(f"VAE full train mode without test-samples") - logging.info(f"VAE train data shape {x_train.shape} with type {x_train.dtype}") + x_train, x_test, train_indices, test_indices = setup_train_test_split(df, opts) + + # Define VAE and encoder models + vae, encoder = define_vae_model(opts) - (vae, encoder) = define_vae_model(opts, output_bias=initial_bias) - # Train the VAE on the training data + # Set up callbacks and train the VAE model + callback_list = callbacks.autoencoder_callback(checkpoint_path=save_path, opts=opts) vae_hist = vae.fit( x_train, x_train, epochs=opts.aeEpochs, batch_size=opts.aeBatchSize, verbose=opts.verbose, - callbacks=callback_list, - validation_data=(x_test, x_test) if opts.testSize > 0.0 else None, + callbacks=[callback_list], + validation_data=(x_test, x_test) if x_test is not None else None, ) - # Save the VAE weights - logging.info(f"VAE weights stored in file: {vae_weights_file}") - ht.store_and_plot_history( - base_file_name=os.path.join(opts.outputDir, base_file_name + ".VAE"), - hist=vae_hist, - ) - save_path = os.path.join(opts.ecModelDir, f"{opts.aeSplitType}_VAE.h5") - if opts.testSize > 0.0: - (callback_vae, callback_encoder) = define_vae_model(opts) - callback_vae.load_weights(filepath=vae_weights_file) - callback_encoder.save(filepath=save_path) - else: - encoder.save(filepath=save_path) - latent_space = encoder.predict(fp_matrix) - latent_space_file = os.path.join( - opts.outputDir, base_file_name + ".latent_space.csv" - ) - with open(latent_space_file, "w", newline="") as file: - writer = csv.writer(file) - writer.writerows(latent_space) + # Store training history + ht.store_and_plot_history(base_file_name=save_path, hist=vae_hist) + + # load the whole vae from the checkpoint + vae.load_weights(save_path) + encoder.save_weights(os.path.join(opts.ecModelDir, "encoder_weights.h5")) + return encoder, train_indices, test_indices diff --git a/environment.yml b/environment.yml index 3c2e7a6c..164db6bc 100644 --- a/environment.yml +++ b/environment.yml @@ -3,21 +3,16 @@ channels: - conda-forge - defaults dependencies: - # dev requirements - - conda-build=3.21.8 - - conda=4.12.0 - - pip=22.0.4 - - pytest=7.1.1 # application requirements - jsonpickle=2.1 - matplotlib=3.5.1 - - numpy=1.19.5 + - numpy=1.22.0 - pandas=1.4.2 - rdkit=2022.03.1 - scikit-learn=1.0.2 + - keras=2.9.0 + - tensorflow-gpu=2.9.3 + - wandb=0.12.0 + - umap-learn=0.1.1 - seaborn=0.12.2 - - tensorflow-gpu=2.6.0 - - wandb=0.12 - - umap=0.1.1 - - pip: - - git+https://github.com/soulios/chemprop.git@1d73523e49aa28a90b74edc04aaf45d7e124e338 \ No newline at end of file + - chemprop=1.7.1 \ No newline at end of file diff --git a/example/predict.json b/example/predict.json index 252965e3..b1f16e4b 100755 --- a/example/predict.json +++ b/example/predict.json @@ -1,12 +1,11 @@ { "py/object": "dfpl.options.Options", - "inputFile": "tests/data/smiles.csv", + "inputFile": "tests/data/tox21.csv", "outputDir": "example/results_predict/", "outputFile": "smiles.csv", - "ecModelDir": "example/results_train/random_autoencoder/", - "ecWeightsFile": "", - "fnnModelDir": "example/results_train/AR_saved_model", - "compressFeatures": true, - "trainAC": false, - "trainFNN": false + "fnnModelDir": "example/results_train/NR-AR", + "ecModelDir": "example/results_train/vae", + "ecWeightsFile": "encoder_weights.h5", + "aeType": "variational", + "compressFeatures": false } diff --git a/example/predictgnn.json b/example/predictgnn.json index 157b5e05..b3c8f6d8 100644 --- a/example/predictgnn.json +++ b/example/predictgnn.json @@ -1,7 +1,6 @@ { "py/object": "dfpl.options.GnnOptions", - "test_path": "tests/data/smiles.csv", - "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt", - "save_dir": "preds_dmpnn", - "saving_name": "DMPNN_preds.csv" + "test_path": "tests/data/tox21.csv", + "preds_path": "preds_dmpnn/DMPNN_preds.csv", + "checkpoint_path": "dmpnn-random/fold_0/model_0/model.pt" } \ No newline at end of file diff --git a/example/train.json b/example/train.json index 62f2abb4..8f811df3 100755 --- a/example/train.json +++ b/example/train.json @@ -2,25 +2,25 @@ "py/object": "dfpl.options.Options", "inputFile": "tests/data/S_dataset.csv", "outputDir": "example/results_train/", - "ecModelDir": "example/results_train/", - "ecWeightsFile": "random_autoencoder.hdf5", + "ecModelDir": "example/results_train/vae", + "ecWeightsFile": "", "verbose": 2, - "trainAC": true, - "compressFeatures": true, + "trainAC": false, + "compressFeatures": false, + "visualizeLatent": false, "encFPSize": 256, "aeSplitType": "random", - "aeEpochs": 2, + "aeEpochs": 11, "aeBatchSize": 351, "aeOptimizer": "Adam", "aeActivationFunction": "relu", "aeLearningRate": 0.001, - "aeLearningRateDecay": 0.0001, + "aeLearningRateDecay": 0.96, "aeType": "deterministic", + "vaeBeta": 0.5, - "type": "smiles", - "fpType": "topological", "fpSize": 2048, "split_type": "random", "enableMultiLabel": false, @@ -40,6 +40,7 @@ "activationFunction": "selu", "dropout": 0.0107, "learningRate": 0.0000022, + "learningRateDecay": 0.96, "l2reg": 0.001, "aeWabTracking": false, diff --git a/example/traingnn.json b/example/traingnn.json index 714fa80a..5dc6a0a5 100644 --- a/example/traingnn.json +++ b/example/traingnn.json @@ -3,7 +3,7 @@ "data_path": "tests/data/S_dataset.csv", "save_dir": "dmpnn-random/", "epochs": 2, - "num_folds": 2, + "num_folds": 1, "metric": "accuracy", "loss_function": "binary_cross_entropy", "split_type": "random", diff --git a/tests/data/smiles.csv b/tests/data/smiles.csv deleted file mode 100644 index 9383afdd..00000000 --- a/tests/data/smiles.csv +++ /dev/null @@ -1,7 +0,0 @@ -smiles -CN(C)c1ccc(cc1)C(=O)c2ccc(cc2)N(C)C -CC12CCC3C(CCC4=CC(=O)CCC34C)C1CCC2=O -CC12CCC3C(CCc4cc(O)ccc34)C1CCC2O -Oc1c(Br)cc(cc1Br)C#N -Oc1ccc(C=Cc2cc(O)cc(O)c2)cc1 -Oc1ccc(cc1)c2ccccc2 \ No newline at end of file diff --git a/tests/run_predictgnn.py b/tests/run_predictgnn.py index 979c2868..01d20ec7 100644 --- a/tests/run_predictgnn.py +++ b/tests/run_predictgnn.py @@ -1,8 +1,6 @@ import logging -import os import pathlib -import pandas as pd from chemprop import args, train import dfpl.options as opt @@ -26,26 +24,12 @@ def test_predictdmpnn(opts: opt.GnnOptions) -> None: ) json_arg_path = utils.makePathAbsolute(f"{example_directory}/predictgnn.json") - ignore_elements = [ - "py/object", - "checkpoint_paths", - "save_dir", - "saving_name", - ] - arguments, data = utils.createArgsFromJson( - json_arg_path, ignore_elements, return_json_object=True + arguments = utils.createArgsFromJson( + json_arg_path, ) - arguments.append("--preds_path") - arguments.append("") - save_dir = data.get("save_dir") - name = data.get("saving_name") - opts = args.PredictArgs().parse_args(arguments) - opts.preds_path = os.path.join(save_dir, name) - df = pd.read_csv(opts.test_path) - smiles = [[row.smiles] for _, row in df.iterrows()] - train.make_predictions(args=opts, smiles=smiles) + train.make_predictions(args=opts) print("predictdmpnn test complete.") diff --git a/tests/run_traingnn.py b/tests/run_traingnn.py index 582d4627..adae40d2 100644 --- a/tests/run_traingnn.py +++ b/tests/run_traingnn.py @@ -10,7 +10,6 @@ test_train_args = opt.GnnOptions( configFile=utils.makePathAbsolute(f"{example_directory}/traingnn.json"), save_dir=utils.makePathAbsolute(f"{project_directory}/output"), - total_epochs=1, )