From ef746c1ec171393d49baeb11a5bb7d36035b4c9b Mon Sep 17 00:00:00 2001 From: ferbsx Date: Tue, 28 Apr 2026 14:29:49 +0200 Subject: [PATCH 01/11] add option to select protein IDs as the dimensionality reduction value --- .../data_analysis/dimension_reduction.py | 19 +++++++++++++------ backend/protzilla/methods/data_analysis.py | 14 ++++++++++++++ backend/protzilla/utilities/transform_dfs.py | 12 ++++++++++-- 3 files changed, 37 insertions(+), 8 deletions(-) diff --git a/backend/protzilla/data_analysis/dimension_reduction.py b/backend/protzilla/data_analysis/dimension_reduction.py index 7c831c53b..a3292fb85 100644 --- a/backend/protzilla/data_analysis/dimension_reduction.py +++ b/backend/protzilla/data_analysis/dimension_reduction.py @@ -17,6 +17,7 @@ def t_sne( n_components: int = 2, perplexity: float = 30.0, metric: str = "euclidean", + value: str = "Sample", random_state: int = 42, max_iter: int = 1000, n_iter_without_progress: int = 300, @@ -38,6 +39,8 @@ def t_sne( :param metric: The metric to use when calculating distance between instances in a feature array. Possible metrics are: euclidean, manhattan, cosine and haversine :type metric: str + :param value: whether to use the "Sample" or the "Protein ID" as values for dimension reduction. + :type value: str :param random_state: determines the random number generator. :type random_state: int :param max_iter: maximum number of iterations for the optimization @@ -59,9 +62,9 @@ def t_sne( """ input_df = protein_df - + columns = 'Protein ID' if value == 'Sample' else 'Sample' intensity_df_wide = ( - long_to_wide(input_df) if is_long_format(input_df) else input_df.copy() + long_to_wide(input_df, index=value, columns=columns) if is_long_format(input_df) else input_df.copy() ) if intensity_df_wide.isnull().sum().any(): raise ValueError( @@ -70,8 +73,8 @@ def t_sne( ) if perplexity >= intensity_df_wide.shape[0]: raise ValueError( - "Perplexity must be less than the number of samples. In the selected dataframe there " - f"is {intensity_df_wide.shape[0]} samples" + f"Perplexity must be less than the number of {value}s. In the selected dataframe there " + f"are {intensity_df_wide.shape[0]} {value}s." ) if ( min(intensity_df_wide.shape[0], intensity_df_wide.shape[1]) <= n_components @@ -80,7 +83,7 @@ def t_sne( raise ValueError( "The number of dimensions of the embedded space must be between 1 and " f"{min(intensity_df_wide.shape[0], intensity_df_wide.shape[1])} (the smaller one of number of " - "samples/features). " + f"{value}s/features). " ) if n_components > 3 and method == TSNEMethod.barnes_hut.value: raise ValueError( @@ -112,6 +115,7 @@ def umap( n_components: int = 2, min_dist: float = 0.1, metric: str = "euclidean", + value: str = "Sample", random_state: int = 42, transform_seed: int = 42, ): @@ -138,6 +142,8 @@ def umap( :param metric: The metric to use when calculating distance between instances in a feature array. :type metric: str + :param value: whether to use the "Sample" or the "Protein ID" as values for dimension reduction. + :type value: str :param random_state: determines the random number generator. :type random_state: int :param transform_seed: Random seed used for the stochastic aspects of the transform @@ -155,7 +161,8 @@ def umap( input_df = protein_df - intensity_df_wide = long_to_wide(input_df) if is_long_format(input_df) else input_df + columns = 'Protein ID' if value == 'Sample' else 'Sample' + intensity_df_wide = long_to_wide(input_df, index=value, columns=columns) if is_long_format(input_df) else input_df if intensity_df_wide.isnull().sum().any(): raise ValueError( "UMAP does not accept missing values encoded as NaN. Consider preprocessing your data to remove NaN " diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index d95851d70..739a9dc37 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -212,6 +212,10 @@ class DimensionReductionMetric(Enum): manhattan = "manhattan" cosine = "cosine" +class DimensionReductionValue(Enum): + protein = "Protein ID" + sample = "Sample" + class DataAnalysisStep(Step, ABC): section = Section.DATA_ANALYSIS @@ -1993,6 +1997,11 @@ def create_form(self): label="Distance metric", options=DimensionReductionMetric, ), + DropdownField( + name="value", + label="Values for dimension reduction", + options=DimensionReductionValue, + ), NumberField( name="random_state", label="Seed for random number generation", @@ -2065,6 +2074,11 @@ def create_form(self): label="Distance metric", options=DimensionReductionMetric, ), + DropdownField( + name="value", + label="Values for dimension reduction", + options=DimensionReductionValue, + ), NumberField( name="random_state", label="Seed for random number generation", diff --git a/backend/protzilla/utilities/transform_dfs.py b/backend/protzilla/utilities/transform_dfs.py index 4be52b1d5..3f9cfc79d 100644 --- a/backend/protzilla/utilities/transform_dfs.py +++ b/backend/protzilla/utilities/transform_dfs.py @@ -3,7 +3,7 @@ from backend.protzilla.utilities.utilities import default_intensity_column -def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None): +def long_to_wide(intensity_df: pd.DataFrame, index: str = "Sample", columns: str = "Protein ID", value_name: str | None = None): """ This function transforms the dataframe to a wide format that can be more easily handled by packages such as sklearn. @@ -12,6 +12,14 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None): :param intensity_df: the dataframe that should be transformed into long format :type intensity_df: pd.DataFrame + :param index: the column that should be used as index in the wide format + dataframe. This should be either "Sample" or "Protein ID". + :type index: str + :param columns: the column that should be used as columns in the wide format + dataframe. This should be either "Sample" or "Protein ID". + :type columns: str + :param value_name: the name of the column that contains the values in the long format dataframe. If None, the default intensity column will be used. + :type value_name: str | None :return: returns dataframe in wide format suitable for use by packages such as sklearn @@ -21,7 +29,7 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None): default_intensity_column(intensity_df) if value_name is None else value_name ) return pd.pivot( - intensity_df, index="Sample", columns="Protein ID", values=values_name + intensity_df, index=index, columns=columns, values=values_name ) From de1f0120230c9dbf598c17c2db4c64c90e2c86f7 Mon Sep 17 00:00:00 2001 From: ferbsx Date: Tue, 28 Apr 2026 14:30:12 +0200 Subject: [PATCH 02/11] update test for dimentionsality reduction to inclue new funtion format --- .../protzilla/data_analysis/test_dimension_reduction.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/tests/protzilla/data_analysis/test_dimension_reduction.py b/backend/tests/protzilla/data_analysis/test_dimension_reduction.py index 8941771cf..235065063 100644 --- a/backend/tests/protzilla/data_analysis/test_dimension_reduction.py +++ b/backend/tests/protzilla/data_analysis/test_dimension_reduction.py @@ -109,12 +109,12 @@ def metadata_df(): def check_dimensionality_reduction_output( - out_df: pd.DataFrame, orig_df: pd.DataFrame, n_components: int + out_df: pd.DataFrame, orig_df: pd.DataFrame, n_components: int, value: str = "Sample" ): assert ( - out_df.shape == (orig_df["Sample"].nunique(), n_components + 1) - and out_df["Sample"].sort_values().tolist() - == sorted(orig_df["Sample"].unique()) + out_df.shape == (orig_df[value].nunique(), n_components + 1) + and out_df[value].sort_values().tolist() + == sorted(orig_df[value].unique()) and all( ( pd.api.types.is_numeric_dtype(out_df[f"Component{i + 1}"]) From 5470a5ac22055d489d8fcb1173724c71c4c0b42f Mon Sep 17 00:00:00 2001 From: yanjo Date: Tue, 28 Apr 2026 15:12:59 +0200 Subject: [PATCH 03/11] change default dimension reduction value in dropdown --- backend/protzilla/methods/data_analysis.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 739a9dc37..d1d7c9f8e 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -213,8 +213,8 @@ class DimensionReductionMetric(Enum): cosine = "cosine" class DimensionReductionValue(Enum): - protein = "Protein ID" sample = "Sample" + protein = "Protein ID" class DataAnalysisStep(Step, ABC): From 06e708b49131f5b6f1af904a71dcedf5db82e111 Mon Sep 17 00:00:00 2001 From: yanjo Date: Tue, 28 Apr 2026 15:14:13 +0200 Subject: [PATCH 04/11] adapt wording for test_tsne_perplexity --- .../tests/protzilla/data_analysis/test_dimension_reduction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/backend/tests/protzilla/data_analysis/test_dimension_reduction.py b/backend/tests/protzilla/data_analysis/test_dimension_reduction.py index 235065063..79a43a0dc 100644 --- a/backend/tests/protzilla/data_analysis/test_dimension_reduction.py +++ b/backend/tests/protzilla/data_analysis/test_dimension_reduction.py @@ -169,8 +169,8 @@ def test_tsne_nan_handling(df_with_nan): def test_tsne_perplexity(dimension_reduction_df): with pytest.raises( ValueError, - match="Perplexity must be less than the number of samples. In the selected dataframe there " - f"is {dimension_reduction_df['Sample'].nunique()} samples", + match="Perplexity must be less than the number of Samples. In the selected dataframe there " + f"are {dimension_reduction_df['Sample'].nunique()} Samples", ): _ = t_sne( dimension_reduction_df, From c0dc7da0680c274a0fe5a43dace8e9b6a84924f8 Mon Sep 17 00:00:00 2001 From: yanjo Date: Tue, 28 Apr 2026 16:15:04 +0200 Subject: [PATCH 05/11] black formatting --- .../protzilla/data_analysis/dimension_reduction.py | 14 ++++++++++---- backend/protzilla/methods/data_analysis.py | 1 + backend/protzilla/utilities/transform_dfs.py | 11 +++++++---- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/backend/protzilla/data_analysis/dimension_reduction.py b/backend/protzilla/data_analysis/dimension_reduction.py index a3292fb85..9c862e32b 100644 --- a/backend/protzilla/data_analysis/dimension_reduction.py +++ b/backend/protzilla/data_analysis/dimension_reduction.py @@ -62,9 +62,11 @@ def t_sne( """ input_df = protein_df - columns = 'Protein ID' if value == 'Sample' else 'Sample' + columns = "Protein ID" if value == "Sample" else "Sample" intensity_df_wide = ( - long_to_wide(input_df, index=value, columns=columns) if is_long_format(input_df) else input_df.copy() + long_to_wide(input_df, index=value, columns=columns) + if is_long_format(input_df) + else input_df.copy() ) if intensity_df_wide.isnull().sum().any(): raise ValueError( @@ -161,8 +163,12 @@ def umap( input_df = protein_df - columns = 'Protein ID' if value == 'Sample' else 'Sample' - intensity_df_wide = long_to_wide(input_df, index=value, columns=columns) if is_long_format(input_df) else input_df + columns = "Protein ID" if value == "Sample" else "Sample" + intensity_df_wide = ( + long_to_wide(input_df, index=value, columns=columns) + if is_long_format(input_df) + else input_df + ) if intensity_df_wide.isnull().sum().any(): raise ValueError( "UMAP does not accept missing values encoded as NaN. Consider preprocessing your data to remove NaN " diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index d1d7c9f8e..927c285b0 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -212,6 +212,7 @@ class DimensionReductionMetric(Enum): manhattan = "manhattan" cosine = "cosine" + class DimensionReductionValue(Enum): sample = "Sample" protein = "Protein ID" diff --git a/backend/protzilla/utilities/transform_dfs.py b/backend/protzilla/utilities/transform_dfs.py index 3f9cfc79d..b8552313d 100644 --- a/backend/protzilla/utilities/transform_dfs.py +++ b/backend/protzilla/utilities/transform_dfs.py @@ -3,7 +3,12 @@ from backend.protzilla.utilities.utilities import default_intensity_column -def long_to_wide(intensity_df: pd.DataFrame, index: str = "Sample", columns: str = "Protein ID", value_name: str | None = None): +def long_to_wide( + intensity_df: pd.DataFrame, + index: str = "Sample", + columns: str = "Protein ID", + value_name: str | None = None, +): """ This function transforms the dataframe to a wide format that can be more easily handled by packages such as sklearn. @@ -28,9 +33,7 @@ def long_to_wide(intensity_df: pd.DataFrame, index: str = "Sample", columns: str values_name = ( default_intensity_column(intensity_df) if value_name is None else value_name ) - return pd.pivot( - intensity_df, index=index, columns=columns, values=values_name - ) + return pd.pivot(intensity_df, index=index, columns=columns, values=values_name) def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame): From 215ec1081d181dca1aede08a2b70488984187796 Mon Sep 17 00:00:00 2001 From: yanjo Date: Tue, 28 Apr 2026 16:15:30 +0200 Subject: [PATCH 06/11] adding tests --- .../data_analysis/test_dimension_reduction.py | 107 +++++++++++++++++- backend/tests/protzilla/test_transform_dfs.py | 10 ++ 2 files changed, 114 insertions(+), 3 deletions(-) diff --git a/backend/tests/protzilla/data_analysis/test_dimension_reduction.py b/backend/tests/protzilla/data_analysis/test_dimension_reduction.py index 79a43a0dc..f53106dcd 100644 --- a/backend/tests/protzilla/data_analysis/test_dimension_reduction.py +++ b/backend/tests/protzilla/data_analysis/test_dimension_reduction.py @@ -109,12 +109,14 @@ def metadata_df(): def check_dimensionality_reduction_output( - out_df: pd.DataFrame, orig_df: pd.DataFrame, n_components: int, value: str = "Sample" + out_df: pd.DataFrame, + orig_df: pd.DataFrame, + n_components: int, + value: str = "Sample", ): assert ( out_df.shape == (orig_df[value].nunique(), n_components + 1) - and out_df[value].sort_values().tolist() - == sorted(orig_df[value].unique()) + and out_df[value].sort_values().tolist() == sorted(orig_df[value].unique()) and all( ( pd.api.types.is_numeric_dtype(out_df[f"Component{i + 1}"]) @@ -250,6 +252,85 @@ def test_tsne_scatter_plot_integration( ) +@pytest.mark.parametrize( + "df_name,n_components,method", + [ + ("dimension_reduction_df", 2, TSNEMethod.exact.value), + ("dimension_reduction_four_proteins_df", 2, TSNEMethod.exact.value), + ("dimension_reduction_df", 2, TSNEMethod.barnes_hut.value), + ("dimension_reduction_four_proteins_df", 2, TSNEMethod.barnes_hut.value), + ], +) +def test_tsne_protein_id_value(df_name, n_components, method, request): + df = request.getfixturevalue(df_name) + out = t_sne( + df, + method=method, + n_components=n_components, + perplexity=2, + value="Protein ID", + random_state=42, + ) + check_dimensionality_reduction_output( + out["embedded_data"], df, n_components, value="Protein ID" + ) + + +@pytest.mark.parametrize( + "df_name,n_components,method", + [ + ("dimension_reduction_df", 2, TSNEMethod.exact.value), + ("dimension_reduction_four_proteins_df", 2, TSNEMethod.exact.value), + ], +) +def test_tsne_metrics_protein_id(df_name, n_components, method, request): + for metric in DimensionReductionMetric: + df = request.getfixturevalue(df_name) + current_out = t_sne( + df, + method=method, + n_components=n_components, + perplexity=2, + metric=metric.value, + value="Protein ID", + random_state=42, + ) + check_dimensionality_reduction_output( + current_out["embedded_data"], df, n_components, value="Protein ID" + ) + + +def test_tsne_perplexity_protein_id(dimension_reduction_df): + with pytest.raises( + ValueError, + match="Perplexity must be less than the number of Protein IDs. In the selected dataframe there " + f"are {dimension_reduction_df['Protein ID'].nunique()} Protein IDs", + ): + _ = t_sne( + dimension_reduction_df, + method=TSNEMethod.barnes_hut.value, + n_components=2, + perplexity=5, + value="Protein ID", + ) + + +def test_tsne_n_components_protein_id(dimension_reduction_df): + with pytest.raises( + ValueError, + match="The number of dimensions of the embedded space must be between 1 and " + f"{min(dimension_reduction_df['Protein ID'].nunique(), dimension_reduction_df['Sample'].nunique())}", + ): + _ = t_sne( + dimension_reduction_df, + method="exact", + n_components=5, + perplexity=2, + value="Protein ID", + random_state=42, + ) + + @pytest.mark.parametrize( "n_components", [2, 3], @@ -271,6 +352,26 @@ def test_umap(dimension_reduction_df, n_components): ) +@pytest.mark.parametrize("n_components", [2]) +def test_umap_protein_id_value(dimension_reduction_four_proteins_df, n_components): + for metric in DimensionReductionMetric: + current_out = umap( + dimension_reduction_four_proteins_df, + n_components=n_components, + metric=metric.value, + n_neighbors=3, + value="Protein ID", + random_state=42, + transform_seed=42, + ) + check_dimensionality_reduction_output( + current_out["embedded_data"], + dimension_reduction_four_proteins_df, + n_components, + value="Protein ID", + ) + + def test_umap_nan_handling(df_with_nan): with pytest.raises( ValueError, diff --git a/backend/tests/protzilla/test_transform_dfs.py b/backend/tests/protzilla/test_transform_dfs.py index 970fbeda2..97ef31030 100644 --- a/backend/tests/protzilla/test_transform_dfs.py +++ b/backend/tests/protzilla/test_transform_dfs.py @@ -90,6 +90,16 @@ def test_transform_long_to_wide(transform_df_long, transform_df_wide): pd.testing.assert_frame_equal(long_to_wide(transform_df_long), transform_df_wide) +def test_transform_long_to_wide_protein_id_as_index( + transform_df_long, transform_df_wide +): + result = long_to_wide(transform_df_long, index="Protein ID", columns="Sample") + expected = transform_df_wide.T + expected.index.name = "Protein ID" + expected.columns.name = "Sample" + pd.testing.assert_frame_equal(result, expected) + + def test_transform_long_to_wide_to_long( transform_df_long, transform_df_wide, transform_df_long_gene_name_provider ): From 953b986bac9e1cdd26bb32a5ffe8bf1dbc3b4e8f Mon Sep 17 00:00:00 2001 From: ferbsx Date: Tue, 28 Apr 2026 16:44:07 +0200 Subject: [PATCH 07/11] updating test based on changes made to underlying function --- backend/protzilla/data_analysis/ptm_analysis.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/protzilla/data_analysis/ptm_analysis.py b/backend/protzilla/data_analysis/ptm_analysis.py index 7174e8a09..860e7d6a7 100644 --- a/backend/protzilla/data_analysis/ptm_analysis.py +++ b/backend/protzilla/data_analysis/ptm_analysis.py @@ -48,7 +48,9 @@ def ptms_per_protein_and_sample(psm_df: pd.DataFrame) -> dict: modification_df = modification_df[["Sample", "Protein ID", "Modifications"]] modification_df = ( - long_to_wide(modification_df, "Modifications").fillna("").reset_index() + long_to_wide(modification_df, value_name="Modifications") + .fillna("") + .reset_index() ) return dict(ptm_df=modification_df) From 156274c2fed92ef392f38d80b4a64e2494b7e767 Mon Sep 17 00:00:00 2001 From: ferbsx Date: Tue, 28 Apr 2026 16:44:28 +0200 Subject: [PATCH 08/11] minor typo --- backend/protzilla/utilities/utilities.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/protzilla/utilities/utilities.py b/backend/protzilla/utilities/utilities.py index 58d7f4966..52951d67b 100644 --- a/backend/protzilla/utilities/utilities.py +++ b/backend/protzilla/utilities/utilities.py @@ -86,7 +86,7 @@ def default_intensity_column( return matched_columns[0] raise ValueError( - "No intensity column name provided and no default intensity column could be determined." + "No intensity column name provided and no default intensity column could be determined. " "Please provide the intensity column name manually to the function call." ) From 46f5cee07adef3f336dc1ff12f9b4543d7ff5efd Mon Sep 17 00:00:00 2001 From: yanjo Date: Wed, 29 Apr 2026 17:04:25 +0200 Subject: [PATCH 09/11] adding Dropdown for selecting the sample name for scatter plot --- backend/protzilla/methods/data_analysis.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index 927c285b0..e074da2f3 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -904,6 +904,12 @@ def create_form(self): name="metadata_column", label="Choose the column of the metadata dataframe that should be used for coloring", ), + DropdownField( + name="sample_name", + label="Choose the column that contains the sample information", + value=DimensionReductionValue.sample.value, + options=DimensionReductionValue, + ), ], ) @@ -919,6 +925,7 @@ def modify_form(self, run: Run) -> None: run, instance_identifier=metadata_source, include_sample=False, + required=False, output_key=source_handle, ) ) From 1c2800198e3cf11ee5988967b7e62eb7edec64f4 Mon Sep 17 00:00:00 2001 From: yanjo Date: Wed, 29 Apr 2026 17:05:14 +0200 Subject: [PATCH 10/11] adding parameter to make required optional for choices from metadata --- backend/protzilla/form_helper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backend/protzilla/form_helper.py b/backend/protzilla/form_helper.py index 1367b3e8c..6b549964f 100644 --- a/backend/protzilla/form_helper.py +++ b/backend/protzilla/form_helper.py @@ -32,6 +32,7 @@ def get_choices_for_metadata( instance_identifier: StepID, output_key: DataKey, include_sample: bool = True, + required: bool = False, ) -> list[Option]: metadata_df = run.steps.get_step_output( output_key=output_key, instance_identifier=instance_identifier @@ -43,7 +44,8 @@ def get_choices_for_metadata( if include_sample else [column for column in metadata_df.columns.unique() if column != "Sample"] ) - return to_choices(columns) + + return to_choices(columns, required=required) def get_choices_for_groups( From 5e23b4e226e207239c376621e73705054e493bce Mon Sep 17 00:00:00 2001 From: yanjo Date: Wed, 29 Apr 2026 17:07:03 +0200 Subject: [PATCH 11/11] remove hardcoded Sample from scatter_plot to allow other values like Protein ID --- backend/protzilla/data_analysis/plots.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/backend/protzilla/data_analysis/plots.py b/backend/protzilla/data_analysis/plots.py index 06f3469cd..e93ca2c28 100644 --- a/backend/protzilla/data_analysis/plots.py +++ b/backend/protzilla/data_analysis/plots.py @@ -38,6 +38,7 @@ def scatter_plot( input_df: pd.DataFrame, metadata_df: pd.DataFrame | None = None, metadata_column: str | None = None, + sample_name: str = "Sample", ) -> dict: """ Function to create a scatter plot from data. @@ -49,6 +50,7 @@ def scatter_plot( :param metadata_column: the name of the column in `metadata_df` that contains the group information for each sample. This parameter is required if `metadata_df` is provided. + :param sample_name: the name of the sample column, should be the same name for the input_df and the metadata_df :return: returns a dictionary containing a list with a plotly figure and/or a list of messages """ @@ -58,19 +60,24 @@ def scatter_plot( "The column selected for annotation is not present in the corresponding metadata dataframe.", ) + if sample_name not in input_df.columns: + raise ValueError( + f"The column {sample_name} selected for annotation is not present in the corresponding input dataframe.", + ) + intensity_df = input_df.copy() if isinstance(metadata_df, pd.DataFrame): intensity_df = pd.merge( intensity_df, - metadata_df[["Sample", metadata_column]], - on="Sample", + metadata_df[[sample_name, metadata_column]], + on=sample_name, how="left", ) else: # Mock a metadata column here so that we can treat dfs with and without metadata the same way metadata_column = "mock_metadata_column" intensity_df[metadata_column] = None - intensity_df = intensity_df.drop(columns="Sample") + intensity_df = intensity_df.drop(columns=sample_name) color_col = ( metadata_column if intensity_df[metadata_column].notnull().any() else None