From ef746c1ec171393d49baeb11a5bb7d36035b4c9b Mon Sep 17 00:00:00 2001
From: ferbsx <itrytocode@gmail.com>
Date: Tue, 28 Apr 2026 14:29:49 +0200
Subject: [PATCH 01/11] add option to select protein IDs as the dimensionality
 reduction value

---
 .../data_analysis/dimension_reduction.py      | 19 +++++++++++++------
 backend/protzilla/methods/data_analysis.py    | 14 ++++++++++++++
 backend/protzilla/utilities/transform_dfs.py  | 12 ++++++++++--
 3 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/backend/protzilla/data_analysis/dimension_reduction.py b/backend/protzilla/data_analysis/dimension_reduction.py
index 7c831c53b..a3292fb85 100644
--- a/backend/protzilla/data_analysis/dimension_reduction.py
+++ b/backend/protzilla/data_analysis/dimension_reduction.py
@@ -17,6 +17,7 @@ def t_sne(
     n_components: int = 2,
     perplexity: float = 30.0,
     metric: str = "euclidean",
+    value: str = "Sample",
     random_state: int = 42,
     max_iter: int = 1000,
     n_iter_without_progress: int = 300,
@@ -38,6 +39,8 @@ def t_sne(
     :param metric: The metric to use when calculating distance between instances in a
         feature array. Possible metrics are: euclidean, manhattan, cosine and haversine
     :type metric: str
+    :param value: whether to use the "Sample" or the "Protein ID" as values for dimension reduction.
+    :type value: str
     :param random_state: determines the random number generator.
     :type random_state: int
     :param max_iter: maximum number of iterations for the optimization
@@ -59,9 +62,9 @@ def t_sne(
     """
 
     input_df = protein_df
-
+    columns = 'Protein ID' if value == 'Sample' else 'Sample'
     intensity_df_wide = (
-        long_to_wide(input_df) if is_long_format(input_df) else input_df.copy()
+        long_to_wide(input_df, index=value, columns=columns) if is_long_format(input_df) else input_df.copy()
     )
     if intensity_df_wide.isnull().sum().any():
         raise ValueError(
@@ -70,8 +73,8 @@ def t_sne(
         )
     if perplexity >= intensity_df_wide.shape[0]:
         raise ValueError(
-            "Perplexity must be less than the number of samples. In the selected dataframe there "
-            f"is {intensity_df_wide.shape[0]} samples"
+            f"Perplexity must be less than the number of {value}s. In the selected dataframe there "
+            f"are {intensity_df_wide.shape[0]} {value}s."
         )
     if (
         min(intensity_df_wide.shape[0], intensity_df_wide.shape[1]) <= n_components
@@ -80,7 +83,7 @@ def t_sne(
         raise ValueError(
             "The number of dimensions of the embedded space must be between 1 and "
             f"{min(intensity_df_wide.shape[0], intensity_df_wide.shape[1])} (the smaller one of number of "
-            "samples/features). "
+            f"{value}s/features). "
         )
     if n_components > 3 and method == TSNEMethod.barnes_hut.value:
         raise ValueError(
@@ -112,6 +115,7 @@ def umap(
     n_components: int = 2,
     min_dist: float = 0.1,
     metric: str = "euclidean",
+    value: str = "Sample",
     random_state: int = 42,
     transform_seed: int = 42,
 ):
@@ -138,6 +142,8 @@ def umap(
     :param metric: The metric to use when calculating distance between instances in a
         feature array.
     :type metric: str
+    :param value: whether to use the "Sample" or the "Protein ID" as values for dimension reduction.
+    :type value: str
     :param random_state: determines the random number generator.
     :type random_state: int
     :param transform_seed: Random seed used for the stochastic aspects of the transform
@@ -155,7 +161,8 @@ def umap(
 
     input_df = protein_df
 
-    intensity_df_wide = long_to_wide(input_df) if is_long_format(input_df) else input_df
+    columns = 'Protein ID' if value == 'Sample' else 'Sample'
+    intensity_df_wide = long_to_wide(input_df, index=value, columns=columns) if is_long_format(input_df) else input_df
     if intensity_df_wide.isnull().sum().any():
         raise ValueError(
             "UMAP does not accept missing values encoded as NaN. Consider preprocessing your data to remove NaN "
diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py
index d95851d70..739a9dc37 100644
--- a/backend/protzilla/methods/data_analysis.py
+++ b/backend/protzilla/methods/data_analysis.py
@@ -212,6 +212,10 @@ class DimensionReductionMetric(Enum):
     manhattan = "manhattan"
     cosine = "cosine"
 
+class DimensionReductionValue(Enum):
+    protein = "Protein ID"
+    sample = "Sample"
+
 
 class DataAnalysisStep(Step, ABC):
     section = Section.DATA_ANALYSIS
@@ -1993,6 +1997,11 @@ def create_form(self):
                     label="Distance metric",
                     options=DimensionReductionMetric,
                 ),
+                DropdownField(
+                    name="value",
+                    label="Values for dimension reduction",
+                    options=DimensionReductionValue,
+                ),
                 NumberField(
                     name="random_state",
                     label="Seed for random number generation",
@@ -2065,6 +2074,11 @@ def create_form(self):
                     label="Distance metric",
                     options=DimensionReductionMetric,
                 ),
+                DropdownField(
+                    name="value",
+                    label="Values for dimension reduction",
+                    options=DimensionReductionValue,
+                ),
                 NumberField(
                     name="random_state",
                     label="Seed for random number generation",
diff --git a/backend/protzilla/utilities/transform_dfs.py b/backend/protzilla/utilities/transform_dfs.py
index 4be52b1d5..3f9cfc79d 100644
--- a/backend/protzilla/utilities/transform_dfs.py
+++ b/backend/protzilla/utilities/transform_dfs.py
@@ -3,7 +3,7 @@
 from backend.protzilla.utilities.utilities import default_intensity_column
 
 
-def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None):
+def long_to_wide(intensity_df: pd.DataFrame, index: str = "Sample", columns: str = "Protein ID", value_name: str | None = None):
     """
     This function transforms the dataframe to a wide format that
     can be more easily handled by packages such as sklearn.
@@ -12,6 +12,14 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None):
     :param intensity_df: the dataframe that should be transformed into
         long format
         :type intensity_df: pd.DataFrame
+    :param index: the column that should be used as index in the wide format
+        dataframe. This should be either "Sample" or "Protein ID".
+    :type index: str
+    :param columns: the column that should be used as columns in the wide format
+        dataframe. This should be either "Sample" or "Protein ID".
+    :type columns: str
+    :param value_name: the name of the column that contains the values in the long format dataframe. If None, the default intensity column will be used.
+    :type value_name: str | None
 
     :return: returns dataframe in wide format suitable for use by
         packages such as sklearn
@@ -21,7 +29,7 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None):
         default_intensity_column(intensity_df) if value_name is None else value_name
     )
     return pd.pivot(
-        intensity_df, index="Sample", columns="Protein ID", values=values_name
+        intensity_df, index=index, columns=columns, values=values_name
     )
 
 

From de1f0120230c9dbf598c17c2db4c64c90e2c86f7 Mon Sep 17 00:00:00 2001
From: ferbsx <itrytocode@gmail.com>
Date: Tue, 28 Apr 2026 14:30:12 +0200
Subject: [PATCH 02/11] update test for dimentionsality reduction to inclue new
 funtion format

---
 .../protzilla/data_analysis/test_dimension_reduction.py   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/backend/tests/protzilla/data_analysis/test_dimension_reduction.py b/backend/tests/protzilla/data_analysis/test_dimension_reduction.py
index 8941771cf..235065063 100644
--- a/backend/tests/protzilla/data_analysis/test_dimension_reduction.py
+++ b/backend/tests/protzilla/data_analysis/test_dimension_reduction.py
@@ -109,12 +109,12 @@ def metadata_df():
 
 
 def check_dimensionality_reduction_output(
-    out_df: pd.DataFrame, orig_df: pd.DataFrame, n_components: int
+    out_df: pd.DataFrame, orig_df: pd.DataFrame, n_components: int, value: str = "Sample"
 ):
     assert (
-        out_df.shape == (orig_df["Sample"].nunique(), n_components + 1)
-        and out_df["Sample"].sort_values().tolist()
-        == sorted(orig_df["Sample"].unique())
+        out_df.shape == (orig_df[value].nunique(), n_components + 1)
+        and out_df[value].sort_values().tolist()
+        == sorted(orig_df[value].unique())
         and all(
             (
                 pd.api.types.is_numeric_dtype(out_df[f"Component{i + 1}"])

From 5470a5ac22055d489d8fcb1173724c71c4c0b42f Mon Sep 17 00:00:00 2001
From: yanjo <y.hartmaring@gmx.net>
Date: Tue, 28 Apr 2026 15:12:59 +0200
Subject: [PATCH 03/11] change default dimension reduction value in dropdown

---
 backend/protzilla/methods/data_analysis.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py
index 739a9dc37..d1d7c9f8e 100644
--- a/backend/protzilla/methods/data_analysis.py
+++ b/backend/protzilla/methods/data_analysis.py
@@ -213,8 +213,8 @@ class DimensionReductionMetric(Enum):
     cosine = "cosine"
 
 class DimensionReductionValue(Enum):
-    protein = "Protein ID"
     sample = "Sample"
+    protein = "Protein ID"
 
 
 class DataAnalysisStep(Step, ABC):

From 06e708b49131f5b6f1af904a71dcedf5db82e111 Mon Sep 17 00:00:00 2001
From: yanjo <y.hartmaring@gmx.net>
Date: Tue, 28 Apr 2026 15:14:13 +0200
Subject: [PATCH 04/11] adapt wording for test_tsne_perplexity

---
 .../tests/protzilla/data_analysis/test_dimension_reduction.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/backend/tests/protzilla/data_analysis/test_dimension_reduction.py b/backend/tests/protzilla/data_analysis/test_dimension_reduction.py
index 235065063..79a43a0dc 100644
--- a/backend/tests/protzilla/data_analysis/test_dimension_reduction.py
+++ b/backend/tests/protzilla/data_analysis/test_dimension_reduction.py
@@ -169,8 +169,8 @@ def test_tsne_nan_handling(df_with_nan):
 def test_tsne_perplexity(dimension_reduction_df):
     with pytest.raises(
         ValueError,
-        match="Perplexity must be less than the number of samples. In the selected dataframe there "
-        f"is {dimension_reduction_df['Sample'].nunique()} samples",
+        match="Perplexity must be less than the number of Samples. In the selected dataframe there "
+        f"are {dimension_reduction_df['Sample'].nunique()} Samples",
     ):
         _ = t_sne(
             dimension_reduction_df,

From c0dc7da0680c274a0fe5a43dace8e9b6a84924f8 Mon Sep 17 00:00:00 2001
From: yanjo <y.hartmaring@gmx.net>
Date: Tue, 28 Apr 2026 16:15:04 +0200
Subject: [PATCH 05/11] black formatting

---
 .../protzilla/data_analysis/dimension_reduction.py | 14 ++++++++++----
 backend/protzilla/methods/data_analysis.py         |  1 +
 backend/protzilla/utilities/transform_dfs.py       | 11 +++++++----
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/backend/protzilla/data_analysis/dimension_reduction.py b/backend/protzilla/data_analysis/dimension_reduction.py
index a3292fb85..9c862e32b 100644
--- a/backend/protzilla/data_analysis/dimension_reduction.py
+++ b/backend/protzilla/data_analysis/dimension_reduction.py
@@ -62,9 +62,11 @@ def t_sne(
     """
 
     input_df = protein_df
-    columns = 'Protein ID' if value == 'Sample' else 'Sample'
+    columns = "Protein ID" if value == "Sample" else "Sample"
     intensity_df_wide = (
-        long_to_wide(input_df, index=value, columns=columns) if is_long_format(input_df) else input_df.copy()
+        long_to_wide(input_df, index=value, columns=columns)
+        if is_long_format(input_df)
+        else input_df.copy()
     )
     if intensity_df_wide.isnull().sum().any():
         raise ValueError(
@@ -161,8 +163,12 @@ def umap(
 
     input_df = protein_df
 
-    columns = 'Protein ID' if value == 'Sample' else 'Sample'
-    intensity_df_wide = long_to_wide(input_df, index=value, columns=columns) if is_long_format(input_df) else input_df
+    columns = "Protein ID" if value == "Sample" else "Sample"
+    intensity_df_wide = (
+        long_to_wide(input_df, index=value, columns=columns)
+        if is_long_format(input_df)
+        else input_df
+    )
     if intensity_df_wide.isnull().sum().any():
         raise ValueError(
             "UMAP does not accept missing values encoded as NaN. Consider preprocessing your data to remove NaN "
diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py
index d1d7c9f8e..927c285b0 100644
--- a/backend/protzilla/methods/data_analysis.py
+++ b/backend/protzilla/methods/data_analysis.py
@@ -212,6 +212,7 @@ class DimensionReductionMetric(Enum):
     manhattan = "manhattan"
     cosine = "cosine"
 
+
 class DimensionReductionValue(Enum):
     sample = "Sample"
     protein = "Protein ID"
diff --git a/backend/protzilla/utilities/transform_dfs.py b/backend/protzilla/utilities/transform_dfs.py
index 3f9cfc79d..b8552313d 100644
--- a/backend/protzilla/utilities/transform_dfs.py
+++ b/backend/protzilla/utilities/transform_dfs.py
@@ -3,7 +3,12 @@
 from backend.protzilla.utilities.utilities import default_intensity_column
 
 
-def long_to_wide(intensity_df: pd.DataFrame, index: str = "Sample", columns: str = "Protein ID", value_name: str | None = None):
+def long_to_wide(
+    intensity_df: pd.DataFrame,
+    index: str = "Sample",
+    columns: str = "Protein ID",
+    value_name: str | None = None,
+):
     """
     This function transforms the dataframe to a wide format that
     can be more easily handled by packages such as sklearn.
@@ -28,9 +33,7 @@ def long_to_wide(intensity_df: pd.DataFrame, index: str = "Sample", columns: str
     values_name = (
         default_intensity_column(intensity_df) if value_name is None else value_name
     )
-    return pd.pivot(
-        intensity_df, index=index, columns=columns, values=values_name
-    )
+    return pd.pivot(intensity_df, index=index, columns=columns, values=values_name)
 
 
 def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame):

From 215ec1081d181dca1aede08a2b70488984187796 Mon Sep 17 00:00:00 2001
From: yanjo <y.hartmaring@gmx.net>
Date: Tue, 28 Apr 2026 16:15:30 +0200
Subject: [PATCH 06/11] adding tests

---
 .../data_analysis/test_dimension_reduction.py | 107 +++++++++++++++++-
 backend/tests/protzilla/test_transform_dfs.py |  10 ++
 2 files changed, 114 insertions(+), 3 deletions(-)

diff --git a/backend/tests/protzilla/data_analysis/test_dimension_reduction.py b/backend/tests/protzilla/data_analysis/test_dimension_reduction.py
index 79a43a0dc..f53106dcd 100644
--- a/backend/tests/protzilla/data_analysis/test_dimension_reduction.py
+++ b/backend/tests/protzilla/data_analysis/test_dimension_reduction.py
@@ -109,12 +109,14 @@ def metadata_df():
 
 
 def check_dimensionality_reduction_output(
-    out_df: pd.DataFrame, orig_df: pd.DataFrame, n_components: int, value: str = "Sample"
+    out_df: pd.DataFrame,
+    orig_df: pd.DataFrame,
+    n_components: int,
+    value: str = "Sample",
 ):
     assert (
         out_df.shape == (orig_df[value].nunique(), n_components + 1)
-        and out_df[value].sort_values().tolist()
-        == sorted(orig_df[value].unique())
+        and out_df[value].sort_values().tolist() == sorted(orig_df[value].unique())
         and all(
             (
                 pd.api.types.is_numeric_dtype(out_df[f"Component{i + 1}"])
@@ -250,6 +252,85 @@ def test_tsne_scatter_plot_integration(
     )
 
 
+@pytest.mark.parametrize(
+    "df_name,n_components,method",
+    [
+        ("dimension_reduction_df", 2, TSNEMethod.exact.value),
+        ("dimension_reduction_four_proteins_df", 2, TSNEMethod.exact.value),
+        ("dimension_reduction_df", 2, TSNEMethod.barnes_hut.value),
+        ("dimension_reduction_four_proteins_df", 2, TSNEMethod.barnes_hut.value),
+    ],
+)
+def test_tsne_protein_id_value(df_name, n_components, method, request):
+    df = request.getfixturevalue(df_name)
+    out = t_sne(
+        df,
+        method=method,
+        n_components=n_components,
+        perplexity=2,
+        value="Protein ID",
+        random_state=42,
+    )
+    check_dimensionality_reduction_output(
+        out["embedded_data"], df, n_components, value="Protein ID"
+    )
+
+
+@pytest.mark.parametrize(
+    "df_name,n_components,method",
+    [
+        ("dimension_reduction_df", 2, TSNEMethod.exact.value),
+        ("dimension_reduction_four_proteins_df", 2, TSNEMethod.exact.value),
+    ],
+)
+def test_tsne_metrics_protein_id(df_name, n_components, method, request):
+    for metric in DimensionReductionMetric:
+        df = request.getfixturevalue(df_name)
+        current_out = t_sne(
+            df,
+            method=method,
+            n_components=n_components,
+            perplexity=2,
+            metric=metric.value,
+            value="Protein ID",
+            random_state=42,
+        )
+        check_dimensionality_reduction_output(
+            current_out["embedded_data"], df, n_components, value="Protein ID"
+        )
+
+
+def test_tsne_perplexity_protein_id(dimension_reduction_df):
+    with pytest.raises(
+        ValueError,
+        match="Perplexity must be less than the number of Protein IDs. In the selected dataframe there "
+        f"are {dimension_reduction_df['Protein ID'].nunique()} Protein IDs",
+    ):
+        _ = t_sne(
+            dimension_reduction_df,
+            method=TSNEMethod.barnes_hut.value,
+            n_components=2,
+            perplexity=5,
+            value="Protein ID",
+        )
+
+
+def test_tsne_n_components_protein_id(dimension_reduction_df):
+    with pytest.raises(
+        ValueError,
+        match="The number of dimensions of the embedded space must be between 1 and "
+        f"{min(dimension_reduction_df['Protein ID'].nunique(), dimension_reduction_df['Sample'].nunique())}",
+    ):
+        _ = t_sne(
+            dimension_reduction_df,
+            method="exact",
+            n_components=5,
+            perplexity=2,
+            value="Protein ID",
+            random_state=42,
+        )
+
+
 @pytest.mark.parametrize(
     "n_components",
     [2, 3],
@@ -271,6 +352,26 @@ def test_umap(dimension_reduction_df, n_components):
         )
 
 
+@pytest.mark.parametrize("n_components", [2])
+def test_umap_protein_id_value(dimension_reduction_four_proteins_df, n_components):
+    for metric in DimensionReductionMetric:
+        current_out = umap(
+            dimension_reduction_four_proteins_df,
+            n_components=n_components,
+            metric=metric.value,
+            n_neighbors=3,
+            value="Protein ID",
+            random_state=42,
+            transform_seed=42,
+        )
+        check_dimensionality_reduction_output(
+            current_out["embedded_data"],
+            dimension_reduction_four_proteins_df,
+            n_components,
+            value="Protein ID",
+        )
+
+
 def test_umap_nan_handling(df_with_nan):
     with pytest.raises(
         ValueError,
diff --git a/backend/tests/protzilla/test_transform_dfs.py b/backend/tests/protzilla/test_transform_dfs.py
index 970fbeda2..97ef31030 100644
--- a/backend/tests/protzilla/test_transform_dfs.py
+++ b/backend/tests/protzilla/test_transform_dfs.py
@@ -90,6 +90,16 @@ def test_transform_long_to_wide(transform_df_long, transform_df_wide):
     pd.testing.assert_frame_equal(long_to_wide(transform_df_long), transform_df_wide)
 
 
+def test_transform_long_to_wide_protein_id_as_index(
+    transform_df_long, transform_df_wide
+):
+    result = long_to_wide(transform_df_long, index="Protein ID", columns="Sample")
+    expected = transform_df_wide.T
+    expected.index.name = "Protein ID"
+    expected.columns.name = "Sample"
+    pd.testing.assert_frame_equal(result, expected)
+
+
 def test_transform_long_to_wide_to_long(
     transform_df_long, transform_df_wide, transform_df_long_gene_name_provider
 ):

From 953b986bac9e1cdd26bb32a5ffe8bf1dbc3b4e8f Mon Sep 17 00:00:00 2001
From: ferbsx <itrytocode@gmail.com>
Date: Tue, 28 Apr 2026 16:44:07 +0200
Subject: [PATCH 07/11] updating test based on changes made to underlying
 function

---
 backend/protzilla/data_analysis/ptm_analysis.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/backend/protzilla/data_analysis/ptm_analysis.py b/backend/protzilla/data_analysis/ptm_analysis.py
index 7174e8a09..860e7d6a7 100644
--- a/backend/protzilla/data_analysis/ptm_analysis.py
+++ b/backend/protzilla/data_analysis/ptm_analysis.py
@@ -48,7 +48,9 @@ def ptms_per_protein_and_sample(psm_df: pd.DataFrame) -> dict:
     modification_df = modification_df[["Sample", "Protein ID", "Modifications"]]
 
     modification_df = (
-        long_to_wide(modification_df, "Modifications").fillna("").reset_index()
+        long_to_wide(modification_df, value_name="Modifications")
+        .fillna("")
+        .reset_index()
     )
 
     return dict(ptm_df=modification_df)

From 156274c2fed92ef392f38d80b4a64e2494b7e767 Mon Sep 17 00:00:00 2001
From: ferbsx <itrytocode@gmail.com>
Date: Tue, 28 Apr 2026 16:44:28 +0200
Subject: [PATCH 08/11] minor typo

---
 backend/protzilla/utilities/utilities.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backend/protzilla/utilities/utilities.py b/backend/protzilla/utilities/utilities.py
index 58d7f4966..52951d67b 100644
--- a/backend/protzilla/utilities/utilities.py
+++ b/backend/protzilla/utilities/utilities.py
@@ -86,7 +86,7 @@ def default_intensity_column(
         return matched_columns[0]
 
     raise ValueError(
-        "No intensity column name provided and no default intensity column could be determined."
+        "No intensity column name provided and no default intensity column could be determined. "
         "Please provide the intensity column name manually to the function call."
     )
 

From 46f5cee07adef3f336dc1ff12f9b4543d7ff5efd Mon Sep 17 00:00:00 2001
From: yanjo <y.hartmaring@gmx.net>
Date: Wed, 29 Apr 2026 17:04:25 +0200
Subject: [PATCH 09/11] adding Dropdown for selecting the sample name for
 scatter plot

---
 backend/protzilla/methods/data_analysis.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py
index 927c285b0..e074da2f3 100644
--- a/backend/protzilla/methods/data_analysis.py
+++ b/backend/protzilla/methods/data_analysis.py
@@ -904,6 +904,12 @@ def create_form(self):
                     name="metadata_column",
                     label="Choose the column of the metadata dataframe that should be used for coloring",
                 ),
+                DropdownField(
+                    name="sample_name",
+                    label="Choose the column that contains the sample information",
+                    value=DimensionReductionValue.sample.value,
+                    options=DimensionReductionValue,
+                ),
             ],
         )
 
@@ -919,6 +925,7 @@ def modify_form(self, run: Run) -> None:
                     run,
                     instance_identifier=metadata_source,
                     include_sample=False,
+                    required=False,
                     output_key=source_handle,
                 )
             )

From 1c2800198e3cf11ee5988967b7e62eb7edec64f4 Mon Sep 17 00:00:00 2001
From: yanjo <y.hartmaring@gmx.net>
Date: Wed, 29 Apr 2026 17:05:14 +0200
Subject: [PATCH 10/11] adding parameter to make required optional for choices
 from metadata

---
 backend/protzilla/form_helper.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/backend/protzilla/form_helper.py b/backend/protzilla/form_helper.py
index 1367b3e8c..6b549964f 100644
--- a/backend/protzilla/form_helper.py
+++ b/backend/protzilla/form_helper.py
@@ -32,6 +32,7 @@ def get_choices_for_metadata(
     instance_identifier: StepID,
     output_key: DataKey,
     include_sample: bool = True,
+    required: bool = False,
 ) -> list[Option]:
     metadata_df = run.steps.get_step_output(
         output_key=output_key, instance_identifier=instance_identifier
@@ -43,7 +44,8 @@ def get_choices_for_metadata(
         if include_sample
         else [column for column in metadata_df.columns.unique() if column != "Sample"]
     )
-    return to_choices(columns)
+
+    return to_choices(columns, required=required)
 
 
 def get_choices_for_groups(

From 5e23b4e226e207239c376621e73705054e493bce Mon Sep 17 00:00:00 2001
From: yanjo <y.hartmaring@gmx.net>
Date: Wed, 29 Apr 2026 17:07:03 +0200
Subject: [PATCH 11/11] remove hardcoded Sample from scatter_plot to allow
 other values like Protein ID

---
 backend/protzilla/data_analysis/plots.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/backend/protzilla/data_analysis/plots.py b/backend/protzilla/data_analysis/plots.py
index 06f3469cd..e93ca2c28 100644
--- a/backend/protzilla/data_analysis/plots.py
+++ b/backend/protzilla/data_analysis/plots.py
@@ -38,6 +38,7 @@ def scatter_plot(
     input_df: pd.DataFrame,
     metadata_df: pd.DataFrame | None = None,
     metadata_column: str | None = None,
+    sample_name: str = "Sample",
 ) -> dict:
     """
     Function to create a scatter plot from data.
@@ -49,6 +50,7 @@ def scatter_plot(
     :param metadata_column: the name of the column in `metadata_df` that contains the
         group information for each sample. This parameter is required if `metadata_df`
         is provided.
+    :param sample_name: the name of the sample column, should be the same name for the input_df and the metadata_df
 
     :return: returns a dictionary containing a list with a plotly figure and/or a list of messages
     """
@@ -58,19 +60,24 @@ def scatter_plot(
                 "The column selected for annotation is not present in the corresponding metadata dataframe.",
             )
 
+    if sample_name not in input_df.columns:
+        raise ValueError(
+            f"The column {sample_name} selected for annotation is not present in the corresponding input dataframe.",
+        )
+
     intensity_df = input_df.copy()
     if isinstance(metadata_df, pd.DataFrame):
         intensity_df = pd.merge(
             intensity_df,
-            metadata_df[["Sample", metadata_column]],
-            on="Sample",
+            metadata_df[[sample_name, metadata_column]],
+            on=sample_name,
             how="left",
         )
     else:
         # Mock a metadata column here so that we can treat dfs with and without metadata the same way
         metadata_column = "mock_metadata_column"
         intensity_df[metadata_column] = None
-    intensity_df = intensity_df.drop(columns="Sample")
+    intensity_df = intensity_df.drop(columns=sample_name)
 
     color_col = (
         metadata_column if intensity_df[metadata_column].notnull().any() else None