cschlaffner · ferbsx · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 28, 2026
diff --git a/backend/protzilla/data_analysis/dimension_reduction.py b/backend/protzilla/data_analysis/dimension_reduction.py
@@ -17,6 +17,7 @@ def t_sne(
     n_components: int = 2,
     perplexity: float = 30.0,
     metric: str = "euclidean",
+    value: str = "Sample",
     random_state: int = 42,
     max_iter: int = 1000,
     n_iter_without_progress: int = 300,
@@ -38,6 +39,8 @@ def t_sne(
     :param metric: The metric to use when calculating distance between instances in a
         feature array. Possible metrics are: euclidean, manhattan, cosine and haversine
     :type metric: str
+    :param value: whether to use the "Sample" or the "Protein ID" as values for dimension reduction.
+    :type value: str
     :param random_state: determines the random number generator.
     :type random_state: int
     :param max_iter: maximum number of iterations for the optimization
@@ -59,9 +62,11 @@ def t_sne(
     """
 
     input_df = protein_df
-
+    columns = "Protein ID" if value == "Sample" else "Sample"
     intensity_df_wide = (
-        long_to_wide(input_df) if is_long_format(input_df) else input_df.copy()
+        long_to_wide(input_df, index=value, columns=columns)
+        if is_long_format(input_df)
+        else input_df.copy()
     )
     if intensity_df_wide.isnull().sum().any():
         raise ValueError(
@@ -70,8 +75,8 @@ def t_sne(
         )
     if perplexity >= intensity_df_wide.shape[0]:
         raise ValueError(
-            "Perplexity must be less than the number of samples. In the selected dataframe there "
-            f"is {intensity_df_wide.shape[0]} samples"
+            f"Perplexity must be less than the number of {value}s. In the selected dataframe there "
+            f"are {intensity_df_wide.shape[0]} {value}s."
         )
     if (
         min(intensity_df_wide.shape[0], intensity_df_wide.shape[1]) <= n_components
@@ -80,7 +85,7 @@ def t_sne(
         raise ValueError(
             "The number of dimensions of the embedded space must be between 1 and "
             f"{min(intensity_df_wide.shape[0], intensity_df_wide.shape[1])} (the smaller one of number of "
-            "samples/features). "
+            f"{value}s/features). "
         )
     if n_components > 3 and method == TSNEMethod.barnes_hut.value:
         raise ValueError(
@@ -112,6 +117,7 @@ def umap(
     n_components: int = 2,
     min_dist: float = 0.1,
     metric: str = "euclidean",
+    value: str = "Sample",
     random_state: int = 42,
     transform_seed: int = 42,
 ):
@@ -138,6 +144,8 @@ def umap(
     :param metric: The metric to use when calculating distance between instances in a
         feature array.
     :type metric: str
+    :param value: whether to use the "Sample" or the "Protein ID" as values for dimension reduction.
+    :type value: str
     :param random_state: determines the random number generator.
     :type random_state: int
     :param transform_seed: Random seed used for the stochastic aspects of the transform
@@ -155,7 +163,12 @@ def umap(
 
     input_df = protein_df
 
-    intensity_df_wide = long_to_wide(input_df) if is_long_format(input_df) else input_df
+    columns = "Protein ID" if value == "Sample" else "Sample"
+    intensity_df_wide = (
+        long_to_wide(input_df, index=value, columns=columns)
+        if is_long_format(input_df)
+        else input_df
+    )
     if intensity_df_wide.isnull().sum().any():
         raise ValueError(
             "UMAP does not accept missing values encoded as NaN. Consider preprocessing your data to remove NaN "

diff --git a/backend/protzilla/data_analysis/plots.py b/backend/protzilla/data_analysis/plots.py
@@ -38,6 +38,7 @@ def scatter_plot(
     input_df: pd.DataFrame,
     metadata_df: pd.DataFrame | None = None,
     metadata_column: str | None = None,
+    sample_name: str = "Sample",
 ) -> dict:
     """
     Function to create a scatter plot from data.
@@ -49,6 +50,7 @@ def scatter_plot(
     :param metadata_column: the name of the column in `metadata_df` that contains the
         group information for each sample. This parameter is required if `metadata_df`
         is provided.
+    :param sample_name: the name of the sample column, should be the same name for the input_df and the metadata_df
 
     :return: returns a dictionary containing a list with a plotly figure and/or a list of messages
     """
@@ -58,19 +60,24 @@ def scatter_plot(
                 "The column selected for annotation is not present in the corresponding metadata dataframe.",
             )
 
+    if sample_name not in input_df.columns:
+        raise ValueError(
+            f"The column {sample_name} selected for annotation is not present in the corresponding input dataframe.",
+        )
+
     intensity_df = input_df.copy()
     if isinstance(metadata_df, pd.DataFrame):
         intensity_df = pd.merge(
             intensity_df,
-            metadata_df[["Sample", metadata_column]],
-            on="Sample",
+            metadata_df[[sample_name, metadata_column]],
+            on=sample_name,
             how="left",
         )
     else:
         # Mock a metadata column here so that we can treat dfs with and without metadata the same way
         metadata_column = "mock_metadata_column"
         intensity_df[metadata_column] = None
-    intensity_df = intensity_df.drop(columns="Sample")
+    intensity_df = intensity_df.drop(columns=sample_name)
 
     color_col = (
         metadata_column if intensity_df[metadata_column].notnull().any() else None

diff --git a/backend/protzilla/data_analysis/ptm_analysis.py b/backend/protzilla/data_analysis/ptm_analysis.py
@@ -48,7 +48,9 @@ def ptms_per_protein_and_sample(psm_df: pd.DataFrame) -> dict:
     modification_df = modification_df[["Sample", "Protein ID", "Modifications"]]
 
     modification_df = (
-        long_to_wide(modification_df, "Modifications").fillna("").reset_index()
+        long_to_wide(modification_df, value_name="Modifications")
+        .fillna("")
+        .reset_index()
     )
 
     return dict(ptm_df=modification_df)

diff --git a/backend/protzilla/form_helper.py b/backend/protzilla/form_helper.py
@@ -32,6 +32,7 @@ def get_choices_for_metadata(
     instance_identifier: StepID,
     output_key: DataKey,
     include_sample: bool = True,
+    required: bool = False,
 ) -> list[Option]:
     metadata_df = run.steps.get_step_output(
         output_key=output_key, instance_identifier=instance_identifier
@@ -43,7 +44,8 @@ def get_choices_for_metadata(
         if include_sample
         else [column for column in metadata_df.columns.unique() if column != "Sample"]
     )
-    return to_choices(columns)
+
+    return to_choices(columns, required=required)
 
 
 def get_choices_for_groups(

diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py
@@ -213,6 +213,11 @@ class DimensionReductionMetric(Enum):
     cosine = "cosine"
 
 
+class DimensionReductionValue(Enum):
+    sample = "Sample"
+    protein = "Protein ID"
+
+
 class DataAnalysisStep(Step, ABC):
     section = Section.DATA_ANALYSIS
 
@@ -899,6 +904,12 @@ def create_form(self):
                     name="metadata_column",
                     label="Choose the column of the metadata dataframe that should be used for coloring",
                 ),
+                DropdownField(
+                    name="sample_name",
+                    label="Choose the column that contains the sample information",
+                    value=DimensionReductionValue.sample.value,
+                    options=DimensionReductionValue,
+                ),
             ],
         )
 
@@ -914,6 +925,7 @@ def modify_form(self, run: Run) -> None:
                     run,
                     instance_identifier=metadata_source,
                     include_sample=False,
+                    required=False,
                     output_key=source_handle,
                 )
             )
@@ -1993,6 +2005,11 @@ def create_form(self):
                     label="Distance metric",
                     options=DimensionReductionMetric,
                 ),
+                DropdownField(
+                    name="value",
+                    label="Values for dimension reduction",
+                    options=DimensionReductionValue,
+                ),
                 NumberField(
                     name="random_state",
                     label="Seed for random number generation",
@@ -2065,6 +2082,11 @@ def create_form(self):
                     label="Distance metric",
                     options=DimensionReductionMetric,
                 ),
+                DropdownField(
+                    name="value",
+                    label="Values for dimension reduction",
+                    options=DimensionReductionValue,
+                ),
                 NumberField(
                     name="random_state",
                     label="Seed for random number generation",

diff --git a/backend/protzilla/utilities/transform_dfs.py b/backend/protzilla/utilities/transform_dfs.py
@@ -3,7 +3,12 @@
 from backend.protzilla.utilities.utilities import default_intensity_column
 
 
-def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None):
+def long_to_wide(
+    intensity_df: pd.DataFrame,
+    index: str = "Sample",
+    columns: str = "Protein ID",
+    value_name: str | None = None,
+):
     """
     This function transforms the dataframe to a wide format that
     can be more easily handled by packages such as sklearn.
@@ -12,6 +17,14 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None):
     :param intensity_df: the dataframe that should be transformed into
         long format
         :type intensity_df: pd.DataFrame
+    :param index: the column that should be used as index in the wide format
+        dataframe. This should be either "Sample" or "Protein ID".
+    :type index: str
+    :param columns: the column that should be used as columns in the wide format
+        dataframe. This should be either "Sample" or "Protein ID".
+    :type columns: str
+    :param value_name: the name of the column that contains the values in the long format dataframe. If None, the default intensity column will be used.
+    :type value_name: str | None
 
     :return: returns dataframe in wide format suitable for use by
         packages such as sklearn
@@ -20,9 +33,7 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None):
     values_name = (
         default_intensity_column(intensity_df) if value_name is None else value_name
     )
-    return pd.pivot(
-        intensity_df, index="Sample", columns="Protein ID", values=values_name
-    )
+    return pd.pivot(intensity_df, index=index, columns=columns, values=values_name)
 
 
 def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame):

diff --git a/backend/protzilla/utilities/utilities.py b/backend/protzilla/utilities/utilities.py
@@ -86,7 +86,7 @@ def default_intensity_column(
         return matched_columns[0]
 
     raise ValueError(
-        "No intensity column name provided and no default intensity column could be determined."
+        "No intensity column name provided and no default intensity column could be determined. "
         "Please provide the intensity column name manually to the function call."
     )