diff --git a/backend/protzilla/data_analysis/dimension_reduction.py b/backend/protzilla/data_analysis/dimension_reduction.py index 7c831c53b..9c862e32b 100644 --- a/backend/protzilla/data_analysis/dimension_reduction.py +++ b/backend/protzilla/data_analysis/dimension_reduction.py @@ -17,6 +17,7 @@ def t_sne( n_components: int = 2, perplexity: float = 30.0, metric: str = "euclidean", + value: str = "Sample", random_state: int = 42, max_iter: int = 1000, n_iter_without_progress: int = 300, @@ -38,6 +39,8 @@ def t_sne( :param metric: The metric to use when calculating distance between instances in a feature array. Possible metrics are: euclidean, manhattan, cosine and haversine :type metric: str + :param value: whether to use the "Sample" or the "Protein ID" as values for dimension reduction. + :type value: str :param random_state: determines the random number generator. :type random_state: int :param max_iter: maximum number of iterations for the optimization @@ -59,9 +62,11 @@ def t_sne( """ input_df = protein_df - + columns = "Protein ID" if value == "Sample" else "Sample" intensity_df_wide = ( - long_to_wide(input_df) if is_long_format(input_df) else input_df.copy() + long_to_wide(input_df, index=value, columns=columns) + if is_long_format(input_df) + else input_df.copy() ) if intensity_df_wide.isnull().sum().any(): raise ValueError( @@ -70,8 +75,8 @@ def t_sne( ) if perplexity >= intensity_df_wide.shape[0]: raise ValueError( - "Perplexity must be less than the number of samples. In the selected dataframe there " - f"is {intensity_df_wide.shape[0]} samples" + f"Perplexity must be less than the number of {value}s. In the selected dataframe there " + f"are {intensity_df_wide.shape[0]} {value}s." ) if ( min(intensity_df_wide.shape[0], intensity_df_wide.shape[1]) <= n_components @@ -80,7 +85,7 @@ def t_sne( raise ValueError( "The number of dimensions of the embedded space must be between 1 and " f"{min(intensity_df_wide.shape[0], intensity_df_wide.shape[1])} (the smaller one of number of " - "samples/features). " + f"{value}s/features). " ) if n_components > 3 and method == TSNEMethod.barnes_hut.value: raise ValueError( @@ -112,6 +117,7 @@ def umap( n_components: int = 2, min_dist: float = 0.1, metric: str = "euclidean", + value: str = "Sample", random_state: int = 42, transform_seed: int = 42, ): @@ -138,6 +144,8 @@ def umap( :param metric: The metric to use when calculating distance between instances in a feature array. :type metric: str + :param value: whether to use the "Sample" or the "Protein ID" as values for dimension reduction. + :type value: str :param random_state: determines the random number generator. :type random_state: int :param transform_seed: Random seed used for the stochastic aspects of the transform @@ -155,7 +163,12 @@ def umap( input_df = protein_df - intensity_df_wide = long_to_wide(input_df) if is_long_format(input_df) else input_df + columns = "Protein ID" if value == "Sample" else "Sample" + intensity_df_wide = ( + long_to_wide(input_df, index=value, columns=columns) + if is_long_format(input_df) + else input_df + ) if intensity_df_wide.isnull().sum().any(): raise ValueError( "UMAP does not accept missing values encoded as NaN. Consider preprocessing your data to remove NaN " diff --git a/backend/protzilla/data_analysis/plots.py b/backend/protzilla/data_analysis/plots.py index 06f3469cd..e93ca2c28 100644 --- a/backend/protzilla/data_analysis/plots.py +++ b/backend/protzilla/data_analysis/plots.py @@ -38,6 +38,7 @@ def scatter_plot( input_df: pd.DataFrame, metadata_df: pd.DataFrame | None = None, metadata_column: str | None = None, + sample_name: str = "Sample", ) -> dict: """ Function to create a scatter plot from data. @@ -49,6 +50,7 @@ def scatter_plot( :param metadata_column: the name of the column in `metadata_df` that contains the group information for each sample. This parameter is required if `metadata_df` is provided. + :param sample_name: the name of the sample column, should be the same name for the input_df and the metadata_df :return: returns a dictionary containing a list with a plotly figure and/or a list of messages """ @@ -58,19 +60,24 @@ def scatter_plot( "The column selected for annotation is not present in the corresponding metadata dataframe.", ) + if sample_name not in input_df.columns: + raise ValueError( + f"The column {sample_name} selected for annotation is not present in the corresponding input dataframe.", + ) + intensity_df = input_df.copy() if isinstance(metadata_df, pd.DataFrame): intensity_df = pd.merge( intensity_df, - metadata_df[["Sample", metadata_column]], - on="Sample", + metadata_df[[sample_name, metadata_column]], + on=sample_name, how="left", ) else: # Mock a metadata column here so that we can treat dfs with and without metadata the same way metadata_column = "mock_metadata_column" intensity_df[metadata_column] = None - intensity_df = intensity_df.drop(columns="Sample") + intensity_df = intensity_df.drop(columns=sample_name) color_col = ( metadata_column if intensity_df[metadata_column].notnull().any() else None diff --git a/backend/protzilla/data_analysis/ptm_analysis.py b/backend/protzilla/data_analysis/ptm_analysis.py index 7174e8a09..860e7d6a7 100644 --- a/backend/protzilla/data_analysis/ptm_analysis.py +++ b/backend/protzilla/data_analysis/ptm_analysis.py @@ -48,7 +48,9 @@ def ptms_per_protein_and_sample(psm_df: pd.DataFrame) -> dict: modification_df = modification_df[["Sample", "Protein ID", "Modifications"]] modification_df = ( - long_to_wide(modification_df, "Modifications").fillna("").reset_index() + long_to_wide(modification_df, value_name="Modifications") + .fillna("") + .reset_index() ) return dict(ptm_df=modification_df) diff --git a/backend/protzilla/form_helper.py b/backend/protzilla/form_helper.py index 1367b3e8c..6b549964f 100644 --- a/backend/protzilla/form_helper.py +++ b/backend/protzilla/form_helper.py @@ -32,6 +32,7 @@ def get_choices_for_metadata( instance_identifier: StepID, output_key: DataKey, include_sample: bool = True, + required: bool = False, ) -> list[Option]: metadata_df = run.steps.get_step_output( output_key=output_key, instance_identifier=instance_identifier @@ -43,7 +44,8 @@ def get_choices_for_metadata( if include_sample else [column for column in metadata_df.columns.unique() if column != "Sample"] ) - return to_choices(columns) + + return to_choices(columns, required=required) def get_choices_for_groups( diff --git a/backend/protzilla/methods/data_analysis.py b/backend/protzilla/methods/data_analysis.py index d95851d70..e074da2f3 100644 --- a/backend/protzilla/methods/data_analysis.py +++ b/backend/protzilla/methods/data_analysis.py @@ -213,6 +213,11 @@ class DimensionReductionMetric(Enum): cosine = "cosine" +class DimensionReductionValue(Enum): + sample = "Sample" + protein = "Protein ID" + + class DataAnalysisStep(Step, ABC): section = Section.DATA_ANALYSIS @@ -899,6 +904,12 @@ def create_form(self): name="metadata_column", label="Choose the column of the metadata dataframe that should be used for coloring", ), + DropdownField( + name="sample_name", + label="Choose the column that contains the sample information", + value=DimensionReductionValue.sample.value, + options=DimensionReductionValue, + ), ], ) @@ -914,6 +925,7 @@ def modify_form(self, run: Run) -> None: run, instance_identifier=metadata_source, include_sample=False, + required=False, output_key=source_handle, ) ) @@ -1993,6 +2005,11 @@ def create_form(self): label="Distance metric", options=DimensionReductionMetric, ), + DropdownField( + name="value", + label="Values for dimension reduction", + options=DimensionReductionValue, + ), NumberField( name="random_state", label="Seed for random number generation", @@ -2065,6 +2082,11 @@ def create_form(self): label="Distance metric", options=DimensionReductionMetric, ), + DropdownField( + name="value", + label="Values for dimension reduction", + options=DimensionReductionValue, + ), NumberField( name="random_state", label="Seed for random number generation", diff --git a/backend/protzilla/utilities/transform_dfs.py b/backend/protzilla/utilities/transform_dfs.py index 4be52b1d5..b8552313d 100644 --- a/backend/protzilla/utilities/transform_dfs.py +++ b/backend/protzilla/utilities/transform_dfs.py @@ -3,7 +3,12 @@ from backend.protzilla.utilities.utilities import default_intensity_column -def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None): +def long_to_wide( + intensity_df: pd.DataFrame, + index: str = "Sample", + columns: str = "Protein ID", + value_name: str | None = None, +): """ This function transforms the dataframe to a wide format that can be more easily handled by packages such as sklearn. @@ -12,6 +17,14 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None): :param intensity_df: the dataframe that should be transformed into long format :type intensity_df: pd.DataFrame + :param index: the column that should be used as index in the wide format + dataframe. This should be either "Sample" or "Protein ID". + :type index: str + :param columns: the column that should be used as columns in the wide format + dataframe. This should be either "Sample" or "Protein ID". + :type columns: str + :param value_name: the name of the column that contains the values in the long format dataframe. If None, the default intensity column will be used. + :type value_name: str | None :return: returns dataframe in wide format suitable for use by packages such as sklearn @@ -20,9 +33,7 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None): values_name = ( default_intensity_column(intensity_df) if value_name is None else value_name ) - return pd.pivot( - intensity_df, index="Sample", columns="Protein ID", values=values_name - ) + return pd.pivot(intensity_df, index=index, columns=columns, values=values_name) def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame): diff --git a/backend/protzilla/utilities/utilities.py b/backend/protzilla/utilities/utilities.py index 58d7f4966..52951d67b 100644 --- a/backend/protzilla/utilities/utilities.py +++ b/backend/protzilla/utilities/utilities.py @@ -86,7 +86,7 @@ def default_intensity_column( return matched_columns[0] raise ValueError( - "No intensity column name provided and no default intensity column could be determined." + "No intensity column name provided and no default intensity column could be determined. " "Please provide the intensity column name manually to the function call." ) diff --git a/backend/tests/protzilla/data_analysis/test_dimension_reduction.py b/backend/tests/protzilla/data_analysis/test_dimension_reduction.py index 8941771cf..f53106dcd 100644 --- a/backend/tests/protzilla/data_analysis/test_dimension_reduction.py +++ b/backend/tests/protzilla/data_analysis/test_dimension_reduction.py @@ -109,12 +109,14 @@ def metadata_df(): def check_dimensionality_reduction_output( - out_df: pd.DataFrame, orig_df: pd.DataFrame, n_components: int + out_df: pd.DataFrame, + orig_df: pd.DataFrame, + n_components: int, + value: str = "Sample", ): assert ( - out_df.shape == (orig_df["Sample"].nunique(), n_components + 1) - and out_df["Sample"].sort_values().tolist() - == sorted(orig_df["Sample"].unique()) + out_df.shape == (orig_df[value].nunique(), n_components + 1) + and out_df[value].sort_values().tolist() == sorted(orig_df[value].unique()) and all( ( pd.api.types.is_numeric_dtype(out_df[f"Component{i + 1}"]) @@ -169,8 +171,8 @@ def test_tsne_nan_handling(df_with_nan): def test_tsne_perplexity(dimension_reduction_df): with pytest.raises( ValueError, - match="Perplexity must be less than the number of samples. In the selected dataframe there " - f"is {dimension_reduction_df['Sample'].nunique()} samples", + match="Perplexity must be less than the number of Samples. In the selected dataframe there " + f"are {dimension_reduction_df['Sample'].nunique()} Samples", ): _ = t_sne( dimension_reduction_df, @@ -250,6 +252,85 @@ def test_tsne_scatter_plot_integration( ) +@pytest.mark.parametrize( + "df_name,n_components,method", + [ + ("dimension_reduction_df", 2, TSNEMethod.exact.value), + ("dimension_reduction_four_proteins_df", 2, TSNEMethod.exact.value), + ("dimension_reduction_df", 2, TSNEMethod.barnes_hut.value), + ("dimension_reduction_four_proteins_df", 2, TSNEMethod.barnes_hut.value), + ], +) +def test_tsne_protein_id_value(df_name, n_components, method, request): + df = request.getfixturevalue(df_name) + out = t_sne( + df, + method=method, + n_components=n_components, + perplexity=2, + value="Protein ID", + random_state=42, + ) + check_dimensionality_reduction_output( + out["embedded_data"], df, n_components, value="Protein ID" + ) + + +@pytest.mark.parametrize( + "df_name,n_components,method", + [ + ("dimension_reduction_df", 2, TSNEMethod.exact.value), + ("dimension_reduction_four_proteins_df", 2, TSNEMethod.exact.value), + ], +) +def test_tsne_metrics_protein_id(df_name, n_components, method, request): + for metric in DimensionReductionMetric: + df = request.getfixturevalue(df_name) + current_out = t_sne( + df, + method=method, + n_components=n_components, + perplexity=2, + metric=metric.value, + value="Protein ID", + random_state=42, + ) + check_dimensionality_reduction_output( + current_out["embedded_data"], df, n_components, value="Protein ID" + ) + + +def test_tsne_perplexity_protein_id(dimension_reduction_df): + with pytest.raises( + ValueError, + match="Perplexity must be less than the number of Protein IDs. In the selected dataframe there " + f"are {dimension_reduction_df['Protein ID'].nunique()} Protein IDs", + ): + _ = t_sne( + dimension_reduction_df, + method=TSNEMethod.barnes_hut.value, + n_components=2, + perplexity=5, + value="Protein ID", + ) + + +def test_tsne_n_components_protein_id(dimension_reduction_df): + with pytest.raises( + ValueError, + match="The number of dimensions of the embedded space must be between 1 and " + f"{min(dimension_reduction_df['Protein ID'].nunique(), dimension_reduction_df['Sample'].nunique())}", + ): + _ = t_sne( + dimension_reduction_df, + method="exact", + n_components=5, + perplexity=2, + value="Protein ID", + random_state=42, + ) + + @pytest.mark.parametrize( "n_components", [2, 3], @@ -271,6 +352,26 @@ def test_umap(dimension_reduction_df, n_components): ) +@pytest.mark.parametrize("n_components", [2]) +def test_umap_protein_id_value(dimension_reduction_four_proteins_df, n_components): + for metric in DimensionReductionMetric: + current_out = umap( + dimension_reduction_four_proteins_df, + n_components=n_components, + metric=metric.value, + n_neighbors=3, + value="Protein ID", + random_state=42, + transform_seed=42, + ) + check_dimensionality_reduction_output( + current_out["embedded_data"], + dimension_reduction_four_proteins_df, + n_components, + value="Protein ID", + ) + + def test_umap_nan_handling(df_with_nan): with pytest.raises( ValueError, diff --git a/backend/tests/protzilla/test_transform_dfs.py b/backend/tests/protzilla/test_transform_dfs.py index 970fbeda2..97ef31030 100644 --- a/backend/tests/protzilla/test_transform_dfs.py +++ b/backend/tests/protzilla/test_transform_dfs.py @@ -90,6 +90,16 @@ def test_transform_long_to_wide(transform_df_long, transform_df_wide): pd.testing.assert_frame_equal(long_to_wide(transform_df_long), transform_df_wide) +def test_transform_long_to_wide_protein_id_as_index( + transform_df_long, transform_df_wide +): + result = long_to_wide(transform_df_long, index="Protein ID", columns="Sample") + expected = transform_df_wide.T + expected.index.name = "Protein ID" + expected.columns.name = "Sample" + pd.testing.assert_frame_equal(result, expected) + + def test_transform_long_to_wide_to_long( transform_df_long, transform_df_wide, transform_df_long_gene_name_provider ):