Skip to content
Open
25 changes: 19 additions & 6 deletions backend/protzilla/data_analysis/dimension_reduction.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def t_sne(
n_components: int = 2,
perplexity: float = 30.0,
metric: str = "euclidean",
value: str = "Sample",
random_state: int = 42,
max_iter: int = 1000,
n_iter_without_progress: int = 300,
Expand All @@ -38,6 +39,8 @@ def t_sne(
:param metric: The metric to use when calculating distance between instances in a
feature array. Possible metrics are: euclidean, manhattan, cosine and haversine
:type metric: str
:param value: whether to use the "Sample" or the "Protein ID" as values for dimension reduction.
:type value: str
:param random_state: determines the random number generator.
:type random_state: int
:param max_iter: maximum number of iterations for the optimization
Expand All @@ -59,9 +62,11 @@ def t_sne(
"""

input_df = protein_df

columns = "Protein ID" if value == "Sample" else "Sample"
intensity_df_wide = (
long_to_wide(input_df) if is_long_format(input_df) else input_df.copy()
long_to_wide(input_df, index=value, columns=columns)
if is_long_format(input_df)
else input_df.copy()
)
if intensity_df_wide.isnull().sum().any():
raise ValueError(
Expand All @@ -70,8 +75,8 @@ def t_sne(
)
if perplexity >= intensity_df_wide.shape[0]:
raise ValueError(
"Perplexity must be less than the number of samples. In the selected dataframe there "
f"is {intensity_df_wide.shape[0]} samples"
f"Perplexity must be less than the number of {value}s. In the selected dataframe there "
f"are {intensity_df_wide.shape[0]} {value}s."
)
if (
min(intensity_df_wide.shape[0], intensity_df_wide.shape[1]) <= n_components
Expand All @@ -80,7 +85,7 @@ def t_sne(
raise ValueError(
"The number of dimensions of the embedded space must be between 1 and "
f"{min(intensity_df_wide.shape[0], intensity_df_wide.shape[1])} (the smaller one of number of "
"samples/features). "
f"{value}s/features). "
)
if n_components > 3 and method == TSNEMethod.barnes_hut.value:
raise ValueError(
Expand Down Expand Up @@ -112,6 +117,7 @@ def umap(
n_components: int = 2,
min_dist: float = 0.1,
metric: str = "euclidean",
value: str = "Sample",
random_state: int = 42,
transform_seed: int = 42,
):
Expand All @@ -138,6 +144,8 @@ def umap(
:param metric: The metric to use when calculating distance between instances in a
feature array.
:type metric: str
:param value: whether to use the "Sample" or the "Protein ID" as values for dimension reduction.
:type value: str
:param random_state: determines the random number generator.
:type random_state: int
:param transform_seed: Random seed used for the stochastic aspects of the transform
Expand All @@ -155,7 +163,12 @@ def umap(

input_df = protein_df

intensity_df_wide = long_to_wide(input_df) if is_long_format(input_df) else input_df
columns = "Protein ID" if value == "Sample" else "Sample"
intensity_df_wide = (
long_to_wide(input_df, index=value, columns=columns)
if is_long_format(input_df)
else input_df
)
if intensity_df_wide.isnull().sum().any():
raise ValueError(
"UMAP does not accept missing values encoded as NaN. Consider preprocessing your data to remove NaN "
Expand Down
13 changes: 10 additions & 3 deletions backend/protzilla/data_analysis/plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def scatter_plot(
input_df: pd.DataFrame,
metadata_df: pd.DataFrame | None = None,
metadata_column: str | None = None,
sample_name: str = "Sample",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See comment above, I feel like sample_name is misleading.
However, it seems like it is only for metadata processing. Since metadata is enforced to include a "Sample" column (different problem), and using "Protein ID" together with metadata will lead to errors anyway, one could probably also revert the whole sample_name completely.

) -> dict:
"""
Function to create a scatter plot from data.
Expand All @@ -49,6 +50,7 @@ def scatter_plot(
:param metadata_column: the name of the column in `metadata_df` that contains the
group information for each sample. This parameter is required if `metadata_df`
is provided.
:param sample_name: the name of the sample column, should be the same name for the input_df and the metadata_df

:return: returns a dictionary containing a list with a plotly figure and/or a list of messages
"""
Expand All @@ -58,19 +60,24 @@ def scatter_plot(
"The column selected for annotation is not present in the corresponding metadata dataframe.",
)

if sample_name not in input_df.columns:
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would like to see a test that checks the raising of this ValueError

raise ValueError(
f"The column {sample_name} selected for annotation is not present in the corresponding input dataframe.",
)

intensity_df = input_df.copy()
if isinstance(metadata_df, pd.DataFrame):
intensity_df = pd.merge(
intensity_df,
metadata_df[["Sample", metadata_column]],
on="Sample",
metadata_df[[sample_name, metadata_column]],
on=sample_name,
how="left",
)
else:
# Mock a metadata column here so that we can treat dfs with and without metadata the same way
metadata_column = "mock_metadata_column"
intensity_df[metadata_column] = None
intensity_df = intensity_df.drop(columns="Sample")
intensity_df = intensity_df.drop(columns=sample_name)

color_col = (
metadata_column if intensity_df[metadata_column].notnull().any() else None
Expand Down
4 changes: 3 additions & 1 deletion backend/protzilla/data_analysis/ptm_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ def ptms_per_protein_and_sample(psm_df: pd.DataFrame) -> dict:
modification_df = modification_df[["Sample", "Protein ID", "Modifications"]]

modification_df = (
long_to_wide(modification_df, "Modifications").fillna("").reset_index()
long_to_wide(modification_df, value_name="Modifications")
.fillna("")
.reset_index()
)

return dict(ptm_df=modification_df)
Expand Down
4 changes: 3 additions & 1 deletion backend/protzilla/form_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ def get_choices_for_metadata(
instance_identifier: StepID,
output_key: DataKey,
include_sample: bool = True,
required: bool = False,
) -> list[Option]:
metadata_df = run.steps.get_step_output(
output_key=output_key, instance_identifier=instance_identifier
Expand All @@ -43,7 +44,8 @@ def get_choices_for_metadata(
if include_sample
else [column for column in metadata_df.columns.unique() if column != "Sample"]
)
return to_choices(columns)

return to_choices(columns, required=required)


def get_choices_for_groups(
Expand Down
22 changes: 22 additions & 0 deletions backend/protzilla/methods/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,11 @@ class DimensionReductionMetric(Enum):
cosine = "cosine"


class DimensionReductionValue(Enum):
sample = "Sample"
protein = "Protein ID"


class DataAnalysisStep(Step, ABC):
section = Section.DATA_ANALYSIS

Expand Down Expand Up @@ -899,6 +904,12 @@ def create_form(self):
name="metadata_column",
label="Choose the column of the metadata dataframe that should be used for coloring",
),
DropdownField(
name="sample_name",
label="Choose the column that contains the sample information",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Calling it sample and the variable sample_name might confuse the user because it is too close to the actual "Sample" column and it might not be clear that "Protein ID" could also be a valid "sample column" in this case.

value=DimensionReductionValue.sample.value,
options=DimensionReductionValue,
),
],
)

Expand All @@ -914,6 +925,7 @@ def modify_form(self, run: Run) -> None:
run,
instance_identifier=metadata_source,
include_sample=False,
required=False,
output_key=source_handle,
)
)
Expand Down Expand Up @@ -1993,6 +2005,11 @@ def create_form(self):
label="Distance metric",
options=DimensionReductionMetric,
),
DropdownField(
name="value",
label="Values for dimension reduction",
options=DimensionReductionValue,
),
NumberField(
name="random_state",
label="Seed for random number generation",
Expand Down Expand Up @@ -2065,6 +2082,11 @@ def create_form(self):
label="Distance metric",
options=DimensionReductionMetric,
),
DropdownField(
name="value",
label="Values for dimension reduction",
options=DimensionReductionValue,
),
NumberField(
name="random_state",
label="Seed for random number generation",
Expand Down
19 changes: 15 additions & 4 deletions backend/protzilla/utilities/transform_dfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@
from backend.protzilla.utilities.utilities import default_intensity_column


def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None):
def long_to_wide(
intensity_df: pd.DataFrame,
index: str = "Sample",
columns: str = "Protein ID",
value_name: str | None = None,
):
"""
This function transforms the dataframe to a wide format that
can be more easily handled by packages such as sklearn.
Expand All @@ -12,6 +17,14 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None):
:param intensity_df: the dataframe that should be transformed into
long format
:type intensity_df: pd.DataFrame
:param index: the column that should be used as index in the wide format
dataframe. This should be either "Sample" or "Protein ID".
:type index: str
:param columns: the column that should be used as columns in the wide format
dataframe. This should be either "Sample" or "Protein ID".
:type columns: str
:param value_name: the name of the column that contains the values in the long format dataframe. If None, the default intensity column will be used.
:type value_name: str | None

:return: returns dataframe in wide format suitable for use by
packages such as sklearn
Expand All @@ -20,9 +33,7 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None):
values_name = (
default_intensity_column(intensity_df) if value_name is None else value_name
)
return pd.pivot(
intensity_df, index="Sample", columns="Protein ID", values=values_name
)
return pd.pivot(intensity_df, index=index, columns=columns, values=values_name)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should add some guardrails to make sure that index and columns are not the same, because this will lead to a cryptic error message. (This should ideally never happen, but if I had a penny for every time I didn't check assumptions because they could never possibly happen...)



def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame):
Expand Down
2 changes: 1 addition & 1 deletion backend/protzilla/utilities/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def default_intensity_column(
return matched_columns[0]

raise ValueError(
"No intensity column name provided and no default intensity column could be determined."
"No intensity column name provided and no default intensity column could be determined. "
"Please provide the intensity column name manually to the function call."
)

Expand Down
Loading
Loading