-
Notifications
You must be signed in to change notification settings - Fork 0
206 dimensionality reduction on proteins #390
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: dev
Are you sure you want to change the base?
Changes from all commits
ef746c1
de1f012
5470a5a
06e708b
c0dc7da
215ec10
953b986
156274c
46f5cee
1c28001
5e23b4e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,6 +38,7 @@ def scatter_plot( | |
| input_df: pd.DataFrame, | ||
| metadata_df: pd.DataFrame | None = None, | ||
| metadata_column: str | None = None, | ||
| sample_name: str = "Sample", | ||
| ) -> dict: | ||
| """ | ||
| Function to create a scatter plot from data. | ||
|
|
@@ -49,6 +50,7 @@ def scatter_plot( | |
| :param metadata_column: the name of the column in `metadata_df` that contains the | ||
| group information for each sample. This parameter is required if `metadata_df` | ||
| is provided. | ||
| :param sample_name: the name of the sample column, should be the same name for the input_df and the metadata_df | ||
|
|
||
| :return: returns a dictionary containing a list with a plotly figure and/or a list of messages | ||
| """ | ||
|
|
@@ -58,19 +60,24 @@ def scatter_plot( | |
| "The column selected for annotation is not present in the corresponding metadata dataframe.", | ||
| ) | ||
|
|
||
| if sample_name not in input_df.columns: | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would like to see a test that checks the raising of this |
||
| raise ValueError( | ||
| f"The column {sample_name} selected for annotation is not present in the corresponding input dataframe.", | ||
| ) | ||
|
|
||
| intensity_df = input_df.copy() | ||
| if isinstance(metadata_df, pd.DataFrame): | ||
| intensity_df = pd.merge( | ||
| intensity_df, | ||
| metadata_df[["Sample", metadata_column]], | ||
| on="Sample", | ||
| metadata_df[[sample_name, metadata_column]], | ||
| on=sample_name, | ||
| how="left", | ||
| ) | ||
| else: | ||
| # Mock a metadata column here so that we can treat dfs with and without metadata the same way | ||
| metadata_column = "mock_metadata_column" | ||
| intensity_df[metadata_column] = None | ||
| intensity_df = intensity_df.drop(columns="Sample") | ||
| intensity_df = intensity_df.drop(columns=sample_name) | ||
|
|
||
| color_col = ( | ||
| metadata_column if intensity_df[metadata_column].notnull().any() else None | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -213,6 +213,11 @@ class DimensionReductionMetric(Enum): | |
| cosine = "cosine" | ||
|
|
||
|
|
||
| class DimensionReductionValue(Enum): | ||
| sample = "Sample" | ||
| protein = "Protein ID" | ||
|
|
||
|
|
||
| class DataAnalysisStep(Step, ABC): | ||
| section = Section.DATA_ANALYSIS | ||
|
|
||
|
|
@@ -899,6 +904,12 @@ def create_form(self): | |
| name="metadata_column", | ||
| label="Choose the column of the metadata dataframe that should be used for coloring", | ||
| ), | ||
| DropdownField( | ||
| name="sample_name", | ||
| label="Choose the column that contains the sample information", | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Calling it sample and the variable |
||
| value=DimensionReductionValue.sample.value, | ||
| options=DimensionReductionValue, | ||
| ), | ||
| ], | ||
| ) | ||
|
|
||
|
|
@@ -914,6 +925,7 @@ def modify_form(self, run: Run) -> None: | |
| run, | ||
| instance_identifier=metadata_source, | ||
| include_sample=False, | ||
| required=False, | ||
| output_key=source_handle, | ||
| ) | ||
| ) | ||
|
|
@@ -1993,6 +2005,11 @@ def create_form(self): | |
| label="Distance metric", | ||
| options=DimensionReductionMetric, | ||
| ), | ||
| DropdownField( | ||
| name="value", | ||
| label="Values for dimension reduction", | ||
| options=DimensionReductionValue, | ||
| ), | ||
| NumberField( | ||
| name="random_state", | ||
| label="Seed for random number generation", | ||
|
|
@@ -2065,6 +2082,11 @@ def create_form(self): | |
| label="Distance metric", | ||
| options=DimensionReductionMetric, | ||
| ), | ||
| DropdownField( | ||
| name="value", | ||
| label="Values for dimension reduction", | ||
| options=DimensionReductionValue, | ||
| ), | ||
| NumberField( | ||
| name="random_state", | ||
| label="Seed for random number generation", | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -3,7 +3,12 @@ | |
| from backend.protzilla.utilities.utilities import default_intensity_column | ||
|
|
||
|
|
||
| def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None): | ||
| def long_to_wide( | ||
| intensity_df: pd.DataFrame, | ||
| index: str = "Sample", | ||
| columns: str = "Protein ID", | ||
| value_name: str | None = None, | ||
| ): | ||
| """ | ||
| This function transforms the dataframe to a wide format that | ||
| can be more easily handled by packages such as sklearn. | ||
|
|
@@ -12,6 +17,14 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None): | |
| :param intensity_df: the dataframe that should be transformed into | ||
| long format | ||
| :type intensity_df: pd.DataFrame | ||
| :param index: the column that should be used as index in the wide format | ||
| dataframe. This should be either "Sample" or "Protein ID". | ||
| :type index: str | ||
| :param columns: the column that should be used as columns in the wide format | ||
| dataframe. This should be either "Sample" or "Protein ID". | ||
| :type columns: str | ||
| :param value_name: the name of the column that contains the values in the long format dataframe. If None, the default intensity column will be used. | ||
| :type value_name: str | None | ||
|
|
||
| :return: returns dataframe in wide format suitable for use by | ||
| packages such as sklearn | ||
|
|
@@ -20,9 +33,7 @@ def long_to_wide(intensity_df: pd.DataFrame, value_name: str | None = None): | |
| values_name = ( | ||
| default_intensity_column(intensity_df) if value_name is None else value_name | ||
| ) | ||
| return pd.pivot( | ||
| intensity_df, index="Sample", columns="Protein ID", values=values_name | ||
| ) | ||
| return pd.pivot(intensity_df, index=index, columns=columns, values=values_name) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we should add some guardrails to make sure that |
||
|
|
||
|
|
||
| def wide_to_long(wide_df: pd.DataFrame, original_long_df: pd.DataFrame): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See comment above, I feel like
sample_nameis misleading.However, it seems like it is only for metadata processing. Since metadata is enforced to include a "Sample" column (different problem), and using "Protein ID" together with metadata will lead to errors anyway, one could probably also revert the whole
sample_namecompletely.