diff --git a/protzilla/importing/ms_data_import.py b/protzilla/importing/ms_data_import.py index c3d9136f..002ff823 100644 --- a/protzilla/importing/ms_data_import.py +++ b/protzilla/importing/ms_data_import.py @@ -11,7 +11,10 @@ def max_quant_import( - file_path: str, intensity_name: str, map_to_uniprot=False, aggregation_method: str ="Sum" + file_path: str, + intensity_name: str, + map_to_uniprot=False, + aggregation_method: str = "Sum", ) -> dict: assert intensity_name in ["Intensity", "iBAQ", "LFQ intensity"] try: @@ -34,15 +37,28 @@ def max_quant_import( c[len(intensity_name) + 1 :] for c in intensity_df.columns ] intensity_df = intensity_df.assign(**{"Protein ID": protein_groups}) - return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method) + return transform_and_clean( + intensity_df, intensity_name, map_to_uniprot, aggregation_method + ) except Exception as e: msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid Max Quant file." - return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))]) + return dict( + messages=[ + dict( + level=logging.ERROR, + msg=msg, + trace=format_trace(traceback.format_exception(e)), + ) + ] + ) def ms_fragger_import( - file_path: str, intensity_name: str, map_to_uniprot=False, aggregation_method: str ="Sum" + file_path: str, + intensity_name: str, + map_to_uniprot=False, + aggregation_method: str = "Sum", ) -> dict: assert intensity_name in [ "Intensity", @@ -87,13 +103,25 @@ def ms_fragger_import( ) intensity_df = intensity_df.assign(**{"Protein ID": protein_groups}) - return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method) + return transform_and_clean( + intensity_df, intensity_name, map_to_uniprot, aggregation_method + ) except Exception as e: msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid MS Fragger file." - return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))]) + return dict( + messages=[ + dict( + level=logging.ERROR, + msg=msg, + trace=format_trace(traceback.format_exception(e)), + ) + ] + ) -def diann_import(file_path, map_to_uniprot=False, aggregation_method: str ="Sum") -> dict: +def diann_import( + file_path, map_to_uniprot=False, aggregation_method: str = "Sum" +) -> dict: try: df = pd.read_csv( file_path, @@ -117,14 +145,86 @@ def diann_import(file_path, map_to_uniprot=False, aggregation_method: str ="Sum" intensity_name = "Intensity" - return transform_and_clean(intensity_df, intensity_name, map_to_uniprot, aggregation_method) + return transform_and_clean( + intensity_df, intensity_name, map_to_uniprot, aggregation_method + ) except Exception as e: msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid DIA-NN MS file." - return dict(messages=[dict(level=logging.ERROR, msg=msg, trace=format_trace(traceback.format_exception(e)))]) + return dict( + messages=[ + dict( + level=logging.ERROR, + msg=msg, + trace=format_trace(traceback.format_exception(e)), + ) + ] + ) + + +def simple_csv_import( + file_path: str, map_to_uniprot=False, aggregation_method: str = "Sum" +) -> dict: + """ + Imports a simple CSV file with protein IDs in the first column and intensity values in the remaining columns. + + :param file_path: Path to the CSV file + :type file_path: str + :param map_to_uniprot: Whether to map protein IDs to UniProt IDs + :type map_to_uniprot: bool + :param aggregation_method: Method to aggregate duplicate protein groups ("Sum", "Mean", or "Median") + :type aggregation_method: str + :return: Dictionary containing the processed dataframe and metadata + """ + try: + df = pd.read_csv( + file_path, + sep=",", + low_memory=False, + na_values=["", 0], + keep_default_na=True, + ) + + # Check if "Protein ID" column exists + if "Protein ID" not in df.columns: + msg = "Column 'Protein ID' not found in the provided file. Please check your file format." + return dict(messages=[dict(level=logging.ERROR, msg=msg)]) + + # Get sample columns (all columns except "Protein ID") + sample_columns = [col for col in df.columns if col != "Protein ID"] + + if not sample_columns: + msg = "No sample columns found in the provided file. Please check your file format." + return dict(messages=[dict(level=logging.ERROR, msg=msg)]) + + # Create a dataframe with only the protein IDs and sample columns + intensity_df = df[["Protein ID"] + sample_columns] + + # Use a fixed intensity name for the output + intensity_name = "Intensity" + + # Pass to the common transform and clean function + return transform_and_clean( + intensity_df, intensity_name, map_to_uniprot, aggregation_method + ) + + except Exception as e: + msg = f"An error occurred while reading the file: {e.__class__.__name__} {e}. Please provide a valid CSV file." + return dict( + messages=[ + dict( + level=logging.ERROR, + msg=msg, + trace=format_trace(traceback.format_exception(e)), + ) + ] + ) def transform_and_clean( - df: pd.DataFrame, intensity_name: str, map_to_uniprot: bool, aggregation_method: str ="Sum" + df: pd.DataFrame, + intensity_name: str, + map_to_uniprot: bool, + aggregation_method: str = "Sum", ) -> dict: """ Transforms a dataframe that is read from a file in wide format into long format, @@ -158,7 +258,9 @@ def transform_and_clean( # applies the selected aggregation to duplicate protein groups, NaN if all are NaN, aggregation of numbers otherwise aggregation_method = aggregation_method.lower() agg_kwargs = {"sum": {"min_count": 1}, "median": {}, "mean": {}} - df = df.groupby("Protein ID", as_index=False).agg(aggregation_method, **agg_kwargs[aggregation_method]) + df = df.groupby("Protein ID", as_index=False).agg( + aggregation_method, **agg_kwargs[aggregation_method] + ) df = df.assign(Gene=lambda _: np.nan) # add deprecated genes column @@ -222,6 +324,7 @@ def clean_protein_groups(protein_groups, map_to_uniprot=True): for group in found_ids_per_group: all_ids_of_group = [] for old_id in group: + # Issue 574: ENSEMBL ids are not mapped to uniprot if uniprot_regex.search(old_id): all_ids_of_group.append(old_id) elif map_to_uniprot: @@ -230,7 +333,7 @@ def clean_protein_groups(protein_groups, map_to_uniprot=True): all_ids_of_group.extend(new_ids) else: all_ids_of_group.append(old_id) - new_groups.append(all_ids_of_group[0] if all_ids_of_group else '') + new_groups.append(all_ids_of_group[0] if all_ids_of_group else "") return new_groups, removed_protein_ids diff --git a/protzilla/methods/importing.py b/protzilla/methods/importing.py index 6a2f6835..7ab44c17 100644 --- a/protzilla/methods/importing.py +++ b/protzilla/methods/importing.py @@ -9,6 +9,7 @@ diann_import, max_quant_import, ms_fragger_import, + simple_csv_import, ) from protzilla.importing.peptide_import import peptide_import, evidence_import from protzilla.steps import Step, StepManager @@ -51,7 +52,9 @@ def method(self, inputs): class MsFraggerImport(ImportingStep): display_name = "MS Fragger Combined Protein Import" operation = "Protein Data Import" - method_description = "Import the combined_protein.tsv file form output of MS Fragger" + method_description = ( + "Import the combined_protein.tsv file form output of MS Fragger" + ) input_keys = ["file_path", "intensity_name", "map_to_uniprot", "aggregation_method"] output_keys = ["protein_df"] @@ -60,6 +63,21 @@ def method(self, inputs): return ms_fragger_import(**inputs) +class SimpleCSVImport(ImportingStep): + display_name = "Simple CSV Intensities Import" + operation = "Protein Data Import" + method_description = ( + "Import protein intensities from a csv file. The csv requires a column 'Protein ID', " + "the remaining column names should be the sample names. The values should be the intensities." + ) + + input_keys = ["file_path", "map_to_uniprot", "aggregation_method"] + output_keys = ["protein_df"] + + def method(self, inputs): + return simple_csv_import(**inputs) + + class MetadataImport(ImportingStep): display_name = "Metadata Import" operation = "metadataimport" @@ -139,4 +157,4 @@ class EvidenceImport(ImportingStep): output_keys = ["peptide_df"] def method(self, inputs): - return evidence_import(**inputs) \ No newline at end of file + return evidence_import(**inputs) diff --git a/tests/protzilla/importing/test_ms_data_import.py b/tests/protzilla/importing/test_ms_data_import.py index 457fe145..73a509c1 100644 --- a/tests/protzilla/importing/test_ms_data_import.py +++ b/tests/protzilla/importing/test_ms_data_import.py @@ -9,6 +9,117 @@ from protzilla.importing import ms_data_import +def simple_csv_import_intensity_df(): + """Create expected dataframe for simple CSV import test""" + # fmt: off + sample_data = { 'Sample': ['Sample1', 'Sample1', 'Sample1', 'Sample1', 'Sample1', 'Sample2', 'Sample2', 'Sample2', 'Sample2', 'Sample2'], + 'Protein ID': ['A2A5R2', 'A2A7S8', 'A2A863', 'A2AGT5', 'A2AJ76', 'A2A5R2', 'A2A7S8', 'A2A863', 'A2AGT5', 'A2AJ76'], + 'Gene': [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan], + 'Intensity': [18210618.0, 4133918.5, 144354336.0, 5645782.0, 9055790.0, 25468630.0, 7812505.5, 139428224.0, 3202878.8, 19467296.0]} + # fmt: on + df = pd.DataFrame(data=sample_data) + df.sort_values(by=["Sample", "Protein ID"], ignore_index=True, inplace=True) + return df + + +def test_simple_csv_import(): + """Test basic functionality of simple_csv_import""" + outputs = ms_data_import.simple_csv_import( + file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data.csv", + ) + + expected_protein_df = simple_csv_import_intensity_df() + + # Drop Gene column for comparison as it's just placeholders + expected_protein_df = expected_protein_df.drop(columns=["Gene"]) + result_protein_df = outputs["protein_df"].drop(columns=["Gene"]) + + pd.testing.assert_frame_equal(expected_protein_df, result_protein_df) + + +def test_simple_csv_import_file_not_exist(): + """Test error handling when file doesn't exist""" + outputs = ms_data_import.simple_csv_import( + file_path="non_existent_file_path", + ) + + assert "protein_df" not in outputs + assert "messages" in outputs + assert any(message["level"] == logging.ERROR for message in outputs["messages"]) + assert any("found" in message["msg"].lower() for message in outputs["messages"]) + + +def test_simple_csv_import_no_protein_id_column(): + """Test error handling when Protein ID column is missing""" + outputs = ms_data_import.simple_csv_import( + file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data_no_protein_column.csv", + ) + + assert "protein_df" not in outputs + assert "messages" in outputs + assert any(message["level"] == logging.ERROR for message in outputs["messages"]) + assert any("Protein ID" in message["msg"] for message in outputs["messages"]) + + +def test_simple_csv_import_no_sample_columns(): + """Test error handling when no sample columns are present""" + outputs = ms_data_import.simple_csv_import( + file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data_no_samples.csv", + ) + + assert "protein_df" not in outputs + assert "messages" in outputs + assert any(message["level"] == logging.ERROR for message in outputs["messages"]) + assert any("No sample columns" in message["msg"] for message in outputs["messages"]) + + +def test_simple_csv_import_aggregation_methods(): + """Test different aggregation methods""" + for method in ["Sum", "Mean", "Median"]: + outputs = ms_data_import.simple_csv_import( + file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data_duplicates.csv", + aggregation_method=method, + ) + + assert "protein_df" in outputs + # The exact values would depend on the test data and method, + # but we can at least check that processing completes + assert outputs["protein_df"] is not None + + +def test_simple_csv_import_filters_contaminants(): + """Test that contaminant proteins are filtered""" + outputs = ms_data_import.simple_csv_import( + file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data_contaminants.csv", + ) + + protein_ids = outputs["protein_df"]["Protein ID"].unique().tolist() + + # All instances of CON__ should be filtered out + assert all( + not any(id_.startswith("CON__") for id_ in group.split(";")) + for group in protein_ids + ) + + # Check that contaminants list is not empty + assert len(outputs["contaminants"]) > 0 + + +# Issue 574: ENSEMBL ids are not mapped to uniprot. Once resolved, uncomment this +# @patch("protzilla.importing.ms_data_import.map_ids_to_uniprot") +# def test_simple_csv_import_with_mapping(ids_to_uniprot_mock): +# """Test UniProt ID mapping functionality""" +# ids_to_uniprot_mock.return_value = {"ENSP12345678901": ["P54321"]} +# +# outputs = ms_data_import.simple_csv_import( +# file_path=f"{PROJECT_PATH}/tests/test_data/simple_csv_data/simple_protein_data_ensembl.csv", +# map_to_uniprot=True) +# +# # Check that the mapped IDs are present in the output +# protein_ids = outputs["protein_df"]["Protein ID"].unique().tolist() +# assert "P54321" in protein_ids + + def ms_fragger_import_intensity_df(intensity_name): ms_fragger_list = ( ["DDM_0pt1_01", "A2A5R2", "Arfgef2"], @@ -218,7 +329,9 @@ def test_max_quant_import_no_protein_ids_column(): assert "protein_df" not in outputs assert "messages" in outputs assert any(message["level"] == logging.ERROR for message in outputs["messages"]) - assert any("Majority protein IDs" in message["msg"] for message in outputs["messages"]) + assert any( + "Majority protein IDs" in message["msg"] for message in outputs["messages"] + ) def test_max_quant_import_invalid_data(): @@ -310,9 +423,7 @@ def test_transform_and_clean(): ["C", "Q11111", np.nan], ] df = pd.DataFrame(data, columns=columns) - outputs = ms_data_import.transform_and_clean( - df, "intensity", map_to_uniprot=False - ) + outputs = ms_data_import.transform_and_clean(df, "intensity", map_to_uniprot=False) expected_df = pd.DataFrame(expected_output, columns=out_col) # we do not care about the genes column, it is deprecated (and replaced by nan) diff --git a/tests/test_data/simple_csv_data/simple_protein_data.csv b/tests/test_data/simple_csv_data/simple_protein_data.csv new file mode 100644 index 00000000..87d0e235 --- /dev/null +++ b/tests/test_data/simple_csv_data/simple_protein_data.csv @@ -0,0 +1,6 @@ +Protein ID,Sample1,Sample2 +A2A5R2,18210618,25468630 +A2A7S8,4133918.5,7812505.5 +A2A863,144354336,139428224 +A2AGT5,5645782,3202878.8 +A2AJ76,9055790,19467296 \ No newline at end of file diff --git a/tests/test_data/simple_csv_data/simple_protein_data_contaminants.csv b/tests/test_data/simple_csv_data/simple_protein_data_contaminants.csv new file mode 100644 index 00000000..f56dcc90 --- /dev/null +++ b/tests/test_data/simple_csv_data/simple_protein_data_contaminants.csv @@ -0,0 +1,6 @@ +Protein ID,Sample1,Sample2 +A2A5R2,18210618,25468630 +A2A7S8,4133918.5,7812505.5 +A2A863,144354336,139428224 +A2AGT5,5645782,3202878.8 +CON__A2AJ76,9055790,19467296 \ No newline at end of file diff --git a/tests/test_data/simple_csv_data/simple_protein_data_duplicates.csv b/tests/test_data/simple_csv_data/simple_protein_data_duplicates.csv new file mode 100644 index 00000000..96574c6c --- /dev/null +++ b/tests/test_data/simple_csv_data/simple_protein_data_duplicates.csv @@ -0,0 +1,8 @@ +Protein ID,Sample1,Sample2 +A2A5R2,18210618,25468630 +A2A7S8,4133918.5,7812505.5 +A2A5R2,18210618,25468630 +A2A863,144354336,139428224 +A2AGT5,5645782,3202878.8 +A2AGT5,5645782,3202878.8 +A2AJ76,9055790,19467296 \ No newline at end of file diff --git a/tests/test_data/simple_csv_data/simple_protein_data_ensembl.csv b/tests/test_data/simple_csv_data/simple_protein_data_ensembl.csv new file mode 100644 index 00000000..6067b931 --- /dev/null +++ b/tests/test_data/simple_csv_data/simple_protein_data_ensembl.csv @@ -0,0 +1,8 @@ +Protein ID,Sample1,Sample2 +ENSP12345678901,18210618,25468630 +A2A7S8,4133918.5,7812505.5 +A2A5R2,18210618,25468630 +A2A863,144354336,139428224 +A2AGT5,5645782,3202878.8 +A2AGT5,5645782,3202878.8 +A2AJ76,9055790,19467296 \ No newline at end of file diff --git a/tests/test_data/simple_csv_data/simple_protein_data_no_protein_column.csv b/tests/test_data/simple_csv_data/simple_protein_data_no_protein_column.csv new file mode 100644 index 00000000..3c11d1b9 --- /dev/null +++ b/tests/test_data/simple_csv_data/simple_protein_data_no_protein_column.csv @@ -0,0 +1,6 @@ +BadlyNamedColumn,Sample1,Sample2 +A2A5R2,18210618,25468630 +A2A7S8,4133918.5,7812505.5 +A2A863,144354336,139428224 +A2AGT5,5645782,3202878.8 +A2AJ76,9055790,19467296 \ No newline at end of file diff --git a/tests/test_data/simple_csv_data/simple_protein_data_no_samples.csv b/tests/test_data/simple_csv_data/simple_protein_data_no_samples.csv new file mode 100644 index 00000000..05563230 --- /dev/null +++ b/tests/test_data/simple_csv_data/simple_protein_data_no_samples.csv @@ -0,0 +1,6 @@ +Protein ID +A2A5R2 +A2A7S8 +A2A863 +A2AGT5 +A2AJ76 \ No newline at end of file diff --git a/ui/runs/form_mapping.py b/ui/runs/form_mapping.py index 7221bba6..cbf4525c 100644 --- a/ui/runs/form_mapping.py +++ b/ui/runs/form_mapping.py @@ -17,6 +17,7 @@ importing.MaxQuantImport: importing_forms.MaxQuantImportForm, importing.DiannImport: importing_forms.DiannImportForm, importing.MsFraggerImport: importing_forms.MSFraggerImportForm, + importing.SimpleCSVImport: importing_forms.SimpleCSVImportForm, importing.MetadataImport: importing_forms.MetadataImportForm, importing.MetadataImportMethodDiann: importing_forms.MetadataImportMethodDiannForm, importing.MetadataColumnAssignment: importing_forms.MetadataColumnAssignmentForm, diff --git a/ui/runs/forms/importing.py b/ui/runs/forms/importing.py index 8975e3ba..5caea507 100644 --- a/ui/runs/forms/importing.py +++ b/ui/runs/forms/importing.py @@ -67,6 +67,18 @@ class DiannImportForm(MethodForm): ) +class SimpleCSVImportForm(MethodForm): + file_path = CustomFileField(label="CSV file containing the intensities") + map_to_uniprot = CustomBooleanField( + label="Map to Uniprot IDs using Biomart (online)", required=False + ) + aggregation_method = CustomChoiceField( + choices=AggregationMethods, + label="Aggregation method used to aggregate duplicate values for protein groups", + initial="Sum", + ) + + class MSFraggerImportForm(MethodForm): file_path = CustomFileField( label="MSFragger intensities file (combined_proteins.tsv)"