From 7dacab12482c980d8bd1b61df08289e6b7d2e463 Mon Sep 17 00:00:00 2001
From: 3dot141592 <robin.luepke@gmail.com>
Date: Wed, 22 Apr 2026 09:11:48 +0200
Subject: [PATCH 1/2] test: reimplement example dataset runner test

---
 backend/tests/protzilla/test_runner.py | 50 ++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/backend/tests/protzilla/test_runner.py b/backend/tests/protzilla/test_runner.py
index bd5be85b1..536992cb4 100644
--- a/backend/tests/protzilla/test_runner.py
+++ b/backend/tests/protzilla/test_runner.py
@@ -1,8 +1,10 @@
 import json
+import os
 import shutil
 from pathlib import Path
 from unittest import mock
 
+import pandas as pd
 import pytest
 import yaml
 
@@ -10,6 +12,7 @@
 from backend.protzilla.runner import _serialize_graphs
 from backend.protzilla.utilities.utilities import random_string
 from backend.tests.paths import (
+    TEST_AML_DATA_PATH,
     TEST_MSDATA_PATH,
     TEST_METADATA_PATH,
     TEST_WORKFLOWS_PATH,
@@ -487,6 +490,53 @@ def test_integration_runner(
     assert_runner_finished_successfully(runner)
 
 
+@pytest.mark.skipif(
+    os.getenv("GITHUB_ACTIONS") == "true",
+    reason="Avoid downloading the example dataset files every time CI is run",
+)
+def test_example_dataset_runner(tests_folder_name, monkeypatch):
+    name = tests_folder_name + "/test_aml_paper_integration_" + random_string()
+    runner = Runner(
+        workflow="example_dataset",
+        ms_data_path=None,
+        meta_data_path=None,
+        peptides_path=None,
+        run_name=name,
+        df_mode="memory",
+        all_plots=True,
+        verbose=False,
+    )
+
+    mock_write = mock.MagicMock()
+    monkeypatch.setattr(runner.run, "_run_write", mock_write)
+    mock_plot_safe = mock.MagicMock()
+    monkeypatch.setattr(runner, "_save_plots_html", mock_plot_safe)
+    runner.compute_workflow()
+    assert_runner_finished_successfully(runner)
+
+    preprocessing_output_df = runner.run.steps.get_step_output(
+        output_key="protein_df",
+        instance_identifier="s00020_FilterProteinsByNumberOfValuesPerGroup",
+    )
+
+    assert len(preprocessing_output_df["Protein ID"].unique()) == 5309
+
+    protein_list = pd.read_csv(TEST_AML_DATA_PATH / "preprocessed_protein_list.csv")
+
+    # Account for different secondary protein IDs by comparing the leading entry.
+    protein_list_1 = protein_list["Protein IDs"].str.split(";").str[0]
+    preprocessing_output_df_1 = (
+        preprocessing_output_df["Protein ID"].str.split(";").str[0].unique()
+    )
+    assert set(protein_list_1) == set(preprocessing_output_df_1)
+
+    significant_protein_df = runner.run.steps.get_step_output(
+        output_key="significant_proteins_df",
+        instance_identifier="s00022_DifferentialExpressionTTest",
+    )
+    assert significant_protein_df["Protein ID"].nunique() == 359
+
+
 @pytest.mark.parametrize(
     "mock_workflow,ms_data_file_path,metadata_file_path",
     [

From 72456c6e87674164a7a677412fd25f0aa695a924 Mon Sep 17 00:00:00 2001
From: 3dot141592 <robin.luepke@gmail.com>
Date: Wed, 22 Apr 2026 09:30:24 +0200
Subject: [PATCH 2/2] small changes

---
 backend/tests/protzilla/test_runner.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/backend/tests/protzilla/test_runner.py b/backend/tests/protzilla/test_runner.py
index 536992cb4..c23cedc5d 100644
--- a/backend/tests/protzilla/test_runner.py
+++ b/backend/tests/protzilla/test_runner.py
@@ -497,14 +497,16 @@ def test_integration_runner(
 def test_example_dataset_runner(tests_folder_name, monkeypatch):
     name = tests_folder_name + "/test_aml_paper_integration_" + random_string()
     runner = Runner(
-        workflow="example_dataset",
-        ms_data_path=None,
-        meta_data_path=None,
-        peptides_path=None,
-        run_name=name,
-        df_mode="memory",
-        all_plots=True,
-        verbose=False,
+        **{
+            "workflow": "example_dataset",
+            "ms_data_path": None,
+            "meta_data_path": None,
+            "peptides_path": None,
+            "run_name": name,
+            "df_mode": "memory",
+            "all_plots": True,
+            "verbose": False,
+        }
     )
 
     mock_write = mock.MagicMock()
@@ -523,7 +525,7 @@ def test_example_dataset_runner(tests_folder_name, monkeypatch):
 
     protein_list = pd.read_csv(TEST_AML_DATA_PATH / "preprocessed_protein_list.csv")
 
-    # Account for different secondary protein IDs by comparing the leading entry.
+    # Do some preprocessing to account for differences in additional protein ids
     protein_list_1 = protein_list["Protein IDs"].str.split(";").str[0]
     preprocessing_output_df_1 = (
         preprocessing_output_df["Protein ID"].str.split(";").str[0].unique()