From 4b9fc13f317a10ba285080d953b413558f6999d9 Mon Sep 17 00:00:00 2001 From: Luis Pedro Coelho Date: Mon, 30 Jun 2025 11:36:40 +1000 Subject: [PATCH 1/4] ENH Only import pandas when needed Importing pandas is slow. This makes calling the tool when it is not needed noticeably faster --- spirepy/sample.py | 2 +- tests/unit/test_sample.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/spirepy/sample.py b/spirepy/sample.py index 354f3ca..efb8f75 100644 --- a/spirepy/sample.py +++ b/spirepy/sample.py @@ -2,7 +2,6 @@ import os.path as path import urllib.request -import pandas as pd import polars as pl from spirepy.data import cluster_metadata @@ -83,6 +82,7 @@ def get_eggnog_data(self) -> pl.dataframe.DataFrame: :rtype: :class:`polars.dataframe.DataFrame` """ if self._eggnog_data is None: + import pandas as pd egg = pd.read_csv( f"https://spire.embl.de/download_eggnog/{self.id}", sep="\t", diff --git a/tests/unit/test_sample.py b/tests/unit/test_sample.py index 7acf74c..a3231f4 100644 --- a/tests/unit/test_sample.py +++ b/tests/unit/test_sample.py @@ -90,7 +90,7 @@ def test_get_mags( mock_cluster_metadata.assert_called_once() assert_frame_equal(result2, expected_mags) - @patch("spirepy.sample.pd.read_csv") + @patch("pandas.read_csv") def test_get_eggnog_data(self, mock_read_csv: MagicMock): """Tests get_eggnog_data for data retrieval and caching.""" mock_df = pd.DataFrame({"gene": ["gene1"], "annotation": ["annot1"]}) From 6e4d027493d6142340ccae5d1982d0e6b0d17aff Mon Sep 17 00:00:00 2001 From: Luis Pedro Coelho Date: Mon, 30 Jun 2025 11:37:40 +1000 Subject: [PATCH 2/4] DOC Better docstrings Mention the caching & return values --- spirepy/data.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/spirepy/data.py b/spirepy/data.py index 160a605..77762ba 100644 --- a/spirepy/data.py +++ b/spirepy/data.py @@ -8,7 +8,13 @@ @memory.cache def cluster_metadata(): - """Fetches and caches the SPIRE cluster metadata from the remote server.""" + """Fetches and caches the SPIRE cluster metadata from the remote server. + + This is slow for the first time, as it downloads a large file, but + subsequent calls will use the cached version. + + :return: A DataFrame with the SPIRE cluster metadata. + """ return pl.read_csv( "https://swifter.embl.de/~fullam/spire/metadata/spire_v1_cluster_metadata.tsv.gz", separator="\t", @@ -17,7 +23,13 @@ def cluster_metadata(): @memory.cache def genome_metadata(): - """Fetches and caches the SPIRE genome metadata from the remote server.""" + """Fetches and caches the SPIRE genome metadata from the remote server. + + This is slow for the first time, as it downloads a large file, but + subsequent calls will use the cached version. + + :return: A DataFrame with the SPIRE genome metadata. + """ return pl.read_csv( "https://swifter.embl.de/~fullam/spire/metadata/spire_v1_genome_metadata.tsv.gz", separator="\t", From 5ec51a7454aa23c5b2cef87c1c21dd00f9fef431 Mon Sep 17 00:00:00 2001 From: Luis Pedro Coelho Date: Mon, 30 Jun 2025 11:37:49 +1000 Subject: [PATCH 3/4] MIN Use SPIREpy as logger name --- spirepy/logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spirepy/logger.py b/spirepy/logger.py index d4cf142..006f473 100644 --- a/spirepy/logger.py +++ b/spirepy/logger.py @@ -5,5 +5,5 @@ handler = RichHandler(level=logging.WARNING, markup=True) handler.setFormatter(formatter) -logger = logging.getLogger("micro_fr_pred") +logger = logging.getLogger("SPIREpy") logger.addHandler(handler) From 35e460d3e4ce7f5f02589e52f9f38a5371d4f1a4 Mon Sep 17 00:00:00 2001 From: Luis Pedro Coelho Date: Mon, 30 Jun 2025 11:51:10 +1000 Subject: [PATCH 4/4] RFCT Use more polar-style code Use expressions to filter --- spirepy/sample.py | 8 ++++++-- spirepy/study.py | 6 +++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/spirepy/sample.py b/spirepy/sample.py index efb8f75..e56950f 100644 --- a/spirepy/sample.py +++ b/spirepy/sample.py @@ -62,9 +62,11 @@ def get_mags(self) -> pl.dataframe.DataFrame: if self._mags is None: cluster_meta = cluster_metadata() metadata = self.get_metadata() - clusters = metadata.filter(metadata["spire_cluster"] != "null") + clusters = metadata.filter( + pl.col("spire_cluster").is_not_null() + ) mags = cluster_meta.filter( - cluster_meta["spire_cluster"].is_in(clusters["spire_cluster"]) + pl.col("spire_cluster").is_in(clusters["spire_cluster"]) ) mags = mags.join(clusters, on="spire_cluster") mags = mags.select( @@ -82,6 +84,8 @@ def get_eggnog_data(self) -> pl.dataframe.DataFrame: :rtype: :class:`polars.dataframe.DataFrame` """ if self._eggnog_data is None: + # We need to use pandas because polars does not support reading + # files with a footer import pandas as pd egg = pd.read_csv( f"https://spire.embl.de/download_eggnog/{self.id}", diff --git a/spirepy/study.py b/spirepy/study.py index 65ae27c..f415042 100644 --- a/spirepy/study.py +++ b/spirepy/study.py @@ -67,10 +67,10 @@ def get_mags(self) -> pl.dataframe.DataFrame: if self._mags is None: genomes = genome_metadata() self._mags = genomes.filter( - genomes["derived_from_sample"].is_in( - self.get_metadata()["sample_id"].to_list() + pl.col("derived_from_sample").is_in( + self.get_metadata()["sample_id"].to_list() + ) ) - ) return self._mags def download_mags(self, output: str):