diff --git a/spirepy/data.py b/spirepy/data.py index 160a605..77762ba 100644 --- a/spirepy/data.py +++ b/spirepy/data.py @@ -8,7 +8,13 @@ @memory.cache def cluster_metadata(): - """Fetches and caches the SPIRE cluster metadata from the remote server.""" + """Fetches and caches the SPIRE cluster metadata from the remote server. + + This is slow for the first time, as it downloads a large file, but + subsequent calls will use the cached version. + + :return: A DataFrame with the SPIRE cluster metadata. + """ return pl.read_csv( "https://swifter.embl.de/~fullam/spire/metadata/spire_v1_cluster_metadata.tsv.gz", separator="\t", @@ -17,7 +23,13 @@ def cluster_metadata(): @memory.cache def genome_metadata(): - """Fetches and caches the SPIRE genome metadata from the remote server.""" + """Fetches and caches the SPIRE genome metadata from the remote server. + + This is slow for the first time, as it downloads a large file, but + subsequent calls will use the cached version. + + :return: A DataFrame with the SPIRE genome metadata. + """ return pl.read_csv( "https://swifter.embl.de/~fullam/spire/metadata/spire_v1_genome_metadata.tsv.gz", separator="\t", diff --git a/spirepy/logger.py b/spirepy/logger.py index d4cf142..006f473 100644 --- a/spirepy/logger.py +++ b/spirepy/logger.py @@ -5,5 +5,5 @@ handler = RichHandler(level=logging.WARNING, markup=True) handler.setFormatter(formatter) -logger = logging.getLogger("micro_fr_pred") +logger = logging.getLogger("SPIREpy") logger.addHandler(handler) diff --git a/spirepy/sample.py b/spirepy/sample.py index 354f3ca..e56950f 100644 --- a/spirepy/sample.py +++ b/spirepy/sample.py @@ -2,7 +2,6 @@ import os.path as path import urllib.request -import pandas as pd import polars as pl from spirepy.data import cluster_metadata @@ -63,9 +62,11 @@ def get_mags(self) -> pl.dataframe.DataFrame: if self._mags is None: cluster_meta = cluster_metadata() metadata = self.get_metadata() - clusters = metadata.filter(metadata["spire_cluster"] != "null") + clusters = metadata.filter( + pl.col("spire_cluster").is_not_null() + ) mags = cluster_meta.filter( - cluster_meta["spire_cluster"].is_in(clusters["spire_cluster"]) + pl.col("spire_cluster").is_in(clusters["spire_cluster"]) ) mags = mags.join(clusters, on="spire_cluster") mags = mags.select( @@ -83,6 +84,9 @@ def get_eggnog_data(self) -> pl.dataframe.DataFrame: :rtype: :class:`polars.dataframe.DataFrame` """ if self._eggnog_data is None: + # We need to use pandas because polars does not support reading + # files with a footer + import pandas as pd egg = pd.read_csv( f"https://spire.embl.de/download_eggnog/{self.id}", sep="\t", diff --git a/spirepy/study.py b/spirepy/study.py index 65ae27c..f415042 100644 --- a/spirepy/study.py +++ b/spirepy/study.py @@ -67,10 +67,10 @@ def get_mags(self) -> pl.dataframe.DataFrame: if self._mags is None: genomes = genome_metadata() self._mags = genomes.filter( - genomes["derived_from_sample"].is_in( - self.get_metadata()["sample_id"].to_list() + pl.col("derived_from_sample").is_in( + self.get_metadata()["sample_id"].to_list() + ) ) - ) return self._mags def download_mags(self, output: str): diff --git a/tests/unit/test_sample.py b/tests/unit/test_sample.py index 7acf74c..a3231f4 100644 --- a/tests/unit/test_sample.py +++ b/tests/unit/test_sample.py @@ -90,7 +90,7 @@ def test_get_mags( mock_cluster_metadata.assert_called_once() assert_frame_equal(result2, expected_mags) - @patch("spirepy.sample.pd.read_csv") + @patch("pandas.read_csv") def test_get_eggnog_data(self, mock_read_csv: MagicMock): """Tests get_eggnog_data for data retrieval and caching.""" mock_df = pd.DataFrame({"gene": ["gene1"], "annotation": ["annot1"]})