From dc31383d27c246135bd1ba863ccffc66a852e7ab Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Thu, 12 Feb 2026 13:57:55 +0100 Subject: [PATCH 1/5] Implement an API for temporal graphs --- fedivertex/main.py | 99 +++++++++++++++++++++++++++++++++++++++++++- setup.py | 5 ++- tests/test_loader.py | 36 ++++++++++++++++ 3 files changed, 136 insertions(+), 4 deletions(-) diff --git a/fedivertex/main.py b/fedivertex/main.py index 1eb755b..0764257 100644 --- a/fedivertex/main.py +++ b/fedivertex/main.py @@ -1,8 +1,10 @@ import json from types import NoneType -from typing import List, Optional +from typing import List, Optional, Tuple + import mlcroissant as mlc import networkx as nx +import networkx_temporal as tx from tqdm import tqdm @@ -198,7 +200,7 @@ def get_graph( graph.nodes[host]["domain"] = host.split("[DOT]")[-1] for col, val in record.items(): col_name = col.split("/")[-1] - if type(val) == bytes: + if type(val) is bytes: val = val.decode() if col_name not in ["host", "Id", "Label"]: graph.nodes[host][col_name] = val @@ -222,3 +224,96 @@ def get_graph( graph = graph.subgraph(largest_cc).copy() return graph + + def get_temporal_graph( + self, + software: str, + graph_type: str, + index: Optional[Tuple[int, int]] = None, + date: Optional[Tuple[str, str]] = None, + disable_tqdm: bool = False, + ) -> tx.TemporalGraph: + """Provide a graph for a given software and graph type. + By default, we provide the latest graph but it can also be selected using the date or index. + + :param software: + :type software: str + :param graph_type: + :type graph_type: str + :param index: index range for the graphs (bounds are included), defaults to None + :type index: Optional[Tuple[int, int]], optional + :param date: date range for the graphs (bounds are included), defaults to None + :type date: Optional[Tuple[str, str]], optional + :param disable_tqdm: disables the TQDM progress bars, defaults to False + :type disable_tqdm: bool, optional + :raises ValueError: if both a date and an index are provided. + :return: a graph in the NetworkX format + :rtype: tx.TemporalGraph + """ + self._check_input(software, graph_type) + + if software == "mastodon" and graph_type == "federation": + resp = input( + """Each Mastodon Federation graph is 1GB large.\n + Storing the temporal graph might take a lot of space in memory, + are you sure you want to load it? [yes or no]""" + ) + if resp.lower() not in ["yes", "y", "yeah"]: + raise KeyboardInterrupt + + availables_dates = self.list_available_dates(software, graph_type) + selected_dates = [] + if index is None and date is None: + # Fetch all graphs + selected_dates = availables_dates + elif index is not None and date is not None: + raise ValueError( + "You must provide either the date or the index range of the graph, not both." + ) + elif index is not None: + if len(index) > 2: + raise ValueError("Incorrect format for the index range") + if index[0] > index[1]: + raise ValueError("Incorrect index range") + if index[0] < 0 or index[1] > len(availables_dates) - 1: + raise ValueError( + f"Indices are out of the acceptable range (0,{len(availables_dates) - 1})" + ) + + selected_dates = availables_dates[index[0] : index[1] + 1] + else: # date is not None: + assert date is not None + if len(date) > 2: + raise ValueError("Incorrect format for the date range") + + min_date, max_date = date + try: + min_date = int(min_date) + max_date = int(max_date) + except ValueError as err: + raise ValueError("Invalid date format") from err + + if ( + min_date > int(availables_dates[-1]) + or int(availables_dates[0]) > max_date + ): + raise ValueError( + f"Indices not covering the available dates: ({availables_dates[0]},{availables_dates[-1]})" + ) + + for selected_date in availables_dates: + int_date = int(selected_date) + if min_date <= int_date and int_date <= max_date: + selected_dates.append(selected_date) + + selected_graphs = [] + for selected_date in selected_dates: + graph = self.get_graph( + software=software, + graph_type=graph_type, + date=selected_date, + disable_tqdm=disable_tqdm, + ) + selected_graphs.append(graph) + + return tx.from_snapshots(selected_graphs) diff --git a/setup.py b/setup.py index 3a54212..83ca05d 100644 --- a/setup.py +++ b/setup.py @@ -1,12 +1,13 @@ -from setuptools import setup, find_packages from pathlib import Path +from setuptools import find_packages, setup + this_directory = Path(__file__).parent long_description = (this_directory / "README.md").read_text() setup( name="fedivertex", - version="0.9.9", + version="1.0.0", author="Marc DAMIE", author_email="marc.damie@inria.fr", description="Interface to download and interact with Fedivertex, the Fediverse Graph Dataset", diff --git a/tests/test_loader.py b/tests/test_loader.py index 4fc4b0a..b4b81a5 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -137,3 +137,39 @@ def test_get_graph(): ) assert bookwyrm_graph.number_of_nodes() == 70 assert bookwyrm_graph.number_of_edges() == 1827 + + +def test_get_temporal_graph(): + loader = GraphLoader() + + with pytest.raises(ValueError): + loader.get_temporal_graph("NON-EXISTING", "federation") + + with pytest.raises(ValueError): + loader.get_temporal_graph("peertube", "NON-EXISTING") + + with pytest.raises(ValueError): + loader.get_temporal_graph( + "peertube", "follow", date=("20250203", "20250217"), index=(3, 7) + ) + + with pytest.raises(ValueError): + loader.get_temporal_graph("peertube", "follow", index=(-1, 7)) + + with pytest.raises(ValueError): + loader.get_temporal_graph("peertube", "follow", index=(3, 70000000000)) + + with pytest.raises(ValueError): + loader.get_temporal_graph("peertube", "follow", date=("20210203", "20210217")) + + temporal_graph = loader.get_temporal_graph( + "peertube", "follow", date=("20250203", "20250617") + ) + assert len(temporal_graph.temporal_nodes()) == 1157 + assert len(temporal_graph.temporal_edges()) == 310695 + assert temporal_graph.number_of_snapshots() == 20 + + temporal_graph = loader.get_temporal_graph("peertube", "follow", index=(0, 7)) + assert len(temporal_graph.temporal_nodes()) == 991 + assert len(temporal_graph.temporal_edges()) == 133852 + assert temporal_graph.number_of_snapshots() == 8 From f85d22b3bc5f1bc9af585669bbf09ae083762a83 Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Thu, 12 Feb 2026 14:13:02 +0100 Subject: [PATCH 2/5] Add a missing dependency --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 83ca05d..c26e97d 100644 --- a/setup.py +++ b/setup.py @@ -20,6 +20,7 @@ "numpy<2.0", # To be compatible with mlcroissant "mlcroissant", "networkx", + "networkx-temporal", "tqdm", ], extras_require={"test": ["pytest", "pytest-coverage"]}, From dc5ae896df40d653f69e8a8848a958b33110357a Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Thu, 12 Feb 2026 16:27:52 +0100 Subject: [PATCH 3/5] Add a light dataset option to reduce disk space usage --- fedivertex/main.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fedivertex/main.py b/fedivertex/main.py index 0764257..a57f7e5 100644 --- a/fedivertex/main.py +++ b/fedivertex/main.py @@ -25,10 +25,12 @@ class GraphLoader: } UNDIRECTED_GRAPHS = ["federation"] - def __init__( - self, - url="https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset/croissant/download", - ): + def __init__(self, light_version=True): + self.light_version = light_version + if self.light_version: + url = "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset-reduced/croissant/download" + else: + url = "https://www.kaggle.com/datasets/marcdamie/fediverse-graph-dataset/croissant/download" try: self.dataset = mlc.Dataset(jsonld=url) except json.JSONDecodeError as err: @@ -58,6 +60,12 @@ def _check_input(self, software: str, graph_type: str) -> NoneType: f"{graph_type} is not a valid graph type for {software}. Valid types: {self.VALID_GRAPH_TYPES[software]}" ) + if self.light_version and software == "mastodon" and graph_type == "federation": + raise ValueError( + f"The graph {software} {graph_type} is not included in the light version of Fedivertex\n" + "To download the full version, generate the dataset loader as follows: `GraphLoader(light_version=False)`" + ) + def _fetch_date_index(self, software: str, graph_type: str, index: int) -> str: """Returns the i-th date available for a given graph type. The dates are sorted increasingly. From eb0cdc3fa47ab6ca1cf32be67570288c273219bb Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Thu, 12 Feb 2026 16:33:23 +0100 Subject: [PATCH 4/5] Extend the pytest execution to Python 3.12 --- .github/workflows/pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml index f8bfd56..662165d 100644 --- a/.github/workflows/pytest.yml +++ b/.github/workflows/pytest.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.10.x,3.11.x] + python-version: [3.10.x, 3.11.x, 3.12.x] steps: - uses: actions/checkout@v3 From 5e5b510d4ca9eb7f680374ae086cfa51ee4d0ed6 Mon Sep 17 00:00:00 2001 From: Marc Damie Date: Tue, 17 Feb 2026 10:55:13 +0100 Subject: [PATCH 5/5] Added more extensive tests --- tests/test_loader.py | 82 ++++++++++++++++++++++++++++++-------------- 1 file changed, 56 insertions(+), 26 deletions(-) diff --git a/tests/test_loader.py b/tests/test_loader.py index b4b81a5..7f541cf 100644 --- a/tests/test_loader.py +++ b/tests/test_loader.py @@ -56,7 +56,7 @@ def test_index_selection(): assert loader._fetch_date_index("peertube", "follow", -1) == latest_date -def test_get_graph(): +def test_get_graph_errors(): loader = GraphLoader() with pytest.raises(ValueError): @@ -68,41 +68,71 @@ def test_get_graph(): with pytest.raises(ValueError): loader.get_graph("peertube", "follow", date="20250203", index=3) - # No error with latest date - for software, graph_type_list in loader.VALID_GRAPH_TYPES.items(): - if software == "mastodon": # To avoid parsing the massive mastodon graph + +def _iter_software_graph(): + loader = GraphLoader() + for software, graph_types in loader.VALID_GRAPH_TYPES.items(): + if software == "mastodon": continue + for graph_type in graph_types: + if graph_type == "federation": + continue + yield software, graph_type + + +@pytest.mark.parametrize("software,graph_type", list(_iter_software_graph())) +def test_get_graph_selection(software, graph_type): + loader = GraphLoader() + + date = loader._fetch_latest_date(software, graph_type) - for graph_type in graph_type_list: - date = loader._fetch_latest_date(software, graph_type) + # Test date selection + graph1 = loader.get_graph(software, graph_type, date=date) - # Test date selection - graph1 = loader.get_graph(software, graph_type, date=date) + if not graph_type == "federation": # Because Federation is undirected + csv_file = f"{software}/{graph_type}/{date}/interactions.csv" + records = loader.dataset.records(csv_file) - if not graph_type == "federation": # Because Federation is undirected - csv_file = f"{software}/{graph_type}/{date}/interactions.csv" - records = loader.dataset.records(csv_file) + assert graph1.number_of_edges() == len(list(records)) - assert graph1.number_of_edges() == len(list(records)) + # Test index selection + graph2 = loader.get_graph(software, graph_type, index=-1) + assert graph1.number_of_edges() == graph2.number_of_edges() - # Test index selection - graph2 = loader.get_graph(software, graph_type, index=-1) - assert graph1.number_of_edges() == graph2.number_of_edges() + available_dates = loader.list_available_dates(software, graph_type) + date = available_dates[0] + graph3 = loader.get_graph(software, graph_type, date=date) - ########### FURTHER TESTS ############### - available_dates = loader.list_available_dates(software, graph_type) - date = available_dates[0] - graph3 = loader.get_graph(software, graph_type, date=date) + graph4 = loader.get_graph(software, graph_type, index=0) + assert graph3.number_of_edges() == graph4.number_of_edges() - if not graph_type == "federation": # Because Federation is undirected - csv_file = f"{software}/{graph_type}/{date}/interactions.csv" - records = loader.dataset.records(csv_file) - assert graph3.number_of_edges() == len(list(records)) +def _iter_software_graph_date(): + loader = GraphLoader() + for software, graph_types in loader.VALID_GRAPH_TYPES.items(): + if software == "mastodon": + continue + for graph_type in graph_types: + if graph_type == "federation": + continue + for date in loader.list_available_dates(software, graph_type): + yield software, graph_type, date - # Test index selection - graph4 = loader.get_graph(software, graph_type, index=0) - assert graph3.number_of_edges() == graph4.number_of_edges() + +@pytest.mark.parametrize("software,graph_type,date", list(_iter_software_graph_date())) +def test_get_graph_sizes(software, graph_type, date): + loader = GraphLoader() + + graph = loader.get_graph(software, graph_type, date=date) + csv_file = f"{software}/{graph_type}/{date}/interactions.csv" + records = list(loader.dataset.records(csv_file)) + + assert graph.number_of_edges() == len(records) # Verify that we load all the edges + # NB: an error can also occur in case of data cleaning issue in the dataset + + +def test_graph_consistency(): + loader = GraphLoader() # Check graph consistency peertube_graph = loader.get_graph("peertube", "follow", date="20250324")