From 80401fa5375fc7b0c6b9e0d54c07ab5335d48dab Mon Sep 17 00:00:00 2001 From: Ingo Scholtes Date: Thu, 2 Oct 2025 07:14:47 +0000 Subject: [PATCH 1/5] removed graphtool parser --- src/pathpyG/io/__init__.py | 1 - src/pathpyG/io/graphtool.py | 298 -------------------------------- src/pathpyG/io/netzschleuder.py | 1 - 3 files changed, 300 deletions(-) delete mode 100644 src/pathpyG/io/graphtool.py diff --git a/src/pathpyG/io/__init__.py b/src/pathpyG/io/__init__.py index b11c8a258..30a9bc7ad 100644 --- a/src/pathpyG/io/__init__.py +++ b/src/pathpyG/io/__init__.py @@ -1,5 +1,4 @@ from pathpyG.io.netzschleuder import read_netzschleuder_graph, read_netzschleuder_record, list_netzschleuder_records -from pathpyG.io.graphtool import read_graphtool from pathpyG.io.pandas import ( df_to_graph, df_to_temporal_graph, diff --git a/src/pathpyG/io/graphtool.py b/src/pathpyG/io/graphtool.py deleted file mode 100644 index 0cc410af0..000000000 --- a/src/pathpyG/io/graphtool.py +++ /dev/null @@ -1,298 +0,0 @@ -from __future__ import annotations -from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union - -import pickle -import struct - -from numpy import array -import torch - -from pathpyG.core.graph import Graph -from pathpyG.utils.config import config - - -def _parse_property_value(data: bytes, ptr: int, type_index: int, endianness: str) -> Tuple[Optional[Any], int]: - """ - Parse a property value as well as the number of processed bytes. - - Args: - data: byte array containing the data to be decoded - ptr: index of the first byte to be parsed - type_index: integer representing the type of the property value to be parsed - endianness: string representation of endianness, where `>` represents Big Endian - and `<` represents Little Endian - - Returns: - Tuple $(v, n)$ consisting of the property value $v$ and the number of bytes $n$ processed - """ - if type_index == 0: - return (bool(data[ptr]), 1) - elif type_index == 1: - return (struct.unpack(endianness + "h", data[ptr : ptr + 2])[0], 2) - elif type_index == 2: - return (struct.unpack(endianness + "i", data[ptr : ptr + 4])[0], 4) - elif type_index == 3: - return (struct.unpack(endianness + "q", data[ptr : ptr + 8])[0], 8) - elif type_index == 4: - return (struct.unpack(endianness + "d", data[ptr : ptr + 8])[0], 8) - elif type_index == 5: - print("pathpy does not support properties with type long double. Properties have been dropped.") - return (None, 16) - elif type_index == 6: - str_len = struct.unpack(endianness + "Q", data[ptr : ptr + 8])[0] - str = data[ptr + 8 : ptr + 8 + str_len].decode("utf-8") - return (str, 8 + str_len) - elif type_index == 7: - num_values = struct.unpack(endianness + "Q", data[ptr : ptr + 8])[0] - offset = 8 - vals = [] - for i in range(num_values): - vals.append(bool(data[ptr + offset : ptr + offset + 1])) - offset += 1 - return (array(vals), 8 + num_values) - elif type_index == 8: - num_values = struct.unpack(endianness + "Q", data[ptr : ptr + 8])[0] - offset = 8 - vals = [] - for i in range(num_values): - vals.append(struct.unpack(endianness + "h", data[ptr + offset : ptr + offset + 2])[0]) - offset += 4 - return (array(vals), 8 + 2 * num_values) - elif type_index == 9: - num_values = struct.unpack(endianness + "Q", data[ptr : ptr + 8])[0] - offset = 8 - vals = [] - for i in range(num_values): - vals.append(struct.unpack(endianness + "i", data[ptr + offset : ptr + offset + 4])[0]) - offset += 4 - return (array(vals), 8 + 4 * num_values) - elif type_index == 10: - num_values = struct.unpack(endianness + "Q", data[ptr : ptr + 8])[0] - offset = 8 - vals = [] - for i in range(num_values): - vals.append(struct.unpack(endianness + "Q", data[ptr + offset : ptr + offset + 8])[0]) - offset += 8 - return (None, 8 + 8 * num_values) - elif type_index == 11: - num_values = struct.unpack(endianness + "Q", data[ptr : ptr + 8])[0] - offset = 8 - vals = [] - for i in range(num_values): - vals.append(struct.unpack(endianness + "d", data[ptr + offset : ptr + offset + 8])[0]) - offset += 8 - return (array(vals), 8 + 8 * num_values) - elif type_index == 12: - val_len = struct.unpack(endianness + "Q", data[ptr : ptr + 8])[0] - print("pathpyG does not support properties with type vector. Properties have been dropped.") - return (None, 8 + 16 * val_len) - elif type_index == 13: - num_strings = struct.unpack(endianness + "Q", data[ptr : ptr + 8])[0] - offset = 8 - strs = [] - for i in range(num_strings): - str_len = struct.unpack(endianness + "Q", data[ptr + offset : ptr + offset + 8])[0] - offset += 8 - strs.append(data[ptr + offset : ptr + offset + str_len].decode("utf-8")) - offset += str_len - - return (strs, offset) - elif type_index == 14: - val_len = struct.unpack(endianness + "Q", data[ptr : ptr + 8])[0] - return (pickle.loads(data[ptr + 8 : ptr + 8 + val_len]), 8 + val_len) - else: - msg = "Unknown type index {0} while parsing graphtool file".format(type_index) - print(msg) - raise Exception(msg) - - -def parse_graphtool_format(data: bytes, id_node_attr=None) -> Graph: - """ - Decodes data in graphtool binary format and returns a [`Graph`][pathpyG.Graph]. For a documentation of - the graphtool binary format, see see doc at https://graph-tool.skewed.de/static/doc/gt_format.html - - Args: - data: Array of bytes to be decoded - - Returns: - Graph: a static graph - """ - - # check magic bytes - if data[0:6] != b"\xe2\x9b\xbe\x20\x67\x74": - print("Invalid graphtool file. Wrong magic bytes.") - raise Exception("Invalid graphtool file. Wrong magic bytes.") - ptr = 6 - - # read graphtool version byte - graphtool_version = int(data[ptr]) - ptr += 1 - - # read endianness - if bool(data[ptr]): - graphtool_endianness = ">" - else: - graphtool_endianness = "<" - ptr += 1 - - # read length of comment - str_len = struct.unpack(graphtool_endianness + "Q", data[ptr : ptr + 8])[0] - ptr += 8 - - # read string comment - comment = data[ptr : ptr + str_len].decode("ascii") - ptr += str_len - - # read network directedness - directed = bool(data[ptr]) - ptr += 1 - - # read number of nodes - n_nodes = struct.unpack(graphtool_endianness + "Q", data[ptr : ptr + 8])[0] - ptr += 8 - - # create pandas dataframe - network_dict = {} - # n = Network(directed = directed, multiedges=True) - - # determine binary representation of neighbour lists - if n_nodes < 2**8: - fmt = "B" - d = 1 - elif n_nodes < 2**16: - fmt = "H" - d = 2 - elif n_nodes < 2**32: - fmt = "I" - d = 4 - else: - fmt = "Q" - d = 8 - - sources = [] - targets = [] - # parse lists of out-neighbors for all n nodes - n_edges = 0 - for v in range(n_nodes): - # read number of neighbors - num_neighbors = struct.unpack(graphtool_endianness + "Q", data[ptr : ptr + 8])[0] - ptr += 8 - - # add edges to record - for _ in range(num_neighbors): - w = struct.unpack(graphtool_endianness + fmt, data[ptr : ptr + d])[0] - ptr += d - sources.append(v) - targets.append(w) - n_edges += 1 - - # collect attributes from property maps - graph_attr = dict() - node_attr = dict() - edge_attr = dict() - - # parse property maps - property_maps = struct.unpack(graphtool_endianness + "Q", data[ptr : ptr + 8])[0] - ptr += 8 - - for _ in range(property_maps): - key_type = struct.unpack(graphtool_endianness + "B", data[ptr : ptr + 1])[0] - ptr += 1 - - property_len = struct.unpack(graphtool_endianness + "Q", data[ptr : ptr + 8])[0] - ptr += 8 - - property_name = data[ptr : ptr + property_len].decode("ascii") - ptr += property_len - - property_type = struct.unpack(graphtool_endianness + "B", data[ptr : ptr + 1])[0] - ptr += 1 - - if key_type == 0: # graph-level property - res = _parse_property_value(data, ptr, property_type, graphtool_endianness) - graph_attr[property_name] = res[0] - ptr += res[1] - elif key_type == 1: # node-level property - if property_name not in node_attr: - node_attr[property_name] = [] - for v in range(n_nodes): - res = _parse_property_value(data, ptr, property_type, graphtool_endianness) - node_attr[property_name].append([res[0]]) - ptr += res[1] - elif key_type == 2: # edge-level property - if property_name not in edge_attr: - edge_attr[property_name] = [] - for e in range(n_edges): - res = _parse_property_value(data, ptr, property_type, graphtool_endianness) - edge_attr[property_name].append(res[0]) - ptr += res[1] - else: - print("Unknown key type {0}".format(key_type)) - - # LOG.info('Version \t= {0}'.format(graphtool_version)) - # LOG.info('Endianness \t= {0}'.format(graphtool_endianness)) - # LOG.info('comment size \t= {0}'.format(str_len)) - # LOG.info('comment \t= {0}'.format(comment)) - # LOG.info('directed \t= {0}'.format(directed)) - # LOG.info('nodes \t\t= {0}'.format(n_nodes)) - - # add edge properties to data frame - # for p in edge_attribute_names: - # # due to use of default_dict, this will add NA values to edges which have missing properties - # network_data[p] = [ edge_attributes[e][p] for e in range(n_edges) ] - - # create graph from pandas dataframe - - # if 'time' in edge_attribute_names and not ignore_temporal: - # raise Exception('') - # n = to_temporal_network(network_data, directed=directed, **network_attributes) - # else: - - if id_node_attr: - mapping = pp.IndexMap(node_attr[id_node_attr]) - else: - mapping = None - - g = Graph.from_edge_index(torch.LongTensor([sources, targets]).to(config["torch"]["device"]), mapping=mapping) - for a in node_attr: - if not a.startswith("node_"): - # print(node_attr[a]) - g.data["node_{0}".format(a)] = node_attr[a] - for a in edge_attr: - if not a.startswith("edge_"): - g.data["edge_{0}".format(a)] = torch.tensor(edge_attr[a], dtype=torch.float).to(config["torch"]["device"]) - for a in graph_attr: - g.data[a] = graph_attr[a] - - if not directed: - return g.to_undirected() - return g - - # for v in node_attributes: - # for p in node_attributes[v]: - # # for now we remove _pos for temporal networks due to type being incompatible with plotting - # if p != '_pos' or ('time' not in edge_attribute_names or ignore_temporal): - # n.nodes[v][p] = node_attributes[v][p] - - -def read_graphtool(file: str, multiedges: bool = False) -> Graph: - """ - Read a file in graphtool binary format. - - Args: - file: Path to graphtool file to be read - """ - with open(file, "rb") as f: - if ".zst" in file: - try: - import zstandard as zstd - - dctx = zstd.ZstdDecompressor() - data = f.read() - return parse_graphtool_format(dctx.decompress(data, max_output_size=len(data))) - except ModuleNotFoundError: - msg = 'Package zstandard is required to decompress graphtool files. Please install module, e.g., using "pip install zstandard".' - # LOG.error(msg) - raise Exception(msg) - else: - return parse_graphtool_format(f.read(), multiedges) diff --git a/src/pathpyG/io/netzschleuder.py b/src/pathpyG/io/netzschleuder.py index 6024b3917..316aa27b6 100644 --- a/src/pathpyG/io/netzschleuder.py +++ b/src/pathpyG/io/netzschleuder.py @@ -12,7 +12,6 @@ from pathpyG.core.graph import Graph from pathpyG.io.pandas import df_to_graph, df_to_temporal_graph -from pathpyG.io.graphtool import parse_graphtool_format from pathpyG.io.pandas import add_node_attributes From 84f2f68619c1c51f798d36e22e10c3f74da8a0f9 Mon Sep 17 00:00:00 2001 From: Ingo Scholtes Date: Thu, 2 Oct 2025 07:23:20 +0000 Subject: [PATCH 2/5] simplified netzschleuder code --- src/pathpyG/io/netzschleuder.py | 147 +++++++++++++------------------- 1 file changed, 61 insertions(+), 86 deletions(-) diff --git a/src/pathpyG/io/netzschleuder.py b/src/pathpyG/io/netzschleuder.py index 316aa27b6..9c7afeebd 100644 --- a/src/pathpyG/io/netzschleuder.py +++ b/src/pathpyG/io/netzschleuder.py @@ -1,5 +1,5 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Optional, Union import json from urllib import request @@ -11,6 +11,7 @@ import pandas as pd from pathpyG.core.graph import Graph +from pathpyG.core.temporal_graph import TemporalGraph from pathpyG.io.pandas import df_to_graph, df_to_temporal_graph from pathpyG.io.pandas import add_node_attributes @@ -91,127 +92,101 @@ def read_netzschleuder_record(name: str, base_url: str = "https://networks.skewe def read_netzschleuder_graph( name: str, - net: Optional[str] = None, + network: Optional[str] = None, multiedges: bool = False, time_attr: Optional[str] = None, - base_url: str = "https://networks.skewed.de", - format="csv", -) -> Graph: - """Read a pathpyG graph or temporal graph from the netzschleuder repository. + base_url: str = "https://networks.skewed.de" +) -> Union[Graph, TemporalGraph]: + """Read a graph or temporal graph from the netzschleuder repository. Args: name: Name of the network data set to read from - net: Identifier of the network within the data set to read. For data sets + network: Identifier of the network within the data set to read. For data sets containing a single network only, this can be set to None. ignore_temporal: If False, this function will return a static or temporal network depending on whether edges contain a time attribute. If True, pathpy will not interpret time attributes and thus always return a static network. base_url: Base URL of netzschleuder repository - format: for 'csv' a zipped csv file will be downloaded, for 'gt' the binary graphtool format will be retrieved via the API Examples: Read network '77' from karate club data set >>> import pathpyG as pp - >>> n = pp.io.read_netzschleuder_network('karate', '77') + >>> n = pp.io.read_netzschleuder_network(name='karate', network='77') >>> print(type(n)) >>> pp.plot(n) pp.Graph - Returns: - an instance of Graph - + Graph or TemporalGraph object """ # build URL - try: # retrieve properties of data record via API properties = json.loads(request.urlopen(f"{base_url}/api/net/{name}").read()) - # print(properties) timestamps = not (time_attr is None) - if not net: + if not network: analyses = properties["analyses"] - net = name + network = name else: - analyses = properties["analyses"][net] + analyses = properties["analyses"][network] try: is_directed = analyses["is_directed"] num_nodes = analyses["num_vertices"] except KeyError: raise Exception(f"Record {name} contains multiple networks, please specify network name.") - - if format == "csv": - url = f"{base_url}/net/{name}/files/{net}.csv.zip" - try: - response = request.urlopen(url) - - # decompress zip into temporary folder - data = BytesIO(response.read()) - - with zipfile.ZipFile(data, "r") as zip_ref: - with tempfile.TemporaryDirectory() as temp_dir: - zip_ref.extractall(path=temp_dir) - - # the gprop file contains lines with property name/value pairs - # gprops = pd.read_csv(f'{temp_dir}/gprops.csv', header=0, sep=',', skip_blank_lines=True, skipinitialspace=True) - - # nodes.csv contains node indices with node properties (like name) - edges = pd.read_csv( - f"{temp_dir}/edges.csv", header=0, sep=",", skip_blank_lines=True, skipinitialspace=True - ) - - # rename columns - edges.rename(columns={"# source": "v", "target": "w"}, inplace=True) - if timestamps and time_attr: - edges.rename(columns={time_attr: "t"}, inplace=True) - - # construct graph and assign edge attributes - if timestamps: - g = df_to_temporal_graph(df=edges, multiedges=multiedges, num_nodes=num_nodes) - else: - g = df_to_graph(df=edges, multiedges=multiedges, is_undirected=not is_directed, num_nodes=num_nodes) - - node_attrs = pd.read_csv( - f"{temp_dir}/nodes.csv", header=0, sep=",", skip_blank_lines=True, skipinitialspace=True - ) - node_attrs.rename(columns={"# index": "index"}, inplace=True) - - add_node_attributes(node_attrs, g) - - # add graph-level attributes - for x in analyses: - g.data["analyses_" + x] = analyses[x] - - return g - except HTTPError: - msg = f"Could not retrieve netzschleuder record at {url}" - raise Exception(msg) - - elif format == "gt": - try: - import zstandard as zstd - - url = f"/net/{name}/files/{net}.gt.zst" - try: - f = request.urlopen(base_url + url) - # decompress data - dctx = zstd.ZstdDecompressor() - reader = dctx.stream_reader(f) - decompressed = reader.readall() - - # parse graphtool binary format - return parse_graphtool_format(bytes(decompressed)) - except HTTPError: - msg = f"Could not retrieve netzschleuder record at {url}" - raise Exception(msg) - except ModuleNotFoundError: - msg = 'Package zstandard is required to decompress graphtool files. Please install module, e.g., using "pip install zstandard.' - # LOG.error(msg) - raise Exception(msg) + + # Retrieve CSV data + url = f"{base_url}/net/{name}/files/{network}.csv.zip" + try: + response = request.urlopen(url) + + # decompress zip into temporary folder + data = BytesIO(response.read()) + + with zipfile.ZipFile(data, "r") as zip_ref: + with tempfile.TemporaryDirectory() as temp_dir: + zip_ref.extractall(path=temp_dir) + + # the gprop file contains lines with property name/value pairs + # gprops = pd.read_csv(f'{temp_dir}/gprops.csv', header=0, sep=',', skip_blank_lines=True, skipinitialspace=True) + + # nodes.csv contains node indices with node properties (like name) + edges = pd.read_csv( + f"{temp_dir}/edges.csv", header=0, sep=",", skip_blank_lines=True, skipinitialspace=True + ) + + # rename columns + edges.rename(columns={"# source": "v", "target": "w"}, inplace=True) + if timestamps and time_attr: + edges.rename(columns={time_attr: "t"}, inplace=True) + + # construct graph and assign edge attributes + if timestamps: + g = df_to_temporal_graph(df=edges, multiedges=multiedges, num_nodes=num_nodes) + else: + g = df_to_graph(df=edges, multiedges=multiedges, + is_undirected=not is_directed, num_nodes=num_nodes) + + node_attrs = pd.read_csv( + f"{temp_dir}/nodes.csv", header=0, sep=",", skip_blank_lines=True, skipinitialspace=True + ) + node_attrs.rename(columns={"# index": "index"}, inplace=True) + + add_node_attributes(node_attrs, g) + + # add graph-level attributes + for x in analyses: + g.data["analyses_" + x] = analyses[x] + + return g + except HTTPError: + msg = f"Could not retrieve netzschleuder record at {url}" + raise Exception(msg) except HTTPError: msg = f"Could not retrieve netzschleuder record at {base_url}/api/net/{name}" raise Exception(msg) + return None From 4953952d936a907ccb2544512f06151039d3db6d Mon Sep 17 00:00:00 2001 From: Ingo Scholtes Date: Thu, 2 Oct 2025 07:23:30 +0000 Subject: [PATCH 3/5] updated Dockerfile --- .github/workflows/Dockerfile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/Dockerfile b/.github/workflows/Dockerfile index 888afa27a..bbdeae951 100644 --- a/.github/workflows/Dockerfile +++ b/.github/workflows/Dockerfile @@ -14,5 +14,5 @@ RUN pip install torch==2.4.1+cu121 --index-url https://download.pytorch.org/whl/ RUN pip install torch_geometric>=2.5.0 RUN pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.4.0+cu121.html -# install pathpy -RUN pip install -e '.[dev,test,doc,vis]' \ No newline at end of file +# install pathpyG +RUN pip install -e . \ No newline at end of file From c8f89c4b348332c93e9163e3778c769406f6530e Mon Sep 17 00:00:00 2001 From: Ingo Scholtes Date: Thu, 2 Oct 2025 07:34:53 +0000 Subject: [PATCH 4/5] updated tutorial --- docs/tutorial/netzschleuder.ipynb | 103 ++++++++++++++++++------------ 1 file changed, 61 insertions(+), 42 deletions(-) diff --git a/docs/tutorial/netzschleuder.ipynb b/docs/tutorial/netzschleuder.ipynb index 3ec56bd1b..8fe8c2375 100644 --- a/docs/tutorial/netzschleuder.ipynb +++ b/docs/tutorial/netzschleuder.ipynb @@ -42,7 +42,16 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.11/site-packages/torch/__config__.py:10: UserWarning: CUDA initialization: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 500: named symbol not found (Triggered internally at ../c10/cuda/CUDAFunctions.cpp:108.)\n", + " return torch._C._show_config()\n" + ] + } + ], "source": [ "from matplotlib import pyplot as plt\n", "\n", @@ -81,14 +90,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Undirected graph with 105 nodes and 882 (directed) edges\n", + "Undirected graph with 105 nodes and 441 edges\n", "{ 'Edge Attributes': {},\n", " 'Graph Attributes': { 'analyses_average_degree': \"\",\n", " 'analyses_degree_assortativity': \"\",\n", @@ -114,7 +123,7 @@ } ], "source": [ - "g = pp.io.read_netzschleuder_graph('polbooks')\n", + "g = pp.io.read_netzschleuder_graph(name='polbooks')\n", "print(g)" ] }, @@ -127,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -161,7 +170,7 @@ "\n", "\n", "\n", - "
\n", + "
\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "\n", "