From 4ea729a6590ead40ac800d5592b22e5c7bc0a16a Mon Sep 17 00:00:00 2001 From: "Francesca.L.Bleken@sintef.no" Date: Tue, 16 Sep 2025 14:50:20 +0200 Subject: [PATCH 1/7] Add criteria with no object --- tests/datadoc/test_dataset.py | 28 ++++++++++++++++++++++++- tests/input/semdata.yaml | 11 ++++++++++ tripper/datadoc/dataset.py | 39 ++++++++++++++++++++++++++++------- 3 files changed, 69 insertions(+), 9 deletions(-) diff --git a/tests/datadoc/test_dataset.py b/tests/datadoc/test_dataset.py index d1bb2294..43f89fb9 100644 --- a/tests/datadoc/test_dataset.py +++ b/tests/datadoc/test_dataset.py @@ -538,6 +538,30 @@ def test_datadoc(): SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], } + # Filter on criterion, but without required value + assert set( + search( + ts, + criteria={"creator.name": ""}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_batch2/77600-23-001"], + SEMDATA["SEM_cement_batch2"], + } + + # Filter on criterion, but without required value + assert set( + search( + ts, + criteria={"creator.name": None}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_batch2/77600-23-001"], + SEMDATA["SEM_cement_batch2"], + } + with pytest.raises(NoSuchTypeError): search(ts, type="invalid-type") @@ -545,10 +569,12 @@ def test_datadoc(): assert set(search(ts, regex={"dcterms:title": "SEM images"})) == { SEMDATA.SEM_cement_batch2, SAMPLE["SEM_cement_batch2/77600-23-001"], + SEMDATA.SEM_cement_missingcreator, } assert set(search(ts, regex={"dcterms:title": "SEM i[^ ]*s"})) == { SEMDATA.SEM_cement_batch2, SAMPLE["SEM_cement_batch2/77600-23-001"], + SEMDATA.SEM_cement_missingcreator, } # Get individual with given IRI @@ -627,7 +653,7 @@ def test_validate(): def test_pipeline(): """Test creating OTEAPI pipeline.""" - pytest.skip() + # pytest.skip() from tripper import Triplestore diff --git a/tests/input/semdata.yaml b/tests/input/semdata.yaml index ce66d8d1..0835f1c7 100644 --- a/tests/input/semdata.yaml +++ b/tests/input/semdata.yaml @@ -73,6 +73,17 @@ Dataset: downloadURL: sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2 mediaType: inode/directory + - "@id": semdata:SEM_cement_missingcreator + "@type": sem:SEMImageSeries + title: Nested series of SEM images which is missing a creator + description: ... + + distribution: + downloadURL: sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch_missingcreator + mediaType: inode/directory + + + Parser: - "@id": par:sem_hitachi diff --git a/tripper/datadoc/dataset.py b/tripper/datadoc/dataset.py index 9542f7d1..a81b1253 100644 --- a/tripper/datadoc/dataset.py +++ b/tripper/datadoc/dataset.py @@ -1336,15 +1336,19 @@ def make_query( if not isinstance(typ, str): typ = typ[0] crit.append(f"?iri rdf:type <{ts.expand_iri(typ)}> .") # type: ignore + print(filters) def add_crit(k, v, regex=False, s="iri"): """Add criteria to SPARQL query.""" nonlocal n + + print(k, v) key = f"@{k[1:]}" if k.startswith("_") else k if isinstance(v, list): for ele in v: add_crit(key, ele, regex=regex, s=s) return + print(key, k, v) if re.match(r"^[_a-zA-Z0.9]+\.", key): newkey, restkey = key.split(".", 1) if newkey in expanded: @@ -1369,12 +1373,18 @@ def add_crit(k, v, regex=False, s="iri"): n += 1 var = f"v{n}" crit.append(f"?{s} <{ts.expand_iri(key)}> ?{var} .") - if regex: - filters.append( - f"FILTER REGEX(STR(?{var}), {value}{flags_arg}) ." - ) - else: - filters.append(f"FILTER(STR(?{var}) = {value}) .") + if value not in ["", None, '""']: + print(f"value is: {value}") + + if regex: + filters.append( + f"FILTER REGEX(STR(?{var}), {value}{flags_arg}) ." + ) + else: + filters.append(f"FILTER(STR(?{var}) = {value}) .") + print("-----------------") + print(filters) + print("----------------------") for k, v in criteria.items(): add_crit(k, v) @@ -1393,6 +1403,9 @@ def add_crit(k, v, regex=False, s="iri"): {where_statements} }} """ + print("====================") + print(query) + print("====================") return query @@ -1411,9 +1424,12 @@ def search( Arguments: ts: Triplestore to search. type: Either a [resource type] (ex: "Dataset", "Distribution", ...) - or the IRI of a class to limit the search to. + or the IRI of a class to limit the search to. Can also be given + as a list of resource types or IRIs. criteria: Exact match criteria. A dict of IRI, value pairs, where the - IRIs refer to data properties on the resource match. The IRIs + IRIs refer to data properties on the resource match. If more than + one value is desire for a given criteria, values can be provided + in a list. The IRIs may use any prefix defined in `ts`. E.g. if the prefix `dcterms` is in `ts`, it is expanded and the match criteria `dcterms:title` is correctly parsed. @@ -1442,10 +1458,17 @@ def search( search(ts, criteria={"contactPoint.hasName": "John Doe"}) + List IRIs of all resources with John Doe and Jane Doe as `contactPoint`: + + search(ts, criteria={"contactPoint.hasName": ["John Doe", "Jane Doe"]}) + List IRIs of all samples: search(ts, type=CHAMEO.Sample) + List IRIs of all samples that are liquids: + search(ts, type=[CHAMEO.Sample, EMMO.Liquid] ) + List IRIs of all datasets with John Doe as `contactPoint` AND are measured on a given sample: From b62fba73698043da74818e81d5328e4744692618 Mon Sep 17 00:00:00 2001 From: "Francesca.L.Bleken@sintef.no" Date: Tue, 16 Sep 2025 18:13:07 +0200 Subject: [PATCH 2/7] None in key for criteria --- tests/datadoc/test_dataset.py | 35 +++++++++++ tests/input/semdata.yaml | 6 +- tripper/datadoc/dataset.py | 111 ++++++++++++++++++++++++++++++---- 3 files changed, 139 insertions(+), 13 deletions(-) diff --git a/tests/datadoc/test_dataset.py b/tests/datadoc/test_dataset.py index 43f89fb9..12720903 100644 --- a/tests/datadoc/test_dataset.py +++ b/tests/datadoc/test_dataset.py @@ -562,6 +562,41 @@ def test_datadoc(): SEMDATA["SEM_cement_batch2"], } + # Filter on criterion, but with any predicate + assert set( + search( + ts, + criteria={None: ["Named Lab Assistant"]}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_missingcreator"], + } + + # Filter on criterion, but with any predicate + assert set( + search( + ts, + criteria={None: "Named Lab Assistant"}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_missingcreator"], + } + + # Filter on more criteria with any predicate, testlabel tests that + # indirect search through inSeries works. + assert set( + search( + ts, + criteria={None: ["Named Lab Assistant", "testlabel"]}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_missingcreator"], + SEMDATA["SEM_cement_batch2/77600-23-001"], + } + with pytest.raises(NoSuchTypeError): search(ts, type="invalid-type") diff --git a/tests/input/semdata.yaml b/tests/input/semdata.yaml index 0835f1c7..176cf8f4 100644 --- a/tests/input/semdata.yaml +++ b/tests/input/semdata.yaml @@ -9,6 +9,7 @@ prefixes: dm: http://onto-ns.com/meta/characterisation/0.1/SEMImage# par: http://sintef.no/dlite/parser# gen: http://sintef.no/dlite/generator# + chameo: https://w3id.org/emmo/domain/characterisation-methodology/chameo# # List of documented datasets @@ -68,7 +69,7 @@ Dataset: contactPoint: hasName: Sigurd Wenner hasEmail: - + label: testlabel distribution: downloadURL: sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2 mediaType: inode/directory @@ -77,6 +78,8 @@ Dataset: "@type": sem:SEMImageSeries title: Nested series of SEM images which is missing a creator description: ... + curator: + - name: Named Lab Assistant distribution: downloadURL: sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch_missingcreator @@ -85,6 +88,7 @@ Dataset: + Parser: - "@id": par:sem_hitachi "@type": oteio:Parser diff --git a/tripper/datadoc/dataset.py b/tripper/datadoc/dataset.py index a81b1253..b91e2b77 100644 --- a/tripper/datadoc/dataset.py +++ b/tripper/datadoc/dataset.py @@ -1336,19 +1336,93 @@ def make_query( if not isinstance(typ, str): typ = typ[0] crit.append(f"?iri rdf:type <{ts.expand_iri(typ)}> .") # type: ignore - print(filters) def add_crit(k, v, regex=False, s="iri"): """Add criteria to SPARQL query.""" nonlocal n - print(k, v) - key = f"@{k[1:]}" if k.startswith("_") else k + def _to_value_token(x): + # Turn a Python value into a SPARQL term + if x in expanded: + return f"<{expanded[x]}>" + if isinstance(x, str): + return ( + f"<{x}>" + if re.match("^[a-z][a-z0-9.+-]*://", x) + else f'"{x}"' + ) + return x + + key = None if k is None else (f"@{k[1:]}" if k.startswith("_") else k) + + if key is None: + # any predicate on first hop; keep ?s (= ?iri) as the resource + + n += 1 + pvar = f"p{n}" + bn = f"bn{n}" + n += 1 + qvar = f"q{n}" + var = f"v{n}" + + # ?s ?p ?bn . ?bn ?q ?var . + crit.append(f"?{s} ?{pvar} ?{bn} .") + crit.append(f"?{bn} ?{qvar} ?{var} .") + # Only return non-blank subjects + if s == "iri": + filters.append("FILTER(!isBlank(?iri)) .") + + # Support list of values → VALUES (equality) or a single alternation for regex + if isinstance(v, list): + if regex: + pattern = "(" + "|".join(str(p) for p in v) + ")" + filters.append( + f'FILTER REGEX(STR(?{var}), "{pattern}"{flags_arg}) .' + ) + else: + vals = [] + for ele in v: + if ele in expanded: + vals.append(f"<{expanded[ele]}>") + elif isinstance(ele, str): + vals.append( + f"<{ele}>" + if re.match("^[a-z][a-z0-9.+-]*://", ele) + else f'"{ele}"' + ) + elif ele not in ("", None): + vals.append(ele) + if vals: + crit.append(f"VALUES ?{var} {{ {' '.join(vals)} }}") + else: + # single value + if v in expanded: + value = f"<{expanded[v]}>" + elif isinstance(v, str): + value = ( + f"<{v}>" + if re.match("^[a-z][a-z0-9.+-]*://", v) + else f'"{v}"' + ) + else: + value = v + if value not in ["", None, '""']: + if regex: + filters.append( + f"FILTER REGEX(STR(?{var}), {value}{flags_arg}) ." + ) + else: + # If it's an IRI token, compare directly; otherwise compare STR() + if isinstance(value, str) and value.startswith("<"): + filters.append(f"FILTER(?{var} = {value}) .") + else: + filters.append(f"FILTER(STR(?{var}) = {value}) .") + return + if isinstance(v, list): for ele in v: add_crit(key, ele, regex=regex, s=s) return - print(key, k, v) if re.match(r"^[_a-zA-Z0.9]+\.", key): newkey, restkey = key.split(".", 1) if newkey in expanded: @@ -1373,8 +1447,8 @@ def add_crit(k, v, regex=False, s="iri"): n += 1 var = f"v{n}" crit.append(f"?{s} <{ts.expand_iri(key)}> ?{var} .") + if value not in ["", None, '""']: - print(f"value is: {value}") if regex: filters.append( @@ -1382,9 +1456,6 @@ def add_crit(k, v, regex=False, s="iri"): ) else: filters.append(f"FILTER(STR(?{var}) = {value}) .") - print("-----------------") - print(filters) - print("----------------------") for k, v in criteria.items(): add_crit(k, v) @@ -1395,6 +1466,8 @@ def add_crit(k, v, regex=False, s="iri"): for k, v in regex.items(): add_crit(k, v, regex=True) + # Make sure that iris are iris (not blank nodes) + filters.append("FILTER(!isBlank(?iri)) .") where_statements = "\n ".join(crit + filters) query = f""" PREFIX rdf: <{RDF}> @@ -1403,9 +1476,6 @@ def add_crit(k, v, regex=False, s="iri"): {where_statements} }} """ - print("====================") - print(query) - print("====================") return query @@ -1428,11 +1498,20 @@ def search( as a list of resource types or IRIs. criteria: Exact match criteria. A dict of IRI, value pairs, where the IRIs refer to data properties on the resource match. If more than - one value is desire for a given criteria, values can be provided + one value is desired for a given criterion, values can be provided in a list. The IRIs may use any prefix defined in `ts`. E.g. if the prefix `dcterms` is in `ts`, it is expanded and the match criteria `dcterms:title` is correctly parsed. + + If the object (value) is given as None or "", all matches + that have any value for the given criterion are returned. + + If predicate (key) is given as None search on all objects irrespective + of predicate is performed. + + Note that more than one value broadens the + search, i.e. it is an OR operation. regex: Like `criteria` but the values in the provided dict are regular expressions used for the matching. flags: Flags passed to regular expressions. @@ -1462,6 +1541,14 @@ def search( search(ts, criteria={"contactPoint.hasName": ["John Doe", "Jane Doe"]}) + List IRIs of all resources that have a `contactPoint`: + + search(ts, criteria={"contactPoint.hasName": None}) + + List IRIs of all resources that have Jane Doe or Blue as object (value): + + search(ts, criteria={None: ["Jane Doe", "Blue"]}) + List IRIs of all samples: search(ts, type=CHAMEO.Sample) From d5cb0737d3b43975801052cbc56cfa2d77846a3a Mon Sep 17 00:00:00 2001 From: "Francesca.L.Bleken@sintef.no" Date: Wed, 17 Sep 2025 16:11:43 +0200 Subject: [PATCH 3/7] Only None accepted as wildcard in search. --- tests/datadoc/test_dataset.py | 12 ------------ tripper/datadoc/dataset.py | 10 +++++----- 2 files changed, 5 insertions(+), 17 deletions(-) diff --git a/tests/datadoc/test_dataset.py b/tests/datadoc/test_dataset.py index 12720903..9f487fdc 100644 --- a/tests/datadoc/test_dataset.py +++ b/tests/datadoc/test_dataset.py @@ -538,18 +538,6 @@ def test_datadoc(): SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], } - # Filter on criterion, but without required value - assert set( - search( - ts, - criteria={"creator.name": ""}, - ) - ) == { - SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], - SEMDATA["SEM_cement_batch2/77600-23-001"], - SEMDATA["SEM_cement_batch2"], - } - # Filter on criterion, but without required value assert set( search( diff --git a/tripper/datadoc/dataset.py b/tripper/datadoc/dataset.py index b91e2b77..053147dc 100644 --- a/tripper/datadoc/dataset.py +++ b/tripper/datadoc/dataset.py @@ -1406,7 +1406,7 @@ def _to_value_token(x): ) else: value = v - if value not in ["", None, '""']: + if value: if regex: filters.append( f"FILTER REGEX(STR(?{var}), {value}{flags_arg}) ." @@ -1448,7 +1448,7 @@ def _to_value_token(x): var = f"v{n}" crit.append(f"?{s} <{ts.expand_iri(key)}> ?{var} .") - if value not in ["", None, '""']: + if value: if regex: filters.append( @@ -1504,10 +1504,10 @@ def search( is in `ts`, it is expanded and the match criteria `dcterms:title` is correctly parsed. - If the object (value) is given as None or "", all matches - that have any value for the given criterion are returned. + If the object (value) is given as None, all matches + that have any value for the given predicate are returned. - If predicate (key) is given as None search on all objects irrespective + If predicate (key) is given as None, search on all objects irrespective of predicate is performed. Note that more than one value broadens the From 609acb4f40e2b9f802e34a35992565e4d584dea6 Mon Sep 17 00:00:00 2001 From: "Francesca.L.Bleken@sintef.no" Date: Thu, 18 Sep 2025 09:13:10 +0200 Subject: [PATCH 4/7] Added option for adding critiera as tuples --- tests/datadoc/test_dataset.py | 23 ++++++++++++++++ tripper/datadoc/dataset.py | 52 ++++++++++++++++++++++++++--------- 2 files changed, 62 insertions(+), 13 deletions(-) diff --git a/tests/datadoc/test_dataset.py b/tests/datadoc/test_dataset.py index 9f487fdc..d49d1c92 100644 --- a/tests/datadoc/test_dataset.py +++ b/tests/datadoc/test_dataset.py @@ -585,6 +585,29 @@ def test_datadoc(): SEMDATA["SEM_cement_batch2/77600-23-001"], } + # Filter on two different criteria in a dict) + assert set( + search( + ts, + criteria={"creator.name": "Sigurd Wenner", "label": "testlabel"}, + ) + ) == { + SEMDATA["SEM_cement_batch2"], + } + + # Filter on two different criteria in a list of tuples + assert set( + search( + ts, + criteria=[ + ("creator.name", "Sigurd Wenner"), + ("label", "testlabel"), + ], + ) + ) == { + SEMDATA["SEM_cement_batch2"], + } + with pytest.raises(NoSuchTypeError): search(ts, type="invalid-type") diff --git a/tripper/datadoc/dataset.py b/tripper/datadoc/dataset.py index 053147dc..8dd4e09a 100644 --- a/tripper/datadoc/dataset.py +++ b/tripper/datadoc/dataset.py @@ -31,12 +31,14 @@ from __future__ import annotations -# pylint: disable=invalid-name,redefined-builtin,import-outside-toplevel -# pylint: disable=too-many-branches import json import logging import re import warnings + +# pylint: disable=invalid-name,redefined-builtin,import-outside-toplevel +# pylint: disable=too-many-branches +from itertools import groupby from pathlib import Path from typing import TYPE_CHECKING @@ -1272,7 +1274,7 @@ def make_query( ts: Triplestore, type=None, criterias: "Optional[dict]" = None, # deprecated - criteria: "Optional[dict]" = None, # new preferred name + criteria: "Optional[Union[dict, list[tuple]]]" = None, # new preferred name regex: "Optional[dict]" = None, flags: "Optional[str]" = None, keywords: "Optional[Keywords]" = None, @@ -1297,6 +1299,14 @@ def make_query( if criteria is None: criteria = criterias + if isinstance(criteria, list): + criteria.sort(key=lambda x: x[0]) + res = { + key: [value for key, value in group] + for key, group in groupby(criteria, key=lambda x: x[0]) + } + criteria = res + keywords = get_keywords(keywords=keywords) context = get_context(keywords=keywords) context._create_caches() # pylint: disable=protected-access @@ -1317,7 +1327,7 @@ def make_query( cid = criteria.pop("@id", criteria.pop("_id", None)) rid = regex.pop("@id", regex.pop("_id", None)) if cid: - filters.append(f'FILTER(STR(?iri) = "{ts.expand_iri(cid)}") .') + filters.append(f'FILTER(STR(?iri) = "{ts.expand_iri(cid)}") .') # type: ignore elif rid: filters.append( f'FILTER REGEX(STR(?iri), "{ts.expand_iri(rid)}"{flags_arg}) .' @@ -1483,7 +1493,7 @@ def search( ts: Triplestore, type=None, criterias: "Optional[dict]" = None, # deprecated - criteria: "Optional[dict]" = None, # new preferred name + criteria: "Optional[Union[list[tuple], dict]]" = None, # new preferred name regex: "Optional[dict]" = None, flags: "Optional[str]" = None, keywords: "Optional[Keywords]" = None, @@ -1499,8 +1509,10 @@ def search( criteria: Exact match criteria. A dict of IRI, value pairs, where the IRIs refer to data properties on the resource match. If more than one value is desired for a given criterion, values can be provided - in a list. The IRIs - may use any prefix defined in `ts`. E.g. if the prefix `dcterms` + in a list. It can also be given as a list of (key, value) tuples. + A combination of tuples and dict is not supported. + + The IRIsmay use any prefix defined in `ts`. E.g. if the prefix `dcterms` is in `ts`, it is expanded and the match criteria `dcterms:title` is correctly parsed. @@ -1510,13 +1522,16 @@ def search( If predicate (key) is given as None, search on all objects irrespective of predicate is performed. - Note that more than one value broadens the + Note that more than one value for a given key broadens the search, i.e. it is an OR operation. - regex: Like `criteria` but the values in the provided dict are regular - expressions used for the matching. - flags: Flags passed to regular expressions. - - `s`: Dot-all mode. The . matches any character. The default - doesn't match newline or carriage return. + + The different key-value pairs in the dict are combined with AND. + + regex: Like `criteria` but the values in the provided dict are regular + expressions used for the matching. + flags: Flags passed to regular expressions. + - `s`: Dot-all mode. The . matches any character. The default + doesn't match newline or carriage return. - `m`: Multi-line mode. The ^ and $ characters matches beginning or end of line instead of beginning or end of string. - `i`: Case-insensitive mode. @@ -1549,6 +1564,15 @@ def search( search(ts, criteria={None: ["Jane Doe", "Blue"]}) + Search with critera given as list of tuples: + search( + ts, + criteria=[ + ("contactPoint.hasName", "John Doe"), + ("fromSample", SAMPLE.batch2/sample3), + ], + ) + List IRIs of all samples: search(ts, type=CHAMEO.Sample) @@ -1556,6 +1580,8 @@ def search( List IRIs of all samples that are liquids: search(ts, type=[CHAMEO.Sample, EMMO.Liquid] ) + + List IRIs of all datasets with John Doe as `contactPoint` AND are measured on a given sample: From 3526b231245bfd6a3522e00e307860309b0ed224 Mon Sep 17 00:00:00 2001 From: "Francesca.L.Bleken@sintef.no" Date: Thu, 18 Sep 2025 15:55:38 +0200 Subject: [PATCH 5/7] Removed helper function which is not used --- tripper/datadoc/dataset.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tripper/datadoc/dataset.py b/tripper/datadoc/dataset.py index 8dd4e09a..72cb3f5d 100644 --- a/tripper/datadoc/dataset.py +++ b/tripper/datadoc/dataset.py @@ -1351,18 +1351,6 @@ def add_crit(k, v, regex=False, s="iri"): """Add criteria to SPARQL query.""" nonlocal n - def _to_value_token(x): - # Turn a Python value into a SPARQL term - if x in expanded: - return f"<{expanded[x]}>" - if isinstance(x, str): - return ( - f"<{x}>" - if re.match("^[a-z][a-z0-9.+-]*://", x) - else f'"{x}"' - ) - return x - key = None if k is None else (f"@{k[1:]}" if k.startswith("_") else k) if key is None: From 3a1813caa47d4f4a7a12c0fdf418d38834de6d46 Mon Sep 17 00:00:00 2001 From: "Francesca.L.Bleken@sintef.no" Date: Fri, 26 Sep 2025 16:05:52 +0200 Subject: [PATCH 6/7] Corrected sorting of tuples with None --- tests/datadoc/test_dataset.py | 14 ++++++++++++++ tripper/datadoc/dataset.py | 3 ++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/datadoc/test_dataset.py b/tests/datadoc/test_dataset.py index d49d1c92..0daf73f7 100644 --- a/tests/datadoc/test_dataset.py +++ b/tests/datadoc/test_dataset.py @@ -608,6 +608,20 @@ def test_datadoc(): SEMDATA["SEM_cement_batch2"], } + assert set( + search( + ts, + criteria=[ + (None, "Sigurd Wenner"), + (None, "testlabel"), + ], + ) + ) == { + SEMDATA["SEM_cement_batch2"], + SEMDATA["SEM_cement_batch2/77600-23-001"], + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + } + with pytest.raises(NoSuchTypeError): search(ts, type="invalid-type") diff --git a/tripper/datadoc/dataset.py b/tripper/datadoc/dataset.py index 72cb3f5d..b86cc848 100644 --- a/tripper/datadoc/dataset.py +++ b/tripper/datadoc/dataset.py @@ -1300,7 +1300,8 @@ def make_query( criteria = criterias if isinstance(criteria, list): - criteria.sort(key=lambda x: x[0]) + criteria = sorted(criteria, key=lambda x: (x[0] is None, x[0])) + res = { key: [value for key, value in group] for key, group in groupby(criteria, key=lambda x: x[0]) From a985c815d7404d25d7354ad4b55d8a632f70c8ee Mon Sep 17 00:00:00 2001 From: "Francesca.L.Bleken@sintef.no" Date: Fri, 26 Sep 2025 19:49:16 +0200 Subject: [PATCH 7/7] Added test that shows errors in dataset --- tests/datadoc/test_dataset.py | 120 +++++++++++++++++++++++++++------- 1 file changed, 96 insertions(+), 24 deletions(-) diff --git a/tests/datadoc/test_dataset.py b/tests/datadoc/test_dataset.py index 0daf73f7..dd02e8b5 100644 --- a/tests/datadoc/test_dataset.py +++ b/tests/datadoc/test_dataset.py @@ -8,6 +8,9 @@ pytest.importorskip("requests") pytest.importorskip("pyld") +GRAPHDB_CHECK_URL = "http://localhost:7200/repositories" +FUSEKI_CHECK_URL = "http://localhost:3030" + def test__get_range(): """Test _get_default_keywords().""" @@ -402,20 +405,20 @@ def test_update_classes(): } in r3["subClassOf"] -def test_datadoc(): +def datasettest(name): """Test save_datadoc() and acquire()/store().""" # pylint: disable=too-many-statements from dataset_paths import indir # pylint: disable=import-error - from tripper import CHAMEO, DCAT, DCTERMS, EMMO, OTEIO, Triplestore + from tripper import CHAMEO, DCAT, DCTERMS, EMMO, OTEIO from tripper.datadoc import acquire, save_datadoc, search, store from tripper.datadoc.errors import NoSuchTypeError pytest.importorskip("dlite") pytest.importorskip("rdflib") - ts = Triplestore("rdflib") + ts = get_triplestore(name) # Load data documentation into triplestore datadoc = save_datadoc(ts, indir / "semdata.yaml") @@ -427,6 +430,8 @@ def test_datadoc(): SEMDATA = ts.namespaces["semdata"] iri = SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"] d = acquire(ts, iri, use_sparql=False) + print("----") + print(d) assert d["@id"] == iri assert set(d["@type"]) == { DCAT.Dataset, @@ -622,6 +627,31 @@ def test_datadoc(): SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], } + assert set( + search( + ts, + criteria=[ + (None, "http://onto-ns.com/meta/matchmaker/0.2/SEMImage"), + ], + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + } + + assert set( + search( + ts, + criteria=[ + ( + "https://w3id.org/emmo/domain/oteio#hasDatamodel", + "http://onto-ns.com/meta/matchmaker/0.2/SEMImage", + ), + ], + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + } + with pytest.raises(NoSuchTypeError): search(ts, type="invalid-type") @@ -746,27 +776,6 @@ def test_pipeline(): pipeline.get() -def test_fuseki(): - """Test save and load dataset with Fuseki.""" - import os - - from tripper import Triplestore - - host = os.getenv("TRIPLESTORE_HOST", "localhost") - port = os.getenv("TRIPLESTORE_PORT", "3030") - fuseki_args = { - "backend": "fusekix", - "base_iri": "http://example.com/ontology#", - "triplestore_url": f"http://{host}:{port}", - "database": "openmodel", - } - try: - ts = Triplestore(**fuseki_args) - except ModuleNotFoundError: - pytest.skip("Cannot connect to Fuseki server") - ts.remove_database(**fuseki_args) - - def test_deprecated(): """Test deprecated save_dict(), load_dict() and search_iris().""" from tripper import Triplestore @@ -799,3 +808,66 @@ def test_deprecated(): with pytest.warns(DeprecationWarning): iris = search_iris(ts, criterias={"creator.name": "John Doe"}) assert iris == [EX.exdata] + + +def get_triplestore(tsname: str) -> "Triplestore": + """Help function that returns a new triplestore object.""" + from tripper import Triplestore + + if tsname == "GraphDB": + ts = Triplestore( + backend="sparqlwrapper", + base_iri="http://localhost:7200/repositories/test_repo", + update_iri=( + "http://localhost:7200/repositories/test_repo/statements" + ), + ) + elif tsname == "Fuseki": + ts = Triplestore( + backend="sparqlwrapper", + base_iri=f"{FUSEKI_CHECK_URL}/test_repo", + update_iri=f"{FUSEKI_CHECK_URL}/test_repo/update", + username="admin", + password="admin0", + ) + elif tsname == "rdflib": + ts = Triplestore("rdflib") + else: + raise ValueError(f"Unsupported triplestore name: {tsname}") + + return ts + + +def test_graphdb_datadoc(): + """ + Test the dataset module using GraphDB. + """ + # Check if GraphDB is available and write a warning if it is not. + from tripper.utils import check_service_availability + + if not check_service_availability(GRAPHDB_CHECK_URL, timeout=1): + pytest.skip("GraphDB instance not available locally; skipping tests.") + + print("Testing graphdb") + datasettest("GraphDB") + + +def test_fuseki_datadoc(): + """ + Test the dataset module using Fuseki. + """ + # Check if Fuseki is available and write a warning if it is not. + from tripper.utils import check_service_availability + + if not check_service_availability(FUSEKI_CHECK_URL, timeout=1): + pytest.skip("Fuseki instance not available locally; skipping tests.") + + print("Testing fuseki") + datasettest("Fuseki") + + +def test_rdflib_datadoc(): + """ + Test the dataset module using rdflib. + """ + datasettest("rdflib")