diff --git a/tests/datadoc/test_dataset.py b/tests/datadoc/test_dataset.py index d1bb2294..dd02e8b5 100644 --- a/tests/datadoc/test_dataset.py +++ b/tests/datadoc/test_dataset.py @@ -8,6 +8,9 @@ pytest.importorskip("requests") pytest.importorskip("pyld") +GRAPHDB_CHECK_URL = "http://localhost:7200/repositories" +FUSEKI_CHECK_URL = "http://localhost:3030" + def test__get_range(): """Test _get_default_keywords().""" @@ -402,20 +405,20 @@ def test_update_classes(): } in r3["subClassOf"] -def test_datadoc(): +def datasettest(name): """Test save_datadoc() and acquire()/store().""" # pylint: disable=too-many-statements from dataset_paths import indir # pylint: disable=import-error - from tripper import CHAMEO, DCAT, DCTERMS, EMMO, OTEIO, Triplestore + from tripper import CHAMEO, DCAT, DCTERMS, EMMO, OTEIO from tripper.datadoc import acquire, save_datadoc, search, store from tripper.datadoc.errors import NoSuchTypeError pytest.importorskip("dlite") pytest.importorskip("rdflib") - ts = Triplestore("rdflib") + ts = get_triplestore(name) # Load data documentation into triplestore datadoc = save_datadoc(ts, indir / "semdata.yaml") @@ -427,6 +430,8 @@ def test_datadoc(): SEMDATA = ts.namespaces["semdata"] iri = SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"] d = acquire(ts, iri, use_sparql=False) + print("----") + print(d) assert d["@id"] == iri assert set(d["@type"]) == { DCAT.Dataset, @@ -538,6 +543,115 @@ def test_datadoc(): SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], } + # Filter on criterion, but without required value + assert set( + search( + ts, + criteria={"creator.name": None}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_batch2/77600-23-001"], + SEMDATA["SEM_cement_batch2"], + } + + # Filter on criterion, but with any predicate + assert set( + search( + ts, + criteria={None: ["Named Lab Assistant"]}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_missingcreator"], + } + + # Filter on criterion, but with any predicate + assert set( + search( + ts, + criteria={None: "Named Lab Assistant"}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_missingcreator"], + } + + # Filter on more criteria with any predicate, testlabel tests that + # indirect search through inSeries works. + assert set( + search( + ts, + criteria={None: ["Named Lab Assistant", "testlabel"]}, + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + SEMDATA["SEM_cement_missingcreator"], + SEMDATA["SEM_cement_batch2/77600-23-001"], + } + + # Filter on two different criteria in a dict) + assert set( + search( + ts, + criteria={"creator.name": "Sigurd Wenner", "label": "testlabel"}, + ) + ) == { + SEMDATA["SEM_cement_batch2"], + } + + # Filter on two different criteria in a list of tuples + assert set( + search( + ts, + criteria=[ + ("creator.name", "Sigurd Wenner"), + ("label", "testlabel"), + ], + ) + ) == { + SEMDATA["SEM_cement_batch2"], + } + + assert set( + search( + ts, + criteria=[ + (None, "Sigurd Wenner"), + (None, "testlabel"), + ], + ) + ) == { + SEMDATA["SEM_cement_batch2"], + SEMDATA["SEM_cement_batch2/77600-23-001"], + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + } + + assert set( + search( + ts, + criteria=[ + (None, "http://onto-ns.com/meta/matchmaker/0.2/SEMImage"), + ], + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + } + + assert set( + search( + ts, + criteria=[ + ( + "https://w3id.org/emmo/domain/oteio#hasDatamodel", + "http://onto-ns.com/meta/matchmaker/0.2/SEMImage", + ), + ], + ) + ) == { + SEMDATA["SEM_cement_batch2/77600-23-001/77600-23-001_5kV_400x_m001"], + } + with pytest.raises(NoSuchTypeError): search(ts, type="invalid-type") @@ -545,10 +659,12 @@ def test_datadoc(): assert set(search(ts, regex={"dcterms:title": "SEM images"})) == { SEMDATA.SEM_cement_batch2, SAMPLE["SEM_cement_batch2/77600-23-001"], + SEMDATA.SEM_cement_missingcreator, } assert set(search(ts, regex={"dcterms:title": "SEM i[^ ]*s"})) == { SEMDATA.SEM_cement_batch2, SAMPLE["SEM_cement_batch2/77600-23-001"], + SEMDATA.SEM_cement_missingcreator, } # Get individual with given IRI @@ -627,7 +743,7 @@ def test_validate(): def test_pipeline(): """Test creating OTEAPI pipeline.""" - pytest.skip() + # pytest.skip() from tripper import Triplestore @@ -660,27 +776,6 @@ def test_pipeline(): pipeline.get() -def test_fuseki(): - """Test save and load dataset with Fuseki.""" - import os - - from tripper import Triplestore - - host = os.getenv("TRIPLESTORE_HOST", "localhost") - port = os.getenv("TRIPLESTORE_PORT", "3030") - fuseki_args = { - "backend": "fusekix", - "base_iri": "http://example.com/ontology#", - "triplestore_url": f"http://{host}:{port}", - "database": "openmodel", - } - try: - ts = Triplestore(**fuseki_args) - except ModuleNotFoundError: - pytest.skip("Cannot connect to Fuseki server") - ts.remove_database(**fuseki_args) - - def test_deprecated(): """Test deprecated save_dict(), load_dict() and search_iris().""" from tripper import Triplestore @@ -713,3 +808,66 @@ def test_deprecated(): with pytest.warns(DeprecationWarning): iris = search_iris(ts, criterias={"creator.name": "John Doe"}) assert iris == [EX.exdata] + + +def get_triplestore(tsname: str) -> "Triplestore": + """Help function that returns a new triplestore object.""" + from tripper import Triplestore + + if tsname == "GraphDB": + ts = Triplestore( + backend="sparqlwrapper", + base_iri="http://localhost:7200/repositories/test_repo", + update_iri=( + "http://localhost:7200/repositories/test_repo/statements" + ), + ) + elif tsname == "Fuseki": + ts = Triplestore( + backend="sparqlwrapper", + base_iri=f"{FUSEKI_CHECK_URL}/test_repo", + update_iri=f"{FUSEKI_CHECK_URL}/test_repo/update", + username="admin", + password="admin0", + ) + elif tsname == "rdflib": + ts = Triplestore("rdflib") + else: + raise ValueError(f"Unsupported triplestore name: {tsname}") + + return ts + + +def test_graphdb_datadoc(): + """ + Test the dataset module using GraphDB. + """ + # Check if GraphDB is available and write a warning if it is not. + from tripper.utils import check_service_availability + + if not check_service_availability(GRAPHDB_CHECK_URL, timeout=1): + pytest.skip("GraphDB instance not available locally; skipping tests.") + + print("Testing graphdb") + datasettest("GraphDB") + + +def test_fuseki_datadoc(): + """ + Test the dataset module using Fuseki. + """ + # Check if Fuseki is available and write a warning if it is not. + from tripper.utils import check_service_availability + + if not check_service_availability(FUSEKI_CHECK_URL, timeout=1): + pytest.skip("Fuseki instance not available locally; skipping tests.") + + print("Testing fuseki") + datasettest("Fuseki") + + +def test_rdflib_datadoc(): + """ + Test the dataset module using rdflib. + """ + datasettest("rdflib") diff --git a/tests/input/semdata.yaml b/tests/input/semdata.yaml index ce66d8d1..176cf8f4 100644 --- a/tests/input/semdata.yaml +++ b/tests/input/semdata.yaml @@ -9,6 +9,7 @@ prefixes: dm: http://onto-ns.com/meta/characterisation/0.1/SEMImage# par: http://sintef.no/dlite/parser# gen: http://sintef.no/dlite/generator# + chameo: https://w3id.org/emmo/domain/characterisation-methodology/chameo# # List of documented datasets @@ -68,11 +69,25 @@ Dataset: contactPoint: hasName: Sigurd Wenner hasEmail: - + label: testlabel distribution: downloadURL: sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch2 mediaType: inode/directory + - "@id": semdata:SEM_cement_missingcreator + "@type": sem:SEMImageSeries + title: Nested series of SEM images which is missing a creator + description: ... + curator: + - name: Named Lab Assistant + + distribution: + downloadURL: sftp://nas.aimen.es/P_MATCHMAKER_SHARE_SINTEF/SEM_cement_batch_missingcreator + mediaType: inode/directory + + + + Parser: - "@id": par:sem_hitachi diff --git a/tripper/datadoc/dataset.py b/tripper/datadoc/dataset.py index 9542f7d1..b86cc848 100644 --- a/tripper/datadoc/dataset.py +++ b/tripper/datadoc/dataset.py @@ -31,12 +31,14 @@ from __future__ import annotations -# pylint: disable=invalid-name,redefined-builtin,import-outside-toplevel -# pylint: disable=too-many-branches import json import logging import re import warnings + +# pylint: disable=invalid-name,redefined-builtin,import-outside-toplevel +# pylint: disable=too-many-branches +from itertools import groupby from pathlib import Path from typing import TYPE_CHECKING @@ -1272,7 +1274,7 @@ def make_query( ts: Triplestore, type=None, criterias: "Optional[dict]" = None, # deprecated - criteria: "Optional[dict]" = None, # new preferred name + criteria: "Optional[Union[dict, list[tuple]]]" = None, # new preferred name regex: "Optional[dict]" = None, flags: "Optional[str]" = None, keywords: "Optional[Keywords]" = None, @@ -1297,6 +1299,15 @@ def make_query( if criteria is None: criteria = criterias + if isinstance(criteria, list): + criteria = sorted(criteria, key=lambda x: (x[0] is None, x[0])) + + res = { + key: [value for key, value in group] + for key, group in groupby(criteria, key=lambda x: x[0]) + } + criteria = res + keywords = get_keywords(keywords=keywords) context = get_context(keywords=keywords) context._create_caches() # pylint: disable=protected-access @@ -1317,7 +1328,7 @@ def make_query( cid = criteria.pop("@id", criteria.pop("_id", None)) rid = regex.pop("@id", regex.pop("_id", None)) if cid: - filters.append(f'FILTER(STR(?iri) = "{ts.expand_iri(cid)}") .') + filters.append(f'FILTER(STR(?iri) = "{ts.expand_iri(cid)}") .') # type: ignore elif rid: filters.append( f'FILTER REGEX(STR(?iri), "{ts.expand_iri(rid)}"{flags_arg}) .' @@ -1340,7 +1351,73 @@ def make_query( def add_crit(k, v, regex=False, s="iri"): """Add criteria to SPARQL query.""" nonlocal n - key = f"@{k[1:]}" if k.startswith("_") else k + + key = None if k is None else (f"@{k[1:]}" if k.startswith("_") else k) + + if key is None: + # any predicate on first hop; keep ?s (= ?iri) as the resource + + n += 1 + pvar = f"p{n}" + bn = f"bn{n}" + n += 1 + qvar = f"q{n}" + var = f"v{n}" + + # ?s ?p ?bn . ?bn ?q ?var . + crit.append(f"?{s} ?{pvar} ?{bn} .") + crit.append(f"?{bn} ?{qvar} ?{var} .") + # Only return non-blank subjects + if s == "iri": + filters.append("FILTER(!isBlank(?iri)) .") + + # Support list of values → VALUES (equality) or a single alternation for regex + if isinstance(v, list): + if regex: + pattern = "(" + "|".join(str(p) for p in v) + ")" + filters.append( + f'FILTER REGEX(STR(?{var}), "{pattern}"{flags_arg}) .' + ) + else: + vals = [] + for ele in v: + if ele in expanded: + vals.append(f"<{expanded[ele]}>") + elif isinstance(ele, str): + vals.append( + f"<{ele}>" + if re.match("^[a-z][a-z0-9.+-]*://", ele) + else f'"{ele}"' + ) + elif ele not in ("", None): + vals.append(ele) + if vals: + crit.append(f"VALUES ?{var} {{ {' '.join(vals)} }}") + else: + # single value + if v in expanded: + value = f"<{expanded[v]}>" + elif isinstance(v, str): + value = ( + f"<{v}>" + if re.match("^[a-z][a-z0-9.+-]*://", v) + else f'"{v}"' + ) + else: + value = v + if value: + if regex: + filters.append( + f"FILTER REGEX(STR(?{var}), {value}{flags_arg}) ." + ) + else: + # If it's an IRI token, compare directly; otherwise compare STR() + if isinstance(value, str) and value.startswith("<"): + filters.append(f"FILTER(?{var} = {value}) .") + else: + filters.append(f"FILTER(STR(?{var}) = {value}) .") + return + if isinstance(v, list): for ele in v: add_crit(key, ele, regex=regex, s=s) @@ -1369,12 +1446,15 @@ def add_crit(k, v, regex=False, s="iri"): n += 1 var = f"v{n}" crit.append(f"?{s} <{ts.expand_iri(key)}> ?{var} .") - if regex: - filters.append( - f"FILTER REGEX(STR(?{var}), {value}{flags_arg}) ." - ) - else: - filters.append(f"FILTER(STR(?{var}) = {value}) .") + + if value: + + if regex: + filters.append( + f"FILTER REGEX(STR(?{var}), {value}{flags_arg}) ." + ) + else: + filters.append(f"FILTER(STR(?{var}) = {value}) .") for k, v in criteria.items(): add_crit(k, v) @@ -1385,6 +1465,8 @@ def add_crit(k, v, regex=False, s="iri"): for k, v in regex.items(): add_crit(k, v, regex=True) + # Make sure that iris are iris (not blank nodes) + filters.append("FILTER(!isBlank(?iri)) .") where_statements = "\n ".join(crit + filters) query = f""" PREFIX rdf: <{RDF}> @@ -1400,7 +1482,7 @@ def search( ts: Triplestore, type=None, criterias: "Optional[dict]" = None, # deprecated - criteria: "Optional[dict]" = None, # new preferred name + criteria: "Optional[Union[list[tuple], dict]]" = None, # new preferred name regex: "Optional[dict]" = None, flags: "Optional[str]" = None, keywords: "Optional[Keywords]" = None, @@ -1411,17 +1493,34 @@ def search( Arguments: ts: Triplestore to search. type: Either a [resource type] (ex: "Dataset", "Distribution", ...) - or the IRI of a class to limit the search to. + or the IRI of a class to limit the search to. Can also be given + as a list of resource types or IRIs. criteria: Exact match criteria. A dict of IRI, value pairs, where the - IRIs refer to data properties on the resource match. The IRIs - may use any prefix defined in `ts`. E.g. if the prefix `dcterms` + IRIs refer to data properties on the resource match. If more than + one value is desired for a given criterion, values can be provided + in a list. It can also be given as a list of (key, value) tuples. + A combination of tuples and dict is not supported. + + The IRIsmay use any prefix defined in `ts`. E.g. if the prefix `dcterms` is in `ts`, it is expanded and the match criteria `dcterms:title` is correctly parsed. - regex: Like `criteria` but the values in the provided dict are regular - expressions used for the matching. - flags: Flags passed to regular expressions. - - `s`: Dot-all mode. The . matches any character. The default - doesn't match newline or carriage return. + + If the object (value) is given as None, all matches + that have any value for the given predicate are returned. + + If predicate (key) is given as None, search on all objects irrespective + of predicate is performed. + + Note that more than one value for a given key broadens the + search, i.e. it is an OR operation. + + The different key-value pairs in the dict are combined with AND. + + regex: Like `criteria` but the values in the provided dict are regular + expressions used for the matching. + flags: Flags passed to regular expressions. + - `s`: Dot-all mode. The . matches any character. The default + doesn't match newline or carriage return. - `m`: Multi-line mode. The ^ and $ characters matches beginning or end of line instead of beginning or end of string. - `i`: Case-insensitive mode. @@ -1442,10 +1541,36 @@ def search( search(ts, criteria={"contactPoint.hasName": "John Doe"}) + List IRIs of all resources with John Doe and Jane Doe as `contactPoint`: + + search(ts, criteria={"contactPoint.hasName": ["John Doe", "Jane Doe"]}) + + List IRIs of all resources that have a `contactPoint`: + + search(ts, criteria={"contactPoint.hasName": None}) + + List IRIs of all resources that have Jane Doe or Blue as object (value): + + search(ts, criteria={None: ["Jane Doe", "Blue"]}) + + Search with critera given as list of tuples: + search( + ts, + criteria=[ + ("contactPoint.hasName", "John Doe"), + ("fromSample", SAMPLE.batch2/sample3), + ], + ) + List IRIs of all samples: search(ts, type=CHAMEO.Sample) + List IRIs of all samples that are liquids: + search(ts, type=[CHAMEO.Sample, EMMO.Liquid] ) + + + List IRIs of all datasets with John Doe as `contactPoint` AND are measured on a given sample: