diff --git a/tests/datadoc/test_tabledoc.py b/tests/datadoc/test_tabledoc.py index 8a494ff7..e2f9c590 100644 --- a/tests/datadoc/test_tabledoc.py +++ b/tests/datadoc/test_tabledoc.py @@ -4,13 +4,13 @@ pytest.importorskip("pyld") - -def test_asdicts(): +if 1: + # def test_asdicts(): """Test the asdicts() method.""" pytest.importorskip("rdflib") - from tripper import IANA, Triplestore + from tripper import IANA, OWL, RDF, RDFS, XSD, Literal, Triplestore from tripper.datadoc import TableDoc td = TableDoc( @@ -87,6 +87,34 @@ def test_asdicts(): td.save(ts) print(ts.serialize()) + # Test optional arguments `ts` and `strict` + DS = ts.namespaces["ds"] + ONTO = ts.namespaces["onto"] + ts.add_triples( + [ + (ONTO.rel, RDF.type, OWL.ObjectProperty), + (ONTO.rel, RDFS.domain, ONTO.T1), + (ONTO.rel, RDFS.range, ONTO.T2), + (ONTO.val, RDF.type, OWL.DatatypeProperty), + (ONTO.val, RDFS.domain, ONTO.T1), + (ONTO.val, RDFS.range, XSD.double), + ] + ) + td = TableDoc( + header=[ + "@id", + "@type", + "title", + "onto:rel", + "onto:val", + ], + data=[ + ("ds:a", "onto:A", "a", "ds:d1", "1.1"), + ("ds:b", "onto:B", "b", DS.d2, "2.2"), + ], + ) + td.save(ts) + def test_fromdicts(): """Test the fromdicts() method.""" @@ -268,6 +296,14 @@ def test_csvsniff(): assert dialect.lineterminator == "\n" assert dialect.quotechar == "'" + lines = [ + "1;1.1;2.2", + "3.3;4.4;5.5", + ] + dialect = csvsniff("\r\n".join(lines)) + assert dialect.delimiter == ";" + assert dialect.lineterminator == "\r\n" + def test_csv_keywords(): """Test load CSV with custom keywords file.""" diff --git a/tripper/datadoc/clitool.py b/tripper/datadoc/clitool.py index a0e671f4..32a1a307 100644 --- a/tripper/datadoc/clitool.py +++ b/tripper/datadoc/clitool.py @@ -42,7 +42,7 @@ def subcommand_add(ts, args): redefine=args.redefine, **kw, ) - td.save(ts) + td.save(ts, strict=args.strict) else: raise ValueError(f"Unknown input format: {fmt}") @@ -210,6 +210,12 @@ def maincommand(argv=None): choices=["raise", "allow", "skip"], help="How to handle redifinition of existing keywords.", ) + parser_add.add_argument( + "--strict", + action=argparse.BooleanOptionalAction, + default=True, + help="Whether to allow cell with unexpected datatype.", + ) # Subcommand: delete parser_delete = subparsers.add_parser( diff --git a/tripper/datadoc/errors.py b/tripper/datadoc/errors.py index e14344df..e3f0eb7b 100644 --- a/tripper/datadoc/errors.py +++ b/tripper/datadoc/errors.py @@ -68,3 +68,7 @@ class RedefineKeywordWarning(TripperWarning): class SkipRedefineKeywordWarning(TripperWarning): """Skip redefining an existing keyword in a user-defined keyword definition (by mapping it to a new IRI).""" + + +class TypeConversionWarning(TripperWarning): + """Cannot convert to the given type.""" diff --git a/tripper/datadoc/keywords.py b/tripper/datadoc/keywords.py index f1bf95b1..0c642fca 100644 --- a/tripper/datadoc/keywords.py +++ b/tripper/datadoc/keywords.py @@ -1148,6 +1148,8 @@ def _load_rdf( for ref in ("domain", "range"): if ref in d: for domain in asseq(d[ref]): + if isinstance(domain, dict): # skip blank nodes + continue expanded = expand_iri(domain, prefixes) if expanded.startswith(str(XSD)): continue diff --git a/tripper/datadoc/tabledoc.py b/tripper/datadoc/tabledoc.py index 79b284b6..ea488398 100644 --- a/tripper/datadoc/tabledoc.py +++ b/tripper/datadoc/tabledoc.py @@ -4,16 +4,18 @@ import csv import re +import warnings from pathlib import Path from typing import TYPE_CHECKING from tripper import Triplestore from tripper.datadoc.context import get_context -from tripper.datadoc.dataset import store, told +from tripper.datadoc.dataset import acquire, store, told +from tripper.datadoc.errors import TypeConversionWarning from tripper.datadoc.keywords import get_keywords from tripper.datadoc.utils import addnested, stripnested from tripper.literal import Literal -from tripper.utils import AttrDict, openfile +from tripper.utils import AttrDict, is_curie, is_uri, openfile if TYPE_CHECKING: # pragma: no cover from typing import Iterable, List, Optional, Protocol, Sequence, Union @@ -96,8 +98,12 @@ def __init__( ) self.strip = strip - def save(self, ts: Triplestore) -> None: - """Save tabular datadocumentation to triplestore.""" + def save(self, ts: Triplestore, strict: bool = True) -> None: + """Save tabular datadocumentation to triplestore. + + If `strict` is false, do not raise an exception if a cell + value cannot be converted to the expected datatype. + """ self.context.add_context( {prefix: str(ns) for prefix, ns in ts.namespaces.items()} ) @@ -105,24 +111,66 @@ def save(self, ts: Triplestore) -> None: for prefix, ns in self.context.get_prefixes().items(): ts.bind(prefix, ns) - store(ts, self.asdicts(), type=self.type, context=self.context) + store( + ts, + self.asdicts(ts=ts, strict=strict), + type=self.type, + context=self.context, + ) + + def asdicts( + self, ts: "Optional[Triplestore]" = None, strict: bool = True + ) -> "List[dict]": + """Return the table as a list of dicts. - def asdicts(self) -> "List[dict]": - """Return the table as a list of dicts.""" + Arguments: + ts: Optional triplestore to look up header names in if they + are not found in the json-ld context. + strict: Whether to raise an error if a cell contain data that + cannot be converted to expected datatype. + + """ + # pylint: disable=too-many-nested-blocks results = [] for row in self.data: d = AttrDict() + print() for i, colname in enumerate(self.header): cell = row[i].strip() if row[i] and self.strip else row[i] + print("* colname:", colname) + print(" cell:", repr(cell)) if cell: - # Convert cell value to correct Python type if not colname.startswith("@"): - leafname = colname.split(".")[-1] - df = self.context.getdef(leafname.split("[")[0]) - if "@type" in df and df["@type"] != "@id": - cell = Literal(cell, datatype=df["@type"]).value - + leafname = colname.split(".")[-1].split("[")[0] + dt = None + if leafname in self.context: + df = self.context.getdef(leafname) + t = df.get("@type") + dt = t if t != "@id" else None + elif is_curie(leafname) or is_uri(leafname): + if ts: + df = acquire(ts, leafname) + dt = df.get("range") + # TODO: if acquire() fails, check if `leafname` + # is a resolvable URI that returns a turtle file. + # If so, extract the definition of `leafname` from + # this file. + print(" dt: ", dt) + if dt: + try: + cell = Literal(cell, datatype=dt).value + except ValueError: + if strict: + raise + warnings.warn( + f"Skipping '{cell}' in column '{colname}' " + "since it cannot be converted to datatype " + f"'{dt}'", + TypeConversionWarning, + ) + continue + print(" val: ", repr(cell)) addnested( d, colname.strip() if self.strip else colname, cell )