diff --git a/build.sbt b/build.sbt index e1f3c4819..1a8a2e2fe 100644 --- a/build.sbt +++ b/build.sbt @@ -57,7 +57,9 @@ lazy val commonSettings = Seq( "-unchecked", ), javacOptions ++= Seq( - "-Werror", + "-source", "17", + // Currently, impossible to enable this without breaking the build due to warnings in protobuf generated code. + // "-Werror", // TODO: enable more warnings "-Xlint:unchecked", ), @@ -83,6 +85,59 @@ lazy val rdfProtos = (project in file("rdf-protos")) publishArtifact := false, ) +lazy val generateProtos = taskKey[Seq[File]]("Copies and modifies proto files before compilation") + +// Intermediate project that generates the Scala code from the protobuf files +lazy val rdfProtosJava = (project in file("rdf-protos-java")) + .enablePlugins(ProtobufPlugin) + .settings( + name := "jelly-protos-java", + organization := "eu.neverblink.jelly", + libraryDependencies ++= Seq( + "com.google.protobuf" % "protobuf-java" % protobufV, + ), + generateProtos := { + val inputDir = (baseDirectory.value / ".." / "submodules" / "protobuf" / "proto").getAbsoluteFile + val outputDir = (baseDirectory.value / "src" / "main" / "protobuf").getAbsoluteFile + + // Make output dir if not exists + IO.createDirectory(outputDir) + + // Clean the output directory + IO.delete(IO.listFiles(outputDir)) + + val protoFiles = (inputDir ** "*.proto").get + protoFiles + .map { file => + // Copy the file to the output directory + val outputFile = outputDir / file.relativeTo(inputDir).get.getPath + IO.copyFile(file, outputFile) + outputFile + } + .map { file => + // Append java options to the file + val content = IO.read(file) + val newContent = content + + """ + |option java_multiple_files = true; + |option java_package = "eu.neverblink.jelly.core.proto.v1"; + |option optimize_for = SPEED; + |""".stripMargin + IO.write(file, newContent) + file + } + + // Return the list of generated files + protoFiles.map { file => + val outputFile = outputDir / file.relativeTo(inputDir).get.getPath + outputFile + } + }, + Compile / compile := (Compile / compile).dependsOn(generateProtos).value, + ProtobufConfig / protobufExcludeFilters := Seq(Glob(baseDirectory.value.toPath) / "**" / "grpc.proto"), + publishArtifact := false, + ) + lazy val core = (project in file("core")) .settings( name := "jelly-core", @@ -103,9 +158,33 @@ lazy val core = (project in file("core")) commonSettings, ) +lazy val coreJava = (project in file("core-java")) + .settings( + name := "jelly-core-java", + description := "Core code for serializing and deserializing RDF data in the Jelly format. Java edition.", + libraryDependencies ++= Seq( + "com.google.protobuf" % "protobuf-java" % protobufV, + ), + Compile / sourceGenerators += Def.task { + // Copy from the managed source directory to the output directory + val inputDir = (rdfProtosJava / target).value / ("scala-" + scalaVersion.value) / "src_managed" / "main" + val outputDir = sourceManaged.value / "main" / "protobuf" + val javaFiles = (inputDir ** "*.java").get + javaFiles.map { file => + val outputFile = outputDir / file.relativeTo(inputDir).get.getPath + IO.copyFile(file, outputFile) + outputFile + } + + }.dependsOn(rdfProtosJava / Compile / compile), + Compile / sourceManaged := sourceManaged.value / "main", + commonSettings, + ) + lazy val corePatch = (project in file("core-patch")) .settings( name := "jelly-core-patch", + organization := "eu.neverblink.jelly", description := "Core code for the RDF Patch Jelly extension.", // Add the generated proto classes after transforming them with Scalameta Compile / sourceGenerators += Def.task { diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/JellyConstants.java b/core-java/src/main/java/eu/neverblink/jelly/core/JellyConstants.java new file mode 100644 index 000000000..101593cf9 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/JellyConstants.java @@ -0,0 +1,19 @@ +package eu.neverblink.jelly.core; + +public class JellyConstants { + + private JellyConstants() {} + + public static final String JELLY_NAME = "Jelly"; + public static final String JELLY_FILE_EXTENSION = "jelly"; + public static final String JELLY_CONTENT_TYPE = "application/x-jelly-rdf"; + + public static final int PROTO_VERSION_1_0_X = 1; + public static final int PROTO_VERSION_1_1_X = 2; + public static final int PROTO_VERSION = PROTO_VERSION_1_1_X; + + public static final String PROTO_SEMANTIC_VERSION_1_0_0 = "1.0.0"; // First protocol version + public static final String PROTO_SEMANTIC_VERSION_1_1_0 = "1.1.0"; // Protocol version with namespace declarations + public static final String PROTO_SEMANTIC_VERSION_1_1_1 = "1.1.1"; // Protocol version with metadata in RdfStreamFrame + public static final String PROTO_SEMANTIC_VERSION = PROTO_SEMANTIC_VERSION_1_1_1; +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/JellyConverterFactory.java b/core-java/src/main/java/eu/neverblink/jelly/core/JellyConverterFactory.java new file mode 100644 index 000000000..eb83ff13b --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/JellyConverterFactory.java @@ -0,0 +1,120 @@ +package eu.neverblink.jelly.core; + +import eu.neverblink.jelly.core.internal.ProtoDecoderImpl; +import eu.neverblink.jelly.core.internal.ProtoEncoderImpl; +import eu.neverblink.jelly.core.proto.v1.RdfStreamOptions; + +/** + * "Main" interface to be implemented by RDF conversion modules (e.g., for Jena and RDF4J). + * Exposes factory methods for building protobuf encoders and decoders. + *

+ * This should typically be implemented as an object. You should also provide a package-scoped given for your + * implementation so that users can easily make use of the connector in the stream package. + * + * @param Type of RDF nodes in the RDF library + * @param Type of RDF datatypes in the RDF library + * @param Implementation of ProtoEncoderConverter for a given RDF library. + * @param Implementation of ProtoDecoderConverter for a given RDF library. + */ +public abstract class JellyConverterFactory< + TNode, + TDatatype, + TEncoderConverter extends ProtoEncoderConverter, + TDecoderConverter extends ProtoDecoderConverter +> { + + /** + * To be implemented by subclasses. Returns an instance of ProtoEncoderConverter for the RDF library. + */ + protected abstract TEncoderConverter encoderConverter(); + + /** + * To be implemented by subclasses. Returns an instance of ProtoDecoderConverter for the RDF library. + */ + protected abstract TDecoderConverter decoderConverter(); + + /** + * Create a new ProtoEncoder. + * @param params Parameters for the encoder. + * @return encoder + */ + public final ProtoEncoder encoder(ProtoEncoder.Params params) { + return new ProtoEncoderImpl<>(encoderConverter(), params); + } + + /** + * Create a new TriplesDecoder. + * @param supportedOptions maximum supported options for the decoder. If not provided, this.defaultSupportedOptions + * will be used. If you want to modify this (e.g., to specify an expected logical stream + * type), you should always use this.defaultSupportedOptions.withXxx. + * namespace prefix (without a colon), the second is the IRI node. + * @param tripleHandler the handler to use for decoding triples + * @return decoder + */ + public final ProtoDecoder triplesDecoder( + RdfHandler.TripleHandler tripleHandler, + RdfStreamOptions supportedOptions + ) { + return new ProtoDecoderImpl.TriplesDecoder<>(decoderConverter(), tripleHandler, supportedOptions); + } + + /** + * Create a new QuadsDecoder. + * @param supportedOptions maximum supported options for the decoder. If not provided, this.defaultSupportedOptions + * will be used. If you want to modify this (e.g., to specify an expected logical stream + * type), you should always use this.defaultSupportedOptions.toBuilder().setXxx.build();. + * @param quadHandler the handler to use for decoding quads + * @return decoder + */ + public final ProtoDecoder quadsDecoder( + RdfHandler.QuadHandler quadHandler, + RdfStreamOptions supportedOptions + ) { + return new ProtoDecoderImpl.QuadsDecoder<>(decoderConverter(), quadHandler, supportedOptions); + } + + /** + * Create a new GraphsAsQuadsDecoder. + * @param supportedOptions maximum supported options for the decoder. If not provided, this.defaultSupportedOptions + * will be used. If you want to modify this (e.g., to specify an expected logical stream + * type), you should always use this.defaultSupportedOptions.toBuilder().setXxx.build();. + * @param graphHandler the handler to use for decoding graphs + * @return decoder + */ + public final ProtoDecoder graphsAsQuadsDecoder( + RdfHandler.QuadHandler graphHandler, + RdfStreamOptions supportedOptions + ) { + return new ProtoDecoderImpl.GraphsAsQuadsDecoder<>(decoderConverter(), graphHandler, supportedOptions); + } + + /** + * Create a new GraphsDecoder. + * @param supportedOptions maximum supported options for the decoder. If not provided, this.defaultSupportedOptions + * will be used. If you want to modify this (e.g., to specify an expected logical stream + * type), you should always use this.defaultSupportedOptions.toBuilder().setXxx.build();. + * @param graphHandler the handler to use for decoding graphs + * @return decoder + */ + public final ProtoDecoder graphsDecoder( + RdfHandler.GraphHandler graphHandler, + RdfStreamOptions supportedOptions + ) { + return new ProtoDecoderImpl.GraphsDecoder<>(decoderConverter(), graphHandler, supportedOptions); + } + + /** + * Create a new AnyStatementDecoder. + * @param supportedOptions maximum supported options for the decoder. If not provided, this.defaultSupportedOptions + * will be used. If you want to modify this (e.g., to specify an expected logical stream + * type), you should always use this.defaultSupportedOptions.toBuilder().setXxx.build();. + * @param anyStatementHandler the handler to use for decoding any statements + * @return decoder + */ + public final ProtoDecoder anyStatementDecoder( + RdfHandler.AnyStatementHandler anyStatementHandler, + RdfStreamOptions supportedOptions + ) { + return new ProtoDecoderImpl.AnyStatementDecoder<>(decoderConverter(), anyStatementHandler, supportedOptions); + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/JellyOptions.java b/core-java/src/main/java/eu/neverblink/jelly/core/JellyOptions.java new file mode 100644 index 000000000..cb9e3da36 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/JellyOptions.java @@ -0,0 +1,302 @@ +package eu.neverblink.jelly.core; + +import eu.neverblink.jelly.core.proto.v1.LogicalStreamType; +import eu.neverblink.jelly.core.proto.v1.RdfStreamOptions; +import eu.neverblink.jelly.core.utils.LogicalStreamTypeUtils; + +/** + * A collection of convenient streaming option presets. + * None of the presets specifies the stream type – do that with the .toBuilder().setPhysicalType().build() method. + */ +public class JellyOptions { + + private JellyOptions() {} + + public static final int BIG_NAME_TABLE_SIZE = 4000; + public static final int BIG_PREFIX_TABLE_SIZE = 150; + public static final int BIG_DT_TABLE_SIZE = 32; + + public static final int SMALL_NAME_TABLE_SIZE = 128; + public static final int SMALL_PREFIX_TABLE_SIZE = 16; + public static final int SMALL_DT_TABLE_SIZE = 16; + + /** + * "Big" preset suitable for high-volume streams and larger machines. + * Does not allow generalized RDF statements. + */ + public static final RdfStreamOptions BIG_STRICT = RdfStreamOptions.newBuilder() + .setMaxNameTableSize(BIG_NAME_TABLE_SIZE) + .setMaxPrefixTableSize(BIG_PREFIX_TABLE_SIZE) + .setMaxDatatypeTableSize(BIG_DT_TABLE_SIZE) + .build(); + + /** + * "Big" preset suitable for high-volume streams and larger machines. + * Allows generalized RDF statements. + */ + public static final RdfStreamOptions BIG_GENERALIZED = RdfStreamOptions.newBuilder() + .setMaxNameTableSize(BIG_NAME_TABLE_SIZE) + .setMaxPrefixTableSize(BIG_PREFIX_TABLE_SIZE) + .setMaxDatatypeTableSize(BIG_DT_TABLE_SIZE) + .setGeneralizedStatements(true) + .build(); + + /** + * "Big" preset suitable for high-volume streams and larger machines. + * Allows RDF-star statements. + */ + public static final RdfStreamOptions BIG_RDF_STAR = RdfStreamOptions.newBuilder() + .setMaxNameTableSize(BIG_NAME_TABLE_SIZE) + .setMaxPrefixTableSize(BIG_PREFIX_TABLE_SIZE) + .setMaxDatatypeTableSize(BIG_DT_TABLE_SIZE) + .setRdfStar(true) + .build(); + + /** + * "Big" preset suitable for high-volume streams and larger machines. + * Allows all protocol features (including generalized RDF statements and RDF-star statements). + */ + public static final RdfStreamOptions BIG_ALL_FEATURES = RdfStreamOptions.newBuilder() + .setMaxNameTableSize(BIG_NAME_TABLE_SIZE) + .setMaxPrefixTableSize(BIG_PREFIX_TABLE_SIZE) + .setMaxDatatypeTableSize(BIG_DT_TABLE_SIZE) + .setGeneralizedStatements(true) + .setRdfStar(true) + .build(); + + /** + * "Small" preset suitable for low-volume streams and smaller machines. + * Does not allow generalized RDF statements. + */ + public static final RdfStreamOptions SMALL_STRICT = RdfStreamOptions.newBuilder() + .setMaxNameTableSize(SMALL_NAME_TABLE_SIZE) + .setMaxPrefixTableSize(SMALL_PREFIX_TABLE_SIZE) + .setMaxDatatypeTableSize(SMALL_DT_TABLE_SIZE) + .build(); + + /** + * "Small" preset suitable for low-volume streams and smaller machines. + * Allows generalized RDF statements. + */ + public static final RdfStreamOptions SMALL_GENERALIZED = RdfStreamOptions.newBuilder() + .setMaxNameTableSize(SMALL_NAME_TABLE_SIZE) + .setMaxPrefixTableSize(SMALL_PREFIX_TABLE_SIZE) + .setMaxDatatypeTableSize(SMALL_DT_TABLE_SIZE) + .setGeneralizedStatements(true) + .build(); + /** + * "Small" preset suitable for low-volume streams and smaller machines. + * Allows RDF-star statements. + */ + public static final RdfStreamOptions SMALL_RDF_STAR = RdfStreamOptions.newBuilder() + .setMaxNameTableSize(SMALL_NAME_TABLE_SIZE) + .setMaxPrefixTableSize(SMALL_PREFIX_TABLE_SIZE) + .setMaxDatatypeTableSize(SMALL_DT_TABLE_SIZE) + .setRdfStar(true) + .build(); + + /** + * "Small" preset suitable for low-volume streams and smaller machines. + * Allows all protocol features (including generalized RDF statements and RDF-star statements). + */ + public static final RdfStreamOptions SMALL_ALL_FEATURES = RdfStreamOptions.newBuilder() + .setMaxNameTableSize(SMALL_NAME_TABLE_SIZE) + .setMaxPrefixTableSize(SMALL_PREFIX_TABLE_SIZE) + .setMaxDatatypeTableSize(SMALL_DT_TABLE_SIZE) + .setGeneralizedStatements(true) + .setRdfStar(true) + .build(); + + /** + * Default maximum supported options for Jelly decoders. + *

+ * This means that by default Jelly-JVM will refuse to read streams that exceed these limits (e.g., with a + * name lookup table larger than 4096 entries). + *

+ * To change these defaults, you should pass a different RdfStreamOptions object to the decoder. + * You should use this method to get the default options and then modify them as needed. + * For example, to disable RDF-star support, you can do this: + * + * final var myOptions = JellyOptions.DEFAULT_SUPPORTED_OPTIONS + * .toBuilder() + * .setRdfStar(false) + * .build(); + * + *

+ * If you were to pass a default RdfStreamOptions object to the decoder, it would simply refuse to read any stream + * as (by default) it will have all max table sizes set to 0. So, you should always use this method as the base. + */ + public static final RdfStreamOptions DEFAULT_SUPPORTED_OPTIONS = RdfStreamOptions.newBuilder() + .setVersion(JellyConstants.PROTO_VERSION) + .setGeneralizedStatements(true) + .setRdfStar(true) + .setMaxNameTableSize(4096) + .setMaxPrefixTableSize(1024) + .setMaxDatatypeTableSize(256) + .build(); + + /** + * Checks if the requested stream options are supported. Throws an exception if not. + *

+ * This is used in two places: + * - By ProtoDecoder implementations to check if it's safe to decode the stream + * This MUST be called before any data (besides the stream options) is ingested. Otherwise, the options may + * request something dangerous, like allocating a very large lookup table, which could be used to perform a + * denial-of-service attack. + * - By implementations the gRPC streaming service from the jelly-grpc module to check if the client is + * requesting stream options that the server can support. + *

+ * We check: + * - version (must be <= Constants.protoVersion and <= supportedOptions.version) + * - generalized statements (must be <= supportedOptions.generalizedStatements) + * - RDF star (must be <= supportedOptions.rdfStar) + * - max name table size (must be <= supportedOptions.maxNameTableSize and >= 16). + * - max prefix table size (must be <= supportedOptions.maxPrefixTableSize) + * - max datatype table size (must be <= supportedOptions.maxDatatypeTableSize and >= 8) + * - logical stream type (must be compatible with physical stream type and compatible with expected log. stream type) + *

+ * We don't check: + * - physical stream type (this is done by the implementations of ProtoDecoderImpl) + * - stream name (we don't care about it) + *

+ * See also the stream options handling table in the gRPC spec: + * link + * This is not exactly what we are doing here (the table is about client-server interactions), but it's a good + * reference for the logic used here. + * + * @param requestedOptions Requested options of the stream. + * @param supportedOptions Options that can be safely supported. + * + * @throws RdfProtoDeserializationError if the requested options are not supported. + */ + public static void checkCompatibility(RdfStreamOptions requestedOptions, RdfStreamOptions supportedOptions) { + checkBaseCompatibility(requestedOptions, supportedOptions); + checkLogicalStreamType(requestedOptions, supportedOptions.getLogicalType()); + } + + /** + * Check if the requested options are compatible with the supported options and the system. + * + * @param requestedOptions requested options + * @param supportedOptions supported options + * + * @throws RdfProtoDeserializationError on validation error + */ + private static void checkBaseCompatibility(RdfStreamOptions requestedOptions, RdfStreamOptions supportedOptions) { + if ( + requestedOptions.getVersion() > supportedOptions.getVersion() || + requestedOptions.getVersion() > JellyConstants.PROTO_VERSION + ) { + throw new RdfProtoDeserializationError( + "Unsupported proto version: %s. Was expecting at most version %s. This library version supports up to version %s.".formatted( + requestedOptions.getVersion(), + supportedOptions.getVersion(), + JellyConstants.PROTO_VERSION + ) + ); + } + if (requestedOptions.getGeneralizedStatements() && !supportedOptions.getGeneralizedStatements()) { + throw new RdfProtoDeserializationError( + "The stream uses generalized statements, which are not supported. " + + "Either disable generalized statements or enable them in the supportedOptions." + ); + } + if (requestedOptions.getRdfStar() && !supportedOptions.getRdfStar()) { + throw new RdfProtoDeserializationError( + "The stream uses RDF-star, which is not supported. " + + "Either disable RDF-star or enable it in the supportedOptions." + ); + } + + checkTableSize("Name", requestedOptions.getMaxNameTableSize(), supportedOptions.getMaxNameTableSize(), 8); + checkTableSize("Prefix", requestedOptions.getMaxPrefixTableSize(), supportedOptions.getMaxPrefixTableSize()); + checkTableSize( + "Datatype", + requestedOptions.getMaxDatatypeTableSize(), + supportedOptions.getMaxDatatypeTableSize() + ); + } + + /** + * Checks if the table size is within the supported range. + * + * @param name Name of the table (for error messages). + * @param size Size of the table. + * @param supportedSize Maximum supported size of the table. + * @param minSize Minimum supported size of the table. + * + * @throws RdfProtoDeserializationError if the table size is not within the supported range. + */ + private static void checkTableSize(String name, int size, int supportedSize, int minSize) { + if (size > supportedSize) { + throw new RdfProtoDeserializationError( + "The stream uses a %s table size of %s, which is larger than the maximum supported size of %s.".formatted( + name.toLowerCase(), + size, + supportedSize + ) + ); + } + if (size < minSize) { + throw new RdfProtoDeserializationError( + "The stream uses a %s table size of %s, which is smaller than the minimum supported size of %s.".formatted( + name.toLowerCase(), + size, + minSize + ) + ); + } + } + + private static void checkTableSize(String name, int size, int supportedSize) { + checkTableSize(name, size, supportedSize, 0); + } + + /** + * Checks if the logical and physical stream types are compatible. Additionally, if the expected logical stream type + * is provided, checks if the actual logical stream type is a subtype of the expected one. + * + * @param options Options of the stream. + * @param expectedLogicalType Expected logical stream type. If UNSPECIFIED, no check is performed. + * + * @throws RdfProtoDeserializationError if the requested options are not supported. + */ + private static void checkLogicalStreamType(RdfStreamOptions options, LogicalStreamType expectedLogicalType) { + final var logicalType = options.getLogicalType(); + final var baseLogicalType = LogicalStreamTypeUtils.toBaseType(logicalType); + final var physicalType = options.getPhysicalType(); + + final var conflict = + switch (baseLogicalType) { + case LOGICAL_STREAM_TYPE_FLAT_TRIPLES, LOGICAL_STREAM_TYPE_GRAPHS -> switch (physicalType) { + case PHYSICAL_STREAM_TYPE_QUADS, PHYSICAL_STREAM_TYPE_GRAPHS -> true; + default -> false; + }; + case LOGICAL_STREAM_TYPE_FLAT_QUADS, LOGICAL_STREAM_TYPE_DATASETS -> switch (physicalType) { + case PHYSICAL_STREAM_TYPE_TRIPLES -> true; + default -> false; + }; + default -> false; + }; + + if (conflict) { + throw new RdfProtoDeserializationError( + "Logical stream type %s is incompatible with physical stream type %s.".formatted( + logicalType, + physicalType + ) + ); + } + + if (!LogicalStreamTypeUtils.isEqualOrSubtypeOf(logicalType, expectedLogicalType)) { + throw new RdfProtoDeserializationError( + "Expected logical stream type %s, got %s. %s is not a subtype of %s.".formatted( + expectedLogicalType, + logicalType, + logicalType, + expectedLogicalType + ) + ); + } + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/JellyTranscoderFactory.java b/core-java/src/main/java/eu/neverblink/jelly/core/JellyTranscoderFactory.java new file mode 100644 index 000000000..0c332f1b9 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/JellyTranscoderFactory.java @@ -0,0 +1,39 @@ +package eu.neverblink.jelly.core; + +import eu.neverblink.jelly.core.internal.ProtoTranscoderImpl; +import eu.neverblink.jelly.core.proto.v1.RdfStreamOptions; + +/** + * Factory for creating ProtoTranscoder instances. + */ +public final class JellyTranscoderFactory { + + private JellyTranscoderFactory() {} + + /** + * Fast transcoder suitable for merging multiple input streams into one. + * This variant DOES NOT check the input options of the consumed streams. This should be therefore only used + * when the input is fully trusted. Otherwise, an attacker could cause a DoS by sending a stream with large lookups. + * + * @param outputOptions options for the output stream. This MUST have the physical stream type set. + * @return ProtoTranscoder + */ + public static ProtoTranscoder fastMergingTranscoderUnsafe(RdfStreamOptions outputOptions) { + return new ProtoTranscoderImpl(null, outputOptions); + } + + /** + * Fast transcoder suitable for merging multiple input streams into one. + * This variant does check the input options of the consumed streams, so it is SAFE to use with untrusted input. + * + * @param supportedInputOptions maximum allowable options for the input streams + * @param outputOptions options for the output stream. This MUST have the physical stream type set. + * @return ProtoTranscoder + */ + public static ProtoTranscoder fastMergingTranscoder( + RdfStreamOptions supportedInputOptions, + RdfStreamOptions outputOptions + ) { + return new ProtoTranscoderImpl(supportedInputOptions, outputOptions); + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/NameDecoder.java b/core-java/src/main/java/eu/neverblink/jelly/core/NameDecoder.java new file mode 100644 index 000000000..c61681491 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/NameDecoder.java @@ -0,0 +1,30 @@ +package eu.neverblink.jelly.core; + +import eu.neverblink.jelly.core.proto.v1.RdfNameEntry; +import eu.neverblink.jelly.core.proto.v1.RdfPrefixEntry; + +/** + * Interface for NameDecoder exposed for Jelly extensions. + * @param type of the IRI + */ +public interface NameDecoder { + /** + * Update the name table with a new entry. + * @param nameEntry new name entry + */ + void updateNames(RdfNameEntry nameEntry); + + /** + * Update the prefix table with a new entry. + * @param prefixEntry new prefix entry + */ + void updatePrefixes(RdfPrefixEntry prefixEntry); + + /** + * Reconstruct an IRI from its prefix and name ids. + * @param prefixId prefix id of IRI row from the Jelly proto + * @param nameId name id of IRI row from the Jelly proto + * @return full IRI combining the prefix and the name + */ + TIri decode(int prefixId, int nameId); +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/NamespaceDeclaration.java b/core-java/src/main/java/eu/neverblink/jelly/core/NamespaceDeclaration.java new file mode 100644 index 000000000..d13f29e0b --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/NamespaceDeclaration.java @@ -0,0 +1,12 @@ +package eu.neverblink.jelly.core; + +/** + * Simple holder for namespace declarations. + *

+ * This isn't actually needed for the core functionality, but it's useful if you want to pass namespace declarations + * around in a type-safe way. It's used for example in the stream module. + * + * @param prefix short name of the namespace (e.g., "rdf"), without a colon + * @param iri namespace IRI (e.g., "http://www.w3.org/1999/02/22-rdf-syntax-ns#") + */ +public record NamespaceDeclaration(String prefix, String iri) {} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/NodeEncoder.java b/core-java/src/main/java/eu/neverblink/jelly/core/NodeEncoder.java new file mode 100644 index 000000000..c2458b8e2 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/NodeEncoder.java @@ -0,0 +1,65 @@ +package eu.neverblink.jelly.core; + +/** + * Interface exposed to RDF library interop modules for encoding RDF terms. + * @param The type of RDF nodes used by the RDF library. + */ +public interface NodeEncoder { + /** + * Encode an IRI node. + * @param iri The IRI to encode. + * @return The encoded IRI node. + */ + RdfTerm.Iri makeIri(String iri); + + /** + * Encode a blank node. + * @param label The label of the blank node. + * @return The encoded blank node. + */ + RdfTerm.BNode makeBlankNode(String label); + + /** + * Encode a simple literal (of type xsd:string). + * @param lex The lexical form of the literal. + * @return The encoded literal. + */ + RdfTerm.SimpleLiteral makeSimpleLiteral(String lex); + + /** + * Encode a language-tagged literal. + * @param lit The literal node. This is used for caching and deduplication. + * @param lex The lexical form of the literal. + * @param lang The language tag. + * @return The encoded literal. + */ + RdfTerm.LanguageLiteral makeLangLiteral(TNode lit, String lex, String lang); + + /** + * Encode a datatype literal (not xsd:string and not language-tagged). + * @param lit The literal node. This is used for caching and deduplication. + * @param lex The lexical form of the literal. + * @param dt The datatype IRI. + * @return The encoded literal. + */ + RdfTerm.DtLiteral makeDtLiteral(TNode lit, String lex, String dt); + + /** + * Encode a quoted triple node (RDF-star). + * You must first encode the subject, predicate, and object of the triple using the other methods in this interface. + * + * @param s The subject of the triple. + * @param p The predicate of the triple. + * @param o The object of the triple. + * @return The encoded triple node. + */ + RdfTerm.Triple makeQuotedTriple(RdfTerm.SpoTerm s, RdfTerm.SpoTerm p, RdfTerm.SpoTerm o); + + /** + * Encode a default graph node. + * @return The encoded default graph node. + */ + static RdfTerm.GraphTerm makeDefaultGraph() { + return RdfTerm.DefaultGraph.INSTANCE; + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/ProtoDecoder.java b/core-java/src/main/java/eu/neverblink/jelly/core/ProtoDecoder.java new file mode 100644 index 000000000..05a9a0c85 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/ProtoDecoder.java @@ -0,0 +1,38 @@ +package eu.neverblink.jelly.core; + +import eu.neverblink.jelly.core.internal.ProtoDecoderBase; +import eu.neverblink.jelly.core.proto.v1.RdfStreamOptions; +import eu.neverblink.jelly.core.proto.v1.RdfStreamRow; + +/** + * Base extendable interface for decoders of protobuf RDF streams. + *

+ * See the implementation in ProtoDecoderImpl. + * + * @param The type of the node. + * @param The type of the datatype. + */ +public abstract class ProtoDecoder extends ProtoDecoderBase { + + /** + * Constructor. + * + * @param converter the converter to use + */ + protected ProtoDecoder(ProtoDecoderConverter converter) { + super(converter); + } + + /** + * Options for this stream. + * @return options if the decoder has encountered the stream options, None otherwise. + */ + protected abstract RdfStreamOptions getStreamOptions(); + + /** + * Ingest a row from the stream. + * + * @param row row to ingest + */ + public abstract void ingestRow(RdfStreamRow row); +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/ProtoDecoderConverter.java b/core-java/src/main/java/eu/neverblink/jelly/core/ProtoDecoderConverter.java new file mode 100644 index 000000000..f246d2ec5 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/ProtoDecoderConverter.java @@ -0,0 +1,20 @@ +package eu.neverblink.jelly.core; + +/** + * Converter trait for translating between Jelly's object representation of RDF and that of RDF libraries. + *

+ * You need to implement this trait to adapt Jelly to a new RDF library. + * + * @param type of RDF nodes in the library + * @param type of RDF datatypes in the library + */ +public interface ProtoDecoderConverter { + TNode makeSimpleLiteral(String lex); + TNode makeLangLiteral(String lex, String lang); + TNode makeDtLiteral(String lex, TDatatype dt); + TDatatype makeDatatype(String dt); + TNode makeBlankNode(String label); + TNode makeIriNode(String iri); + TNode makeTripleNode(TNode s, TNode p, TNode o); + TNode makeDefaultGraphNode(); +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/ProtoEncoder.java b/core-java/src/main/java/eu/neverblink/jelly/core/ProtoEncoder.java new file mode 100644 index 000000000..2992869d2 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/ProtoEncoder.java @@ -0,0 +1,65 @@ +package eu.neverblink.jelly.core; + +import eu.neverblink.jelly.core.internal.ProtoEncoderBase; +import eu.neverblink.jelly.core.proto.v1.RdfStreamOptions; +import eu.neverblink.jelly.core.proto.v1.RdfStreamRow; +import java.util.Collection; + +/** + * Base interface for RDF stream encoders. + * @param type of RDF nodes in the library + */ +public abstract class ProtoEncoder + extends ProtoEncoderBase + implements RowBufferAppender, RdfHandler.AnyRdfHandler { + + /** + * Parameters passed to the Jelly encoder. + *

+ * New fields may be added in the future, but always with a default value and in a sequential order. + * WARNING: PLEASE USE .of TO CREATE NEW INSTANCES, otherwise your code will break when new fields are added. + * + * @param options options for this stream (required) + * @param enableNamespaceDeclarations whether to allow namespace declarations in the stream. + * If true, this will raise the stream version to 2 (Jelly 1.1.0). Otherwise, + * the stream version will be 1 (Jelly 1.0.0). + * @param appendableRowBuffer buffer for storing stream rows that should go into a stream frame. + * The encoder will append the rows to this buffer. + */ + public record Params( + RdfStreamOptions options, + boolean enableNamespaceDeclarations, + Collection appendableRowBuffer + ) { + /** + * Creates a new instance of Params. + * @param options options for this stream (required) + * @param enableNamespaceDeclarations whether to allow namespace declarations in the stream. + * @param appendableRowBuffer buffer for storing stream rows that should go into a stream frame. + * @return a new instance of Params + */ + public static Params of( + RdfStreamOptions options, + boolean enableNamespaceDeclarations, + Collection appendableRowBuffer + ) { + return new Params(options, enableNamespaceDeclarations, appendableRowBuffer); + } + } + + /** + * Whether namespace declarations are enabled for this encoder. + */ + protected final boolean enableNamespaceDeclarations; + + /** + * Buffer for storing stream rows that should go into a stream frame. + */ + protected final Collection appendableRowBuffer; + + protected ProtoEncoder(ProtoEncoderConverter converter, Params params) { + super(params.options, converter); + this.enableNamespaceDeclarations = params.enableNamespaceDeclarations; + this.appendableRowBuffer = params.appendableRowBuffer; + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/ProtoEncoderConverter.java b/core-java/src/main/java/eu/neverblink/jelly/core/ProtoEncoderConverter.java new file mode 100644 index 000000000..97dfc7002 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/ProtoEncoderConverter.java @@ -0,0 +1,13 @@ +package eu.neverblink.jelly.core; + +/** + * Converter trait for translating between an RDF library's object representation and Jelly's proto objects. + *

+ * You need to implement this trait to implement Jelly encoding for a new RDF library. + * + * @param type of RDF nodes in the library + */ +public interface ProtoEncoderConverter { + RdfTerm.SpoTerm nodeToProto(NodeEncoder encoder, TNode node); + RdfTerm.GraphTerm graphNodeToProto(NodeEncoder encoder, TNode node); +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/ProtoTranscoder.java b/core-java/src/main/java/eu/neverblink/jelly/core/ProtoTranscoder.java new file mode 100644 index 000000000..aebeafcdc --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/ProtoTranscoder.java @@ -0,0 +1,29 @@ +package eu.neverblink.jelly.core; + +import eu.neverblink.jelly.core.proto.v1.RdfStreamFrame; +import eu.neverblink.jelly.core.proto.v1.RdfStreamRow; + +/** + * Transcoder for Jelly streams. + *

+ * It turns one or more input streams into one output stream. + */ +public interface ProtoTranscoder { + /** + * Ingests a single row and returns zero or more rows. + * + * @param row the row to ingest + * @return zero or more rows + * @throws RdfProtoTranscodingError if the row can't be transcoded + */ + Iterable ingestRow(RdfStreamRow row); + + /** + * Ingests a frame and returns a frame. + * + * @param frame the frame to ingest + * @return the frame + * @throws RdfProtoTranscodingError if the frame can't be transcoded + */ + RdfStreamFrame ingestFrame(RdfStreamFrame frame); +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/RdfHandler.java b/core-java/src/main/java/eu/neverblink/jelly/core/RdfHandler.java new file mode 100644 index 000000000..57ca57617 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/RdfHandler.java @@ -0,0 +1,84 @@ +package eu.neverblink.jelly.core; + +/** + * Interface for handling different types of RDF data structures that flow from the decoder. + * + * @param The type of the nodes in the RDF data structure, as bound by library. + */ +public interface RdfHandler { + /** + * Handle namespace definition. + * @param prefix The prefix of the namespace. + * @param namespace The namespace IRI, as represented by node in the RDF data structure. + */ + default void handleNamespace(String prefix, TNode namespace) { + // No-op + } + + /** + * Extension of the ProtoHandler interface to handle triples. + * @param The type of the nodes in the RDF data structure, as bound by library. + */ + interface TripleHandler extends RdfHandler { + /** + * Handle a triple. + * @param subject The subject of the triple, as represented by node in the RDF data structure. + * @param predicate The predicate of the triple, as represented by node in the RDF data structure. + * @param object The object of the triple, as represented by node in the RDF data structure. + */ + void handleTriple(TNode subject, TNode predicate, TNode object); + } + + /** + * Extension of the ProtoHandler interface to handle quads. + * @param The type of the nodes in the RDF data structure, as bound by library. + */ + interface QuadHandler extends RdfHandler { + /** + * Handle a quad. + * @param subject The subject of the quad, as represented by node in the RDF data structure. + * @param predicate The predicate of the quad, as represented by node in the RDF data structure. + * @param object The object of the quad, as represented by node in the RDF data structure. + * @param graph The graph of the quad, as represented by node in the RDF data structure. + */ + void handleQuad(TNode subject, TNode predicate, TNode object, TNode graph); + } + + /** + * Extension of the ProtoHandler interface to handle graphs. + * @param The type of the nodes in the RDF data structure, as bound by library. + */ + interface GraphHandler extends RdfHandler { + /** + * Handle a graph start. + * @param graph The graph node, as represented by node in the RDF data structure. + */ + void handleGraphStart(TNode graph); + + /** + * Handle a graph-related triple. + * + * @param subject A subject of triple that belong to the graph. + * @param predicate A predicate of triple that belong to the graph. + * @param object An object of triple that belong to the graph. + */ + void handleTriple(TNode subject, TNode predicate, TNode object); + + /** + * Handle a graph end. + */ + void handleGraphEnd(); + } + + /** + * Extension of the ProtoHandler interface to handle Triples and Quads. + * @param The type of the nodes in the RDF data structure, as bound by library. + */ + interface AnyStatementHandler extends TripleHandler, QuadHandler {} + + /** + * Extension of the ProtoHandler interface to handle any RDF data structure. + * @param The type of the nodes in the RDF data structure, as bound by library. + */ + interface AnyRdfHandler extends AnyStatementHandler, GraphHandler {} +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/RdfProtoDeserializationError.java b/core-java/src/main/java/eu/neverblink/jelly/core/RdfProtoDeserializationError.java new file mode 100644 index 000000000..e1c65d758 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/RdfProtoDeserializationError.java @@ -0,0 +1,16 @@ +package eu.neverblink.jelly.core; + +/** + * This exception is thrown when there is an error during the deserialization of a + * protocol buffer message from RDF. + */ +public final class RdfProtoDeserializationError extends RuntimeException { + + public RdfProtoDeserializationError(String msg) { + super(msg); + } + + public RdfProtoDeserializationError(String msg, Throwable cause) { + super(msg, cause); + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/RdfProtoSerializationError.java b/core-java/src/main/java/eu/neverblink/jelly/core/RdfProtoSerializationError.java new file mode 100644 index 000000000..570ec75ec --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/RdfProtoSerializationError.java @@ -0,0 +1,16 @@ +package eu.neverblink.jelly.core; + +/** + * This exception is thrown when there is an error during the serialization of a + * protocol buffer message to RDF. + */ +public final class RdfProtoSerializationError extends RuntimeException { + + public RdfProtoSerializationError(String msg) { + super(msg); + } + + public RdfProtoSerializationError(String msg, Throwable cause) { + super(msg, cause); + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/RdfProtoTranscodingError.java b/core-java/src/main/java/eu/neverblink/jelly/core/RdfProtoTranscodingError.java new file mode 100644 index 000000000..52c7656b1 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/RdfProtoTranscodingError.java @@ -0,0 +1,15 @@ +package eu.neverblink.jelly.core; + +/** + * Exception thrown when an error occurs during the transcoding of RDF ProtoBuf data. + */ +public final class RdfProtoTranscodingError extends RuntimeException { + + public RdfProtoTranscodingError(String msg) { + super(msg); + } + + public RdfProtoTranscodingError(String msg, Throwable cause) { + super(msg, cause); + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/RdfTerm.java b/core-java/src/main/java/eu/neverblink/jelly/core/RdfTerm.java new file mode 100644 index 000000000..3812c0297 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/RdfTerm.java @@ -0,0 +1,655 @@ +package eu.neverblink.jelly.core; + +import eu.neverblink.jelly.core.proto.v1.RdfDefaultGraph; +import eu.neverblink.jelly.core.proto.v1.RdfGraphEnd; +import eu.neverblink.jelly.core.proto.v1.RdfGraphStart; +import eu.neverblink.jelly.core.proto.v1.RdfIri; +import eu.neverblink.jelly.core.proto.v1.RdfLiteral; +import eu.neverblink.jelly.core.proto.v1.RdfQuad; +import eu.neverblink.jelly.core.proto.v1.RdfTriple; + +/** + * Represents RDF terms in a type-safe manner with conversion capabilities to and from Protocol Buffer messages. + * This interface defines the hierarchy of RDF terms and provides factory methods for creating terms from proto messages. + */ +public sealed interface RdfTerm { + /** + * Creates an IRI term from a Protocol Buffer RDF IRI message. + * @param iri The Protocol Buffer RDF IRI message + * @return An Iri instance, or null if the input is null + */ + static Iri from(RdfIri iri) { + if (iri == null) { + return null; + } + + return new Iri(iri.getPrefixId(), iri.getNameId()); + } + + /** + * Creates a blank node term from a string identifier. + * @param bNode The blank node identifier + * @return A BNode instance, or null if the input is null + */ + static BNode from(String bNode) { + if (bNode == null) { + return null; + } + + return new BNode(bNode); + } + + /** + * Creates a literal term from a Protocol Buffer RDF literal message. + * @param literal The Protocol Buffer RDF literal message + * @return A LiteralTerm instance (SimpleLiteral, LanguageLiteral, or DtLiteral), or null if the input is null + */ + static LiteralTerm from(RdfLiteral literal) { + if (literal == null) { + return null; + } + + if (literal.hasLangtag()) { + return new LanguageLiteral(literal.getLex(), literal.getLangtag()); + } else if (literal.hasDatatype()) { + return new DtLiteral(literal.getLex(), literal.getDatatype()); + } else { + return new SimpleLiteral(literal.getLex()); + } + } + + /** + * Creates a triple term from a Protocol Buffer RDF triple message. + * @param triple The Protocol Buffer RDF triple message + * @return A Triple instance, or null if the input is null + */ + static Triple from(RdfTriple triple) { + if (triple == null) { + return null; + } + + final var subject = + switch (triple.getSubjectCase()) { + case S_IRI -> from(triple.getSIri()); + case S_BNODE -> from(triple.getSBnode()); + case S_LITERAL -> from(triple.getSLiteral()); + case S_TRIPLE_TERM -> from(triple.getSTripleTerm()); + case SUBJECT_NOT_SET -> null; + }; + + final var predicate = + switch (triple.getPredicateCase()) { + case P_IRI -> from(triple.getPIri()); + case P_BNODE -> from(triple.getPBnode()); + case P_LITERAL -> from(triple.getPLiteral()); + case P_TRIPLE_TERM -> from(triple.getPTripleTerm()); + case PREDICATE_NOT_SET -> null; + }; + + final var object = + switch (triple.getObjectCase()) { + case O_IRI -> from(triple.getOIri()); + case O_BNODE -> from(triple.getOBnode()); + case O_LITERAL -> from(triple.getOLiteral()); + case O_TRIPLE_TERM -> from(triple.getOTripleTerm()); + case OBJECT_NOT_SET -> null; + }; + + return new Triple(subject, predicate, object); + } + + /** + * Creates a graph start marker from a Protocol Buffer RDF graph start message. + * @param graphStart The Protocol Buffer RDF graph start message + * @return A GraphStart instance, or null if the input is null + */ + static GraphStart from(RdfGraphStart graphStart) { + if (graphStart == null) { + return null; + } + + final var graph = + switch (graphStart.getGraphCase()) { + case G_IRI -> from(graphStart.getGIri()); + case G_BNODE -> from(graphStart.getGBnode()); + case G_DEFAULT_GRAPH -> from(graphStart.getGDefaultGraph()); + case G_LITERAL -> from(graphStart.getGLiteral()); + case GRAPH_NOT_SET -> null; + }; + + return new GraphStart(graph); + } + + /** + * Creates a graph end marker from a Protocol Buffer RDF graph end message. + * @param ignoredGraphEnd The Protocol Buffer RDF graph end message (ignored) + * @return A new GraphEnd instance + */ + static GraphEnd from(RdfGraphEnd ignoredGraphEnd) { + return new GraphEnd(); + } + + /** + * Creates a default graph marker from a Protocol Buffer RDF default graph message. + * @param ignoredDefaultGraph The Protocol Buffer RDF default graph message (ignored) + * @return A new DefaultGraph instance + */ + static DefaultGraph from(RdfDefaultGraph ignoredDefaultGraph) { + return new DefaultGraph(); + } + + /** + * Creates a quad term from a Protocol Buffer RDF quad message. + * @param quad The Protocol Buffer RDF quad message + * @return A Quad instance, or null if the input is null + */ + static Quad from(RdfQuad quad) { + if (quad == null) { + return null; + } + + final var subject = + switch (quad.getSubjectCase()) { + case S_IRI -> from(quad.getSIri()); + case S_BNODE -> from(quad.getSBnode()); + case S_LITERAL -> from(quad.getSLiteral()); + case S_TRIPLE_TERM -> from(quad.getSTripleTerm()); + case SUBJECT_NOT_SET -> null; + }; + + final var predicate = + switch (quad.getPredicateCase()) { + case P_IRI -> from(quad.getPIri()); + case P_BNODE -> from(quad.getPBnode()); + case P_LITERAL -> from(quad.getPLiteral()); + case P_TRIPLE_TERM -> from(quad.getPTripleTerm()); + case PREDICATE_NOT_SET -> null; + }; + + final var object = + switch (quad.getObjectCase()) { + case O_IRI -> from(quad.getOIri()); + case O_BNODE -> from(quad.getOBnode()); + case O_LITERAL -> from(quad.getOLiteral()); + case O_TRIPLE_TERM -> from(quad.getOTripleTerm()); + case OBJECT_NOT_SET -> null; + }; + + final var graph = + switch (quad.getGraphCase()) { + case G_IRI -> from(quad.getGIri()); + case G_BNODE -> from(quad.getGBnode()); + case G_DEFAULT_GRAPH -> from(quad.getGDefaultGraph()); + case G_LITERAL -> from(quad.getGLiteral()); + case GRAPH_NOT_SET -> null; + }; + + return new Quad(subject, predicate, object, graph); + } + + /** + * Represents terms that can appear in subject, predicate, or object positions of a triple. + */ + sealed interface SpoTerm extends RdfTerm { + /** + * Converts the term to a Protocol Buffer RDF triple subject term. + */ + void writeSubject(RdfTriple.Builder builder); + + /** + * Converts the term to a Protocol Buffer RDF quad subject term. + */ + void writeSubject(RdfQuad.Builder builder); + + /** + * Converts the term to a Protocol Buffer RDF triple predicate term. + */ + void writePredicate(RdfTriple.Builder builder); + + /** + * Converts the term to a Protocol Buffer RDF quad predicate term. + */ + void writePredicate(RdfQuad.Builder builder); + + /** + * Converts the term to a Protocol Buffer RDF triple object term. + */ + void writeObject(RdfTriple.Builder builder); + + /** + * Converts the term to a Protocol Buffer RDF quad object term. + */ + void writeObject(RdfQuad.Builder builder); + } + + /** + * Represents terms that mark graph boundaries in the RDF dataset. + */ + sealed interface GraphMarkerTerm extends RdfTerm {} + + /** + * Represents terms that can appear as graph labels. + */ + sealed interface GraphTerm extends RdfTerm { + /** + * Converts the term to a Protocol Buffer RDF graph start message. + */ + void writeGraph(RdfGraphStart.Builder builder); + + /** + * Converts the term to a Protocol Buffer RDF quad graph message. + */ + void writeGraph(RdfQuad.Builder builder); + } + + /** + * Represents terms that can appear in SPO positions and as graph labels. + */ + sealed interface SpoOrGraphTerm extends SpoTerm, GraphTerm {} + + /** + * Represents literal terms with lexical values. + */ + sealed interface LiteralTerm extends SpoOrGraphTerm { + String lex(); + } + + /** + * Represents terms that can be either graph markers or graph labels. + */ + sealed interface GraphMarkerOrGraphTerm extends GraphMarkerTerm, GraphTerm {} + + /** + * Represents IRI terms with prefix and name identifiers. + * + * @param prefixId The prefix identifier + * @param nameId The name identifier + */ + record Iri(int prefixId, int nameId) implements SpoOrGraphTerm { + public RdfIri toProto() { + return RdfIri.newBuilder().setPrefixId(prefixId).setNameId(nameId).build(); + } + + @Override + public void writeSubject(RdfTriple.Builder builder) { + builder.setSIri(toProto()); + } + + @Override + public void writeSubject(RdfQuad.Builder builder) { + builder.setSIri(toProto()); + } + + @Override + public void writePredicate(RdfTriple.Builder builder) { + builder.setPIri(toProto()); + } + + @Override + public void writePredicate(RdfQuad.Builder builder) { + builder.setPIri(toProto()); + } + + @Override + public void writeObject(RdfTriple.Builder builder) { + builder.setOIri(toProto()); + } + + @Override + public void writeObject(RdfQuad.Builder builder) { + builder.setOIri(toProto()); + } + + @Override + public void writeGraph(RdfGraphStart.Builder builder) { + builder.setGIri(toProto()); + } + + @Override + public void writeGraph(RdfQuad.Builder builder) { + builder.setGIri(toProto()); + } + } + + /** + * Represents blank node terms with a string identifier. + * + * @param bNode The blank node identifier + */ + record BNode(String bNode) implements SpoOrGraphTerm { + public String toProto() { + return bNode; + } + + @Override + public void writeSubject(RdfTriple.Builder builder) { + builder.setSBnode(toProto()); + } + + @Override + public void writeSubject(RdfQuad.Builder builder) { + builder.setSBnode(toProto()); + } + + @Override + public void writePredicate(RdfTriple.Builder builder) { + builder.setPBnode(toProto()); + } + + @Override + public void writePredicate(RdfQuad.Builder builder) { + builder.setPBnode(toProto()); + } + + @Override + public void writeObject(RdfTriple.Builder builder) { + builder.setOBnode(toProto()); + } + + @Override + public void writeObject(RdfQuad.Builder builder) { + builder.setOBnode(toProto()); + } + + @Override + public void writeGraph(RdfGraphStart.Builder builder) { + builder.setGBnode(toProto()); + } + + @Override + public void writeGraph(RdfQuad.Builder builder) { + builder.setGBnode(toProto()); + } + } + + /** + * Represents literal terms with lexical values and language tags. + * + * @param lex The lexical value + * @param langtag The language tag + */ + record LanguageLiteral(String lex, String langtag) implements LiteralTerm { + public RdfLiteral toProto() { + return RdfLiteral.newBuilder().setLex(lex).setLangtag(langtag).build(); + } + + @Override + public void writeSubject(RdfTriple.Builder builder) { + builder.setSLiteral(toProto()); + } + + @Override + public void writeSubject(RdfQuad.Builder builder) { + builder.setSLiteral(toProto()); + } + + @Override + public void writePredicate(RdfTriple.Builder builder) { + builder.setPLiteral(toProto()); + } + + @Override + public void writePredicate(RdfQuad.Builder builder) { + builder.setPLiteral(toProto()); + } + + @Override + public void writeObject(RdfTriple.Builder builder) { + builder.setOLiteral(toProto()); + } + + @Override + public void writeObject(RdfQuad.Builder builder) { + builder.setOLiteral(toProto()); + } + + @Override + public void writeGraph(RdfGraphStart.Builder builder) { + builder.setGLiteral(toProto()); + } + + @Override + public void writeGraph(RdfQuad.Builder builder) { + builder.setGLiteral(toProto()); + } + } + + /** + * Represents literal terms with lexical values and datatype identifiers. + * + * @param lex The lexical value + * @param datatype The datatype identifier + */ + record DtLiteral(String lex, int datatype) implements LiteralTerm { + public RdfLiteral toProto() { + return RdfLiteral.newBuilder().setLex(lex).setDatatype(datatype).build(); + } + + @Override + public void writeSubject(RdfTriple.Builder builder) { + builder.setSLiteral(toProto()); + } + + @Override + public void writeSubject(RdfQuad.Builder builder) { + builder.setSLiteral(toProto()); + } + + @Override + public void writePredicate(RdfTriple.Builder builder) { + builder.setPLiteral(toProto()); + } + + @Override + public void writePredicate(RdfQuad.Builder builder) { + builder.setPLiteral(toProto()); + } + + @Override + public void writeObject(RdfTriple.Builder builder) { + builder.setOLiteral(toProto()); + } + + @Override + public void writeObject(RdfQuad.Builder builder) { + builder.setOLiteral(toProto()); + } + + @Override + public void writeGraph(RdfGraphStart.Builder builder) { + builder.setGLiteral(toProto()); + } + + @Override + public void writeGraph(RdfQuad.Builder builder) { + builder.setGLiteral(toProto()); + } + } + + /** + * Represents simple literal terms with lexical values. + * + * @param lex The lexical value + */ + record SimpleLiteral(String lex) implements LiteralTerm { + public RdfLiteral toProto() { + return RdfLiteral.newBuilder().setLex(lex).build(); + } + + @Override + public void writeSubject(RdfTriple.Builder builder) { + builder.setSLiteral(toProto()); + } + + @Override + public void writeSubject(RdfQuad.Builder builder) { + builder.setSLiteral(toProto()); + } + + @Override + public void writePredicate(RdfTriple.Builder builder) { + builder.setPLiteral(toProto()); + } + + @Override + public void writePredicate(RdfQuad.Builder builder) { + builder.setPLiteral(toProto()); + } + + @Override + public void writeObject(RdfTriple.Builder builder) { + builder.setOLiteral(toProto()); + } + + @Override + public void writeObject(RdfQuad.Builder builder) { + builder.setOLiteral(toProto()); + } + + @Override + public void writeGraph(RdfGraphStart.Builder builder) { + builder.setGLiteral(toProto()); + } + + @Override + public void writeGraph(RdfQuad.Builder builder) { + builder.setGLiteral(toProto()); + } + } + + /** + * Represents RDF triples with subject, predicate, and object terms. + * + * @param subject The subject term + * @param predicate The predicate term + * @param object The object term + */ + record Triple(SpoTerm subject, SpoTerm predicate, SpoTerm object) implements SpoTerm { + public RdfTriple toProto() { + final var tripleBuilder = RdfTriple.newBuilder(); + + if (subject != null) { + subject.writeSubject(tripleBuilder); + } + + if (predicate != null) { + predicate.writePredicate(tripleBuilder); + } + + if (object != null) { + object.writeObject(tripleBuilder); + } + + return tripleBuilder.build(); + } + + @Override + public void writeSubject(RdfTriple.Builder builder) { + builder.setSTripleTerm(toProto()); + } + + @Override + public void writeSubject(RdfQuad.Builder builder) { + builder.setSTripleTerm(toProto()); + } + + @Override + public void writePredicate(RdfTriple.Builder builder) { + builder.setPTripleTerm(toProto()); + } + + @Override + public void writePredicate(RdfQuad.Builder builder) { + builder.setPTripleTerm(toProto()); + } + + @Override + public void writeObject(RdfTriple.Builder builder) { + builder.setOTripleTerm(toProto()); + } + + @Override + public void writeObject(RdfQuad.Builder builder) { + builder.setOTripleTerm(toProto()); + } + } + + /** + * Represents graph start markers with optional graph labels. + * + * @param graph The graph label term + */ + record GraphStart(GraphTerm graph) implements GraphMarkerTerm { + public RdfGraphStart toProto() { + final var graphBuilder = RdfGraphStart.newBuilder(); + + if (graph != null) { + graph.writeGraph(graphBuilder); + } + + return graphBuilder.build(); + } + } + + /** + * Represents graph end markers. + */ + record GraphEnd() implements GraphMarkerTerm { + public RdfGraphEnd toProto() { + return RdfGraphEnd.getDefaultInstance(); + } + } + + /** + * Represents default graph markers. + */ + record DefaultGraph() implements GraphMarkerOrGraphTerm { + public static final DefaultGraph INSTANCE = new DefaultGraph(); + + public RdfDefaultGraph toProto() { + return RdfDefaultGraph.getDefaultInstance(); + } + + @Override + public void writeGraph(RdfGraphStart.Builder builder) { + builder.setGDefaultGraph(toProto()); + } + + @Override + public void writeGraph(RdfQuad.Builder builder) { + builder.setGDefaultGraph(toProto()); + } + } + + /** + * Represents RDF quads with subject, predicate, object, and graph terms. + * + * @param subject The subject term + * @param predicate The predicate term + * @param object The object term + * @param graph The graph term + */ + record Quad(SpoTerm subject, SpoTerm predicate, SpoTerm object, GraphTerm graph) implements RdfTerm { + public RdfQuad toProto() { + final var quadBuilder = RdfQuad.newBuilder(); + + if (subject != null) { + subject.writeSubject(quadBuilder); + } + + if (predicate != null) { + predicate.writePredicate(quadBuilder); + } + + if (object != null) { + object.writeObject(quadBuilder); + } + + if (graph != null) { + graph.writeGraph(quadBuilder); + } + + return quadBuilder.build(); + } + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/RowBufferAppender.java b/core-java/src/main/java/eu/neverblink/jelly/core/RowBufferAppender.java new file mode 100644 index 000000000..0d7d0b860 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/RowBufferAppender.java @@ -0,0 +1,16 @@ +package eu.neverblink.jelly.core; + +import eu.neverblink.jelly.core.proto.v1.RdfDatatypeEntry; +import eu.neverblink.jelly.core.proto.v1.RdfNameEntry; +import eu.neverblink.jelly.core.proto.v1.RdfPrefixEntry; + +/** + * Interface for appending lookup entries to the row buffer. + *

+ * This is used by NodeEncoder. + */ +public interface RowBufferAppender { + void appendNameEntry(RdfNameEntry nameEntry); + void appendPrefixEntry(RdfPrefixEntry prefixEntry); + void appendDatatypeEntry(RdfDatatypeEntry datatypeEntry); +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/internal/DecoderLookup.java b/core-java/src/main/java/eu/neverblink/jelly/core/internal/DecoderLookup.java new file mode 100644 index 000000000..7eabf9903 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/internal/DecoderLookup.java @@ -0,0 +1,44 @@ +package eu.neverblink.jelly.core.internal; + +/** + * Simple, array-based lookup for the protobuf decoder. + * @param type of the value + */ +public class DecoderLookup { + + private int lastSetId = -1; + private final T[] lookup; + + /** + * Create a new decoder lookup table. + * @param maxEntries maximum number of entries + */ + @SuppressWarnings("unchecked") + public DecoderLookup(int maxEntries) { + this.lookup = (T[]) new Object[maxEntries]; + } + + /** + * @param id 1-based. 0 signifies an id that is larger by 1 than the last set id. + * @param v value + * @throws ArrayIndexOutOfBoundsException if id < 0 or id > maxEntries + */ + public void update(int id, T v) { + if (id == 0) { + lastSetId += 1; + } else { + lastSetId = id - 1; + } + + lookup[lastSetId] = v; + } + + /** + * @param id 1-based + * @return value + * @throws ArrayIndexOutOfBoundsException if id < 1 or id > maxEntries + */ + public T get(int id) { + return lookup[id - 1]; + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/internal/EncoderLookup.java b/core-java/src/main/java/eu/neverblink/jelly/core/internal/EncoderLookup.java new file mode 100644 index 000000000..e60c8f503 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/internal/EncoderLookup.java @@ -0,0 +1,211 @@ +package eu.neverblink.jelly.core.internal; + +import java.util.HashMap; +import java.util.Objects; + +/** + * A lookup table for NodeEncoder, used for indexing datatypes, IRI prefixes, and IRI names. + * This is a very efficient implementation of an LRU cache that uses as few allocations as possible. + * The table is implemented as a doubly linked list in an array. + */ +final class EncoderLookup { + + /** + * Represents an entry in the lookup table. + */ + static final class LookupEntry { + + /** The ID of the entry used for referencing it from RdfIri and RdfLiteral objects. */ + public int getId; + /** The ID of the entry used for adding the lookup entry to the RDF stream. */ + public int setId; + /** Whether this entry is a new entry. */ + public boolean newEntry; + + public LookupEntry(int getId, int setId) { + this.getId = getId; + this.setId = setId; + } + + public LookupEntry(int getId, int setId, boolean newEntry) { + this.getId = getId; + this.setId = setId; + this.newEntry = newEntry; + } + } + + /** The lookup hash map */ + private final HashMap map = new HashMap<>(); + + /** + * The doubly-linked list of entries, with 1-based indexing. + * Each entry is represented by two integers: left and right. + * The head pointer is in table[1]. + * The first valid entry is in table[2] – table[3]. + */ + private final int[] table; + + /** + * The serial numbers of the entries, incremented each time the entry is replaced in the table. + * This could theoretically overflow and cause bogus cache hits, but it's enormously + * unlikely to happen in practice. I can buy a beer for anyone who can construct an RDF dataset that + * causes this to happen. + */ + final int[] serials; + + // Tail pointer for the table. + private int tail; + // Maximum size of the lookup. + final int size; + // Current size of the lookup (how many entries are used). + // This will monotonically increase until it reaches the maximum size. + private int used; + // The last id that was set in the table. + private int lastSetId = -1000; + // Names of the entries. Entry 0 is always null. + private final String[] names; + // Whether to maintain serial numbers for the entries. + private final boolean useSerials; + + private final LookupEntry entryForReturns = new LookupEntry(0, 0, true); + + public EncoderLookup(int size, boolean useSerials) { + this.size = size; + table = new int[(size + 1) * 2]; + names = new String[size + 1]; + this.useSerials = useSerials; + if (useSerials) { + serials = new int[size + 1]; + // Set the head's serial to non-zero value, so that default-initialized DependentNodes are not + // accidentally considered as valid entries. + serials[0] = -1; + } else { + serials = null; + } + } + + /** + * To be called after an entry is accessed (used). + * This moves the entry to the front of the list to prevent it from being evicted. + * @param id The ID of the entry that was accessed. + */ + public void onAccess(int id) { + int base = id * 2; + if (base == tail) { + return; + } + int left = table[base]; + int right = table[base + 1]; + // Set our left to the tail + table[base] = tail; + // Set left's right to our right + table[left + 1] = right; + // Set right's left to our left + table[right] = left; + // Set the tail's right to us + table[tail + 1] = base; + // Update the tail + tail = base; + } + + /** + * One branch of the getOrAddEntry method. Should be inlined by the JIT. + * @param key The key of the entry. + * @param id The ID of the entry. + */ + private void addEntrySequential(String key, int id) { + int base = id * 2; + // Set the left to the tail + table[base] = tail; + // Right is already 0 + // table[base + 1] = 0; + // Set the tail's right to us + table[tail + 1] = base; + tail = base; + names[id] = key; + map.put(key, new LookupEntry(id, id)); + } + + /** + * Another branch of the getOrAddEntry method. Should be inlined by the JIT. + * @param key The key of the entry. + * @param id The ID of the entry. + */ + private void addEntryEvicting(String key, int id) { + // Remove the entry from the map + LookupEntry oldEntry = map.remove(names[id]); + // Insert the new entry + names[id] = key; + map.put(key, oldEntry); + // Update the table + onAccess(id); + entryForReturns.setId = lastSetId + 1 == id ? 0 : id; + // We only update lastSetId in this case, because in the sequential case we don't check it anyway + lastSetId = id; + } + + /** + * Adds a new entry to the lookup table or retrieves it if it already exists. + * @param key The key of the entry. + * @return The entry. + */ + public LookupEntry getOrAddEntry(String key) { + final var value = map.get(key); + if (value != null) { + // The entry is already in the table, just update the access order + onAccess(value.getId); + return value; + } + int id; + if (used < size) { + // We still have space in the table, add a new entry to the end of the table. + id = ++used; + addEntrySequential(key, id); + } else { + // The table is full, evict the least recently used entry. + id = table[1] / 2; + addEntryEvicting(key, id); + } + if (this.useSerials) { + // Increment the serial number + // We save some memory accesses by not doing this if the serials are not used. + // The if should be very predictable and have no negative performance impact. + ++Objects.requireNonNull(serials)[id]; + } + entryForReturns.getId = id; + return entryForReturns; + } + + /** + * A variant of getOrAddEntry that is used for transcoders. + * This method does not update the serial number of the entry because serials are not used by transcoders. + * @param key The key of the entry. + * @param evictHint A hint for the entry to evict. If 0, the least recently used entry is evicted. + * @return The entry. + */ + public LookupEntry getOrAddEntryTranscoder(String key, int evictHint) { + final var value = map.get(key); + if (value != null) { + onAccess(value.getId); + return value; + } + int id; + if (used < size) { + id = ++used; + addEntrySequential(key, id); + } else { + // The table is full + if (evictHint != 0) { + // We have a hint for the entry to evict + id = evictHint; + } else { + // Evict the least recently used entry. + id = table[1] / 2; + } + addEntryEvicting(key, id); + } + // Serials are not used for transcoders + entryForReturns.getId = id; + return entryForReturns; + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/internal/LastNodeHolder.java b/core-java/src/main/java/eu/neverblink/jelly/core/internal/LastNodeHolder.java new file mode 100644 index 000000000..4cc080a70 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/internal/LastNodeHolder.java @@ -0,0 +1,13 @@ +package eu.neverblink.jelly.core.internal; + +/** + * Tiny mutable holder for the last node that occurred as S, P, O, or G. + * @param the type of the node + */ +public class LastNodeHolder { + + /** + * null indicates that there was no value for this node yet. + */ + TNode node = null; +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/internal/NameDecoderImpl.java b/core-java/src/main/java/eu/neverblink/jelly/core/internal/NameDecoderImpl.java new file mode 100644 index 000000000..8055153b5 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/internal/NameDecoderImpl.java @@ -0,0 +1,184 @@ +package eu.neverblink.jelly.core.internal; + +import eu.neverblink.jelly.core.NameDecoder; +import eu.neverblink.jelly.core.RdfProtoDeserializationError; +import eu.neverblink.jelly.core.proto.v1.RdfNameEntry; +import eu.neverblink.jelly.core.proto.v1.RdfPrefixEntry; +import java.util.function.Function; + +/** + * Class for decoding RDF IRIs from their Jelly representation. + * + * @param The type of the IRI in the target RDF library. + */ +final class NameDecoderImpl implements NameDecoder { + + private static final class NameLookupEntry { + + // Primary: the actual name + public String name; + // Secondary values (may be mutated without invalidating the primary value) + // Reference to the last prefix ID used to encode the IRI with this name + public int lastPrefixId; + // Serial number of the last prefix ID used to encode the IRI with this name + public int lastPrefixSerial; + // Last IRI encoded with this name + public Object lastIri; + } + + private static final class PrefixLookupEntry { + + public String prefix; + public int serial = -1; + } + + private final NameLookupEntry[] nameLookup; + private final PrefixLookupEntry[] prefixLookup; + + private int lastPrefixIdReference = 0; + private int lastNameIdReference = 0; + + private int lastPrefixIdSet = 0; + private int lastNameIdSet = 0; + + private final Function iriFactory; + + /** + * Creates a new NameDecoder. + * + * @param prefixTableSize The size of the prefix lookup table. + * @param nameTableSize The size of the name lookup table. + * @param iriFactory A function that creates an IRI from a string. + */ + public NameDecoderImpl(int prefixTableSize, int nameTableSize, Function iriFactory) { + this.iriFactory = iriFactory; + nameLookup = new NameLookupEntry[nameTableSize + 1]; + prefixLookup = new PrefixLookupEntry[prefixTableSize + 1]; + + for (int i = 1; i < nameTableSize + 1; i++) { + nameLookup[i] = new NameLookupEntry(); + } + for (int i = 1; i < prefixTableSize + 1; i++) { + prefixLookup[i] = new PrefixLookupEntry(); + } + } + + /** + * Update the name table with a new entry. + * + * @param nameEntry name row + * @throws RdfProtoDeserializationError if the identifier is out of bounds + */ + @Override + public void updateNames(RdfNameEntry nameEntry) { + int id = nameEntry.getId(); + // Branchless! Equivalent to: + // if (id == 0) lastNameIdSet++; + // else lastNameIdSet = id; + // Same code is used in the methods below. + lastNameIdSet = ((lastNameIdSet + 1) & ((id - 1) >> 31)) + id; + try { + NameLookupEntry entry = nameLookup[lastNameIdSet]; + entry.name = nameEntry.getValue(); + // Enough to invalidate the last IRI – we don't have to touch the serial number. + entry.lastPrefixId = 0; + // Set to null is required to avoid a false positive in the decode method for cases without a prefix. + entry.lastIri = null; + } catch (ArrayIndexOutOfBoundsException | NullPointerException e) { + throw new RdfProtoDeserializationError( + "Name entry with ID %d is out of bounds of the name lookup table.".formatted(id) + ); + } + } + + /** + * Update the prefix table with a new entry. + * + * @param prefixEntry prefix row + * @throws RdfProtoDeserializationError if the identifier is out of bounds + */ + @Override + public void updatePrefixes(RdfPrefixEntry prefixEntry) { + int id = prefixEntry.getId(); + lastPrefixIdSet = ((lastPrefixIdSet + 1) & ((id - 1) >> 31)) + id; + try { + PrefixLookupEntry entry = prefixLookup[lastPrefixIdSet]; + entry.prefix = prefixEntry.getValue(); + entry.serial++; + } catch (ArrayIndexOutOfBoundsException | NullPointerException e) { + throw new RdfProtoDeserializationError( + "Prefix entry with ID %d is out of bounds of the prefix lookup table.".formatted(id) + ); + } + } + + /** + * Reconstruct an IRI from its prefix and name ids. + * + * @param prefixId prefix ID + * @param nameId name ID + * @return full IRI combining the prefix and the name + * @throws RdfProtoDeserializationError if the IRI reference is invalid + * @throws NullPointerException if the IRI reference is invalid + */ + @SuppressWarnings("unchecked") + @Override + public TIri decode(int prefixId, int nameId) { + final var originalPrefixId = prefixId; + + lastNameIdReference = ((lastNameIdReference + 1) & ((nameId - 1) >> 31)) + nameId; + NameLookupEntry nameEntry; + try { + nameEntry = nameLookup[lastNameIdReference]; + } catch (ArrayIndexOutOfBoundsException e) { + throw new RdfProtoDeserializationError( + ("Encountered an invalid name table reference (out of bounds). " + + "Name ID: %d, Prefix ID: %d").formatted(nameId, originalPrefixId) + ); + } + + // Branchless way to update the prefix ID + // Equivalent to: + // if (prefixId == 0) prefixId = lastPrefixIdReference; + // else lastPrefixIdReference = prefixId; + lastPrefixIdReference = prefixId = (((prefixId - 1) >> 31) & lastPrefixIdReference) + prefixId; + if (prefixId != 0) { + // Name and prefix + PrefixLookupEntry prefixEntry; + try { + prefixEntry = prefixLookup[prefixId]; + } catch (ArrayIndexOutOfBoundsException e) { + throw new RdfProtoDeserializationError( + ("Encountered an invalid prefix table reference (out of bounds). " + + "Prefix ID: %d, Name ID: %d").formatted(prefixId, nameId) + ); + } + if (nameEntry.lastPrefixId != prefixId || nameEntry.lastPrefixSerial != prefixEntry.serial) { + // Update the last prefix + nameEntry.lastPrefixId = prefixId; + nameEntry.lastPrefixSerial = prefixEntry.serial; + // And compute a new IRI + nameEntry.lastIri = iriFactory.apply(prefixEntry.prefix.concat(nameEntry.name)); + return (TIri) nameEntry.lastIri; + } + if (nameEntry.lastIri == null) { + throw new RdfProtoDeserializationError( + "Encountered an invalid IRI reference. Prefix ID: %d, Name ID: %d".formatted( + originalPrefixId, + nameId + ) + ); + } + } else if (nameEntry.lastIri == null) { + if (nameEntry.name == null) { + throw new RdfProtoDeserializationError( + "Encountered an invalid IRI reference. No prefix, Name ID: %d".formatted(nameId) + ); + } + // Name only, no need to check the prefix lookup + nameEntry.lastIri = iriFactory.apply(nameEntry.name); + } + + return (TIri) nameEntry.lastIri; + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/internal/NodeEncoderImpl.java b/core-java/src/main/java/eu/neverblink/jelly/core/internal/NodeEncoderImpl.java new file mode 100644 index 000000000..79c42a3ac --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/internal/NodeEncoderImpl.java @@ -0,0 +1,297 @@ +package eu.neverblink.jelly.core.internal; + +import eu.neverblink.jelly.core.NodeEncoder; +import eu.neverblink.jelly.core.RdfProtoSerializationError; +import eu.neverblink.jelly.core.RdfTerm; +import eu.neverblink.jelly.core.RowBufferAppender; +import eu.neverblink.jelly.core.proto.v1.RdfDatatypeEntry; +import eu.neverblink.jelly.core.proto.v1.RdfNameEntry; +import eu.neverblink.jelly.core.proto.v1.RdfPrefixEntry; +import eu.neverblink.jelly.core.proto.v1.RdfStreamOptions; +import java.util.LinkedHashMap; +import java.util.Objects; + +/** + * Encodes RDF nodes native to the used RDF library (e.g., Apache Jena, RDF4J) into Jelly's protobuf objects. + * This class performs a lot of caching to avoid encoding the same node multiple times. It is absolutely NOT + * thread-safe, and should only be ever used by a single instance of ProtoEncoder. + * + * @param The type of RDF nodes used by the RDF library. + */ +final class NodeEncoderImpl implements NodeEncoder { + + /** + * A cached node that depends on other lookups (RdfIri and RdfLiteral in the datatype variant). + */ + static final class DependentNode { + + // The actual cached node + public RdfTerm encoded; + // 1: datatypes and IRI names + // The pointer is the index in the lookup table, the serial is the serial number of the entry. + // The serial in the lookup table must be equal to the serial here for the entry to be valid. + public int lookupPointer1; + public int lookupSerial1; + // 2: IRI prefixes + public int lookupPointer2; + public int lookupSerial2; + } + + /** + * A simple LRU cache for already encoded nodes. + * @param Key type + * @param Value type + */ + private static final class NodeCache extends LinkedHashMap { + + private final int maxSize; + + public NodeCache(int maxSize) { + this.maxSize = maxSize; + } + + @Override + protected boolean removeEldestEntry(java.util.Map.Entry eldest) { + return size() > maxSize; + } + } + + private final int maxPrefixTableSize; + private int lastIriNameId; + private int lastIriPrefixId = -1000; + + private final EncoderLookup datatypeLookup; + private final EncoderLookup prefixLookup; + private final EncoderLookup nameLookup; + + private final RowBufferAppender bufferAppender; + + // We split the node caches in three – the first two are for nodes that depend on the lookups + // (IRIs and datatype literals). The third one is for nodes that don't depend on the lookups. + private final NodeCache iriNodeCache; + private final NodeCache dtLiteralNodeCache; + private final NodeCache nodeCache; + + // Pre-allocated IRI that has prefixId=0 and nameId=0 + static final RdfTerm.Iri zeroIri = new RdfTerm.Iri(0, 0); + // Pre-allocated IRIs that have prefixId=0 + private final RdfTerm.Iri[] nameOnlyIris; + + /** + * Creates a new NodeEncoder. + * @param prefixTableSize The size of the prefix lookup table + * @param nameTableSize The size of the name lookup table + * @param dtTableSize The size of the datatype lookup table + * @param nodeCacheSize The size of the node cache (for nodes that don't depend on lookups) + * @param iriNodeCacheSize The size of the IRI dependent node cache (for prefix+name encoding) + * @param dtLiteralNodeCacheSize The size of the datatype literal dependent node cache + * @param bufferAppender consumer of the lookup entry rows + */ + public NodeEncoderImpl( + int prefixTableSize, + int nameTableSize, + int dtTableSize, + int nodeCacheSize, + int iriNodeCacheSize, + int dtLiteralNodeCacheSize, + RowBufferAppender bufferAppender + ) { + datatypeLookup = new EncoderLookup(dtTableSize, true); + this.maxPrefixTableSize = prefixTableSize; + if (maxPrefixTableSize > 0) { + prefixLookup = new EncoderLookup(maxPrefixTableSize, true); + iriNodeCache = new NodeCache<>(iriNodeCacheSize); + } else { + prefixLookup = null; + iriNodeCache = null; + } + nameOnlyIris = new RdfTerm.Iri[nameTableSize + 1]; + for (int i = 0; i < nameOnlyIris.length; i++) { + nameOnlyIris[i] = new RdfTerm.Iri(0, i); + } + dtLiteralNodeCache = new NodeCache<>(dtLiteralNodeCacheSize); + nameLookup = new EncoderLookup(nameTableSize, maxPrefixTableSize > 0); + nodeCache = new NodeCache<>(nodeCacheSize); + this.bufferAppender = bufferAppender; + } + + /** + * Create a new NodeEncoder using the default cache size heuristics from the options. + * @param options The options to use + * @param bufferAppender The buffer appender to use + */ + public static NodeEncoder create(RdfStreamOptions options, RowBufferAppender bufferAppender) { + return new NodeEncoderImpl<>( + options.getMaxPrefixTableSize(), + options.getMaxNameTableSize(), + options.getMaxDatatypeTableSize(), + Math.max(Math.min(options.getMaxNameTableSize(), 1024), 256), + options.getMaxNameTableSize(), + Math.max(Math.min(options.getMaxNameTableSize(), 1024), 256), + bufferAppender + ); + } + + /** + * Encodes an IRI using two layers of caching – both for the entire IRI, and the prefix and name tables. + * @param iri The IRI to encode + * @return The encoded IRI + */ + @Override + public RdfTerm.Iri makeIri(String iri) { + if (maxPrefixTableSize == 0) { + // Fast path for no prefixes + final var nameEntry = nameLookup.getOrAddEntry(iri); + if (nameEntry.newEntry) { + bufferAppender.appendNameEntry(RdfNameEntry.newBuilder().setId(nameEntry.setId).setValue(iri).build()); + } + int nameId = nameEntry.getId; + if (lastIriNameId + 1 == nameId) { + lastIriNameId = nameId; + return zeroIri; + } else { + lastIriNameId = nameId; + return nameOnlyIris[nameId]; + } + } + + // Slow path, with splitting out the prefix + final var cachedNode = Objects.requireNonNull(iriNodeCache).computeIfAbsent(iri, k -> new DependentNode()); + // Check if the value is still valid + if ( + cachedNode.encoded != null && + cachedNode.lookupSerial1 == Objects.requireNonNull(nameLookup.serials)[cachedNode.lookupPointer1] && + cachedNode.lookupSerial2 == + Objects.requireNonNull(Objects.requireNonNull(prefixLookup).serials)[cachedNode.lookupPointer2] + ) { + nameLookup.onAccess(cachedNode.lookupPointer1); + prefixLookup.onAccess(cachedNode.lookupPointer2); + return outputIri(cachedNode); + } + + int i = iri.indexOf('#', 8); + String prefix; + String postfix; + if (i == -1) { + i = iri.lastIndexOf('/'); + if (i != -1) { + prefix = iri.substring(0, i + 1); + postfix = iri.substring(i + 1); + } else { + prefix = ""; + postfix = iri; + } + } else { + prefix = iri.substring(0, i + 1); + postfix = iri.substring(i + 1); + } + + final var prefixEntry = Objects.requireNonNull(prefixLookup).getOrAddEntry(prefix); + final var nameEntry = nameLookup.getOrAddEntry(postfix); + if (prefixEntry.newEntry) { + bufferAppender.appendPrefixEntry( + RdfPrefixEntry.newBuilder().setId(prefixEntry.setId).setValue(prefix).build() + ); + } + if (nameEntry.newEntry) { + bufferAppender.appendNameEntry(RdfNameEntry.newBuilder().setId(nameEntry.setId).setValue(postfix).build()); + } + int nameId = nameEntry.getId; + int prefixId = prefixEntry.getId; + cachedNode.lookupPointer1 = nameId; + cachedNode.lookupSerial1 = Objects.requireNonNull(nameLookup.serials)[nameId]; + cachedNode.lookupPointer2 = prefixId; + cachedNode.lookupSerial2 = Objects.requireNonNull(prefixLookup.serials)[prefixId]; + cachedNode.encoded = new RdfTerm.Iri(prefixId, nameId); + return outputIri(cachedNode); + } + + @Override + public RdfTerm.BNode makeBlankNode(String label) { + return (RdfTerm.BNode) nodeCache.computeIfAbsent(label, k -> new RdfTerm.BNode(label)); + } + + @Override + public RdfTerm.SimpleLiteral makeSimpleLiteral(String lex) { + return (RdfTerm.SimpleLiteral) nodeCache.computeIfAbsent(lex, k -> new RdfTerm.SimpleLiteral(lex)); + } + + @Override + public RdfTerm.LanguageLiteral makeLangLiteral(TNode lit, String lex, String lang) { + return (RdfTerm.LanguageLiteral) nodeCache.computeIfAbsent(lit, k -> new RdfTerm.LanguageLiteral(lex, lang)); + } + + /** + * Encodes a datatype literal using two layers of caching – both for the entire literal, and the datatype name. + * @param key The literal key (the unencoded literal node) + * @param lex The lexical form of the literal + * @param datatypeName The name of the datatype + * @return The encoded literal + */ + @Override + public RdfTerm.DtLiteral makeDtLiteral(TNode key, String lex, String datatypeName) { + if (datatypeLookup.size == 0) { + throw new RdfProtoSerializationError( + "Datatype literals cannot be " + + "encoded when the datatype table is disabled. Set the datatype table size " + + "to a positive value." + ); + } + final var cachedNode = dtLiteralNodeCache.computeIfAbsent(key, k -> new DependentNode()); + // Check if the value is still valid + if ( + cachedNode.encoded != null && + cachedNode.lookupSerial1 == Objects.requireNonNull(datatypeLookup.serials)[cachedNode.lookupPointer1] + ) { + datatypeLookup.onAccess(cachedNode.lookupPointer1); + return (RdfTerm.DtLiteral) cachedNode.encoded; + } + + // The node is not encoded, but we may already have the datatype encoded + final var dtEntry = datatypeLookup.getOrAddEntry(datatypeName); + if (dtEntry.newEntry) { + bufferAppender.appendDatatypeEntry( + RdfDatatypeEntry.newBuilder().setId(dtEntry.setId).setValue(datatypeName).build() + ); + } + int dtId = dtEntry.getId; + cachedNode.lookupPointer1 = dtId; + cachedNode.lookupSerial1 = Objects.requireNonNull(datatypeLookup.serials)[dtId]; + cachedNode.encoded = new RdfTerm.DtLiteral(lex, dtId); + + return (RdfTerm.DtLiteral) cachedNode.encoded; + } + + @Override + public RdfTerm.Triple makeQuotedTriple(RdfTerm.SpoTerm s, RdfTerm.SpoTerm p, RdfTerm.SpoTerm o) { + return new RdfTerm.Triple(s, p, o); + } + + /** + * Helper function to output an IRI from a cached node using same-prefix and next-name optimizations. + * @param cachedNode The cached node + * @return The encoded IRI + */ + private RdfTerm.Iri outputIri(DependentNode cachedNode) { + int nameId = cachedNode.lookupPointer1; + int prefixId = cachedNode.lookupPointer2; + if (lastIriPrefixId == prefixId) { + if (lastIriNameId + 1 == nameId) { + lastIriNameId = nameId; + return zeroIri; + } else { + lastIriNameId = nameId; + return nameOnlyIris[nameId]; + } + } else { + lastIriPrefixId = prefixId; + if (lastIriNameId + 1 == nameId) { + lastIriNameId = nameId; + return new RdfTerm.Iri(prefixId, 0); + } else { + lastIriNameId = nameId; + return (RdfTerm.Iri) cachedNode.encoded; + } + } + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/internal/ProtoDecoderBase.java b/core-java/src/main/java/eu/neverblink/jelly/core/internal/ProtoDecoderBase.java new file mode 100644 index 000000000..70d5204ce --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/internal/ProtoDecoderBase.java @@ -0,0 +1,158 @@ +package eu.neverblink.jelly.core.internal; + +import eu.neverblink.jelly.core.NameDecoder; +import eu.neverblink.jelly.core.ProtoDecoderConverter; +import eu.neverblink.jelly.core.RdfProtoDeserializationError; +import eu.neverblink.jelly.core.RdfTerm; + +/** + * Base trait for Jelly proto decoders. Only for internal use. + * @param type of RDF nodes in the library + * @param type of the datatype in the library + */ +public abstract class ProtoDecoderBase { + + protected final ProtoDecoderConverter converter; + protected final NameDecoder nameDecoder; + protected final DecoderLookup datatypeLookup; + + protected final LastNodeHolder lastSubject = new LastNodeHolder<>(); + protected final LastNodeHolder lastPredicate = new LastNodeHolder<>(); + protected final LastNodeHolder lastObject = new LastNodeHolder<>(); + protected final LastNodeHolder lastGraph = new LastNodeHolder<>(); + + protected ProtoDecoderBase(ProtoDecoderConverter converter) { + this.converter = converter; + this.nameDecoder = new NameDecoderImpl<>(getPrefixTableSize(), getNameTableSize(), converter::makeIriNode); + this.datatypeLookup = new DecoderLookup<>(getDatatypeTableSize()); + } + + protected abstract int getNameTableSize(); + + protected abstract int getPrefixTableSize(); + + protected abstract int getDatatypeTableSize(); + + /** + * Convert a GraphTerm message to a node. + * @param graph graph term to convert + * @return converted node + * @throws RdfProtoDeserializationError if the graph term can't be decoded + */ + protected final TNode convertGraphTerm(RdfTerm.GraphTerm graph) { + try { + if (graph == null) { + throw new RdfProtoDeserializationError("Empty graph term encountered in a GRAPHS stream."); + } else if (graph instanceof RdfTerm.Iri iri) { + return nameDecoder.decode(iri.prefixId(), iri.nameId()); + } else if (graph instanceof RdfTerm.DefaultGraph) { + return converter.makeDefaultGraphNode(); + } else if (graph instanceof RdfTerm.BNode bnode) { + return converter.makeBlankNode(bnode.bNode()); + } else if (graph instanceof RdfTerm.LanguageLiteral languageLiteral) { + return converter.makeLangLiteral(languageLiteral.lex(), languageLiteral.langtag()); + } else if (graph instanceof RdfTerm.DtLiteral dtLiteral) { + return converter.makeDtLiteral(dtLiteral.lex(), datatypeLookup.get(dtLiteral.datatype())); + } else if (graph instanceof RdfTerm.SimpleLiteral simpleLiteral) { + return converter.makeSimpleLiteral(simpleLiteral.lex()); + } else { + throw new RdfProtoDeserializationError("Unknown graph term type."); + } + } catch (Exception e) { + throw new RdfProtoDeserializationError("Error while decoding graph term %s".formatted(e), e); + } + } + + /** + * Convert a SpoTerm message to a node. + * @param term term to convert + * @throws RdfProtoDeserializationError if the term can't be decoded + */ + protected final TNode convertTerm(RdfTerm.SpoTerm term) { + try { + if (term == null) { + throw new RdfProtoDeserializationError("Term value is not set inside a quoted triple."); + } else if (term instanceof RdfTerm.Iri iri) { + return nameDecoder.decode(iri.prefixId(), iri.nameId()); + } else if (term instanceof RdfTerm.BNode bnode) { + return converter.makeBlankNode(bnode.bNode()); + } else if (term instanceof RdfTerm.LanguageLiteral languageLiteral) { + return converter.makeLangLiteral(languageLiteral.lex(), languageLiteral.langtag()); + } else if (term instanceof RdfTerm.DtLiteral dtLiteral) { + return converter.makeDtLiteral(dtLiteral.lex(), datatypeLookup.get(dtLiteral.datatype())); + } else if (term instanceof RdfTerm.SimpleLiteral simpleLiteral) { + return converter.makeSimpleLiteral(simpleLiteral.lex()); + } else if (term instanceof RdfTerm.Triple triple) { + return converter.makeTripleNode( + convertTerm(triple.subject()), + convertTerm(triple.predicate()), + convertTerm(triple.object()) + ); + } else { + throw new RdfProtoDeserializationError("Unknown term type."); + } + } catch (Exception e) { + throw new RdfProtoDeserializationError("Error while decoding term %s".formatted(e), e); + } + } + + /** + * Convert a subject SpoTerm message to a node, while respecting repeated terms. + * @param subject term to convert + * @return converted node + */ + protected final TNode convertSubjectTermWrapped(RdfTerm.SpoTerm subject) { + return convertSpoTermWrapped(subject, lastSubject); + } + + /** + * Convert a predicate SpoTerm message to a node, while respecting repeated terms. + * @param predicate term to convert + * @return converted node + */ + protected final TNode convertPredicateTermWrapped(RdfTerm.SpoTerm predicate) { + return convertSpoTermWrapped(predicate, lastPredicate); + } + + /** + * Convert an object SpoTerm message to a node, while respecting repeated terms. + * @param object term to convert + * @return converted node + */ + protected final TNode convertObjectTermWrapped(RdfTerm.SpoTerm object) { + return convertSpoTermWrapped(object, lastObject); + } + + /** + * Convert a GraphTerm message to a node, while respecting repeated terms. + * @param graph graph term to convert + * @return converted node + */ + protected final TNode convertGraphTermWrapped(RdfTerm.GraphTerm graph) { + if (graph == null && lastGraph.node == null) { + throw new RdfProtoDeserializationError("Empty term without previous graph term."); + } + + if (graph == null) { + return lastGraph.node; + } + + final var node = convertGraphTerm(graph); + lastGraph.node = node; + return node; + } + + private TNode convertSpoTermWrapped(RdfTerm.SpoTerm term, LastNodeHolder lastNodeHolder) { + if (term == null && lastNodeHolder.node == null) { + throw new RdfProtoDeserializationError("Empty term without previous term."); + } + + if (term == null) { + return lastNodeHolder.node; + } + + final var node = convertTerm(term); + lastNodeHolder.node = node; + return node; + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/internal/ProtoDecoderImpl.java b/core-java/src/main/java/eu/neverblink/jelly/core/internal/ProtoDecoderImpl.java new file mode 100644 index 000000000..b4b98852f --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/internal/ProtoDecoderImpl.java @@ -0,0 +1,429 @@ +package eu.neverblink.jelly.core.internal; + +import static eu.neverblink.jelly.core.JellyOptions.*; + +import eu.neverblink.jelly.core.*; +import eu.neverblink.jelly.core.proto.v1.LogicalStreamType; +import eu.neverblink.jelly.core.proto.v1.PhysicalStreamType; +import eu.neverblink.jelly.core.proto.v1.RdfDatatypeEntry; +import eu.neverblink.jelly.core.proto.v1.RdfGraphStart; +import eu.neverblink.jelly.core.proto.v1.RdfNamespaceDeclaration; +import eu.neverblink.jelly.core.proto.v1.RdfQuad; +import eu.neverblink.jelly.core.proto.v1.RdfStreamOptions; +import eu.neverblink.jelly.core.proto.v1.RdfStreamRow; +import eu.neverblink.jelly.core.proto.v1.RdfTriple; + +/** + * Base class for stateful decoders of protobuf RDF streams. + * + * @param the type of the node + * @param the type of the datatype + * @see ProtoDecoder the base (extendable) interface. + * @see ProtoDecoderBase for common methods shared by all decoders. + */ +public sealed class ProtoDecoderImpl extends ProtoDecoder { + + protected final RdfHandler protoHandler; + protected final RdfStreamOptions supportedOptions; + + private RdfStreamOptions currentOptions = null; + + public ProtoDecoderImpl( + ProtoDecoderConverter converter, + RdfHandler protoHandler, + RdfStreamOptions supportedOptions + ) { + super(converter); + this.protoHandler = protoHandler; + this.supportedOptions = supportedOptions; + } + + /** + * Returns the size of the name table. + * + * @return the size of the name table if options are set, otherwise the default size + */ + @Override + protected int getNameTableSize() { + if (currentOptions == null) { + return SMALL_NAME_TABLE_SIZE; + } + + return currentOptions.getMaxNameTableSize(); + } + + /** + * Returns the size of the prefix table. + * + * @return the size of the prefix table if options are set, otherwise the default size + */ + @Override + protected int getPrefixTableSize() { + if (currentOptions == null) { + return SMALL_PREFIX_TABLE_SIZE; + } + + return currentOptions.getMaxPrefixTableSize(); + } + + /** + * Returns the size of the datatype table. + * + * @return the size of the datatype table if options are set, otherwise the default size + */ + @Override + protected int getDatatypeTableSize() { + if (currentOptions == null) { + return SMALL_DT_TABLE_SIZE; + } + + return currentOptions.getMaxDatatypeTableSize(); + } + + /** + * Returns the received stream options from the producer. + * + * @return the stream options if set, otherwise null + */ + @Override + public RdfStreamOptions getStreamOptions() { + return currentOptions; + } + + private void setStreamOptions(RdfStreamOptions options) { + if (currentOptions != null) { + return; + } + + this.currentOptions = options; + } + + @Override + public void ingestRow(RdfStreamRow row) { + if (row == null) { + throw new RdfProtoDeserializationError("Row kind is not set."); + } + + switch (row.getRowCase()) { + case OPTIONS -> handleOptions(row.getOptions()); + case NAME -> nameDecoder.updateNames(row.getName()); + case PREFIX -> nameDecoder.updatePrefixes(row.getPrefix()); + case DATATYPE -> handleDatatype(row.getDatatype()); + case NAMESPACE -> handleNamespace(row.getNamespace()); + case TRIPLE -> handleTriple(row.getTriple()); + case QUAD -> handleQuad(row.getQuad()); + case GRAPH_START -> handleGraphStart(row.getGraphStart()); + case GRAPH_END -> handleGraphEnd(); + case ROW_NOT_SET -> throw new RdfProtoDeserializationError("Row kind is not set."); + } + } + + protected void handleOptions(RdfStreamOptions options) { + checkCompatibility(options, supportedOptions); + setStreamOptions(options); + } + + protected void handleDatatype(RdfDatatypeEntry datatype) { + datatypeLookup.update(datatype.getId(), converter.makeDatatype(datatype.getValue())); + } + + protected void handleNamespace(RdfNamespaceDeclaration namespace) { + final var iri = namespace.getValue(); + protoHandler.handleNamespace(namespace.getName(), nameDecoder.decode(iri.getPrefixId(), iri.getNameId())); + } + + protected void handleTriple(RdfTriple triple) { + throw new RdfProtoDeserializationError("Unexpected triple row in stream."); + } + + protected void handleQuad(RdfQuad quad) { + throw new RdfProtoDeserializationError("Unexpected quad row in stream."); + } + + protected void handleGraphStart(RdfGraphStart graphStart) { + throw new RdfProtoDeserializationError("Unexpected start of graph in stream."); + } + + protected void handleGraphEnd() { + throw new RdfProtoDeserializationError("Unexpected end of graph in stream."); + } + + /** + * A decoder that reads TRIPLES streams and outputs a sequence of triples. + *

+ * Do not instantiate this class directly. Instead use factory methods in + * ConverterFactory implementations. + */ + public static final class TriplesDecoder extends ProtoDecoderImpl { + + private final RdfHandler.TripleHandler protoHandler; + + public TriplesDecoder( + ProtoDecoderConverter converter, + RdfHandler.TripleHandler protoHandler, + RdfStreamOptions supportedOptions + ) { + super(converter, protoHandler, supportedOptions); + this.protoHandler = protoHandler; + } + + @Override + protected void handleOptions(RdfStreamOptions opts) { + if (!opts.getPhysicalType().equals(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES)) { + throw new RdfProtoDeserializationError("Incoming stream type is not TRIPLES."); + } + super.handleOptions(opts); + } + + @Override + protected void handleTriple(RdfTriple triple) { + final var tripleTerm = RdfTerm.from(triple); + protoHandler.handleTriple( + convertSubjectTermWrapped(tripleTerm.subject()), + convertPredicateTermWrapped(tripleTerm.predicate()), + convertObjectTermWrapped(tripleTerm.object()) + ); + } + } + + /** + * A decoder that reads QUADS streams and outputs a sequence of quads. + *

+ * Do not instantiate this class directly. Instead use factory methods in + * ConverterFactory implementations. + */ + public static final class QuadsDecoder extends ProtoDecoderImpl { + + private final RdfHandler.QuadHandler protoHandler; + + public QuadsDecoder( + ProtoDecoderConverter converter, + RdfHandler.QuadHandler protoHandler, + RdfStreamOptions supportedOptions + ) { + super(converter, protoHandler, supportedOptions); + this.protoHandler = protoHandler; + } + + @Override + protected void handleOptions(RdfStreamOptions opts) { + if (!opts.getPhysicalType().equals(PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS)) { + throw new RdfProtoDeserializationError("Incoming stream type is not QUADS."); + } + super.handleOptions(opts); + } + + @Override + protected void handleQuad(RdfQuad quad) { + final var quadTerm = RdfTerm.from(quad); + protoHandler.handleQuad( + convertSubjectTermWrapped(quadTerm.subject()), + convertPredicateTermWrapped(quadTerm.predicate()), + convertObjectTermWrapped(quadTerm.object()), + convertGraphTermWrapped(quadTerm.graph()) + ); + } + } + + /** + * A decoder that reads GRAPHS streams and outputs a flat sequence of quads. + *

+ * Do not instantiate this class directly. Instead use factory methods in + * ConverterFactory implementations. + */ + public static final class GraphsAsQuadsDecoder extends ProtoDecoderImpl { + + private final RdfHandler.QuadHandler protoHandler; + private TNode currentGraph = null; + + public GraphsAsQuadsDecoder( + ProtoDecoderConverter converter, + RdfHandler.QuadHandler protoHandler, + RdfStreamOptions supportedOptions + ) { + super(converter, protoHandler, supportedOptions); + this.protoHandler = protoHandler; + } + + @Override + protected void handleOptions(RdfStreamOptions opts) { + if (!opts.getPhysicalType().equals(PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS)) { + throw new RdfProtoDeserializationError("Incoming stream type is not GRAPHS."); + } + super.handleOptions(opts); + } + + @Override + protected void handleGraphStart(RdfGraphStart graphStart) { + final var graphStartTerm = RdfTerm.from(graphStart); + currentGraph = convertGraphTerm(graphStartTerm.graph()); + } + + @Override + protected void handleGraphEnd() { + currentGraph = null; + } + + @Override + protected void handleTriple(RdfTriple triple) { + if (currentGraph == null) { + throw new RdfProtoDeserializationError("Triple in stream without preceding graph start."); + } + + final var tripleTerm = RdfTerm.from(triple); + protoHandler.handleQuad( + convertSubjectTermWrapped(tripleTerm.subject()), + convertPredicateTermWrapped(tripleTerm.predicate()), + convertObjectTermWrapped(tripleTerm.object()), + currentGraph + ); + } + } + + /** + * A decoder that reads GRAPHS streams and outputs a sequence of graphs. + * Each graph is emitted as soon as the producer signals that it's complete. + *

+ * Do not instantiate this class directly. Instead use factory methods in + * ConverterFactory implementations. + */ + public static final class GraphsDecoder extends ProtoDecoderImpl { + + private final RdfHandler.GraphHandler protoHandler; + private TNode currentGraph = null; + + public GraphsDecoder( + ProtoDecoderConverter converter, + RdfHandler.GraphHandler protoHandler, + RdfStreamOptions supportedOptions + ) { + super(converter, protoHandler, supportedOptions); + this.protoHandler = protoHandler; + } + + @Override + protected void handleOptions(RdfStreamOptions opts) { + if (!opts.getPhysicalType().equals(PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS)) { + throw new RdfProtoDeserializationError("Incoming stream type is not GRAPHS."); + } + super.handleOptions(opts); + } + + @Override + protected void handleGraphStart(RdfGraphStart graphStart) { + final var graphStartTerm = RdfTerm.from(graphStart); + currentGraph = convertGraphTerm(graphStartTerm.graph()); + protoHandler.handleGraphStart(currentGraph); + } + + @Override + protected void handleGraphEnd() { + if (currentGraph == null) { + throw new RdfProtoDeserializationError("End of graph encountered before a start."); + } + + currentGraph = null; + protoHandler.handleGraphEnd(); + } + + @Override + protected void handleTriple(RdfTriple triple) { + var tripleTerm = RdfTerm.from(triple); + var subject = convertSubjectTermWrapped(tripleTerm.subject()); + var predicate = convertPredicateTermWrapped(tripleTerm.predicate()); + var object = convertObjectTermWrapped(tripleTerm.object()); + protoHandler.handleTriple(subject, predicate, object); + } + } + + /** + * A decoder that reads streams of any type and outputs a sequence of triples or quads. + *

+ * The type of the stream is detected automatically based on the options row, + * which must be at the start of the stream. If the options row is not present or the stream changes its type + * in the middle, an error is thrown. + *

+ * Do not instantiate this class directly. Instead use factory methods in + * ConverterFactory implementations. + */ + public static final class AnyStatementDecoder extends ProtoDecoderImpl { + + private final RdfHandler.AnyStatementHandler protoHandler; + private ProtoDecoderImpl delegateDecoder = null; + + public AnyStatementDecoder( + ProtoDecoderConverter converter, + RdfHandler.AnyStatementHandler protoHandler, + RdfStreamOptions supportedOptions + ) { + super(converter, protoHandler, supportedOptions); + this.protoHandler = protoHandler; + } + + @Override + public RdfStreamOptions getStreamOptions() { + if (delegateDecoder != null) { + return delegateDecoder.getStreamOptions(); + } + + return null; + } + + @Override + public void ingestRow(RdfStreamRow row) { + if (row.hasOptions()) { + handleOptions(row.getOptions()); + delegateDecoder.ingestRow(row); + return; + } + + if (delegateDecoder == null) { + throw new RdfProtoDeserializationError("Stream options are not set."); + } + + delegateDecoder.ingestRow(row); + } + + @Override + protected void handleOptions(RdfStreamOptions options) { + final var newSupportedOptions = supportedOptions + .toBuilder() + .setLogicalType(LogicalStreamType.LOGICAL_STREAM_TYPE_UNSPECIFIED) + .build(); + + checkCompatibility(options, newSupportedOptions); + if (delegateDecoder != null) { + return; + } + + switch (options.getPhysicalType()) { + case PHYSICAL_STREAM_TYPE_TRIPLES -> delegateDecoder = new TriplesDecoder<>( + converter, + protoHandler, + options + ); + case PHYSICAL_STREAM_TYPE_QUADS -> delegateDecoder = new QuadsDecoder<>( + converter, + protoHandler, + options + ); + case PHYSICAL_STREAM_TYPE_GRAPHS -> delegateDecoder = new GraphsAsQuadsDecoder<>( + converter, + protoHandler, + options + ); + default -> throw new RdfProtoDeserializationError("Incoming physical stream type is not recognized."); + } + } + + @Override + protected void handleTriple(RdfTriple triple) { + delegateDecoder.handleTriple(triple); + } + + @Override + protected void handleQuad(RdfQuad quad) { + delegateDecoder.handleQuad(quad); + } + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/internal/ProtoEncoderBase.java b/core-java/src/main/java/eu/neverblink/jelly/core/internal/ProtoEncoderBase.java new file mode 100644 index 000000000..bf892bade --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/internal/ProtoEncoderBase.java @@ -0,0 +1,82 @@ +package eu.neverblink.jelly.core.internal; + +import eu.neverblink.jelly.core.NodeEncoder; +import eu.neverblink.jelly.core.ProtoEncoderConverter; +import eu.neverblink.jelly.core.RdfTerm; +import eu.neverblink.jelly.core.RowBufferAppender; +import eu.neverblink.jelly.core.proto.v1.RdfStreamOptions; + +/** + * Base interface for Jelly proto encoders. Only for internal use. + * @param type of RDF nodes in the library + */ +public abstract class ProtoEncoderBase implements RowBufferAppender { + + /** + * RdfStreamOptions for this encoder. + */ + protected final RdfStreamOptions options; + protected final NodeEncoder nodeEncoder; + protected final ProtoEncoderConverter converter; + + protected final LastNodeHolder lastSubject = new LastNodeHolder<>(); + protected final LastNodeHolder lastPredicate = new LastNodeHolder<>(); + protected final LastNodeHolder lastObject = new LastNodeHolder<>(); + protected TNode lastGraph = null; + + protected ProtoEncoderBase(RdfStreamOptions options, ProtoEncoderConverter converter) { + this.options = options; + this.nodeEncoder = NodeEncoderImpl.create(options, this); + this.converter = converter; + } + + protected final RdfTerm.Triple tripleToProto(TNode subject, TNode predicate, TNode object) { + return new RdfTerm.Triple( + nodeToProtoWrapped(subject, lastSubject), + nodeToProtoWrapped(predicate, lastPredicate), + nodeToProtoWrapped(object, lastObject) + ); + } + + protected final RdfTerm.Quad quadToProto(TNode subject, TNode predicate, TNode object, TNode graph) { + return new RdfTerm.Quad( + nodeToProtoWrapped(subject, lastSubject), + nodeToProtoWrapped(predicate, lastPredicate), + nodeToProtoWrapped(object, lastObject), + graphNodeToProtoWrapped(graph) + ); + } + + /** + * Converts a triple to an RdfQuad object with a null graph. + *

+ * Used in RDF-Patch for triple add/delete operations. + */ + protected final RdfTerm.Quad tripleInQuadToProto(TNode subject, TNode predicate, TNode object) { + return new RdfTerm.Quad( + nodeToProtoWrapped(subject, lastSubject), + nodeToProtoWrapped(predicate, lastPredicate), + nodeToProtoWrapped(object, lastObject), + null + ); + } + + private RdfTerm.SpoTerm nodeToProtoWrapped(TNode node, LastNodeHolder lastNodeHolder) { + if (node.equals(lastNodeHolder.node)) { + return null; + } else { + lastNodeHolder.node = node; + return converter.nodeToProto(nodeEncoder, node); + } + } + + private RdfTerm.GraphTerm graphNodeToProtoWrapped(TNode node) { + // Graph nodes may be null in Jena for example... so we need to handle that. + if ((node == null && lastGraph == null) || (node != null && node.equals(lastGraph))) { + return null; + } else { + lastGraph = node; + return converter.graphNodeToProto(nodeEncoder, node); + } + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/internal/ProtoEncoderImpl.java b/core-java/src/main/java/eu/neverblink/jelly/core/internal/ProtoEncoderImpl.java new file mode 100644 index 000000000..b6ffcc0b5 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/internal/ProtoEncoderImpl.java @@ -0,0 +1,117 @@ +package eu.neverblink.jelly.core.internal; + +import eu.neverblink.jelly.core.ProtoEncoder; +import eu.neverblink.jelly.core.ProtoEncoderConverter; +import eu.neverblink.jelly.core.RdfProtoSerializationError; +import eu.neverblink.jelly.core.RdfTerm; +import eu.neverblink.jelly.core.proto.v1.RdfDatatypeEntry; +import eu.neverblink.jelly.core.proto.v1.RdfNameEntry; +import eu.neverblink.jelly.core.proto.v1.RdfNamespaceDeclaration; +import eu.neverblink.jelly.core.proto.v1.RdfPrefixEntry; +import eu.neverblink.jelly.core.proto.v1.RdfStreamRow; +import java.util.Collection; + +/** + * Stateful encoder of a protobuf RDF stream. + *

+ * This class supports all stream types and options, but usually does not check if the user is conforming to them. + * It will, for example, allow the user to send generalized triples in a stream that should not have them. + * Take care to ensure the correctness of the transmitted data, or use the specialized wrappers from the stream package. + */ +public class ProtoEncoderImpl extends ProtoEncoder { + + private boolean hasEmittedOptions = false; + private final Collection rowBuffer; + + /** + * Constructor for the ProtoEncoderImpl class. + *

+ * + * @param converter converter for the encoder + * @param params parameters object for the encoder + */ + public ProtoEncoderImpl(ProtoEncoderConverter converter, ProtoEncoder.Params params) { + super(converter, params); + this.rowBuffer = appendableRowBuffer; + } + + @Override + public void handleTriple(TNode subject, TNode predicate, TNode object) { + emitOptions(); + final var triple = tripleToProto(subject, predicate, object); + final var mainRow = RdfStreamRow.newBuilder().setTriple(triple.toProto()).build(); + rowBuffer.add(mainRow); + } + + @Override + public void handleQuad(TNode subject, TNode predicate, TNode object, TNode graph) { + emitOptions(); + final var quad = quadToProto(subject, predicate, object, graph); + final var mainRow = RdfStreamRow.newBuilder().setQuad(quad.toProto()).build(); + rowBuffer.add(mainRow); + } + + @Override + public void handleGraphStart(TNode graph) { + emitOptions(); + final var graphNode = converter.graphNodeToProto(nodeEncoder, graph); + final var graphStart = new RdfTerm.GraphStart(graphNode); + final var graphRow = RdfStreamRow.newBuilder().setGraphStart(graphStart.toProto()).build(); + rowBuffer.add(graphRow); + } + + @Override + public void handleGraphEnd() { + if (!hasEmittedOptions) { + throw new RdfProtoSerializationError("Cannot end a delimited graph before starting one"); + } + + final var graphEnd = new RdfTerm.GraphEnd(); + final var graphRow = RdfStreamRow.newBuilder().setGraphEnd(graphEnd.toProto()).build(); + rowBuffer.add(graphRow); + } + + @Override + public void handleNamespace(String prefix, TNode namespace) { + if (!enableNamespaceDeclarations) { + throw new RdfProtoSerializationError("Namespace declarations are not enabled in this stream"); + } + + emitOptions(); + + final var namespaceTerm = converter.nodeToProto(nodeEncoder, namespace); + if (!(namespaceTerm instanceof RdfTerm.Iri iriTerm)) { + throw new RdfProtoSerializationError("Namespace must be an IRI"); + } + + final var mainRow = RdfStreamRow.newBuilder() + .setNamespace(RdfNamespaceDeclaration.newBuilder().setName(prefix).setValue(iriTerm.toProto()).build()) + .build(); + + rowBuffer.add(mainRow); + } + + @Override + public void appendNameEntry(RdfNameEntry nameEntry) { + rowBuffer.add(RdfStreamRow.newBuilder().setName(nameEntry).build()); + } + + @Override + public void appendPrefixEntry(RdfPrefixEntry prefixEntry) { + rowBuffer.add(RdfStreamRow.newBuilder().setPrefix(prefixEntry).build()); + } + + @Override + public void appendDatatypeEntry(RdfDatatypeEntry datatypeEntry) { + rowBuffer.add(RdfStreamRow.newBuilder().setDatatype(datatypeEntry).build()); + } + + private void emitOptions() { + if (hasEmittedOptions) { + return; + } + + hasEmittedOptions = true; + rowBuffer.add(RdfStreamRow.newBuilder().setOptions(options).build()); + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/internal/ProtoTranscoderImpl.java b/core-java/src/main/java/eu/neverblink/jelly/core/internal/ProtoTranscoderImpl.java new file mode 100644 index 000000000..1d17d801b --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/internal/ProtoTranscoderImpl.java @@ -0,0 +1,302 @@ +package eu.neverblink.jelly.core.internal; + +import eu.neverblink.jelly.core.*; +import eu.neverblink.jelly.core.proto.v1.RdfDatatypeEntry; +import eu.neverblink.jelly.core.proto.v1.RdfNameEntry; +import eu.neverblink.jelly.core.proto.v1.RdfNamespaceDeclaration; +import eu.neverblink.jelly.core.proto.v1.RdfPrefixEntry; +import eu.neverblink.jelly.core.proto.v1.RdfStreamFrame; +import eu.neverblink.jelly.core.proto.v1.RdfStreamOptions; +import eu.neverblink.jelly.core.proto.v1.RdfStreamRow; +import java.util.ArrayList; +import java.util.List; + +/** + * Fast implementation of the ProtoTranscoder interface. + *

+ * It does not in perfect compression (like you would get with full decoding and re-encoding), but it should be + * good enough for the vast majority of cases. + */ +public class ProtoTranscoderImpl implements ProtoTranscoder { + + private final RdfStreamOptions supportedInputOptions; + private final RdfStreamOptions outputOptions; + + private final TranscoderLookup prefixLookup; + private final TranscoderLookup nameLookup; + private final TranscoderLookup datatypeLookup; + + private final List rowBuffer = new ArrayList<>(); + + private RdfStreamOptions inputOptions = null; + private boolean inputUsesPrefixes = false; + private boolean hasChangedTerms = false; + private boolean hasEmittedOptions = false; + + /** + * Constructor for the ProtoTranscoderImpl class. + * + * @param supportedInputOptions maximum allowable options for the input streams (optional) + * @param outputOptions options for the output stream. This MUST have the physical stream type set. + */ + public ProtoTranscoderImpl(RdfStreamOptions supportedInputOptions, RdfStreamOptions outputOptions) { + this.supportedInputOptions = supportedInputOptions; + this.outputOptions = outputOptions; + prefixLookup = new TranscoderLookup(false, outputOptions.getMaxPrefixTableSize()); + nameLookup = new TranscoderLookup(true, outputOptions.getMaxNameTableSize()); + datatypeLookup = new TranscoderLookup(false, outputOptions.getMaxDatatypeTableSize()); + } + + @Override + public Iterable ingestRow(RdfStreamRow row) { + rowBuffer.clear(); + processRow(row); + return rowBuffer; + } + + @Override + public RdfStreamFrame ingestFrame(RdfStreamFrame frame) { + rowBuffer.clear(); + for (final var row : frame.getRowsList()) { + processRow(row); + } + + return RdfStreamFrame.newBuilder().addAllRows(rowBuffer).putAllMetadata(frame.getMetadataMap()).build(); + } + + private void processRow(RdfStreamRow row) { + switch (row.getRowCase()) { + case OPTIONS -> handleOptions(row.getOptions()); + case TRIPLE -> handleTriple(row); + case QUAD -> handleQuad(row); + case GRAPH_START -> handleGraphStart(row); + case GRAPH_END -> handleIdentity(row); + case NAMESPACE -> handleNamespaceDeclaration(row); + case NAME -> handleName(row); + case PREFIX -> handlePrefix(row); + case DATATYPE -> handleDatatype(row); + case ROW_NOT_SET -> throw new RdfProtoTranscodingError("Row kind is not set"); + } + } + + private void handleName(RdfStreamRow row) { + final var name = row.getName(); + final var entry = nameLookup.addEntry(name.getId(), name.getValue()); + if (!entry.newEntry) { + return; + } + + if (entry.setId == name.getId()) { + rowBuffer.add(row); + return; + } + + final var newName = RdfNameEntry.newBuilder().setId(entry.setId).setValue(name.getValue()).build(); + rowBuffer.add(RdfStreamRow.newBuilder().setName(newName).build()); + } + + private void handlePrefix(RdfStreamRow row) { + final var prefix = row.getPrefix(); + final var entry = prefixLookup.addEntry(prefix.getId(), prefix.getValue()); + if (!entry.newEntry) { + return; + } + + if (entry.setId == prefix.getId()) { + rowBuffer.add(row); + return; + } + + final var newPrefix = RdfPrefixEntry.newBuilder().setId(entry.setId).setValue(prefix.getValue()).build(); + rowBuffer.add(RdfStreamRow.newBuilder().setPrefix(newPrefix).build()); + } + + private void handleDatatype(RdfStreamRow row) { + final var datatype = row.getDatatype(); + final var entry = datatypeLookup.addEntry(datatype.getId(), datatype.getValue()); + if (!entry.newEntry) { + return; + } + + if (entry.setId == datatype.getId()) { + rowBuffer.add(row); + return; + } + + final var newDatatype = RdfDatatypeEntry.newBuilder().setId(entry.setId).setValue(datatype.getValue()).build(); + rowBuffer.add(RdfStreamRow.newBuilder().setDatatype(newDatatype).build()); + } + + private void handleIdentity(RdfStreamRow row) { + // No changes needed, just add the row to the buffer + rowBuffer.add(row); + } + + private void handleTriple(RdfStreamRow row) { + this.hasChangedTerms = false; + final var triple = RdfTerm.from(row.getTriple()); + + final var s1 = handleSpoTerm(triple.subject()); + final var p1 = handleSpoTerm(triple.predicate()); + final var o1 = handleSpoTerm(triple.object()); + + if (!hasChangedTerms) { + rowBuffer.add(row); + return; + } + + final var newTriple = new RdfTerm.Triple(s1, p1, o1); + rowBuffer.add(RdfStreamRow.newBuilder().setTriple(newTriple.toProto()).build()); + } + + private void handleQuad(RdfStreamRow row) { + this.hasChangedTerms = false; + final var quad = RdfTerm.from(row.getQuad()); + + final var s1 = handleSpoTerm(quad.subject()); + final var p1 = handleSpoTerm(quad.predicate()); + final var o1 = handleSpoTerm(quad.object()); + final var g1 = handleGraphTerm(quad.graph()); + + if (!hasChangedTerms) { + rowBuffer.add(row); + return; + } + + final var newQuad = new RdfTerm.Quad(s1, p1, o1, g1); + rowBuffer.add(RdfStreamRow.newBuilder().setQuad(newQuad.toProto()).build()); + } + + private void handleGraphStart(RdfStreamRow row) { + this.hasChangedTerms = false; + final var graphStart = RdfTerm.from(row.getGraphStart()); + + final var g1 = handleGraphTerm(graphStart.graph()); + if (!hasChangedTerms) { + rowBuffer.add(row); + return; + } + + final var newGraphStart = new RdfTerm.GraphStart(g1); + rowBuffer.add(RdfStreamRow.newBuilder().setGraphStart(newGraphStart.toProto()).build()); + } + + private void handleNamespaceDeclaration(RdfStreamRow row) { + this.hasChangedTerms = false; + var nsRow = row.getNamespace(); + var iriValue = handleIri(RdfTerm.from(nsRow.getValue())); + + if (!hasChangedTerms) { + rowBuffer.add(row); + return; + } + + var namespace = RdfNamespaceDeclaration.newBuilder() + .setName(nsRow.getName()) + .setValue(iriValue.toProto()) + .build(); + + rowBuffer.add(RdfStreamRow.newBuilder().setNamespace(namespace).build()); + } + + private RdfTerm.SpoTerm handleSpoTerm(RdfTerm.SpoTerm term) { + if (term instanceof RdfTerm.Iri iri) { + return handleIri(iri); + } else if (term instanceof RdfTerm.LiteralTerm literalTerm) { + return handleLiteral(literalTerm); + } else if (term instanceof RdfTerm.Triple triple) { + return handleTripleTerm(triple); + } else { + return term; + } + } + + private RdfTerm.GraphTerm handleGraphTerm(RdfTerm.GraphTerm graph) { + if (graph instanceof RdfTerm.Iri iri) { + return handleIri(iri); + } else if (graph instanceof RdfTerm.LiteralTerm literalTerm) { + return handleLiteral(literalTerm); + } else { + return graph; + } + } + + private RdfTerm.Iri handleIri(RdfTerm.Iri iri) { + var prefix = iri.prefixId(); + var name = iri.nameId(); + var prefix1 = inputUsesPrefixes ? prefixLookup.remap(prefix) : 0; + var name1 = nameLookup.remap(name); + if (prefix1 != prefix || name1 != name) { + hasChangedTerms = true; + return new RdfTerm.Iri(prefix1, name1); + } + return iri; + } + + private RdfTerm.LiteralTerm handleLiteral(RdfTerm.LiteralTerm literal) { + if (!(literal instanceof RdfTerm.DtLiteral dtLiteral)) { + return literal; + } + + var dt = dtLiteral.datatype(); + var dt1 = datatypeLookup.remap(dt); + if (dt1 != dt) { + hasChangedTerms = true; + return new RdfTerm.DtLiteral(dtLiteral.lex(), dt1); + } + + return literal; + } + + private RdfTerm.Triple handleTripleTerm(RdfTerm.Triple triple) { + var s1 = handleSpoTerm(triple.subject()); + var p1 = handleSpoTerm(triple.predicate()); + var o1 = handleSpoTerm(triple.object()); + if (!s1.equals(triple.subject()) || !p1.equals(triple.predicate()) || !o1.equals(triple.object())) { + hasChangedTerms = true; + return new RdfTerm.Triple(s1, p1, o1); + } + return triple; + } + + private void handleOptions(RdfStreamOptions options) { + if (supportedInputOptions != null) { + if (outputOptions.getPhysicalType() != options.getPhysicalType()) { + throw new RdfProtoTranscodingError( + "Input stream has a different physical type than the output. Input: %s output: %s".formatted( + options.getPhysicalType(), + outputOptions.getPhysicalType() + ) + ); + } + JellyOptions.checkCompatibility(options, supportedInputOptions); + } + + this.inputUsesPrefixes = options.getMaxPrefixTableSize() > 0; + + if (inputUsesPrefixes) { + prefixLookup.newInputStream(options.getMaxPrefixTableSize()); + } else if (outputOptions.getMaxPrefixTableSize() > 0) { + throw new RdfProtoTranscodingError("Output stream uses prefixes, but the input stream does not."); + } + + nameLookup.newInputStream(options.getMaxNameTableSize()); + datatypeLookup.newInputStream(options.getMaxDatatypeTableSize()); + + // Set the input options + inputOptions = options; + + // Update the input options + if (hasEmittedOptions) { + return; + } + + hasEmittedOptions = true; + var version = inputOptions.getVersion() == JellyConstants.PROTO_VERSION_1_0_X + ? JellyConstants.PROTO_VERSION_1_0_X + : JellyConstants.PROTO_VERSION; + + var newOptions = outputOptions.toBuilder().setVersion(version).build(); + rowBuffer.add(RdfStreamRow.newBuilder().setOptions(newOptions).build()); + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/internal/TranscoderLookup.java b/core-java/src/main/java/eu/neverblink/jelly/core/internal/TranscoderLookup.java new file mode 100644 index 000000000..a10b4bcd5 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/internal/TranscoderLookup.java @@ -0,0 +1,120 @@ +package eu.neverblink.jelly.core.internal; + +import eu.neverblink.jelly.core.RdfProtoTranscodingError; +import java.util.Arrays; + +/** + * A wrapper around EncoderLookup that is used in proto transcoders to remap input stream IDs to output stream IDs. + */ +final class TranscoderLookup { + + // The size of the output lookup table + private final int outputSize; + // Mapping input IDs to output IDs + private int[] table; + // The actual lookup table (output) + private final EncoderLookup lookup; + + // 0-compression: + // - for prefixes and datatypes: no worries about splicing, because zeroes are not allowed at the start of the + // stream. While splitting, we need to check for zeroes at the start of the stream and remap them. + // - IRI names: remap all 0s forcefully + private final boolean isNameLookup; + private int lastSetId = 0; + private int lastInputGetId = 0; + private int lastOutputGetId = 0; + + /** + * Create a new TranscoderLookup. + * @param isNameLookup Whether this lookup is for IRI names. + * @param outputSize The size of the output lookup. + */ + TranscoderLookup(boolean isNameLookup, int outputSize) { + this.isNameLookup = isNameLookup; + this.outputSize = outputSize; + this.lookup = new EncoderLookup(outputSize, false); + } + + /** + * Remap a lookup entry from the input stream to the output stream. + *

+ * This may result in us actually adding a new entry to the output lookup, or not, if it's already there. + * + * @param originalId The ID of the entry in the input stream. + * @param value The value of the entry. + * @return The lookup entry in the output stream. + */ + EncoderLookup.LookupEntry addEntry(int originalId, String value) { + if (originalId == 0) { + originalId = ++lastSetId; + } else { + lastSetId = originalId; + } + // If the input stream is evicting something, and our lookup is already full, we tell the lookup to evict + // the exact same entry as the one evicted in the input. This way we are 100% sure that the input and output + // streams have the same lookup entries available to each other. + // + // This has a downside in case where the output's lookup is larger than the input's lookup and we are + // concatenating multiple input streams together. Then, we will be evicting sometimes entries that really don't + // have to be evicted yet, because instead we could evict something from a previous input stream. + // Unfortunately, I don't really have an idea for how to track this efficiently. + EncoderLookup.LookupEntry entry = lookup.getOrAddEntryTranscoder(value, table[originalId]); + table[originalId] = entry.getId; + return entry; + } + + /** + * Remap a reference to a lookup entry from the input stream ID space to the output stream ID space. + *

+ * This automatically handles 0-compression. + * + * @param id The ID to remap (input stream). + * @return The remapped ID (output stream). + */ + int remap(int id) { + if (isNameLookup) { + if (id == 0) { + id = ++lastInputGetId; + } else { + lastInputGetId = id; + } + int outputId = table[id]; + lookup.onAccess(outputId); + if (outputId == lastOutputGetId + 1) { + lastOutputGetId++; + return 0; + } + lastOutputGetId = outputId; + return outputId; + } + if (id == 0) { + // No need to do onAccess here, because this is the same as the last element + return 0; + } + id = table[id]; + lookup.onAccess(id); + return id; + } + + /** + * Signal that a new input stream is starting. + * @param size The size of the input lookup. + */ + void newInputStream(int size) { + if (size > outputSize) { + throw new RdfProtoTranscodingError("Input lookup size cannot be greater than the output lookup size"); + } + if (table != null) { + // Only set this for streams 2 and above (counting from 1) + lastSetId = 0; + lastInputGetId = 0; + } + if (table == null || table.length < size + 1) { + table = new int[size + 1]; + } else { + // We need to zero the mapping, so that we know when the input stream is doing + // an eviction vs just adding a new entry. + Arrays.fill(table, 0); + } + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/utils/IoUtils.java b/core-java/src/main/java/eu/neverblink/jelly/core/utils/IoUtils.java new file mode 100644 index 000000000..55c6564b3 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/utils/IoUtils.java @@ -0,0 +1,59 @@ +package eu.neverblink.jelly.core.utils; + +import com.google.protobuf.CodedOutputStream; +import java.io.*; + +public class IoUtils { + + private IoUtils() {} + + public record AutodetectDelimitingResponse(boolean isDelimited, InputStream newInput) {} + + /** + * Autodetects whether the input stream is a non-delimited Jelly file or a delimited Jelly file. + *

+ * To do this, the first three bytes in the stream are peeked. + * These bytes are then put back into the stream, and the stream is returned, so the parser won't notice the peeking. + * @param inputStream the input stream + * @return (isDelimited, newInputStream) where isDelimited is true if the stream is a delimited Jelly file + */ + public static AutodetectDelimitingResponse autodetectDelimiting(InputStream inputStream) throws IOException { + final var scout = inputStream.readNBytes(3); + final var scoutIn = new ByteArrayInputStream(scout); + final var newInput = new SequenceInputStream(scoutIn, inputStream); + + // Truth table (notation: 0A = 0x0A, NN = not 0x0A, ?? = don't care): + // NN ?? ?? -> delimited (all non-delimited start with 0A) + // 0A NN ?? -> non-delimited + // 0A 0A NN -> delimited (total message size = 10) + // 0A 0A 0A -> non-delimited (stream options size = 10) + + // A case like "0A 0A 0A 0A" in the delimited variant is impossible. It would mean that the whole message + // is 10 bytes long, while stream options alone are 10 bytes long. + + // It's not possible to have a long varint starting with 0A, because its most significant bit + // would have to be 1 (continuation bit). So, we don't need to worry about that case. + + // Yeah, it's magic. But it works. + + final var isDelimited = scout.length == 3 && (scout[0] != 0x0A || (scout[1] == 0x0A && scout[2] != 0x0A)); + return new AutodetectDelimitingResponse(isDelimited, newInput); + } + + /** + * Utility method to transform a non-delimited Jelly frame (as a byte array) into a delimited one, + * writing it to a byte stream. + *

+ * This is useful if you for example store non-delimited frames in a database, but want to write them to a stream. + * + * @param nonDelimitedFrame EXACTLY one non-delimited Jelly frame + * @param output the output stream to write the frame to + */ + public static void writeFrameAsDelimited(byte[] nonDelimitedFrame, OutputStream output) throws IOException { + // Don't worry, the buffer won't really have 0-size. It will be of minimal size able to fit the varint. + final var codedOutput = CodedOutputStream.newInstance(output, 0); + codedOutput.writeUInt32NoTag(nonDelimitedFrame.length); + codedOutput.flush(); + output.write(nonDelimitedFrame); + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/utils/LogicalStreamTypeUtils.java b/core-java/src/main/java/eu/neverblink/jelly/core/utils/LogicalStreamTypeUtils.java new file mode 100644 index 000000000..4045c7eaf --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/utils/LogicalStreamTypeUtils.java @@ -0,0 +1,130 @@ +package eu.neverblink.jelly.core.utils; + +import eu.neverblink.jelly.core.ProtoDecoderConverter; +import eu.neverblink.jelly.core.proto.v1.LogicalStreamType; +import java.util.List; +import java.util.UUID; + +public class LogicalStreamTypeUtils { + + private static final String STAX_PREFIX = "https://w3id.org/stax/ontology#"; + + private LogicalStreamTypeUtils() {} + + /** + * Converts the logical stream type to its base concrete stream type in RDF-STaX. + * For example, LogicalStreamType.TIMESTAMPED_NAMED_GRAPHS will be converted to LogicalStreamType.DATASETS. + * UNSPECIFIED values will be left as-is. + * + * @param logicalType logical stream type + * @return base stream type + */ + public static LogicalStreamType toBaseType(LogicalStreamType logicalType) { + return LogicalStreamType.forNumber(logicalType.getNumber() % 10); + } + + /** + * Checks if the logical stream type is equal to or a subtype of the other logical stream type. + * For example, LogicalStreamType.TIMESTAMPED_NAMED_GRAPHS is a subtype of LogicalStreamType.DATASETS. + * + * @param logicalType the logical stream type to check + * @param other the other logical stream type + * @return true if the logical stream type is equal to or a subtype of the other logical stream type + */ + public static boolean isEqualOrSubtypeOf(LogicalStreamType logicalType, LogicalStreamType other) { + return ( + logicalType.equals(other) || + String.valueOf(logicalType.getNumber()).endsWith(String.valueOf(other.getNumber())) + ); + } + + /** + * Returns the IRI of the RDF-STaX stream type individual for the logical stream type. + * If the logical stream type is not supported or is not specified, None is returned. + * + * @param logicalType the logical stream type + * @return the IRI of the RDF-STaX stream type individual + */ + public static String getRdfStaxType(LogicalStreamType logicalType) { + return switch (logicalType) { + case LOGICAL_STREAM_TYPE_FLAT_TRIPLES -> STAX_PREFIX + "flatTripleStream"; + case LOGICAL_STREAM_TYPE_FLAT_QUADS -> STAX_PREFIX + "flatQuadStream"; + case LOGICAL_STREAM_TYPE_GRAPHS -> STAX_PREFIX + "graphStream"; + case LOGICAL_STREAM_TYPE_SUBJECT_GRAPHS -> STAX_PREFIX + "subjectGraphStream"; + case LOGICAL_STREAM_TYPE_DATASETS -> STAX_PREFIX + "datasetStream"; + case LOGICAL_STREAM_TYPE_NAMED_GRAPHS -> STAX_PREFIX + "namedGraphStream"; + case LOGICAL_STREAM_TYPE_TIMESTAMPED_NAMED_GRAPHS -> STAX_PREFIX + "timestampedNamedGraphStream"; + default -> null; + }; + } + + /** + * Creates a logical stream type from an RDF-STaX stream type individual IRI. + * + * @param iri the IRI of the RDF-STaX stream type individual + * @return the logical stream type, or None if the IRI is not a valid RDF-STaX stream type individual + */ + public static LogicalStreamType fromOntologyIri(String iri) { + if (!iri.startsWith(STAX_PREFIX)) { + return null; + } + + String typeName = iri.substring(STAX_PREFIX.length()); + return switch (typeName) { + case "flatTripleStream" -> LogicalStreamType.LOGICAL_STREAM_TYPE_FLAT_TRIPLES; + case "flatQuadStream" -> LogicalStreamType.LOGICAL_STREAM_TYPE_FLAT_QUADS; + case "graphStream" -> LogicalStreamType.LOGICAL_STREAM_TYPE_GRAPHS; + case "subjectGraphStream" -> LogicalStreamType.LOGICAL_STREAM_TYPE_SUBJECT_GRAPHS; + case "datasetStream" -> LogicalStreamType.LOGICAL_STREAM_TYPE_DATASETS; + case "namedGraphStream" -> LogicalStreamType.LOGICAL_STREAM_TYPE_NAMED_GRAPHS; + case "timestampedNamedGraphStream" -> LogicalStreamType.LOGICAL_STREAM_TYPE_TIMESTAMPED_NAMED_GRAPHS; + default -> null; + }; + } + + /** + * Returns an RDF-STaX annotation for the logical stream type, in RDF. The annotation simply states that + * has a stream type usage, and that stream type usage has this stream type. + *

+ * Example in Turtle for a flat triple stream: + * stax:hasStreamTypeUsage [ + * a stax:RdfStreamTypeUsage ; + * stax:hasStreamType stax:flatTripleStream + * ] . + * + * @param logicalType the logical stream type + * @param subjectNode the subject node to annotate + * @param converter the converter to use for creating RDF nodes and triples + * @param the type of RDF nodes + * @param the type of RDF triples + * @param the type of RDF triples + * @throws IllegalArgumentException if the logical stream type is not supported + * @return the RDF-STaX annotation + */ + public static List getRdfStaxAnnotation( + ProtoDecoderConverter converter, + TripleEncoder tripleEncoder, + LogicalStreamType logicalType, + TNode subjectNode + ) { + var typeIri = getRdfStaxType(logicalType); + if (typeIri == null) { + throw new IllegalArgumentException("Unsupported logical stream type: " + logicalType); + } + + TNode bNode = converter.makeBlankNode(UUID.randomUUID().toString()); + return List.of( + tripleEncoder.encode(subjectNode, converter.makeIriNode(STAX_PREFIX + "hasStreamTypeUsage"), bNode), + tripleEncoder.encode( + bNode, + converter.makeIriNode("http://www.w3.org/1999/02/22-rdf-syntax-ns#type"), + converter.makeIriNode(STAX_PREFIX + "RdfStreamTypeUsage") + ), + tripleEncoder.encode( + bNode, + converter.makeIriNode(STAX_PREFIX + "hasStreamType"), + converter.makeIriNode(typeIri) + ) + ); + } +} diff --git a/core-java/src/main/java/eu/neverblink/jelly/core/utils/TripleEncoder.java b/core-java/src/main/java/eu/neverblink/jelly/core/utils/TripleEncoder.java new file mode 100644 index 000000000..de7eb31c4 --- /dev/null +++ b/core-java/src/main/java/eu/neverblink/jelly/core/utils/TripleEncoder.java @@ -0,0 +1,21 @@ +package eu.neverblink.jelly.core.utils; + +/** + * TripleEncoder is a functional interface that encodes a triple into a representation bound to RDF libraries. + * Currently it is only used in getRdfStaxAnnotation method. + * + * @param the type of the nodes in the triple + * @param the type of the encoded triple + */ +@FunctionalInterface +public interface TripleEncoder { + /** + * Encodes a triple into a representation bound to RDF libraries. + * + * @param subject the subject of the triple + * @param predicate the predicate of the triple + * @param object the object of the triple + * @return the encoded triple + */ + TTriple encode(TNode subject, TNode predicate, TNode object); +} diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/ProtoAuxiliarySpec.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/ProtoAuxiliarySpec.scala new file mode 100644 index 000000000..52e2d1aac --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/ProtoAuxiliarySpec.scala @@ -0,0 +1,45 @@ +package eu.neverblink.jelly.core + +import com.google.protobuf.ByteString +import eu.neverblink.jelly.core.JellyOptions +import eu.neverblink.jelly.core.proto.v1.* +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpec + +/** + * Tests for some auxiliary methods (e.g., Text Format serialization) of the generated Protobuf messages. + */ +class ProtoAuxiliarySpec extends AnyWordSpec, Matchers: + import ProtoTestCases.* + + val opt = JellyOptions.SMALL_GENERALIZED + val testCasesRaw: Seq[(String, TestCase[?], Map[String, ByteString])] = Seq( + ("Triples1", Triples1, Map.empty), + ("Triples2NsDecl", Triples2NsDecl, Map("key" -> ByteString.copyFromUtf8("test"))), + ("Quads1", Quads1, Map.empty), + ( + "Quads2RepeatDefault", + Quads2RepeatDefault, + Map( + "keyZeros" -> ByteString.copyFrom(Array.ofDim[Byte](10)), + "keyOnes" -> ByteString.copyFrom(Array.fill[Byte](10)(1)), + )), + ("Graphs1", Graphs1, Map.empty), + ) + val testCases = testCasesRaw + .map((name, tc, metadata) => ( + name, + tc.encodedFull(opt, 1000, metadata).head + )) + + "RdfStreamFrame" should { + // This case is mostly here to test metadata serialization/deserialization + // in a round-trip setting. + "deserialize from bytes" when { + for ((name, tc) <- testCases) do s"test case $name" in { + val bytes = tc.toByteArray + val frame = RdfStreamFrame.parseFrom(bytes) + frame should be (tc) + } + } + } diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/ProtoDecoderSpec.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/ProtoDecoderSpec.scala new file mode 100644 index 000000000..3ea6438fb --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/ProtoDecoderSpec.scala @@ -0,0 +1,951 @@ +package eu.neverblink.jelly.core + +import eu.neverblink.jelly.core.{JellyConstants, JellyOptions, RdfProtoDeserializationError} +import eu.neverblink.jelly.core.helpers.Assertions.* +import eu.neverblink.jelly.core.helpers.Mrl.* +import eu.neverblink.jelly.core.helpers.{MockConverterFactory, ProtoCollector} +import eu.neverblink.jelly.core.helpers.RdfAdapter.* +import eu.neverblink.jelly.core.proto.v1.* +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpec + +class ProtoDecoderSpec extends AnyWordSpec, Matchers: + import ProtoTestCases.* + import eu.neverblink.jelly.core.internal.ProtoDecoderImpl.* + + private val defaultOptions = JellyOptions.DEFAULT_SUPPORTED_OPTIONS + + "checkLogicalStreamType" should { + val decoderFactories = Seq( + ("TriplesDecoder", (MockConverterFactory.triplesDecoder, PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES)), + ("QuadsDecoder", (MockConverterFactory.quadsDecoder, PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS)), + ("GraphsAsQuadsDecoder", (MockConverterFactory.graphsAsQuadsDecoder, PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS)), + ("GraphsDecoder", (MockConverterFactory.graphsDecoder, PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS)), + ).toMap + val logicalStreamTypeSets = Seq( + ( + Seq(LogicalStreamType.LOGICAL_STREAM_TYPE_FLAT_TRIPLES), + Seq("TriplesDecoder") + ), + ( + Seq(LogicalStreamType.LOGICAL_STREAM_TYPE_FLAT_QUADS), + Seq("QuadsDecoder", "GraphsAsQuadsDecoder") + ), + ( + Seq( + LogicalStreamType.LOGICAL_STREAM_TYPE_GRAPHS, + LogicalStreamType.LOGICAL_STREAM_TYPE_SUBJECT_GRAPHS, + ), + Seq("TriplesDecoder") + ), + ( + Seq( + LogicalStreamType.LOGICAL_STREAM_TYPE_DATASETS, + LogicalStreamType.LOGICAL_STREAM_TYPE_NAMED_GRAPHS, + LogicalStreamType.LOGICAL_STREAM_TYPE_TIMESTAMPED_NAMED_GRAPHS, + ), + Seq("QuadsDecoder", "GraphsDecoder", "GraphsAsQuadsDecoder") + ), + ( + Seq( + LogicalStreamType.LOGICAL_STREAM_TYPE_NAMED_GRAPHS, + LogicalStreamType.LOGICAL_STREAM_TYPE_TIMESTAMPED_NAMED_GRAPHS, + ), + Seq("GraphsDecoder") + ) + ) + + for + (logicalStreamTypeSet, decoders) <- logicalStreamTypeSets + decoderName <- decoders + do + val lst = logicalStreamTypeSet.head + val (decoderF, pst) = decoderFactories(decoderName) + + f"throw exception when expecting logical type $lst on a stream with no logical type, with $decoderName" in { + val collector = ProtoCollector() + val decoder = decoderF( + collector, + defaultOptions.toBuilder.setLogicalType(lst).build() + ) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(pst) + .setLogicalType(LogicalStreamType.LOGICAL_STREAM_TYPE_UNSPECIFIED) + .build() + )) + + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(data.head) + } + + error.getMessage should include("Expected logical stream type") + } + + for lstOfStream <- logicalStreamTypeSet do + f"accept stream with logical type $lstOfStream when expecting $lst, with $decoderName" in { + val collector = ProtoCollector() + val decoder = decoderF( + collector, + defaultOptions.toBuilder.setLogicalType(lst).build() + ) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(pst) + .setLogicalType(lstOfStream) + .build() + )) + + decoder.ingestRow(data.head) + decoder.getStreamOptions.getLogicalType should be (lstOfStream) + } + + for + (pst, decs) <- decoderFactories.groupBy(_._2._2) + (decoderName, (decoderF, _)) <- decs + (lstSet, _) <- logicalStreamTypeSets.take(4).filterNot(x => x._2.exists(y => decs.exists(z => z._1 == y))) + lstOfStream <- lstSet + do + f"throw exception that a stream with logical type $lstOfStream is incompatible with $pst, with $decoderName" in { + val collector = ProtoCollector() + val decoder = decoderF(collector, defaultOptions) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(pst) + .setLogicalType(lstOfStream) + .build() + )) + + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(data.head) + } + + error.getMessage should include("is incompatible with physical stream type") + } + } + + // Test body + "a TriplesDecoder" should { + "decode triple statements" in { + val collector = ProtoCollector() + val decoder = MockConverterFactory.triplesDecoder( + collector, + defaultOptions.toBuilder + .setLogicalType(LogicalStreamType.LOGICAL_STREAM_TYPE_FLAT_TRIPLES) + .build() + ) + + Triples1 + .encoded( + JellyOptions.SMALL_GENERALIZED + .toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .setLogicalType(LogicalStreamType.LOGICAL_STREAM_TYPE_FLAT_TRIPLES) + .build() + ) + .foreach(row => decoder.ingestRow(row)) + + assertDecoded(collector.statements.toSeq, Triples1.mrl) + } + + "decode triple statements with unset expected logical stream type" in { + val collector = ProtoCollector() + val decoder = MockConverterFactory.triplesDecoder(collector) + Triples1 + .encoded( + JellyOptions.SMALL_GENERALIZED + .toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .build() + ) + .foreach(row => decoder.ingestRow(row)) + + assertDecoded(collector.statements.toSeq, Triples1.mrl) + } + + "decode triple statements with namespace declarations" in { + val collector = ProtoCollector() + val decoder = MockConverterFactory.triplesDecoder( + collector, + defaultOptions.toBuilder + .setLogicalType(LogicalStreamType.LOGICAL_STREAM_TYPE_FLAT_TRIPLES) + .build() + ) + + Triples2NsDecl + .encoded( + JellyOptions.SMALL_GENERALIZED + .toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .setLogicalType(LogicalStreamType.LOGICAL_STREAM_TYPE_FLAT_TRIPLES) + .build() + ) + .foreach(row => decoder.ingestRow(row)) + + assertDecoded(collector.statements.toSeq, Triples2NsDecl.mrl.filter(_.isInstanceOf[Triple]).asInstanceOf[Seq[Triple]]) + collector.namespaces.toSeq should be (Seq( + ("test", Iri("https://test.org/test/")), + ("ns2", Iri("https://test.org/ns2/")), + )) + } + + "ignore namespace declarations by default" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.triplesDecoder( + collector, + defaultOptions.toBuilder + .setLogicalType(LogicalStreamType.LOGICAL_STREAM_TYPE_FLAT_TRIPLES) + .build() + ) + + Triples2NsDecl + .encoded( + JellyOptions.SMALL_GENERALIZED + .toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .setLogicalType(LogicalStreamType.LOGICAL_STREAM_TYPE_FLAT_TRIPLES) + .build() + ) + .foreach(row => decoder.ingestRow(row)) + + assertDecoded(collector.statements.toSeq, Triples2NsDecl.mrl.filter(_.isInstanceOf[Triple]).asInstanceOf[Seq[Triple]]) + } + + "throw exception on unset logical stream type" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.triplesDecoder( + collector, + defaultOptions.toBuilder + .setLogicalType(LogicalStreamType.LOGICAL_STREAM_TYPE_FLAT_TRIPLES) + .build() + ) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED + .toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .setLogicalType(LogicalStreamType.LOGICAL_STREAM_TYPE_UNSPECIFIED) + .build() + )) + + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(data.head) + } + + error.getMessage should include ("Expected logical stream type") + } + + "throw exception on a quad in a TRIPLES stream" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.triplesDecoder(collector) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .build(), + rdfQuad("1", "2", "3", "4"), + )) + + decoder.ingestRow(data.head) + + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(data(1)) + } + + error.getMessage should include ("Unexpected quad row in stream") + } + + // The following cases are for the [[ProtoDecoder]] base class – but tested on the child. + // The code is the same in quads, triples, or graphs decoders, so this is fine. + // Code coverage checks out. + "ignore duplicate stream options" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.triplesDecoder(collector) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .build(), + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .setRdfStar(true) + .build(), + )) + + decoder.ingestRow(data.head) + decoder.ingestRow(data(1)) + decoder.getStreamOptions should not be null + decoder.getStreamOptions.getRdfStar should be (false) + } + + "throw exception on unset term without preceding value" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.triplesDecoder(collector) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .build(), + rdfTriple(null, null, null), + )) + + decoder.ingestRow(data.head) + + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(data(1)) + } + + error.getMessage should include ("Empty term without previous term") + } + + "throw exception on an empty term in a quoted triple" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.triplesDecoder(collector) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .build(), + rdfTriple("1", "2", rdfTriple(null, null, null)) + )) + + decoder.ingestRow(data.head) + + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(data(1)) + } + + error.getMessage should include ("Term value is not set inside a quoted triple") + } + + "throw exception on unset row kind" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.triplesDecoder(collector) + + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(rdfStreamRow()) + } + + error.getMessage should include ("Row kind is not set") + } + + "interpret unset literal kind as a simple literal" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.triplesDecoder(collector) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .build(), + rdfTriple("1", "2", rdfLiteral("test")), + )) + + decoder.ingestRow(data.head) + decoder.ingestRow(data(1)) + + val r = collector.statements.head.asInstanceOf[Triple] + r.o should be (a[SimpleLiteral]) + } + + // The tests for this logic are in internal.NameDecoderSpec + // Here we are just testing if the exceptions are rethrown correctly. + "throw exception on an invalid IRI term" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.triplesDecoder(collector) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .build(), + rdfPrefixEntry(0, "a"), + rdfNameEntry(0, "b"), + rdfTriple("1", "2", rdfIri(2, 2)), + )) + + decoder.ingestRow(data.head) + + decoder.ingestRow(data(1)) + decoder.ingestRow(data(2)) + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(data(3)) + } + + error.getMessage should include ("Error while decoding term") + error.getCause shouldBe a [NullPointerException] + } + } + + "a QuadsDecoder" should { + "decode quad statements" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.quadsDecoder(collector) + + Quads1 + .encoded( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS) + .build(), + ) + .foreach(row => decoder.ingestRow(row)) + + assertDecoded(collector.statements.toSeq, Quads1.mrl) + } + + "decode quad statements (repeated default graph)" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.quadsDecoder(collector) + + Quads2RepeatDefault + .encoded( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS) + .build(), + ) + .foreach(row => decoder.ingestRow(row)) + + assertDecoded(collector.statements.toSeq, Quads2RepeatDefault.mrl) + } + + "throw exception on a triple in a QUADS stream" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.quadsDecoder(collector) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS) + .build(), + rdfTriple("1", "2", "3"), + )) + + decoder.ingestRow(data.head) + + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(data(1)) + } + + error.getMessage should include ("Unexpected triple row in stream") + } + + "throw exception on a graph start in a QUADS stream" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.quadsDecoder(collector) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS) + .build(), + rdfGraphStart(rdfDefaultGraph()), + )) + + decoder.ingestRow(data.head) + + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(data(1)) + } + + error.getMessage should include ("Unexpected start of graph in stream") + } + + "throw exception on a graph end in a QUADS stream" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.quadsDecoder(collector) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS) + .build(), + rdfGraphEnd(), + )) + + decoder.ingestRow(data.head) + + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(data(1)) + } + + error.getMessage should include ("Unexpected end of graph in stream") + } + } + + "a GraphsDecoder" should { + "decode graphs" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.graphsDecoder(collector) + + Graphs1 + .encoded( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS) + .build(), + ) + .foreach(row => decoder.ingestRow(row)) + + for ix <- 0 until collector.statements.size.max(Graphs1.mrl.size) do + val obsRow = collector.statements.applyOrElse(ix, null) + val expRow = Graphs1.mrl.applyOrElse(ix, null) + + withClue(s"Graph row $ix:") { + obsRow should not be null + expRow should not be null + + val obsRowGraph = obsRow.asInstanceOf[Graph] + obsRowGraph.graph should be (expRow._1) + assertDecoded(obsRowGraph.triples.toSeq, expRow._2.toSeq) + } + } + + "throw exception on a quad in a GRAPHS stream" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.graphsDecoder(collector) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS) + .build(), + rdfQuad("1", "2", "3", "4"), + )) + + decoder.ingestRow(data.head) + + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(data(1)) + } + + error.getMessage should include ("Unexpected quad row in stream") + } + + "throw exception on a graph end before a graph start" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.graphsDecoder(collector) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS) + .build(), + rdfTriple("1", "2", "3"), + rdfGraphEnd(), + )) + + decoder.ingestRow(data.head) + decoder.ingestRow(data(1)) + + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(data(2)) + } + + error.getMessage should include ("End of graph encountered before a start") + } + + // The following cases are for the [[ProtoDecoder]] base class – but tested on the child. + "throw exception on unset graph term in a GRAPHS stream" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.graphsDecoder(collector) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS) + .build(), + rdfGraphStart(), + )) + + decoder.ingestRow(data.head) + + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(data(1)) + } + + error.getMessage should include ("Empty graph term encountered") + } + } + + "a GraphsAsQuadsDecoder" should { + "decode graphs as quads" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.graphsAsQuadsDecoder(collector) + + Graphs1 + .encoded( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS) + .build(), + ) + .foreach(row => decoder.ingestRow(row)) + + assertDecoded(collector.statements.toSeq, Graphs1.mrlQuads) + } + + "throw exception on a triple before a graph start" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.graphsAsQuadsDecoder(collector) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS) + .build(), + rdfTriple("1", "2", "3"), + )) + + decoder.ingestRow(data.head) + + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(data(1)) + } + + error.getMessage should include ("Triple in stream without preceding graph start") + } + + // The tests for this logic are in internal.NameDecoderSpec + // Here we are just testing if the exceptions are rethrown correctly. + "throw exception on an invalid IRI term" in { + val collector = ProtoCollector() + val decoder = MockConverterFactory.graphsAsQuadsDecoder(collector) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS) + .build(), + rdfPrefixEntry(0, "a"), + rdfNameEntry(0, "b"), + rdfGraphStart(rdfDefaultGraph()), + rdfTriple("1", "2", rdfIri(2, 2)), + )) + + decoder.ingestRow(data.head) + decoder.ingestRow(data(1)) + decoder.ingestRow(data(2)) + decoder.ingestRow(data(3)) + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(data(4)) + } + + error.getMessage should include("Error while decoding term") + error.getCause shouldBe a[NullPointerException] + } + } + + "an AnyStatementDecoder" should { + val cases = Seq( + (Triples1, PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES, "triples", Triples1.mrl), + (Quads1, PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS, "quads", Quads1.mrl), + (Graphs1, PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS, "graphs", Graphs1.mrlQuads), + ) + + for ((testCase, streamType, streamName, expected) <- cases) do + s"decode $streamName" in { + val collector = ProtoCollector() + + val opts = JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(streamType) + .setVersion(JellyConstants.PROTO_VERSION) + .build() + + val decoder = MockConverterFactory.anyDecoder(collector) + + testCase + .encoded(opts) + .foreach(row => decoder.ingestRow(row)) + + assertDecoded(collector.statements.toSeq, expected) + decoder.getStreamOptions should be (opts) + } + + "should return None when retrieving stream options on an empty stream" in { + val collector = ProtoCollector() + val decoder = MockConverterFactory.anyDecoder(collector) + decoder.getStreamOptions should be (null) + } + + "should throw when decoding a row without preceding options" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.anyDecoder(collector) + + val data = wrapEncoded(Seq( + rdfTriple("1", "2", "3"), + )) + + val error = intercept[RdfProtoDeserializationError] { + decoder.ingestRow(data.head) + } + + error.getMessage should include ("Stream options are not set") + } + + "should ignore multiple stream options" in { + val collector = ProtoCollector() + + val decoder = MockConverterFactory.anyDecoder(collector) + + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .build(), + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .build(), + rdfTriple("1", "2", "3"), + )) + + decoder.ingestRow(data.head) + decoder.ingestRow(data(1)) + decoder.ingestRow(data(2)) + + collector.statements.head should be (a[Triple]) + } + } + + private val streamTypeCases = Seq( + ( + (o: Option[RdfStreamOptions]) => MockConverterFactory.triplesDecoder( + ProtoCollector(), + o.orElse(Some(JellyOptions.DEFAULT_SUPPORTED_OPTIONS)).get + ), + "Triples", + PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES, + PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS + ), + ( + (o: Option[RdfStreamOptions]) => MockConverterFactory.quadsDecoder( + ProtoCollector(), + o.orElse(Some(JellyOptions.DEFAULT_SUPPORTED_OPTIONS)).get + ), + "Quads", + PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS, + PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS + ), + ( + (o: Option[RdfStreamOptions]) => MockConverterFactory.graphsDecoder( + ProtoCollector(), + o.orElse(Some(JellyOptions.DEFAULT_SUPPORTED_OPTIONS)).get + ), + "Graphs", + PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS, + PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS + ), + ( + (o: Option[RdfStreamOptions]) => MockConverterFactory.graphsAsQuadsDecoder( + ProtoCollector(), + o.orElse(Some(JellyOptions.DEFAULT_SUPPORTED_OPTIONS)).get + ), + "GraphsAsQuads", + PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS, + PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES + ), + ( + (o: Option[RdfStreamOptions]) => MockConverterFactory.anyDecoder( + ProtoCollector(), + o.orElse(Some(JellyOptions.DEFAULT_SUPPORTED_OPTIONS)).get + ), + "AnyStatement", + PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES, + PhysicalStreamType.PHYSICAL_STREAM_TYPE_UNSPECIFIED + ), + ) + + for (decoderFactory, decName, streamType, invalidStreamType) <- streamTypeCases do + s"a ${decName}Decoder" should { + "throw exception on an empty stream type" in { + val data = wrapEncoded(Seq(JellyOptions.SMALL_GENERALIZED)) + + val error = intercept[RdfProtoDeserializationError] { + decoderFactory(None).ingestRow(data.head) + } + + error.getMessage should include ("stream type is not") + } + + "throw exception on an invalid stream type" in { + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED + .toBuilder + .setPhysicalType(invalidStreamType) + .build() + )) + + val error = intercept[RdfProtoDeserializationError] { + decoderFactory(None).ingestRow(data.head) + } + + error.getMessage should include ("stream type is not") + } + + "throw exception on an unsupported proto version" in { + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED + .toBuilder + .setPhysicalType(streamType) + .setVersion(JellyConstants.PROTO_VERSION + 1) + .build() + )) + + val error = intercept[RdfProtoDeserializationError] { + decoderFactory(None).ingestRow(data.head) + } + + error.getMessage should include("Unsupported proto version") + } + + "throw exception on a proto version higher than marked by the user as supported" in { + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(streamType) + .setVersion(JellyConstants.PROTO_VERSION) + .build() + )) + + val opt = JellyOptions.DEFAULT_SUPPORTED_OPTIONS.toBuilder + .setVersion(JellyConstants.PROTO_VERSION - 1) + .build() + + val error = intercept[RdfProtoDeserializationError] { + decoderFactory(Some(opt)).ingestRow(data.head) + } + + error.getMessage should include("Unsupported proto version") + } + + "throw exception on a stream with generalized statements if marked as unsupported" in { + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(streamType) + .build() + )) + + val opt = JellyOptions.DEFAULT_SUPPORTED_OPTIONS.toBuilder + .setGeneralizedStatements(false) + .build() + + val error = intercept[RdfProtoDeserializationError] { + decoderFactory(Some(opt)).ingestRow(data.head) + } + + error.getMessage should include("stream uses generalized statements") + } + + "throw exception on a stream with RDF-star if marked as unsupported" in { + val data = wrapEncoded(Seq( + JellyOptions.SMALL_RDF_STAR.toBuilder + .setPhysicalType(streamType) + .build() + )) + + val opt = JellyOptions.DEFAULT_SUPPORTED_OPTIONS.toBuilder + .setRdfStar(false) + .build() + + val error = intercept[RdfProtoDeserializationError] { + decoderFactory(Some(opt)).ingestRow(data.head) + } + + error.getMessage should include("stream uses RDF-star") + } + + "throw exception on a stream with a name table size larger than supported" in { + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(streamType) + .setMaxNameTableSize(100) + .build() + )) + + val opt = JellyOptions.DEFAULT_SUPPORTED_OPTIONS.toBuilder + .setMaxNameTableSize(80) + .build() + + val error = intercept[RdfProtoDeserializationError] { + decoderFactory(Some(opt)).ingestRow(data.head) + } + + error.getMessage should include("name table size of 100") + error.getMessage should include("larger than the maximum supported size of 80") + } + + "throw exception on a stream with a prefix table size larger than supported" in { + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(streamType) + .setMaxPrefixTableSize(100) + .build() + )) + val opt = JellyOptions.DEFAULT_SUPPORTED_OPTIONS.toBuilder + .setMaxPrefixTableSize(80) + .build() + + val error = intercept[RdfProtoDeserializationError] { + decoderFactory(Some(opt)).ingestRow(data.head) + } + + error.getMessage should include("prefix table size of 100") + error.getMessage should include("larger than the maximum supported size of 80") + } + + "throw exception on a stream with a datatype table size larger than supported" in { + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(streamType) + .setMaxDatatypeTableSize(100) + .build() + )) + + val opt = JellyOptions.DEFAULT_SUPPORTED_OPTIONS.toBuilder + .setMaxDatatypeTableSize(80) + .build() + + val error = intercept[RdfProtoDeserializationError] { + decoderFactory(Some(opt)).ingestRow(data.head) + } + + error.getMessage should include("datatype table size of 100") + error.getMessage should include("larger than the maximum supported size of 80") + } + + "throw exception on a stream with a name table size smaller than supported" in { + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(streamType) + .setMaxNameTableSize(2) // 8 is the minimum + .build() + )) + + val error = intercept[RdfProtoDeserializationError] { + decoderFactory(None).ingestRow(data.head) + } + + error.getMessage should include("name table size of 2") + error.getMessage should include("smaller than the minimum supported size of 8") + } + + "accept a datatype table size = 0" in { + val data = wrapEncoded(Seq( + JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(streamType) + .setMaxDatatypeTableSize(0) + .build() + )) + + decoderFactory(None).ingestRow(data.head) // should not throw + } + } diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/ProtoEncoderSpec.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/ProtoEncoderSpec.scala new file mode 100644 index 000000000..f03446f8e --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/ProtoEncoderSpec.scala @@ -0,0 +1,188 @@ +package eu.neverblink.jelly.core + +import eu.neverblink.jelly.core.{JellyConstants, JellyOptions, NamespaceDeclaration, RdfProtoSerializationError} +import eu.neverblink.jelly.core.helpers.* +import eu.neverblink.jelly.core.helpers.Assertions.* +import eu.neverblink.jelly.core.helpers.Mrl.* +import eu.neverblink.jelly.core.proto.v1.* +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpec + +import scala.collection.mutable.ListBuffer +import scala.jdk.CollectionConverters.* + +class ProtoEncoderSpec extends AnyWordSpec, Matchers: + import ProtoTestCases.* + import eu.neverblink.jelly.core.ProtoEncoder.Params as Pep + + // Test body + "a ProtoEncoder" should { + "encode triple statements" in { + val buffer = ListBuffer[RdfStreamRow]() + val options = JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .setVersion(JellyConstants.PROTO_VERSION_1_0_X) + .build() + + val encoder = MockConverterFactory.encoder(Pep( + options, + enableNamespaceDeclarations = false, + appendableRowBuffer = buffer.asJava + )) + Triples1.mrl.foreach(triple => encoder.handleTriple(triple.s, triple.p, triple.o)) + assertEncoded(buffer.toSeq, Triples1.encoded(options)) + } + + "encode triple statements with ns decls and an external buffer" in { + val buffer = ListBuffer[RdfStreamRow]() + val options = JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .setVersion(JellyConstants.PROTO_VERSION) + .build() + + val encoder = MockConverterFactory.encoder(Pep( + options, + enableNamespaceDeclarations = true, + appendableRowBuffer = buffer.asJava + )) + + for triple <- Triples2NsDecl.mrl do + triple match + case t: Triple => encoder.handleTriple(t.s, t.p, t.o) + case ns: NamespaceDeclaration => encoder.handleNamespace(ns.prefix, Iri(ns.iri)) + + assertEncoded(buffer.toSeq, Triples2NsDecl.encoded(options)) + } + + "encode quad statements" in { + val buffer = ListBuffer[RdfStreamRow]() + val options = JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS) + .setVersion(JellyConstants.PROTO_VERSION_1_0_X) + .build() + + val encoder = MockConverterFactory.encoder(Pep( + options, + enableNamespaceDeclarations = false, + appendableRowBuffer = buffer.asJava + )) + + Quads1.mrl.foreach(quad => encoder.handleQuad(quad.s, quad.p, quad.o, quad.g)) + assertEncoded(buffer.toSeq, Quads1.encoded(options)) + } + + "encode quad statements with an external buffer" in { + val buffer = ListBuffer[RdfStreamRow]() + val options = JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS) + .setVersion(JellyConstants.PROTO_VERSION_1_0_X) + .build() + + val encoder = MockConverterFactory.encoder(Pep( + options, + enableNamespaceDeclarations = false, + appendableRowBuffer = buffer.asJava + )) + + for quad <- Quads1.mrl do + encoder.handleQuad(quad.s, quad.p, quad.o, quad.g) + + assertEncoded(buffer.toSeq, Quads1.encoded(options)) + } + + "encode quad statements (repeated default graph)" in { + val buffer = ListBuffer[RdfStreamRow]() + val options = JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS) + .setVersion(JellyConstants.PROTO_VERSION_1_0_X) + .build() + + val encoder = MockConverterFactory.encoder(Pep( + options, + enableNamespaceDeclarations = false, + appendableRowBuffer = buffer.asJava + )) + + Quads2RepeatDefault.mrl.foreach(quad => encoder.handleQuad(quad.s, quad.p, quad.o, quad.g)) + assertEncoded(buffer.toSeq, Quads2RepeatDefault.encoded(options)) + } + + "encode graphs with an external buffer" in { + val buffer = ListBuffer[RdfStreamRow]() + val options = JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS) + .setVersion(JellyConstants.PROTO_VERSION_1_0_X) + .build() + + val encoder = MockConverterFactory.encoder(Pep( + options, + enableNamespaceDeclarations = false, + appendableRowBuffer = buffer.asJava + )) + + for (graphName, triples) <- Graphs1.mrl do + encoder.handleGraphStart(graphName) + for triple <- triples do + encoder.handleTriple(triple.s, triple.p, triple.o) + encoder.handleGraphEnd() + + assertEncoded(buffer.toSeq, Graphs1.encoded(options)) + } + + "not allow to end a graph before starting one" in { + val buffer = ListBuffer[RdfStreamRow]() + val options = JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS) + .build() + + val encoder = MockConverterFactory.encoder(Pep( + options, + enableNamespaceDeclarations = false, + appendableRowBuffer = buffer.asJava + )) + + val error = intercept[RdfProtoSerializationError] { + encoder.handleGraphEnd() + } + + error.getMessage should include ("Cannot end a delimited graph before starting one") + } + + "not allow to use quoted triples as the graph name" in { + val buffer = ListBuffer[RdfStreamRow]() + val options = JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS) + .build() + + val encoder = MockConverterFactory.encoder(Pep( + options, + enableNamespaceDeclarations = false, + appendableRowBuffer = buffer.asJava + )) + + val error = intercept[RdfProtoSerializationError] { + encoder.handleGraphStart(TripleNode(BlankNode("S"), BlankNode("P"), BlankNode("O"))) + } + + error.getMessage should include ("Cannot encode graph node") + } + + "not allow to use namespace declarations if they are not enabled" in { + val buffer = ListBuffer[RdfStreamRow]() + val options = JellyOptions.SMALL_GENERALIZED.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .build() + + val encoder = MockConverterFactory.encoder(Pep( + options, + enableNamespaceDeclarations = false, + appendableRowBuffer = buffer.asJava + )) + + val error = intercept[RdfProtoSerializationError] { + encoder.handleNamespace("test", Iri("http://example.org/test")) + } + + error.getMessage should include ("Namespace declarations are not enabled in this stream") + } + } diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/ProtoTestCases.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/ProtoTestCases.scala new file mode 100644 index 000000000..39fb1db00 --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/ProtoTestCases.scala @@ -0,0 +1,313 @@ +package eu.neverblink.jelly.core + +import com.google.protobuf.ByteString +import eu.neverblink.jelly.core.{JellyConstants, NamespaceDeclaration} +import eu.neverblink.jelly.core.helpers.Mrl.* +import eu.neverblink.jelly.core.helpers.RdfAdapter.* +import eu.neverblink.jelly.core.proto.v1.* + +object ProtoTestCases: + def wrapEncoded(rows: Seq[RdfStreamRowValue]): Seq[RdfStreamRow] = rows + .map { + case v: RdfStreamOptions => v.getVersion match + // If the version is not set, set it to the current version + case 0 => v.toBuilder + .setVersion(JellyConstants.PROTO_VERSION) + .build() + // Otherwise assume we are checking version compatibility + case _ => v + case v => v + } + .map(rdfStreamRowFromValue) + + trait TestCase[+TStatement]: + def mrl: Seq[TStatement] + def encoded(opt: RdfStreamOptions): Seq[RdfStreamRow] + def encodedFull( + opt: RdfStreamOptions, groupByN: Int, metadata: Map[String, ByteString] = Map.empty + ): Seq[RdfStreamFrame] = + encoded(opt) + .grouped(groupByN) + .map(rows => rdfStreamFrame(rows, metadata = metadata)) + .toSeq + + object Triples1 extends TestCase[Triple]: + val mrl = Seq( + Triple( + Iri("https://test.org/test/subject"), + Iri("https://test.org/test/predicate"), + Iri("https://test.org/ns2/object"), + ), + Triple( + Iri("https://test.org/test/subject"), + Iri("https://test.org/test/predicate"), + DtLiteral("123", Datatype("https://test.org/xsd/integer")), + ), + Triple( + Iri("https://test.org/test/subject"), + Iri("https://test.org/test/predicate"), + TripleNode(Iri("https://test.org/test/subject"), Iri("b"), Iri("c")), + ), + Triple( + Iri("https://test.org/test/predicate"), + Iri("https://test.org/test/subject"), + TripleNode(Iri("https://test.org/test/subject"), Iri("b"), Iri("c")), + ), + ) + + def encoded(opt: RdfStreamOptions) = wrapEncoded(Seq( + opt, + rdfPrefixEntry(0, "https://test.org/test/"), + rdfNameEntry(0, "subject"), + rdfNameEntry(0, "predicate"), + rdfPrefixEntry(0, "https://test.org/ns2/"), + rdfNameEntry(0, "object"), + rdfTriple( + rdfIri(1, 0), + rdfIri(0, 0), + rdfIri(2, 0), + ), + rdfDatatypeEntry(0, "https://test.org/xsd/integer"), + rdfTriple( + null, + null, + rdfLiteral("123", 1), + ), + rdfPrefixEntry(0, ""), + rdfNameEntry(0, "b"), + rdfNameEntry(0, "c"), + rdfTriple( + null, + null, + rdfTriple( + rdfIri(1, 1), + rdfIri(3, 4), + rdfIri(0, 0), + ) + ), + rdfTriple( + rdfIri(1, 2), + rdfIri(0, 1), + null, + ), + )) + + object Triples2NsDecl extends TestCase[Triple | NamespaceDeclaration]: + val mrl = Seq( + NamespaceDeclaration("test", "https://test.org/test/"), + Triple( + Iri("https://test.org/test/subject"), + Iri("https://test.org/test/predicate"), + Iri("https://test.org/ns2/object"), + ), + NamespaceDeclaration("ns2", "https://test.org/ns2/"), + Triple( + Iri("https://test.org/ns2/object"), + Iri("https://test.org/test/subject"), + Iri("https://test.org/test/predicate"), + ), + ) + + def encoded(opt: RdfStreamOptions) = wrapEncoded(Seq( + opt, + rdfPrefixEntry(0, "https://test.org/test/"), + rdfNameEntry(0, ""), + rdfNamespaceDeclaration("test", rdfIri(1, 0)), + rdfNameEntry(0, "subject"), + rdfNameEntry(0, "predicate"), + rdfPrefixEntry(0, "https://test.org/ns2/"), + rdfNameEntry(0, "object"), + rdfTriple( + rdfIri(0, 0), + rdfIri(0, 0), + rdfIri(2, 0), + ), + rdfNamespaceDeclaration("ns2", rdfIri(0, 1)), + rdfTriple( + rdfIri(0, 4), + rdfIri(1, 2), + rdfIri(0, 0), + ), + )) + + object Quads1 extends TestCase[Quad]: + val mrl = Seq( + Quad( + Iri("https://test.org/test/subject"), + Iri("https://test.org/test/predicate"), + LangLiteral("test", "en-gb"), + Iri("https://test.org/ns3/graph"), + ), + Quad( + Iri("https://test.org/test/subject"), + BlankNode("blank"), + SimpleLiteral("test"), + Iri("https://test.org/ns3/graph"), + ), + // Generalized quads + Quad( + Iri("https://test.org/test/subject"), + BlankNode("blank"), + SimpleLiteral("test"), + BlankNode("blank"), + ), + Quad( + Iri("https://test.org/test/subject"), + BlankNode("blank"), + SimpleLiteral("test"), + SimpleLiteral("test"), + ), + ) + + def encoded(opt: RdfStreamOptions) = wrapEncoded(Seq( + opt, + rdfPrefixEntry(0, "https://test.org/test/"), + rdfNameEntry(0, "subject"), + rdfNameEntry(0, "predicate"), + rdfPrefixEntry(0, "https://test.org/ns3/"), + rdfNameEntry(0, "graph"), + rdfQuad( + rdfIri(1, 0), + rdfIri(0, 0), + rdfLiteral("test", "en-gb"), + rdfIri(2, 0), + ), + rdfQuad( + null, + "blank", + rdfLiteral("test"), + null, + ), + rdfQuad( + null, + null, + null, + "blank", + ), + rdfQuad( + null, + null, + null, + rdfLiteral("test"), + ), + )) + + object Quads2RepeatDefault extends TestCase[Quad]: + val mrl = Seq( + Quad( + Iri("https://test.org/test/subject"), + Iri("https://test.org/test/predicate"), + LangLiteral("test", "en-gb"), + DefaultGraphNode(), + ), + Quad( + Iri("https://test.org/test/subject"), + BlankNode("blank"), + SimpleLiteral("test"), + DefaultGraphNode(), + ), + ) + + def encoded(opt: RdfStreamOptions) = wrapEncoded(Seq( + opt, + rdfPrefixEntry(0, "https://test.org/test/"), + rdfNameEntry(0, "subject"), + rdfNameEntry(0, "predicate"), + rdfQuad( + rdfIri(1, 0), + rdfIri(0, 0), + rdfLiteral("test", "en-gb"), + rdfDefaultGraph(), + ), + rdfQuad( + null, + "blank", + rdfLiteral("test"), + null, + ), + )) + + object Graphs1 extends TestCase[(Node, Iterable[Triple])]: + val mrl = Seq( + ( + DefaultGraphNode(), + Seq( + Triple( + Iri("https://test.org/test/subject"), + Iri("https://test.org/test/predicate"), + Iri("https://test.org/ns2/object"), + ), + Triple( + Iri("https://test.org/test/subject"), + Iri("https://test.org/test/predicate"), + DtLiteral("123", Datatype("https://test.org/xsd/integer")), + ), + ) + ), + ( + Iri("https://test.org/ns3/graph"), + Seq( + Triple( + Iri("https://test.org/test/subject"), + Iri("https://test.org/test/predicate"), + Iri("https://test.org/ns2/object"), + ), + ) + ), + ) + + val mrlQuads = Seq( + Quad( + Iri("https://test.org/test/subject"), + Iri("https://test.org/test/predicate"), + Iri("https://test.org/ns2/object"), + DefaultGraphNode() + ), + Quad( + Iri("https://test.org/test/subject"), + Iri("https://test.org/test/predicate"), + DtLiteral("123", Datatype("https://test.org/xsd/integer")), + DefaultGraphNode() + ), + Quad( + Iri("https://test.org/test/subject"), + Iri("https://test.org/test/predicate"), + Iri("https://test.org/ns2/object"), + Iri("https://test.org/ns3/graph"), + ), + ) + + def encoded(opt: RdfStreamOptions) = wrapEncoded(Seq( + opt, + rdfGraphStart( + rdfDefaultGraph() + ), + rdfPrefixEntry(0, "https://test.org/test/"), + rdfNameEntry(0, "subject"), + rdfNameEntry(0, "predicate"), + rdfPrefixEntry(0, "https://test.org/ns2/"), + rdfNameEntry(0, "object"), + rdfTriple( + rdfIri(1, 0), + rdfIri(0, 0), + rdfIri(2, 0), + ), + rdfDatatypeEntry(0, "https://test.org/xsd/integer"), + rdfTriple( + null, + null, + rdfLiteral("123", 1), + ), + rdfGraphEnd(), + rdfPrefixEntry(0, "https://test.org/ns3/"), + rdfNameEntry(0, "graph"), + rdfGraphStart( + rdfIri(3, 0) + ), + rdfTriple( + null, + null, + rdfIri(2, 3), + ), + rdfGraphEnd(), + )) diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/ProtoTranscoderSpec.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/ProtoTranscoderSpec.scala new file mode 100644 index 000000000..e60c8358b --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/ProtoTranscoderSpec.scala @@ -0,0 +1,393 @@ +package eu.neverblink.jelly.core + +import com.google.protobuf.ByteString +import eu.neverblink.jelly.core.internal.ProtoTranscoderImpl +import eu.neverblink.jelly.core.{JellyConstants, JellyOptions, NamespaceDeclaration, RdfProtoDeserializationError, RdfProtoTranscodingError} +import eu.neverblink.jelly.core.ProtoTestCases.* +import eu.neverblink.jelly.core.helpers.RdfAdapter.* +import eu.neverblink.jelly.core.helpers.{MockConverterFactory, Mrl, ProtoCollector} +import eu.neverblink.jelly.core.proto.v1.* +import org.scalatest.Inspectors +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpec + +import scala.jdk.CollectionConverters.* +import scala.jdk.javaapi.CollectionConverters.asScala +import scala.util.Random + +/** + * Unit tests for the ProtoTranscoder class. + * See also integration tests: [[eu.ostrzyciel.jelly.integration_tests.CrossTranscodingSpec]] + */ +class ProtoTranscoderSpec extends AnyWordSpec, Inspectors, Matchers: + def smallOptions(prefixTableSize: Int) = rdfStreamOptions( + maxNameTableSize = 4, + maxPrefixTableSize = prefixTableSize, + maxDatatypeTableSize = 8, + ) + + val testCases: Seq[(String, PhysicalStreamType, + TestCase[Mrl.Triple | Mrl.Quad | (Mrl.Node, Iterable[Mrl.Triple]) | NamespaceDeclaration] + )] = Seq( + ("Triples1", PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES, Triples1), + ("Triples2NsDecl", PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES, Triples2NsDecl), + ("Quads1", PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS, Quads1), + ("Quads2RepeatDefault", PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS, Quads2RepeatDefault), + ("Graphs1", PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS, Graphs1), + ) + + "ProtoTranscoder" should { + "splice two identical streams" when { + for (caseName, streamType, testCase) <- testCases do + s"input is $caseName" in { + val options: RdfStreamOptions = JellyOptions.SMALL_ALL_FEATURES.toBuilder + .setPhysicalType(streamType) + .build() + val input: RdfStreamFrame = testCase.encodedFull(options, 100).head + val transcoder = new ProtoTranscoderImpl(null, options) + // First frame should be returned as is + val out1 = transcoder.ingestFrame(input) + out1 shouldEqual input + // What's more, the rows should be the exact same objects (except the options) + forAll(asScala(input.getRowsList).zip(asScala(out1.getRowsList)).drop(1)) { case (in, out) => + in eq out shouldBe true // reference equality + } + + val out2 = transcoder.ingestFrame(input) + out2.getRowsList.size shouldBe < (input.getRowsList.size) + // No row in out2 should be an options row or a lookup entry row + forAll(asScala(out2.getRowsList)) { (row: RdfStreamRow) => + row.hasOptions shouldBe false + row.hasPrefix shouldBe false + row.hasName shouldBe false + row.hasDatatype shouldBe false + } + + // If there is a row in out2 with same content as in input, it should be the same object + var identicalRows = 0 + forAll(asScala(input.getRowsList)) { (row: RdfStreamRow) => + val sameRows = asScala(out2.getRowsList).filter(_ == row) + if sameRows.nonEmpty then + forAtLeast(1, sameRows) { (sameRow: RdfStreamRow) => + sameRow eq row shouldBe true + identicalRows += 1 + } + } + // Something should be identical + identicalRows shouldBe > (0) + + // Decode the output + val collector = ProtoCollector() + val decoder = MockConverterFactory.anyDecoder(collector) + asScala(out1.getRowsList).foreach(decoder.ingestRow) + asScala(out2.getRowsList).foreach(decoder.ingestRow) + + val statements1 = collector.statements.slice(0, collector.statements.size / 2) + val statements2 = collector.statements.slice(collector.statements.size / 2, collector.statements.size) + statements1 shouldEqual statements2 + } + } + + "splice multiple identical streams" when { + for (caseName, streamType, testCase) <- testCases do + s"input is $caseName" in { + val options: RdfStreamOptions = JellyOptions.SMALL_ALL_FEATURES.toBuilder + .setPhysicalType(streamType) + .build() + + val input: RdfStreamFrame = testCase.encodedFull(options, 100).head + val transcoder = new ProtoTranscoderImpl(null, options) + val out1 = transcoder.ingestFrame(input) + var lastOut = out1 + for i <- 1 to 100 do + val outN = transcoder.ingestFrame(input) + outN.getRowsList.size shouldBe < (input.getRowsList.size) + // No row in out should be an options row or a lookup entry row + forAll(asScala(outN.getRowsList)) { (row: RdfStreamRow) => + row.hasOptions shouldBe false + row.hasPrefix shouldBe false + row.hasName shouldBe false + row.hasDatatype shouldBe false + } + if i != 1 then + outN shouldBe lastOut + lastOut = outN + } + } + + "splice multiple different streams" when { + for seed <- 1 to 20 do + f"random seed is $seed" in { + val collector = ProtoCollector() + val decoder = MockConverterFactory.quadsDecoder(collector) + val options = JellyOptions.SMALL_ALL_FEATURES.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS) + .build() + + val transcoder = new ProtoTranscoderImpl(null, options) + val possibleCases = Seq(Quads1, Quads2RepeatDefault) + val random = Random(seed) + val usedIndices = Array.ofDim[Int](possibleCases.size) + + for i <- 1 to 100 do + val index = random.nextInt(possibleCases.size) + usedIndices(index) += 1 + val testCase = possibleCases(index) + val out = transcoder.ingestFrame(testCase.encodedFull(options, 100).head) + + if usedIndices(index) > 1 then + // No row in out should be an options row or a lookup entry row + forAll(asScala(out.getRowsList)) { (row: RdfStreamRow) => + row.hasOptions shouldBe false + row.hasPrefix shouldBe false + row.hasName shouldBe false + row.hasDatatype shouldBe false + } + + asScala(out.getRowsList).foreach(decoder.ingestRow) + collector.statements shouldBe testCase.mrl + collector.clear() + } + } + + "handle named graphs" in { + val options = JellyOptions.SMALL_STRICT.toBuilder + .setMaxPrefixTableSize(0) + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_GRAPHS) + .setVersion(JellyConstants.PROTO_VERSION) + .build() + + val input: Seq[RdfStreamRow] = Seq[RdfStreamRow]( + rdfStreamRow(options), + rdfStreamRow(rdfNameEntry(0, "some IRI")), + rdfStreamRow(rdfNameEntry(4, "some IRI 2")), + rdfStreamRow(rdfGraphStart(rdfIri(0, 0))), + rdfStreamRow(rdfGraphStart(rdfIri(0, 4))), + ) + + val expectedOutput: Seq[RdfStreamRow] = Seq[RdfStreamRow]( + rdfStreamRow(options), + rdfStreamRow(rdfNameEntry(0, "some IRI")), + // ID 4 should be remapped to 2 + rdfStreamRow(rdfNameEntry(0, "some IRI 2")), + rdfStreamRow(rdfGraphStart(rdfIri(0, 0))), + rdfStreamRow(rdfGraphStart(rdfIri(0, 0))), + ) + + val transcoder = new ProtoTranscoderImpl(null, options) + + input.flatMap(entry => transcoder.ingestRow(entry).asScala) shouldBe expectedOutput + } + + "remap prefix, name, and datatype IDs" in { + val options = JellyOptions.SMALL_STRICT.toBuilder + .setVersion(JellyConstants.PROTO_VERSION) + .build() + + val input: Seq[RdfStreamRow] = Seq( + rdfStreamRow(options), + rdfStreamRow(rdfNameEntry(4, "some name")), + rdfStreamRow(rdfPrefixEntry(4, "some prefix")), + rdfStreamRow(rdfDatatypeEntry(4, "some IRI")), + rdfStreamRow(rdfTriple( + rdfTriple( + rdfIri(4, 4), + rdfIri(0, 4), + rdfLiteral("some literal", 4), + ), + rdfIri(0, 4), + rdfLiteral("some literal", 0), + )), + rdfStreamRow(rdfTriple( + rdfTriple("", "", ""), + rdfIri(0, 4), + rdfLiteral("some literal", 0), + )), + ) + + val expectedOutput: Seq[RdfStreamRow] = Seq( + rdfStreamRow(options), + rdfStreamRow(rdfNameEntry(0, "some name")), + rdfStreamRow(rdfPrefixEntry(0, "some prefix")), + rdfStreamRow(rdfDatatypeEntry(0, "some IRI")), + rdfStreamRow(rdfTriple( + rdfTriple( + rdfIri(1, 0), + rdfIri(0, 1), + rdfLiteral("some literal", 1), + ), + rdfIri(0, 1), + rdfLiteral("some literal", 0), + )), + rdfStreamRow(rdfTriple( + rdfTriple("", "", ""), + rdfIri(0, 1), + rdfLiteral("some literal", 0), + )), + ) + + val transcoder = new ProtoTranscoderImpl(null, options) + val output = input.flatMap(entry => transcoder.ingestRow(entry).asScala) + + output.size shouldBe expectedOutput.size + + for (i <- input.indices) do + output(i) shouldBe expectedOutput(i) + } + + "maintain protocol version 1 if input uses it" in { + val options = JellyOptions.SMALL_STRICT.toBuilder + .setVersion(JellyConstants.PROTO_VERSION_1_0_X) + .build() + + val input = rdfStreamRow(options) + val transcoder = new ProtoTranscoderImpl( + null, + options.toBuilder + .setVersion(JellyConstants.PROTO_VERSION) + .build() + ) + + val output = transcoder.ingestRow(input).asScala + output.head shouldBe input + } + + "throw an exception on a null row" in { + val transcoder = new ProtoTranscoderImpl(null, JellyOptions.SMALL_STRICT) + val ex = intercept[RdfProtoTranscodingError] { + transcoder.ingestRow(rdfStreamRow()) + } + ex.getMessage should include ("Row kind is not set") + } + + "throw an exception on mismatched physical types if checking is enabled" in { + val transcoder = new ProtoTranscoderImpl( + JellyOptions.DEFAULT_SUPPORTED_OPTIONS, + JellyOptions.SMALL_STRICT.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .build() + ) + + val ex = intercept[RdfProtoTranscodingError] { + transcoder.ingestRow(rdfStreamRow( + JellyOptions.SMALL_STRICT + .toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS) + .build() + )) + } + + ex.getMessage should include ("Input stream has a different physical type than the output") + ex.getMessage should include ("PHYSICAL_STREAM_TYPE_QUADS") + ex.getMessage should include ("PHYSICAL_STREAM_TYPE_TRIPLES") + } + + "not throw an exception on mismatched physical types if checking is disabled" in { + val transcoder = new ProtoTranscoderImpl( + null, + JellyOptions.SMALL_STRICT.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .build() + ) + + transcoder.ingestRow(rdfStreamRow( + JellyOptions.SMALL_STRICT.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_QUADS) + .build() + )) + } + + "throw an exception on unsupported options if checking is enabled" in { + val transcoder = new ProtoTranscoderImpl( + // Mark the prefix table as disabled + JellyOptions.DEFAULT_SUPPORTED_OPTIONS.toBuilder + .setMaxPrefixTableSize(0) + .build(), + JellyOptions.SMALL_STRICT.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .build() + ) + + val ex = intercept[RdfProtoDeserializationError] { + transcoder.ingestRow(rdfStreamRow( + JellyOptions.SMALL_STRICT.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .build() + )) + } + + ex.getMessage should include ("larger than the maximum supported size") + } + + "throw an exception if the input does not use prefixes but the output does" in { + val transcoder = new ProtoTranscoderImpl( + null, + JellyOptions.SMALL_STRICT.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .build() + ) + + val ex = intercept[RdfProtoTranscodingError] { + transcoder.ingestRow(rdfStreamRow( + JellyOptions.SMALL_STRICT.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .setMaxPrefixTableSize(0) + .build() + )) + } + + ex.getMessage should include ("Output stream uses prefixes, but the input stream does not") + } + + "accept an input stream with valid options if checking is enabled" in { + val transcoder = new ProtoTranscoderImpl( + // Mark the prefix table as disabled + JellyOptions.DEFAULT_SUPPORTED_OPTIONS.toBuilder + .setMaxPrefixTableSize(0) + .build(), + JellyOptions.SMALL_STRICT.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .setMaxPrefixTableSize(0) + .build(), + ) + + val inputOptions = JellyOptions.SMALL_STRICT.toBuilder + .setPhysicalType(PhysicalStreamType.PHYSICAL_STREAM_TYPE_TRIPLES) + .setMaxPrefixTableSize(0) + .build() + + transcoder.ingestRow(rdfStreamRow(inputOptions)) + } + + "preserve lack of metadata in a frame (1.1.1)" in { + val transcoder = new ProtoTranscoderImpl(null, JellyOptions.SMALL_STRICT) + val input = rdfStreamFrame( + rows = Seq(rdfStreamRow( + JellyOptions.SMALL_STRICT.toBuilder + .setVersion(JellyConstants.PROTO_VERSION_1_1_X) + .build() + )), + ) + val output = transcoder.ingestFrame(input) + output.getMetadataMap.size should be (0) + } + + "preserve metadata in a frame (1.1.1)" in { + val transcoder = new ProtoTranscoderImpl(null, JellyOptions.SMALL_STRICT) + val input = rdfStreamFrame( + rows = Seq(rdfStreamRow( + JellyOptions.SMALL_STRICT.toBuilder + .setVersion(JellyConstants.PROTO_VERSION_1_1_X) + .build() + )), + metadata = Map( + "key1" -> ByteString.copyFromUtf8("value"), + "key2" -> ByteString.copyFromUtf8("value2"), + ), + ) + val output = transcoder.ingestFrame(input) + output.getMetadataMap.size should be (2) + output.getMetadataMap.asScala("key1").toStringUtf8 should be ("value") + output.getMetadataMap.asScala("key2").toStringUtf8 should be ("value2") + } + } diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/Assertions.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/Assertions.scala new file mode 100644 index 000000000..1db004b2e --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/Assertions.scala @@ -0,0 +1,26 @@ +package eu.neverblink.jelly.core.helpers + +import eu.neverblink.jelly.core.helpers.Mrl.{Statement} +import eu.neverblink.jelly.core.helpers.RdfAdapter.extractRdfStreamRow +import eu.neverblink.jelly.core.proto.v1.* +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpec + +object Assertions extends AnyWordSpec, Matchers: + def assertEncoded(observed: Seq[RdfStreamRow], expected: Seq[RdfStreamRow]): Unit = + for ix <- 0 until observed.size.min(expected.size) do + withClue(s"Row $ix:") { + val obsRow = extractRdfStreamRow(observed.applyOrElse(ix, null)) + val expRow = extractRdfStreamRow(expected.applyOrElse(ix, null)) + obsRow should be(expRow) + } + observed.size should be(expected.size) + + def assertDecoded(observed: Seq[Statement], expected: Seq[Statement]): Unit = + for ix <- 0 until observed.size.min(expected.size) do + withClue(s"Row $ix:") { + val obsRow = observed.applyOrElse(ix, null) + val expRow = expected.applyOrElse(ix, null) + obsRow should be(expRow) + } + observed.size should be(expected.size) diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/MockConverterFactory.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/MockConverterFactory.scala new file mode 100644 index 000000000..46fbb97e8 --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/MockConverterFactory.scala @@ -0,0 +1,46 @@ +package eu.neverblink.jelly.core.helpers + +import eu.neverblink.jelly.core.RdfHandler.* +import eu.neverblink.jelly.core.internal.ProtoDecoderImpl.* +import eu.neverblink.jelly.core.internal.ProtoEncoderImpl +import eu.neverblink.jelly.core.{JellyOptions, ProtoDecoderConverter, ProtoEncoder, ProtoEncoderConverter} +import eu.neverblink.jelly.core.helpers.Mrl.* +import eu.neverblink.jelly.core.proto.v1.* + +import scala.jdk.FunctionConverters.* + +object MockConverterFactory extends MockConverterFactory + +trait MockConverterFactory: + + final def encoderConverter: ProtoEncoderConverter[Node] = MockProtoEncoderConverter() + + final def decoderConverter: ProtoDecoderConverter[Node, Datatype] = new MockProtoDecoderConverter() + + final def encoder(params: ProtoEncoder.Params): ProtoEncoder[Node] = + new ProtoEncoderImpl[Node](encoderConverter, params) + + final def triplesDecoder( + handler: TripleHandler[Node], + options: RdfStreamOptions = JellyOptions.DEFAULT_SUPPORTED_OPTIONS + ): TriplesDecoder[Node, Datatype] = TriplesDecoder[Node, Datatype](decoderConverter, handler, options) + + final def quadsDecoder( + handler: QuadHandler[Node], + options: RdfStreamOptions = JellyOptions.DEFAULT_SUPPORTED_OPTIONS + ): QuadsDecoder[Node, Datatype] = QuadsDecoder[Node, Datatype](decoderConverter, handler, options) + + final def graphsDecoder( + handler: GraphHandler[Node], + options: RdfStreamOptions = JellyOptions.DEFAULT_SUPPORTED_OPTIONS + ): GraphsDecoder[Node, Datatype] = GraphsDecoder[Node, Datatype](decoderConverter, handler, options) + + final def graphsAsQuadsDecoder( + handler: QuadHandler[Node], + options: RdfStreamOptions = JellyOptions.DEFAULT_SUPPORTED_OPTIONS + ): GraphsAsQuadsDecoder[Node, Datatype] = GraphsAsQuadsDecoder[Node, Datatype](decoderConverter, handler, options) + + final def anyDecoder( + handler: AnyStatementHandler[Node], + options: RdfStreamOptions = JellyOptions.DEFAULT_SUPPORTED_OPTIONS + ): AnyStatementDecoder[Node, Datatype] = AnyStatementDecoder[Node, Datatype](decoderConverter, handler, options) diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/MockProtoDecoderConverter.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/MockProtoDecoderConverter.scala new file mode 100644 index 000000000..ff7e7c4d7 --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/MockProtoDecoderConverter.scala @@ -0,0 +1,18 @@ +package eu.neverblink.jelly.core.helpers + +import eu.neverblink.jelly.core.ProtoDecoderConverter +import eu.neverblink.jelly.core.helpers.Mrl.* + +/** + * Mock implementation of [[ProtoDecoder]]. + */ +class MockProtoDecoderConverter + extends ProtoDecoderConverter[Node, Datatype]: + def makeSimpleLiteral(lex: String) = SimpleLiteral(lex) + def makeLangLiteral(lex: String, lang: String) = LangLiteral(lex, lang) + def makeDtLiteral(lex: String, dt: Datatype) = DtLiteral(lex, dt) + def makeDatatype(dt: String) = Datatype(dt) + def makeBlankNode(label: String) = BlankNode(label) + def makeIriNode(iri: String) = Iri(iri) + def makeTripleNode(s: Node, p: Node, o: Node) = TripleNode(s, p, o) + def makeDefaultGraphNode(): Node = DefaultGraphNode() diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/MockProtoEncoderConverter.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/MockProtoEncoderConverter.scala new file mode 100644 index 000000000..cc9f6a338 --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/MockProtoEncoderConverter.scala @@ -0,0 +1,35 @@ +package eu.neverblink.jelly.core.helpers + +import eu.neverblink.jelly.core.{NodeEncoder, ProtoEncoderConverter, RdfProtoSerializationError, RdfTerm} +import eu.neverblink.jelly.core.* +import eu.neverblink.jelly.core.helpers.Mrl.* +import eu.neverblink.jelly.core.proto.v1.* + +import scala.collection.mutable + +/** + * Mock implementation of ProtoEncoderConverter + */ +class MockProtoEncoderConverter extends ProtoEncoderConverter[Node]: + + override def nodeToProto(encoder: NodeEncoder[Node], node: Node): RdfTerm.SpoTerm = node match + case Iri(iri) => encoder.makeIri(iri) + case SimpleLiteral(lex) => encoder.makeSimpleLiteral(lex) + case LangLiteral(lex, lang) => encoder.makeLangLiteral(node, lex, lang) + case DtLiteral(lex, dt) => encoder.makeDtLiteral(node, lex, dt.dt) + case BlankNode(label) => encoder.makeBlankNode(label) + case TripleNode(s, p, o) => encoder.makeQuotedTriple( + nodeToProto(encoder, s), + nodeToProto(encoder, p), + nodeToProto(encoder, o), + ) + case _ => throw RdfProtoSerializationError(s"Cannot encode node: $node") + + override def graphNodeToProto(encoder: NodeEncoder[Node], node: Node): RdfTerm.GraphTerm = node match + case Iri(iri) => encoder.makeIri(iri) + case SimpleLiteral(lex) => encoder.makeSimpleLiteral(lex) + case LangLiteral(lex, lang) => encoder.makeLangLiteral(node, lex, lang) + case DtLiteral(lex, dt) => encoder.makeDtLiteral(node, lex, dt.dt) + case BlankNode(label) => encoder.makeBlankNode(label) + case DefaultGraphNode() => NodeEncoder.makeDefaultGraph + case _ => throw RdfProtoSerializationError(s"Cannot encode graph node: $node") diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/Mrl.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/Mrl.scala new file mode 100644 index 000000000..0deca0b0d --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/Mrl.scala @@ -0,0 +1,22 @@ +package eu.neverblink.jelly.core.helpers + +/** + * "Mrl" stands for "mock RDF library". I wanted it to be short. + */ +object Mrl: + final case class Datatype(dt: String) + + sealed trait Node + final case class Iri(iri: String) extends Node + final case class SimpleLiteral(lex: String) extends Node + final case class LangLiteral(lex: String, lang: String) extends Node + final case class DtLiteral(lex: String, dt: Datatype) extends Node + final case class BlankNode(label: String) extends Node + final case class DefaultGraphNode() extends Node + final case class TripleNode(s: Node, p: Node, o: Node) extends Node + + sealed trait Statement + final case class Triple(s: Node, p: Node, o: Node) extends Statement + final case class Quad(s: Node, p: Node, o: Node, g: Node) extends Statement + final case class Graph(graph: Node, triples: Seq[Triple]) extends Statement + \ No newline at end of file diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/ProtoCollector.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/ProtoCollector.scala new file mode 100644 index 000000000..825eb5ef3 --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/ProtoCollector.scala @@ -0,0 +1,41 @@ +package eu.neverblink.jelly.core.helpers + +import eu.neverblink.jelly.core.RdfHandler.AnyRdfHandler +import eu.neverblink.jelly.core.helpers.Mrl.* + +import java.util +import scala.collection.mutable +import scala.jdk.javaapi.CollectionConverters +import scala.jdk.javaapi.CollectionConverters.asScala + +final class ProtoCollector extends AnyRdfHandler[Node]: + val namespaces: mutable.ListBuffer[(String, Node)] = mutable.ListBuffer.empty + val statements: mutable.ListBuffer[Statement] = mutable.ListBuffer.empty + + private var currentGraph: Option[Node] = None + private val currentGraphTripleBuffer = mutable.ListBuffer.empty[Triple] + + override def handleNamespace(prefix: String, namespace: Node): Unit = + namespaces += ((prefix, namespace)) + + override def handleTriple(subject: Node, predicate: Node, `object`: Node): Unit = + if currentGraph.isDefined then + currentGraphTripleBuffer += Triple(subject, predicate, `object`) + else + statements += Triple(subject, predicate, `object`) + + override def handleQuad(subject: Node, predicate: Node, `object`: Node, graph: Node): Unit = + statements += Quad(subject, predicate, `object`, graph) + + override def handleGraphStart(graph: Node): Unit = + currentGraph = Some(graph) + + override def handleGraphEnd(): Unit = + if currentGraphTripleBuffer.nonEmpty then + statements += Graph(currentGraph.get, currentGraphTripleBuffer.toSeq) + currentGraphTripleBuffer.clear() + currentGraph = None + + def clear(): Unit = + namespaces.clear() + statements.clear() diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/RdfAdapter.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/RdfAdapter.scala new file mode 100644 index 000000000..28254d386 --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/helpers/RdfAdapter.scala @@ -0,0 +1,269 @@ +package eu.neverblink.jelly.core.helpers + +import com.google.protobuf.ByteString +import eu.neverblink.jelly.core.proto.v1.* + +import scala.jdk.CollectionConverters.* + + +object RdfAdapter: + + def rdfNameEntry(id: Int, value: String): RdfNameEntry = + RdfNameEntry.newBuilder() + .setId(id) + .setValue(value) + .build() + + def rdfPrefixEntry(id: Int, value: String): RdfPrefixEntry = + RdfPrefixEntry.newBuilder() + .setId(id) + .setValue(value) + .build() + + def rdfDatatypeEntry(id: Int, value: String): RdfDatatypeEntry = + RdfDatatypeEntry.newBuilder() + .setId(id) + .setValue(value) + .build() + + def rdfNamespaceDeclaration(name: String, value: RdfIri): RdfNamespaceDeclaration = + RdfNamespaceDeclaration.newBuilder() + .setName(name) + .setValue(value) + .build() + + def rdfLiteral(lex: String): RdfLiteral = + RdfLiteral.newBuilder() + .setLex(lex) + .build() + + def rdfLiteral(lex: String, langtag: String): RdfLiteral = + RdfLiteral.newBuilder() + .setLex(lex) + .setLangtag(langtag) + .build() + + def rdfLiteral(lex: String, datatype: Int): RdfLiteral = + RdfLiteral.newBuilder() + .setLex(lex) + .setDatatype(datatype) + .build() + + def rdfIri(prefixId: Int, nameId: Int): RdfIri = + RdfIri.newBuilder() + .setNameId(nameId) + .setPrefixId(prefixId) + .build() + + def rdfStreamFrame(rows: Seq[RdfStreamRow], metadata: Map[String, ByteString] = Map.empty): RdfStreamFrame = + RdfStreamFrame.newBuilder() + .addAllRows(rows.asJava) + .putAllMetadata(metadata.asJava) + .build() + + type RdfStreamRowValue = + RdfStreamOptions + | RdfTriple + | RdfQuad + | RdfGraphStart + | RdfGraphEnd + | RdfNamespaceDeclaration + | RdfNameEntry + | RdfPrefixEntry + | RdfDatatypeEntry + | Null + + def rdfStreamRowFromValue(value: RdfStreamRowValue): RdfStreamRow = + value match { + case v: RdfStreamOptions => rdfStreamRow(v) + case v: RdfTriple => rdfStreamRow(v) + case v: RdfQuad => rdfStreamRow(v) + case v: RdfGraphStart => rdfStreamRow(v) + case v: RdfGraphEnd => rdfStreamRow(v) + case v: RdfNamespaceDeclaration => rdfStreamRow(v) + case v: RdfNameEntry => rdfStreamRow(v) + case v: RdfPrefixEntry => rdfStreamRow(v) + case v: RdfDatatypeEntry => rdfStreamRow(v) + } + + def rdfStreamRow(row: RdfNameEntry): RdfStreamRow = + RdfStreamRow.newBuilder() + .setName(row) + .build() + + def rdfStreamRow(row: RdfPrefixEntry): RdfStreamRow = + RdfStreamRow.newBuilder() + .setPrefix(row) + .build() + + def rdfStreamRow(row: RdfStreamOptions): RdfStreamRow = + RdfStreamRow.newBuilder() + .setOptions(row) + .build() + + def rdfStreamRow(row: RdfTriple): RdfStreamRow = + RdfStreamRow.newBuilder() + .setTriple(row) + .build() + + def rdfStreamRow(row: RdfQuad): RdfStreamRow = + RdfStreamRow.newBuilder() + .setQuad(row) + .build() + + def rdfStreamRow(row: RdfGraphStart): RdfStreamRow = + RdfStreamRow.newBuilder() + .setGraphStart(row) + .build() + + def rdfStreamRow(row: RdfGraphEnd): RdfStreamRow = + RdfStreamRow.newBuilder() + .setGraphEnd(row) + .build() + + def rdfStreamRow(row: RdfNamespaceDeclaration): RdfStreamRow = + RdfStreamRow.newBuilder() + .setNamespace(row) + .build() + + def rdfStreamRow(row: RdfDatatypeEntry): RdfStreamRow = + RdfStreamRow.newBuilder() + .setDatatype(row) + .build() + + def rdfStreamRow(): RdfStreamRow = + RdfStreamRow.newBuilder() + .build() + + def rdfStreamOptions( + streamName: String = "", + maxNameTableSize: Int = 1, + maxPrefixTableSize: Int = 1, + maxDatatypeTableSize: Int = 1, + ): RdfStreamOptions = + RdfStreamOptions.newBuilder() + .setStreamName(streamName) + .setMaxNameTableSize(maxNameTableSize) + .setMaxPrefixTableSize(maxPrefixTableSize) + .setMaxDatatypeTableSize(maxDatatypeTableSize) + .build() + + def rdfDefaultGraph(): RdfDefaultGraph = + RdfDefaultGraph.newBuilder() + .build() + + type RdfGraphValue = + RdfIri + | String + | RdfDefaultGraph + | RdfLiteral + | Null + + def rdfGraphStart(graph: RdfGraphValue): RdfGraphStart = { + val builder = RdfGraphStart.newBuilder() + + graph match + case g: RdfIri => builder.setGIri(g) + case g: String => builder.setGBnode(g) + case g: RdfDefaultGraph => builder.setGDefaultGraph(g) + case g: RdfLiteral => builder.setGLiteral(g) + + builder.build() + } + + def rdfGraphStart(): RdfGraphStart = + RdfGraphStart.newBuilder() + .build() + + def rdfGraphEnd(): RdfGraphEnd = + RdfGraphEnd.newBuilder() + .build() + + def rdfQuad(subject: RdfSpoValue, predicate: RdfSpoValue, `object`: RdfSpoValue, graph: RdfGraphValue): RdfQuad = { + var builder = RdfQuad.newBuilder() + + if subject != null then + subject match + case s: RdfIri => builder = builder.setSIri(s) + case s: String => builder = builder.setSBnode(s) + case s: RdfLiteral => builder = builder.setSLiteral(s) + case s: RdfTriple => builder = builder.setSTripleTerm(s) + + if predicate != null then + predicate match + case p: RdfIri => builder = builder.setPIri(p) + case p: String => builder = builder.setPBnode(p) + case p: RdfLiteral => builder = builder.setPLiteral(p) + case p: RdfTriple => builder = builder.setPTripleTerm(p) + + if `object` != null then + `object` match + case o: RdfIri => builder = builder.setOIri(o) + case o: String => builder = builder.setOBnode(o) + case o: RdfLiteral => builder = builder.setOLiteral(o) + case o: RdfTriple => builder = builder.setOTripleTerm(o) + + if graph != null then + graph match + case g: RdfIri => builder = builder.setGIri(g) + case g: String => builder = builder.setGBnode(g) + case g: RdfDefaultGraph => builder = builder.setGDefaultGraph(g) + case g: RdfLiteral => builder = builder.setGLiteral(g) + + builder.build() + } + + type RdfSpoValue = + RdfIri + | String + | RdfLiteral + | RdfTriple + | Null + + def rdfTriple(subject: RdfSpoValue, predicate: RdfSpoValue, `object`: RdfSpoValue): RdfTriple = { + var builder = RdfTriple.newBuilder() + + if subject != null then + subject match + case s: RdfIri => builder = builder.setSIri(s) + case s: String => builder = builder.setSBnode(s) + case s: RdfLiteral => builder = builder.setSLiteral(s) + case s: RdfTriple => builder = builder.setSTripleTerm(s) + + if predicate != null then + predicate match + case p: RdfIri => builder = builder.setPIri(p) + case p: String => builder = builder.setPBnode(p) + case p: RdfLiteral => builder = builder.setPLiteral(p) + case p: RdfTriple => builder = builder.setPTripleTerm(p) + + if `object` != null then + `object` match + case o: RdfIri => builder = builder.setOIri(o) + case o: String => builder = builder.setOBnode(o) + case o: RdfLiteral => builder = builder.setOLiteral(o) + case o: RdfTriple => builder = builder.setOTripleTerm(o) + + builder.build() + } + + def extractRdfStreamRow(row: RdfStreamRow): RdfStreamRowValue = + if row.hasOptions then + row.getOptions + else if row.hasName then + row.getName + else if row.hasPrefix then + row.getPrefix + else if row.hasTriple then + row.getTriple + else if row.hasQuad then + row.getQuad + else if row.hasGraphStart then + row.getGraphStart + else if row.hasGraphEnd then + row.getGraphEnd + else if row.hasNamespace then + row.getNamespace + else if row.hasDatatype then + row.getDatatype + else null diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/internal/EncoderLookupSpec.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/internal/EncoderLookupSpec.scala new file mode 100644 index 000000000..9ad3844df --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/internal/EncoderLookupSpec.scala @@ -0,0 +1,136 @@ +package eu.neverblink.jelly.core.internal + +import org.scalatest.Inspectors +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpec + +import scala.util.Random + +class EncoderLookupSpec extends AnyWordSpec, Matchers: + Random.setSeed(123) + + "encoder lookup" should { + "add new entries up to capacity" in { + val lookup = EncoderLookup(4, true) + for i <- 1 to 4 do + val v = lookup.getOrAddEntry(s"v$i") + v.getId should be (i) + v.setId should be (0) + v.newEntry should be (true) + lookup.serials(v.getId) should be (1) + } + + "retrieve entries" in { + val lookup = EncoderLookup(4, true) + for i <- 1 to 4 do + lookup.getOrAddEntry(s"v$i") + for i <- 1 to 4 do + val v = lookup.getOrAddEntry(s"v$i") + v.getId should be (i) + v.setId should be (i) + v.newEntry should be (false) + lookup.serials(v.getId) should be (1) + } + + "retrieve entries many times, in random order" in { + val lookup = EncoderLookup(50, true) + for i <- 1 to 50 do + lookup.getOrAddEntry(s"v$i") + for _ <- 1 to 20 do + for i <- Random.shuffle(1 to 50) do + val v = lookup.getOrAddEntry(s"v$i") + v.getId should be (i) + v.setId should be (i) + v.newEntry should be (false) + lookup.serials(v.getId) should be (1) + } + + "overwrite existing entries, from oldest to newest" in { + val lookup = EncoderLookup(4, true) + for i <- 1 to 4 do + lookup.getOrAddEntry(s"v$i") + + val v = lookup.getOrAddEntry("v5") + v.getId should be (1) + v.setId should be (1) + v.newEntry should be (true) + lookup.serials(v.getId) should be (2) + + for i <- 6 to 8 do + val v = lookup.getOrAddEntry(s"v$i") + v.getId should be (i - 4) + v.setId should be (0) + v.newEntry should be (true) + lookup.serials(v.getId) should be (2) + } + + "overwrite existing entries in order, many times" in { + val lookup = EncoderLookup(17, true) + for i <- 1 to 17 do + lookup.getOrAddEntry(s"v$i") + + for k <- 2 to 23 do + val v = lookup.getOrAddEntry(s"v1 $k") + v.getId should be (1) + v.setId should be (1) + v.newEntry should be (true) + lookup.serials(v.getId) should be (k) + for i <- 2 to 17 do + val v = lookup.getOrAddEntry(s"v$i $k") + v.getId should be (i) + v.setId should be (0) + v.newEntry should be (true) + lookup.serials(v.getId) should be (k) + } + + "pass random stress test (1)" in { + val lookup = EncoderLookup(100, true) + val frequentSet = (1 to 10).map(i => s"v$i") + frequentSet.foreach(lookup.getOrAddEntry) + + for i <- 1 to 50 do + for fIndex <- 1 to 10 do + val v = lookup.getOrAddEntry(frequentSet(fIndex - 1)) + v.getId should be (fIndex) + v.setId should be (fIndex) + v.newEntry should be (false) + lookup.serials(v.getId) should be (1) + + for _ <- 1 to 80 do + val v = lookup.getOrAddEntry(s"r${Random.nextInt(200) + 1}") + v.getId should be > 10 + if v.setId != 0 then + v.setId should be > 10 + } + + "pass random stress test (2)" in { + val lookup = EncoderLookup(113, true) + for i <- 1 to 20 do + lookup.getOrAddEntry(s"v$i") + for _ <- 1 to 1000 do + val id = Random.nextInt(20) + 1 + val v = lookup.getOrAddEntry(s"v$id") + v.getId should be (id) + if v.setId != 0 then + v.setId should be (id) + v.newEntry should be (false) + else + v.newEntry should be (true) + lookup.serials(v.getId) should be (1) + } + + "pass random stress test (3)" in { + val lookup = EncoderLookup(1023, true) + for _ <- 1 to 100_000 do + val v = lookup.getOrAddEntry(s"v${Random.nextInt(10_000) + 1}") + v.getId should be > 0 + } + + "not use the serials table if not needed" in { + val lookup = EncoderLookup(16, false) + for _ <- 1 to 2000 do + val v = lookup.getOrAddEntry(s"v${Random.nextInt(1000) + 1}") + v.getId should be > 0 + lookup.serials should be (null) + } + } diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/internal/NameDecoderSpec.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/internal/NameDecoderSpec.scala new file mode 100644 index 000000000..01c779db9 --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/internal/NameDecoderSpec.scala @@ -0,0 +1,172 @@ +package eu.neverblink.jelly.core.internal + +import eu.neverblink.jelly.core.RdfProtoDeserializationError +import eu.neverblink.jelly.core.proto.v1.* +import eu.neverblink.jelly.core.helpers.RdfAdapter.* +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpec + +class NameDecoderSpec extends AnyWordSpec, Matchers: + var smallOptions: RdfStreamOptions = RdfStreamOptions.newBuilder() + .setMaxNameTableSize(16) + .setMaxPrefixTableSize(8) + .build() + + def makeDecoder(opt: RdfStreamOptions) = + NameDecoderImpl(opt.getMaxPrefixTableSize(), opt.getMaxNameTableSize(), identity) + + "A NameDecoder" when { + "empty" should { + "throw NullPointerException when trying to retrieve a non-existent IRI" in { + val dec = makeDecoder(smallOptions) + intercept[NullPointerException] { + dec.decode(3, 5) + } + } + + "throw exception when trying to retrieve a non-existent IRI with no prefix" in { + val dec = makeDecoder(smallOptions) + val error = intercept[RdfProtoDeserializationError] { + dec.decode(0, 5) + } + error.getMessage should include ("No prefix, Name ID: 5") + } + + "throw exception when trying to retrieve a name with empty LUT" in { + val dec = makeDecoder(smallOptions) + val error = intercept[RdfProtoDeserializationError] { + dec.decode(0, 0) + } + error.getMessage should include ("No prefix, Name ID: 0") + } + + "return empty string for no prefix and empty name" in { + val dec = makeDecoder(smallOptions) + dec.updateNames(rdfNameEntry(0, "")) + dec.decode(0, 0) should be ("") + } + + "accept new prefixes with default IDs" in { + val dec = makeDecoder(smallOptions) + dec.updatePrefixes(rdfPrefixEntry(0, "https://test.org/")) + dec.updatePrefixes(rdfPrefixEntry(0, "https://test.org/2/")) + dec.updateNames(rdfNameEntry(0, "")) + dec.updateNames(rdfNameEntry(0, "")) + dec.decode(1, 0) should be("https://test.org/") + dec.decode(2, 0) should be("https://test.org/2/") + } + + "accept a new prefix with default ID after explicitly numbered prefix" in { + val dec = makeDecoder(smallOptions) + dec.updatePrefixes(rdfPrefixEntry(4, "https://test.org/")) + // This ID will resolve to 5 + dec.updatePrefixes(rdfPrefixEntry(0, "https://test.org/2/")) + dec.updateNames(rdfNameEntry(0, "")) + dec.updateNames(rdfNameEntry(0, "")) + dec.decode(4, 0) should be("https://test.org/") + dec.decode(5, 0) should be("https://test.org/2/") + } + + "accept a new prefix and return it (IRI with no name part)" in { + val dec = makeDecoder(smallOptions) + dec.updatePrefixes(rdfPrefixEntry(3, "https://test.org/")) + dec.updateNames(rdfNameEntry(0, "")) + dec.decode(3, 0) should be ("https://test.org/") + } + + "accept a new name and return it (IRI with no prefix)" in { + val dec = makeDecoder(smallOptions) + dec.updateNames(rdfNameEntry(5, "Cake")) + dec.decode(0, 5) should be ("Cake") + } + + "override an earlier name entry and decode the IRI (IRI with no prefix)" in { + val dec = makeDecoder(smallOptions) + dec.updateNames(rdfNameEntry(5, "Cake")) + dec.decode(0, 5) should be("Cake") + dec.updateNames(rdfNameEntry(5, "Pie")) + dec.decode(0, 5) should be("Pie") + } + + "accept a new name and prefix and return them" in { + val dec = makeDecoder(smallOptions) + // Test prefix & name on the edge of the lookup + dec.updatePrefixes(rdfPrefixEntry(8, "https://test.org/")) + dec.updateNames(rdfNameEntry(16, "Cake")) + dec.decode(8, 16) should be ("https://test.org/Cake") + } + + "override an earlier name entry and decode the IRI (with prefix)" in { + val dec = makeDecoder(smallOptions) + dec.updatePrefixes(rdfPrefixEntry(8, "https://test.org/")) + dec.updateNames(rdfNameEntry(16, "Cake")) + dec.decode(8, 16) should be("https://test.org/Cake") + dec.updateNames(rdfNameEntry(16, "Pie")) + dec.decode(8, 16) should be("https://test.org/Pie") + } + + "not accept a new prefix ID larger than table size" in { + val dec = makeDecoder(smallOptions) + intercept[RdfProtoDeserializationError] { + dec.updatePrefixes(rdfPrefixEntry(9, "https://test.org/")) + } + } + + "not accept a new prefix ID lower than 0 (-1)" in { + val dec = makeDecoder(smallOptions) + intercept[RdfProtoDeserializationError] { + dec.updatePrefixes(rdfPrefixEntry(-1, "https://test.org/")) + } + } + + "not accept a new prefix ID lower than 0 (-2)" in { + val dec = makeDecoder(smallOptions) + intercept[RdfProtoDeserializationError] { + dec.updatePrefixes(rdfPrefixEntry(-2, "https://test.org/")) + } + } + + "not retrieve a prefix ID larger than table size" in { + val dec = makeDecoder(smallOptions) + intercept[RdfProtoDeserializationError] { + dec.decode(9, 0) + } + } + + "not accept a new name ID larger than table size" in { + val dec = makeDecoder(smallOptions) + intercept[RdfProtoDeserializationError] { + dec.updateNames(rdfNameEntry(17, "Cake")) + } + } + + "not accept a default ID going beyond the table size" in { + val dec = makeDecoder(smallOptions) + dec.updateNames(rdfNameEntry(16, "Cake")) + intercept[RdfProtoDeserializationError] { + dec.updateNames(rdfNameEntry(0, "Cake 2")) + } + } + + "not accept a new name ID lower than 0 (-1)" in { + val dec = makeDecoder(smallOptions) + intercept[RdfProtoDeserializationError] { + dec.updateNames(rdfNameEntry(-1, "Cake")) + } + } + + "not accept a new name ID lower than 0 (-2)" in { + val dec = makeDecoder(smallOptions) + intercept[RdfProtoDeserializationError] { + dec.updateNames(rdfNameEntry(-2, "Cake")) + } + } + + "not retrieve a name ID larger than table size" in { + val dec = makeDecoder(smallOptions) + intercept[RdfProtoDeserializationError] { + dec.decode(0, 17) + } + } + } + } diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/internal/NodeEncoderSpec.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/internal/NodeEncoderSpec.scala new file mode 100644 index 000000000..1ac9b3d8b --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/internal/NodeEncoderSpec.scala @@ -0,0 +1,452 @@ +package eu.neverblink.jelly.core.internal + +import eu.neverblink.jelly.core.{JellyOptions, RdfProtoSerializationError, RowBufferAppender} +import eu.neverblink.jelly.core.helpers.Mrl +import eu.neverblink.jelly.core.helpers.RdfAdapter.* +import eu.neverblink.jelly.core.proto.v1.* +import org.scalatest.Inspectors +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpec + +import scala.collection.mutable.ListBuffer +import scala.util.Random + +class NodeEncoderSpec extends AnyWordSpec, Inspectors, Matchers: + def smallOptions(prefixTableSize: Int): RdfStreamOptions = rdfStreamOptions( + maxNameTableSize = 4, + maxPrefixTableSize = prefixTableSize, + maxDatatypeTableSize = 8, + ) + + private def getEncoder(prefixTableSize: Int = 8): (NodeEncoderImpl[Mrl.Node], ListBuffer[RdfStreamRow]) = + val buffer = new ListBuffer[RdfStreamRow]() + val appender: RowBufferAppender = new RowBufferAppender { + def appendNameEntry(entry: RdfNameEntry): Unit = buffer += rdfStreamRow(entry) + def appendPrefixEntry(entry: RdfPrefixEntry): Unit = buffer += rdfStreamRow(entry) + def appendDatatypeEntry(entry: RdfDatatypeEntry): Unit = buffer += rdfStreamRow(entry) + } + (NodeEncoderImpl[Mrl.Node]( + prefixTableSize, 4, 8, + 16, 16, 16, + appender + ), buffer) + + "A NodeEncoder" when { + "encoding datatype literals" should { + "encode a datatype literal" in { + val (encoder, buffer) = getEncoder() + val node = encoder.makeDtLiteral(Mrl.DtLiteral("v1", Mrl.Datatype("dt1")), "v1", "dt1") + + node.lex should be ("v1") + node.datatype should be (1) + + buffer.size should be (1) + buffer.head.hasDatatype should be (true) + val dtEntry = buffer.head.getDatatype + dtEntry.getValue should be ("dt1") + dtEntry.getId should be (0) + } + + "encode multiple datatype literals and reuse existing datatypes" in { + val (encoder, buffer) = getEncoder() + for i <- 1 to 4 do + val node = encoder.makeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + s"v$i", s"dt$i" + ) + node.lex should be (s"v$i") + node.datatype should be (i) + + // "dt3" datatype should be reused + val node = encoder.makeDtLiteral( + Mrl.DtLiteral(s"v1000", Mrl.Datatype(s"dt3")), + "v1000", "dt3", + ) + node.lex should be ("v1000") + node.datatype should be (3) + + // "v2"^^ should be reused + val node2 = encoder.makeDtLiteral( + Mrl.DtLiteral("v2", Mrl.Datatype("dt2")), + "v2", "dt2", + ) + node2.lex should be ("v2") + node2.datatype should be (2) + + buffer.size should be (4) + buffer.map(_.getDatatype) should contain only ( + rdfDatatypeEntry(0, "dt1"), + rdfDatatypeEntry(0, "dt2"), + rdfDatatypeEntry(0, "dt3"), + rdfDatatypeEntry(0, "dt4"), + ) + } + + "not evict datatype IRIs used recently" in { + val (encoder, buffer) = getEncoder() + for i <- 1 to 8 do + val node = encoder.makeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + s"v$i", s"dt$i", + ) + node.lex should be(s"v$i") + node.datatype should be(i) + + // use literal 1 again + val node = encoder.makeDtLiteral( + Mrl.DtLiteral("v1", Mrl.Datatype("dt1")), + "v1", "dt1", + ) + node.lex should be("v1") + node.datatype should be(1) + + // now add a new DT and see which DT is evicted + val node2 = encoder.makeDtLiteral( + Mrl.DtLiteral("v9", Mrl.Datatype("dt9")), + "v9", "dt9", + ) + node2.lex should be("v9") + node2.datatype should be(2) + } + + "encode datatype literals while evicting old datatypes" in { + val (encoder, buffer) = getEncoder() + for i <- 1 to 12 do + val node = encoder.makeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + s"v$i", s"dt$i", + ) + // first 4 datatypes should be evicted + node.lex should be (s"v$i") + node.datatype should be ((i - 1) % 8 + 1) + + for i <- 9 to 12 do + val node = encoder.makeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + s"v$i", s"dt$i", + ) + node.lex should be (s"v$i") + node.datatype should be (i - 8) + + for i <- 5 to 8 do + val node = encoder.makeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + s"v$i", s"dt$i", + ) + node.lex should be (s"v$i") + node.datatype should be (i) + + // 5–8 were used last, so they should be evicted last + for i <- 13 to 16 do + val node = encoder.makeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + s"v$i", s"dt$i", + ) + node.lex should be (s"v$i") + node.datatype should be (i - 12) // 1–4 + + buffer.size should be (16) + val expectedIds = Array.from( + Iterable.fill(8)(0) ++ Seq(1) ++ Iterable.fill(3)(0) ++ Seq(1) ++ Iterable.fill(3)(0) + ) + for (r, i) <- buffer.zipWithIndex do + val dt = r.getDatatype + dt.getId should be (expectedIds(i)) + dt.getValue should be (s"dt${i + 1}") + } + + "reuse already encoded literals, evicting old ones" in { + val (encoder, buffer) = getEncoder() + for i <- 1 to 4; j <- 1 to 4 do + val node = encoder.makeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$j")), + s"v$i", s"dt$j", + ) + node.lex should be (s"v$i") + node.datatype should be (j) + + for _ <- 1 to 10 do + for i <- Random.shuffle(1 to 4); j <- Random.shuffle(1 to 4) do + val node = encoder.makeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$j")), + s"v$i", s"dt$j", + ) + node.lex should be (s"v$i") + node.datatype should be (j) + + // Add more literals to evict the old ones + for j <- 101 to 104 do + val node = encoder.makeDtLiteral( + Mrl.DtLiteral(s"v100", Mrl.Datatype(s"dt${j - 100}")), + s"v100", s"dt${j - 100}", + ) + node.lex should be ("v100") + node.datatype should be (j - 100) + + // These entries should have been evicted + for j <- 1 to 4 do + val node = encoder.makeDtLiteral( + Mrl.DtLiteral(s"v1", Mrl.Datatype(s"dt$j")), + s"v1", s"dt$j", + ) + node.lex should be ("v1") + node.datatype should be (j) + } + + "invalidate cached datatype literals when their datatypes are evicted" in { + val (encoder, buffer) = getEncoder() + for i <- 1 to 4 do + val node = encoder.makeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + s"v$i", s"dt$i", + ) + node.lex should be (s"v$i") + node.datatype should be (i) + + for i <- 5 to 12 do + val node = encoder.makeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + s"v$i", s"dt$i", + ) + node.lex should be (s"v$i") + node.datatype should be ((i - 1) % 8 + 1) + + for i <- 1 to 4 do + val node = encoder.makeDtLiteral( + Mrl.DtLiteral(s"v$i", Mrl.Datatype(s"dt$i")), + s"v$i", s"dt$i", + ) + node.lex should be (s"v$i") + node.datatype should be (i + 4) + } + + "throw exception if datatype table size = 0" in { + val encoder = NodeEncoderImpl[Mrl.Node]( + 16, 16, 0, 16, 16, 16, null + ) + val e = intercept[RdfProtoSerializationError] { + encoder.makeDtLiteral( + Mrl.DtLiteral("v1", Mrl.Datatype("dt1")), + "v1", "dt1", + ) + } + e.getMessage should include ("Datatype literals cannot be encoded when the datatype table") + } + } + + "encoding IRIs" should { + "add a full IRI" in { + val (encoder, buffer) = getEncoder() + val iri = encoder.makeIri("https://test.org/Cake") + iri.nameId should be (0) + iri.prefixId should be (1) + + buffer.size should be (2) + buffer should contain (rdfStreamRow( + rdfPrefixEntry(id = 0, value = "https://test.org/") + )) + buffer should contain (rdfStreamRow( + rdfNameEntry(id = 0, value = "Cake") + )) + } + + "add a prefix-only IRI" in { + val (encoder, buffer) = getEncoder() + val iri = encoder.makeIri("https://test.org/test/") + iri.nameId should be (0) + iri.prefixId should be (1) + + // an empty name entry still has to be allocated + buffer.size should be (2) + buffer should contain (rdfStreamRow( + rdfPrefixEntry(id = 0, value = "https://test.org/test/") + )) + buffer should contain(rdfStreamRow( + rdfNameEntry(id = 0, value = "") + )) + } + + "add a name-only IRI" in { + val (encoder, buffer) = getEncoder() + val iri = encoder.makeIri("testTestTest") + iri.nameId should be (0) + iri.prefixId should be (1) + + // in the mode with the prefix table enabled, an empty prefix entry still has to be allocated + buffer.size should be (2) + buffer should contain (rdfStreamRow( + rdfPrefixEntry(id = 0, value = "") + )) + buffer should contain (rdfStreamRow( + rdfNameEntry(id = 0, value = "testTestTest") + )) + } + + "add a full IRI in no-prefix table mode" in { + val (encoder, buffer) = getEncoder(0) + val iri = encoder.makeIri("https://test.org/Cake") + iri.nameId should be (0) + iri.prefixId should be (0) + + // in the no prefix mode, there must be no prefix entries + buffer.size should be (1) + buffer should contain (rdfStreamRow( + rdfNameEntry(id = 0, value = "https://test.org/Cake") + )) + } + + "add IRIs while evicting old ones" in { + val (encoder, buffer) = getEncoder(3) + val data = Seq( + // IRI, expected prefix ID, expected name ID + ("https://test.org/Cake1", 1, 0), + ("https://test.org/Cake1", 0, 1), + ("https://test.org/Cake1", 0, 1), + ("https://test.org#Cake1", 2, 1), + ("https://test.org/test/Cake1", 3, 1), + ("https://test.org/Cake2", 1, 0), + ("https://test.org#Cake2", 2, 2), + ("https://test.org/other/Cake1", 3, 1), + ("https://test.org/other/Cake2", 0, 0), + ("https://test.org/other/Cake3", 0, 0), + ("https://test.org/other/Cake4", 0, 0), + ("https://test.org/other/Cake1", 0, 1), + ("https://test.org/other/Cake2", 0, 0), + ("https://test.org/other/Cake3", 0, 0), + ("https://test.org/other/Cake4", 0, 0), + ("https://test.org/other/Cake5", 0, 1), + ("https://test.org/other/Cake5", 0, 1), + ("https://test.org#Cake2", 2, 0), + ("https://test.org#Cake5", 0, 1), + // prefix "" evicts the previous number #1 + ("Cake2", 1, 0), + ) + + for (sIri, ePrefix, eName) <- data do + val iri = encoder.makeIri(sIri) + iri.prefixId should be (ePrefix) + iri.nameId should be (eName) + + val expectedBuffer = Seq( + // Prefix? (name otherwise), ID, value + (true, 0, "https://test.org/"), + (false, 0, "Cake1"), + (true, 0, "https://test.org#"), + (true, 0, "https://test.org/test/"), + (false, 0, "Cake2"), + (true, 3, "https://test.org/other/"), + (false, 0, "Cake3"), + (false, 0, "Cake4"), + (false, 1, "Cake5"), + (true, 1, ""), + ) + + buffer.size should be (expectedBuffer.size) + for ((isPrefix, eId, eVal), row) <- expectedBuffer.zip(buffer) do + if isPrefix then + row.hasPrefix should be (true) + val prefix = row.getPrefix + prefix.getId should be (eId) + prefix.getValue should be (eVal) + else + row.hasName should be (true) + val name = row.getName + name.getId should be (eId) + name.getValue should be (eVal) + } + + "add IRIs while evicting old ones (2: detecting invalidated prefix entries)" in { + val (encoder, buffer) = getEncoder(3) + val data = Seq( + // IRI, expected prefix ID, expected name ID + ("https://test.org/1/Cake1", 1, 0), + ("https://test.org/2/Cake1", 2, 1), + ("https://test.org/3/Cake1", 3, 1), + ("https://test.org/3/Cake2", 0, 0), + // Evict the /1/ prefix + ("https://test.org/4/Cake2", 1, 2), + // Try to get the first IRI + ("https://test.org/1/Cake1", 2, 1), + ) + + for (sIri, ePrefix, eName) <- data do + val iri = encoder.makeIri(sIri) + iri.prefixId should be(ePrefix) + iri.nameId should be(eName) + + val expectedBuffer = Seq( + // Prefix? (name otherwise), ID, value + (true, 0, "https://test.org/1/"), + (false, 0, "Cake1"), + (true, 0, "https://test.org/2/"), + (true, 0, "https://test.org/3/"), + (false, 0, "Cake2"), + (true, 1, "https://test.org/4/"), + (true, 0, "https://test.org/1/"), + ) + + buffer.size should be(expectedBuffer.size) + for ((isPrefix, eId, eVal), row) <- expectedBuffer.zip(buffer) do + if isPrefix then + row.hasPrefix should be (true) + val prefix = row.getPrefix + prefix.getId should be(eId) + prefix.getValue should be(eVal) + else + row.hasName should be (true) + val name = row.getName + name.getId should be(eId) + name.getValue should be(eVal) + } + + "not evict IRI prefixes used recently" in { + val (encoder, buffer) = getEncoder(3) + val data = Seq( + // IRI, expected prefix ID, expected name ID + ("https://test.org/1/Cake1", 1, 0), + ("https://test.org/2/Cake2", 2, 0), + ("https://test.org/3/Cake3", 3, 0), + ("https://test.org/3/Cake3", 0, 3), + ("https://test.org/2/Cake2", 2, 2), + ("https://test.org/1/Cake1", 1, 1), + // Evict something -- this must not be /1/ because it was used last + // this tests if .onAccess() is called correctly + ("https://test.org/4/Cake4", 3, 4), + ) + + for (sIri, ePrefix, eName) <- data do + val iri = encoder.makeIri(sIri) + iri.prefixId should be(ePrefix) + iri.nameId should be(eName) + } + + "add IRIs while evicting old ones, without a prefix table" in { + val (encoder, buffer) = getEncoder(0) + val data = Seq( + // IRI, expected name ID + ("https://test.org/Cake1", 0), + ("https://test.org/Cake1", 1), + ("https://test.org/Cake1", 1), + ("https://test.org#Cake1", 0), + ("https://test.org/test/Cake1", 0), + ("https://test.org/Cake2", 0), + ("https://test.org#Cake2", 1), + ("https://test.org/other/Cake1", 0), + ("https://test.org/other/Cake2", 0), + ("https://test.org/other/Cake3", 0), + ("https://test.org/other/Cake1", 2), + ("https://test.org/other/Cake2", 0), + ("https://test.org/other/Cake3", 0), + ("https://test.org/other/Cake4", 1), + ("https://test.org/other/Cake5", 0), + ("https://test.org/other/Cake5", 2), + ("https://test.org/other/Cake3", 4), + ) + + for (sIri, eName) <- data do + val iri = encoder.makeIri(sIri) + iri.prefixId should be(0) + iri.nameId should be(eName) + } + } + } diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/internal/TranscoderLookupSpec.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/internal/TranscoderLookupSpec.scala new file mode 100644 index 000000000..414579d26 --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/internal/TranscoderLookupSpec.scala @@ -0,0 +1,220 @@ +package eu.neverblink.jelly.core.internal + +import eu.neverblink.jelly.core.RdfProtoTranscodingError +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpec + +/** + * Unit tests for the TranscoderLookup class. + */ +class TranscoderLookupSpec extends AnyWordSpec, Matchers: + + "TranscoderLookup" should { + "throw an exception when trying to set input lookup size greater than the output" in { + val tl = TranscoderLookup(false, 100) + val ex = intercept[RdfProtoTranscodingError] { + tl.newInputStream(120) + } + ex.getMessage should include ("Input lookup size cannot be greater than the output lookup size") + } + + "remap IDs" when { + "it's a prefix lookup" in { + val tl = TranscoderLookup(false, 120) + tl.newInputStream(100) + tl.addEntry(80, "s80").getId shouldBe 1 + tl.addEntry(81, "s81").getId shouldBe 2 + + tl.remap(80) shouldBe 1 + tl.remap(0) shouldBe 0 + tl.remap(0) shouldBe 0 + tl.remap(81) shouldBe 2 + tl.remap(80) shouldBe 1 + tl.remap(81) shouldBe 2 + tl.remap(0) shouldBe 0 + } + + "it's a name lookup" in { + val tl = TranscoderLookup(true, 100) + tl.newInputStream(100) + tl.addEntry(80, "s80").getId shouldBe 1 + tl.addEntry(81, "s81").getId shouldBe 2 + tl.addEntry(82, "s82").getId shouldBe 3 + tl.addEntry(83, "s83").getId shouldBe 4 + + tl.remap(80) shouldBe 0 + tl.remap(80) shouldBe 1 + tl.remap(80) shouldBe 1 + tl.remap(81) shouldBe 0 + tl.remap(82) shouldBe 0 + tl.remap(82) shouldBe 3 + tl.remap(83) shouldBe 0 + + // and with 0 in the input + tl.remap(80) shouldBe 1 + tl.remap(0) shouldBe 0 + tl.remap(0) shouldBe 0 + tl.remap(80) shouldBe 1 + } + } + + "remap IDs evicting old entries" when { + "it's a prefix lookup" in { + val tl = TranscoderLookup(false, 10) + tl.newInputStream(5) + for i <- 0 to 4 do + tl.addEntry(i + 1, s"s$i").getId shouldBe i + 1 + tl.remap(i + 1) shouldBe i + 1 + for i <- 5 to 50 do + // Later all ids will be remapped to 6–10 because the transcoder will evict the same entry as the input. + tl.addEntry((i % 5) + 1, s"s$i").getId shouldBe (i % 5) + 6 + tl.remap((i % 5) + 1) shouldBe (i % 5) + 6 + } + + "it's a name lookup" in { + val tl = TranscoderLookup(true, 10) + tl.newInputStream(5) + for i <- 0 to 50 do + val getId = tl.addEntry((i % 5) + 1, s"s$i").getId + if i < 5 then getId shouldBe i + 1 + else getId shouldBe (i % 5) + 6 + if (i % 5) != 0 || i < 10 then + tl.remap((i % 5) + 1) shouldBe 0 + else + tl.remap((i % 5) + 1) shouldBe (i % 5) + 6 + } + } + + "decode 0-encoding in lookup entries in the input stream" when { + "it's a prefix lookup" in { + val tl = TranscoderLookup(false, 10) + tl.newInputStream(5) + tl.addEntry(0, "s1_1") + tl.addEntry(0, "s2_1") + tl.addEntry(0, "s3_1") + tl.remap(1) shouldBe 1 + + tl.addEntry(1, "s1_2") + tl.remap(1) shouldBe 4 + tl.remap(2) shouldBe 2 + tl.remap(3) shouldBe 3 + tl.remap(0) shouldBe 0 + + // Recover an entry + tl.addEntry(5, "s1_1") + tl.remap(5) shouldBe 1 + tl.remap(0) shouldBe 0 + } + + "it's a name lookup" in { + val tl = TranscoderLookup(true, 10) + tl.newInputStream(5) + tl.addEntry(0, "s1_1") + tl.addEntry(0, "s2_1") + tl.addEntry(0, "s3_1") + tl.remap(1) shouldBe 0 + + tl.addEntry(1, "s1_2") + tl.remap(1) shouldBe 4 + tl.remap(0) shouldBe 2 + tl.remap(0) shouldBe 0 + + // Recover an entry + tl.addEntry(5, "s1_1") + tl.remap(5) shouldBe 1 + tl.remap(2) shouldBe 0 + } + } + + "handle multiple input streams" when { + "it's a prefix lookup" in { + val tl = TranscoderLookup(false, 10) + tl.newInputStream(5) + tl.addEntry(0, "s1_1") + tl.addEntry(0, "s2_1") + tl.addEntry(0, "s3_1") + tl.remap(2) shouldBe 2 + + tl.newInputStream(5) + tl.addEntry(0, "s1_2") + tl.addEntry(0, "s2_2") + tl.addEntry(0, "s3_2") + tl.remap(1) shouldBe 4 + tl.remap(2) shouldBe 5 + tl.remap(3) shouldBe 6 + + tl.newInputStream(5) + tl.addEntry(0, "s1_3") + tl.addEntry(0, "s2_3") + tl.addEntry(0, "s3_3") + tl.remap(1) shouldBe 7 + tl.remap(2) shouldBe 8 + tl.remap(3) shouldBe 9 + + tl.newInputStream(5) + tl.addEntry(0, "s1_1") + tl.addEntry(0, "s2_2") + tl.addEntry(0, "s3_3") + tl.remap(1) shouldBe 1 + tl.remap(2) shouldBe 5 + tl.remap(3) shouldBe 9 + } + + "it's a name lookup" in { + val tl = TranscoderLookup(true, 10) + tl.newInputStream(5) + tl.addEntry(0, "s1_1") + tl.addEntry(0, "s2_1") + tl.addEntry(0, "s3_1") + tl.remap(2) shouldBe 2 + tl.remap(0) shouldBe 0 + + tl.newInputStream(5) + tl.addEntry(0, "s1_1") + tl.addEntry(0, "s2_1") + tl.addEntry(0, "s3_1") + tl.remap(0) shouldBe 1 + tl.remap(0) shouldBe 0 + tl.remap(0) shouldBe 0 + + tl.newInputStream(5) + tl.addEntry(0, "s1_2") + tl.addEntry(0, "s2_2") + tl.addEntry(0, "s3_2") + tl.remap(0) shouldBe 0 // last was 3, this is 4, so it's 0 + tl.remap(3) shouldBe 6 + tl.remap(1) shouldBe 4 + tl.remap(0) shouldBe 0 + tl.remap(0) shouldBe 0 + } + } + + "resize the internal remapping table" in { + val tl = TranscoderLookup(false, 100) + + for i <- 1 to 10 do + val size = i * 4 + tl.newInputStream(size) + for j <- 1 to size do + tl.addEntry(j, s"s$j").getId shouldBe j + tl.remap(j) + } + + "evict the corresponding element if the input stream is evicting something" in { + val tl = TranscoderLookup(false, 3) + tl.newInputStream(3) + tl.addEntry(0, "s1_1") + tl.addEntry(0, "s2_1") + tl.addEntry(0, "s3_1") + + tl.newInputStream(3) + tl.addEntry(0, "s1_1").newEntry should be (false) + + // Even though this entry was just used, we are evicting it because our input stream does that + val e = tl.addEntry(1, "something else") + e.newEntry should be (true) + e.setId should be (1) + e.getId should be (1) + } + } + diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/utils/IoUtilsSpec.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/utils/IoUtilsSpec.scala new file mode 100644 index 000000000..f57fa4ef4 --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/utils/IoUtilsSpec.scala @@ -0,0 +1,128 @@ +package eu.neverblink.jelly.core.utils + +import eu.neverblink.jelly.core.helpers.RdfAdapter.* +import eu.neverblink.jelly.core.proto.v1.* +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpec + +import java.io.{ByteArrayInputStream, ByteArrayOutputStream} + +class IoUtilsSpec extends AnyWordSpec, Matchers: + private val frameLarge = rdfStreamFrame(Seq( + rdfStreamRow( + rdfNameEntry(1, "name name name name") + ) + )) + private val frameSize10 = rdfStreamFrame(Seq( + rdfStreamRow( + rdfNameEntry(0, "name") + ) + )) + private val frameOptionsSize10 = rdfStreamFrame(Seq( + rdfStreamRow( + rdfStreamOptions(streamName = "name12") + ) + )) + + "IoUtils" should { + "autodetectDelimiting" when { + "input stream is a non-delimited Jelly message (size >10)" in { + val bytes = frameLarge.toByteArray + bytes(0) shouldBe 0x0A + bytes(1) should not be 0x0A + + val in = new ByteArrayInputStream(bytes) + val response = IoUtils.autodetectDelimiting(in) + response.isDelimited shouldBe false + response.newInput.readAllBytes() shouldBe bytes + } + + "input stream is a delimited Jelly message (size >10)" in { + val os = ByteArrayOutputStream() + frameLarge.writeDelimitedTo(os) + val bytes = os.toByteArray + bytes(0) should not be 0x0A + bytes(1) shouldBe 0x0A + + val in = new ByteArrayInputStream(bytes) + val response = IoUtils.autodetectDelimiting(in) + response.isDelimited shouldBe true + response.newInput.readAllBytes() shouldBe bytes + } + + "input stream is a non-delimited Jelly message (size=10)" in { + val bytes = frameSize10.toByteArray + bytes.size shouldBe 10 + bytes(0) shouldBe 0x0A + bytes(1) should not be 0x0A + + val in = new ByteArrayInputStream(bytes) + val response = IoUtils.autodetectDelimiting(in) + response.isDelimited shouldBe false + response.newInput.readAllBytes() shouldBe bytes + } + + "input stream is a delimited Jelly message (size=10)" in { + val os = ByteArrayOutputStream() + frameSize10.writeDelimitedTo(os) + val bytes = os.toByteArray + bytes.size shouldBe 11 + bytes(0) shouldBe 0x0A + bytes(1) shouldBe 0x0A + bytes(2) should not be 0x0A + + val in = new ByteArrayInputStream(bytes) + val response = IoUtils.autodetectDelimiting(in) + response.isDelimited shouldBe true + response.newInput.readAllBytes() shouldBe bytes + } + + "input stream is a non-delimited Jelly message (options size =10)" in { + val os = ByteArrayOutputStream() + frameOptionsSize10.getRows(0).writeTo(os) + val bytes = os.toByteArray + + val in = new ByteArrayInputStream(bytes) + val response = IoUtils.autodetectDelimiting(in) + response.isDelimited shouldBe false + response.newInput.readAllBytes() shouldBe bytes + } + + "input stream is a delimited Jelly message (options size =10)" in { + val os = ByteArrayOutputStream() + frameOptionsSize10.writeDelimitedTo(os) + val bytes = os.toByteArray + + val in = new ByteArrayInputStream(bytes) + val response = IoUtils.autodetectDelimiting(in) + response.isDelimited shouldBe true + response.newInput.readAllBytes() shouldBe bytes + } + + "input stream is empty" in { + val in = new ByteArrayInputStream(Array.emptyByteArray) + val response = IoUtils.autodetectDelimiting(in) + response.isDelimited shouldBe false + response.newInput.readAllBytes() shouldBe Array.emptyByteArray + } + + "input stream has only 2 bytes" in { + // some messed-up data + val in = new ByteArrayInputStream(Array[Byte](0x12, 0x34)) + val response = IoUtils.autodetectDelimiting(in) + response.isDelimited shouldBe false + response.newInput.readAllBytes() shouldBe Array[Byte](0x12, 0x34) + } + } + + "writeFrameAsDelimited" in { + val os = ByteArrayOutputStream() + IoUtils.writeFrameAsDelimited(frameLarge.toByteArray, os) + val bytes = os.toByteArray + + val in = new ByteArrayInputStream(bytes) + val response = IoUtils.autodetectDelimiting(in) + response.isDelimited shouldBe true + RdfStreamFrame.parseDelimitedFrom(response.newInput) shouldBe frameLarge + } + } diff --git a/core-java/src/test/scala/eu/neverblink/jelly/core/utils/LogicalStreamTypeUtilsSpec.scala b/core-java/src/test/scala/eu/neverblink/jelly/core/utils/LogicalStreamTypeUtilsSpec.scala new file mode 100644 index 000000000..daf49f5d0 --- /dev/null +++ b/core-java/src/test/scala/eu/neverblink/jelly/core/utils/LogicalStreamTypeUtilsSpec.scala @@ -0,0 +1,138 @@ +package eu.neverblink.jelly.core.utils + +import eu.neverblink.jelly.core.helpers.Assertions.* +import eu.neverblink.jelly.core.helpers.MockConverterFactory +import eu.neverblink.jelly.core.helpers.Mrl.* +import eu.neverblink.jelly.core.proto.v1.* +import org.scalatest.matchers.should.Matchers +import org.scalatest.wordspec.AnyWordSpec + +import scala.language.postfixOps + +class LogicalStreamTypeUtilsSpec extends AnyWordSpec, Matchers: + private val validStreamTypes = LogicalStreamType.values + .filter(_ != LogicalStreamType.UNRECOGNIZED) + .filter(_.getNumber > 0) + + given MockConverterFactory.type = MockConverterFactory + + "toBaseType" should { + for streamType <- validStreamTypes do + s"return base type for $streamType" in { + val baseValue = LogicalStreamTypeUtils.toBaseType(streamType) + baseValue.getNumber should be > 0 + baseValue.getNumber should be < 10 + + streamType match + case LogicalStreamType.LOGICAL_STREAM_TYPE_FLAT_TRIPLES => LogicalStreamType.LOGICAL_STREAM_TYPE_FLAT_TRIPLES + case LogicalStreamType.LOGICAL_STREAM_TYPE_FLAT_QUADS => LogicalStreamType.LOGICAL_STREAM_TYPE_FLAT_QUADS + case LogicalStreamType.LOGICAL_STREAM_TYPE_GRAPHS => LogicalStreamType.LOGICAL_STREAM_TYPE_GRAPHS + case LogicalStreamType.LOGICAL_STREAM_TYPE_DATASETS => LogicalStreamType.LOGICAL_STREAM_TYPE_DATASETS + case LogicalStreamType.LOGICAL_STREAM_TYPE_SUBJECT_GRAPHS => LogicalStreamType.LOGICAL_STREAM_TYPE_GRAPHS + case LogicalStreamType.LOGICAL_STREAM_TYPE_NAMED_GRAPHS => LogicalStreamType.LOGICAL_STREAM_TYPE_DATASETS + case LogicalStreamType.LOGICAL_STREAM_TYPE_TIMESTAMPED_NAMED_GRAPHS => LogicalStreamType.LOGICAL_STREAM_TYPE_DATASETS + case _ => fail(s"Unrecognized stream type: $streamType") + } + } + + "isEqualOrSubtypeOf" should { + for streamType <- validStreamTypes do + val baseValue = LogicalStreamTypeUtils.toBaseType(streamType) + + s"return true for $streamType and itself" in { + LogicalStreamTypeUtils.isEqualOrSubtypeOf(streamType, streamType) shouldBe true + } + + s"return true for $streamType and its base type" in { + LogicalStreamTypeUtils.isEqualOrSubtypeOf(streamType, baseValue) shouldBe true + } + + if baseValue != streamType then + s"return false for ${baseValue} and $streamType" in { + LogicalStreamTypeUtils.isEqualOrSubtypeOf(baseValue, streamType) shouldBe false + } + + s"return false for $streamType and an undefined type" in { + LogicalStreamTypeUtils.isEqualOrSubtypeOf(streamType, LogicalStreamType.LOGICAL_STREAM_TYPE_UNSPECIFIED) shouldBe false + } + + s"return false for an undefined type and $streamType" in { + LogicalStreamTypeUtils.isEqualOrSubtypeOf(LogicalStreamType.LOGICAL_STREAM_TYPE_UNSPECIFIED, streamType) shouldBe false + } + } + + "getRdfStaxType" should { + for streamType <- validStreamTypes do + s"return RDF STaX type for $streamType" in { + val t = LogicalStreamTypeUtils.getRdfStaxType(streamType) + t should not be None + t should startWith ("https://w3id.org/stax/ontology#") + } + + s"return a type that can be parsed by LogicalStreamTypeFactory for $streamType" in { + val t = LogicalStreamTypeUtils.getRdfStaxType(streamType) + val newType = LogicalStreamTypeUtils.fromOntologyIri(t) + newType should be (streamType) + } + + "not return RDF STaX type for UNSPECIFIED" in { + LogicalStreamTypeUtils.getRdfStaxType(LogicalStreamType.LOGICAL_STREAM_TYPE_UNSPECIFIED) should be (null) + } + } + + "getRdfStaxAnnotation" should { + val subjectNodes = Seq( + Iri("https://example.org/stream"), + BlankNode("stream"), + null, + ) + + for + streamType <- validStreamTypes + subjectNode <- subjectNodes + do + s"return RDF STaX annotation for $streamType and $subjectNode" in { + val decoder = MockConverterFactory.decoderConverter + val a = LogicalStreamTypeUtils.getRdfStaxAnnotation( + decoder, + { (s, p, o) => Triple(s, p, o) }, + streamType, + subjectNode + ) + a.size should be (3) + + val a0Triple = a.get(0) + + a0Triple.s should be (subjectNode) + a0Triple.p should be (Iri("https://w3id.org/stax/ontology#hasStreamTypeUsage")) + + val a2Triple = a.get(2) + + a2Triple.o should be (Iri(LogicalStreamTypeUtils.getRdfStaxType(streamType))) + } + + for subjectNode <- subjectNodes do + s"throw exception for RDF STaX annotation for UNSPECIFIED and $subjectNode" in { + val error = intercept[IllegalArgumentException] { + val decoder = MockConverterFactory.decoderConverter + LogicalStreamTypeUtils.getRdfStaxAnnotation( + decoder, + { (s, p, o) => Triple(s, p, o) }, + LogicalStreamType.LOGICAL_STREAM_TYPE_UNSPECIFIED, + subjectNode + ) + } + error.getMessage should include ("Unsupported logical stream type") + error.getMessage should include ("UNSPECIFIED") + } + } + + "LogicalStreamTypeFactory.fromOntologyIri" should { + "return None for a non-STaX IRI" in { + LogicalStreamTypeUtils.fromOntologyIri("https://example.org/stream") should be (null) + } + + "return None for an invalid STaX IRI" in { + LogicalStreamTypeUtils.fromOntologyIri("https://w3id.org/stax/ontology#doesNotExist") should be (null) + } + } diff --git a/core/src/main/java/eu/ostrzyciel/jelly/core/internal/EncoderLookup.java b/core/src/main/java/eu/ostrzyciel/jelly/core/internal/EncoderLookup.java index c095bf4ee..2d0488dc0 100644 --- a/core/src/main/java/eu/ostrzyciel/jelly/core/internal/EncoderLookup.java +++ b/core/src/main/java/eu/ostrzyciel/jelly/core/internal/EncoderLookup.java @@ -149,7 +149,7 @@ private final void addEntryEvicting(String key, int id) { * @return The entry. */ public LookupEntry getOrAddEntry(String key) { - var value = map.get(key); + final var value = map.get(key); if (value != null) { // The entry is already in the table, just update the access order onAccess(value.getId); @@ -183,7 +183,7 @@ public LookupEntry getOrAddEntry(String key) { * @return The entry. */ public LookupEntry getOrAddEntryTranscoder(String key, int evictHint) { - var value = map.get(key); + final var value = map.get(key); if (value != null) { onAccess(value.getId); return value; diff --git a/core/src/main/java/eu/ostrzyciel/jelly/core/internal/NodeEncoderImpl.java b/core/src/main/java/eu/ostrzyciel/jelly/core/internal/NodeEncoderImpl.java index 870d8e803..f57c8e4ef 100644 --- a/core/src/main/java/eu/ostrzyciel/jelly/core/internal/NodeEncoderImpl.java +++ b/core/src/main/java/eu/ostrzyciel/jelly/core/internal/NodeEncoderImpl.java @@ -118,7 +118,7 @@ public NodeEncoderImpl( public UniversalTerm makeIri(String iri) { if (maxPrefixTableSize == 0) { // Fast path for no prefixes - var nameEntry = nameLookup.getOrAddEntry(iri); + final var nameEntry = nameLookup.getOrAddEntry(iri); if (nameEntry.newEntry) { bufferAppender.appendNameEntry(new RdfNameEntry(nameEntry.setId, iri)); } @@ -133,7 +133,7 @@ public UniversalTerm makeIri(String iri) { } // Slow path, with splitting out the prefix - var cachedNode = iriNodeCache.computeIfAbsent(iri, k -> new DependentNode()); + final var cachedNode = iriNodeCache.computeIfAbsent(iri, k -> new DependentNode()); // Check if the value is still valid if ( cachedNode.encoded != null && @@ -162,8 +162,8 @@ public UniversalTerm makeIri(String iri) { postfix = iri.substring(i + 1); } - var prefixEntry = prefixLookup.getOrAddEntry(prefix); - var nameEntry = nameLookup.getOrAddEntry(postfix); + final var prefixEntry = prefixLookup.getOrAddEntry(prefix); + final var nameEntry = nameLookup.getOrAddEntry(postfix); if (prefixEntry.newEntry) { bufferAppender.appendPrefixEntry(new RdfPrefixEntry(prefixEntry.setId, prefix)); } @@ -211,7 +211,7 @@ public UniversalTerm makeDtLiteral(TNode key, String lex, String datatypeName) { "to a positive value." ); } - var cachedNode = dtLiteralNodeCache.computeIfAbsent(key, k -> new DependentNode()); + final var cachedNode = dtLiteralNodeCache.computeIfAbsent(key, k -> new DependentNode()); // Check if the value is still valid if ( cachedNode.encoded != null && cachedNode.lookupSerial1 == datatypeLookup.serials[cachedNode.lookupPointer1] @@ -221,7 +221,7 @@ public UniversalTerm makeDtLiteral(TNode key, String lex, String datatypeName) { } // The node is not encoded, but we may already have the datatype encoded - var dtEntry = datatypeLookup.getOrAddEntry(datatypeName); + final var dtEntry = datatypeLookup.getOrAddEntry(datatypeName); if (dtEntry.newEntry) { bufferAppender.appendDatatypeEntry(new RdfDatatypeEntry(dtEntry.setId, datatypeName)); } diff --git a/project/plugins.sbt b/project/plugins.sbt index 2171d5ce1..65cd972d9 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -2,6 +2,8 @@ addSbtPlugin("com.thesamet" % "sbt-protoc" % "1.0.7") addSbtPlugin("org.apache.pekko" % "pekko-grpc-sbt-plugin" % "1.1.1") addSbtPlugin("com.github.sbt" % "sbt-ci-release" % "1.9.3") addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.3.1") +addSbtPlugin("com.github.sbt" % "sbt-protobuf" % "0.8.1") + addDependencyTreePlugin lazy val scalapbV = "0.11.17" diff --git a/rdf-protos-java/.gitignore b/rdf-protos-java/.gitignore new file mode 100644 index 000000000..9c23f1042 --- /dev/null +++ b/rdf-protos-java/.gitignore @@ -0,0 +1,2 @@ +# Protobuf files +src/main/protobuf/ diff --git a/rdf-protos-java/src/main/.gitkeep b/rdf-protos-java/src/main/.gitkeep new file mode 100644 index 000000000..e69de29bb