diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala index 69da8d7..d3147b1 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJelly.scala @@ -5,10 +5,13 @@ import eu.neverblink.jelly.cli.* import eu.neverblink.jelly.cli.command.rdf.util.* import eu.neverblink.jelly.cli.command.rdf.util.RdfFormat.* import eu.neverblink.jelly.cli.command.rdf.util.RdfFormat.Jena.* -import eu.ostrzyciel.jelly.convert.jena.riot.JellyLanguage +import eu.neverblink.jelly.cli.util.args.IndexRange +import eu.ostrzyciel.jelly.convert.jena.JenaConverterFactory import eu.ostrzyciel.jelly.core.proto.v1.RdfStreamFrame +import org.apache.jena.graph.Triple +import org.apache.jena.riot.Lang import org.apache.jena.riot.system.StreamRDFWriter -import org.apache.jena.riot.{Lang, RDFParser} +import org.apache.jena.sparql.core.Quad import java.io.{InputStream, OutputStream} @@ -36,6 +39,11 @@ case class RdfFromJellyOptions( "If not explicitly specified, but output file supplied, the format is inferred from the file name. " + RdfFromJellyPrint.helpMsg, ) @ExtraName("out-format") outputFormat: Option[String] = None, + @HelpMessage( + "Frame indices to include in the output. If not specified, all frames are included. " + + IndexRange.helpText, + ) + takeFrames: String = "", ) extends HasJellyCommandOptions object RdfFromJelly extends RdfSerDesCommand[RdfFromJellyOptions, RdfFormat.Writeable]: @@ -49,7 +57,11 @@ object RdfFromJelly extends RdfSerDesCommand[RdfFromJellyOptions, RdfFormat.Writ val defaultAction: (InputStream, OutputStream) => Unit = jellyToLang(RdfFormat.NQuads.jenaLang, _, _) + private def takeFrames: IndexRange = IndexRange(getOptions.takeFrames, "--take-frames") + override def doRun(options: RdfFromJellyOptions, remainingArgs: RemainingArgs): Unit = + // Parse options now to make sure they are valid + takeFrames val (inputStream, outputStream) = this.getIoStreamsFromOptions(remainingArgs.remaining.headOption, options.outputFile) parseFormatArgs(inputStream, outputStream, options.outputFormat, options.outputFile) @@ -75,8 +87,34 @@ object RdfFromJelly extends RdfSerDesCommand[RdfFromJellyOptions, RdfFormat.Writ inputStream: InputStream, outputStream: OutputStream, ): Unit = - val nQuadWriter = StreamRDFWriter.getWriterStream(outputStream, jenaLang) - RDFParser.source(inputStream).lang(JellyLanguage.JELLY).parse(nQuadWriter) + val writer = StreamRDFWriter.getWriterStream(outputStream, jenaLang) + // Whether the output is active at this moment + var outputEnabled = false + val decoder = JenaConverterFactory.anyStatementDecoder( + // Only pass on the namespaces to the writer if the output is enabled + namespaceHandler = (String, Node) => { + if outputEnabled then writer.prefix(String, Node.getURI) + }, + ) + val inputFrames = takeFrames.end match + case Some(end) => JellyUtil.iterateRdfStream(inputStream).take(end) + case None => JellyUtil.iterateRdfStream(inputStream) + val startFrom = takeFrames.start.getOrElse(0) + for (frame, i) <- inputFrames.zipWithIndex do + // If we are not yet in the output range, still fully parse the frame and update the decoder + // state. We need this to decode the later frames correctly. + if i < startFrom then for row <- frame.rows do decoder.ingestRowFlat(row) + else + // TODO: write frame index as a comment here + // https://github.com/Jelly-RDF/cli/issues/4 + outputEnabled = true + // We are in the output range, so we can start writing the output + for row <- frame.rows do + decoder.ingestRowFlat(row) match + case null => () + case t: Triple => writer.triple(t) + case q: Quad => writer.quad(q) + writer.finish() /** This method reads the Jelly file, rewrites it to Jelly text and writes it to some output * stream @@ -96,9 +134,10 @@ object RdfFromJelly extends RdfSerDesCommand[RdfFromJellyOptions, RdfFormat.Writ outputStream.write(frame.getBytes) try { - JellyUtil.iterateRdfStream(inputStream).zipWithIndex.foreach { - case (maybeFrame, frameIndex) => - writeFrameToOutput(maybeFrame, frameIndex) + val it = JellyUtil.iterateRdfStream(inputStream) + .zipWithIndex + takeFrames.slice(it).foreach { case (maybeFrame, frameIndex) => + writeFrameToOutput(maybeFrame, frameIndex) } } finally { outputStream.flush() diff --git a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfValidate.scala b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfValidate.scala index a6003bd..e3dd6f3 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfValidate.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/command/rdf/RdfValidate.scala @@ -48,8 +48,7 @@ case class RdfValidateOptions( compareOrdered: Boolean = false, @HelpMessage( "Frame indices to compare. If not specified, all frames are compared. " + - "The indices are 0-based and can be specified as a Rust-style range: " + - "'..3', '3..', '1..5', '4..=6'", + IndexRange.helpText, ) compareFrameIndices: String = "", @HelpMessage( diff --git a/src/main/scala/eu/neverblink/jelly/cli/util/args/IndexRange.scala b/src/main/scala/eu/neverblink/jelly/cli/util/args/IndexRange.scala index a841ab2..3da6b16 100644 --- a/src/main/scala/eu/neverblink/jelly/cli/util/args/IndexRange.scala +++ b/src/main/scala/eu/neverblink/jelly/cli/util/args/IndexRange.scala @@ -4,14 +4,21 @@ import eu.neverblink.jelly.cli.InvalidArgument import scala.collection.IterableOnceOps +/** Represents a range of indices, similar to Rust-style ranges. + * @param start + * start index (inclusive) + * @param end + * end index (exclusive) + */ final case class IndexRange( start: Option[Int], end: Option[Int], ): def slice[T, C <: IterableOnceOps[T, ?, C]](it: C): C = val startIndex = start.getOrElse(0) - val endIndex = end.getOrElse(it.size) - it.slice(startIndex, endIndex) + this.end match + case None => it.drop(startIndex) + case Some(endIndex) => it.slice(startIndex, endIndex) /** Parser for Rust-style index ranges. */ @@ -46,3 +53,6 @@ object IndexRange: "'3..' (from inclusive), or '1..3' (range up to exclusive), or '1..=3' (inclusive)", ), ) + + val helpText: String = "The indices are 0-based and can be specified as a Rust-style range: " + + "'..3', '3..', '1..5', '4..=6'" diff --git a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJellySpec.scala b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJellySpec.scala index 1d3fcce..55a263f 100644 --- a/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJellySpec.scala +++ b/src/test/scala/eu/neverblink/jelly/cli/command/rdf/RdfFromJellySpec.scala @@ -4,10 +4,13 @@ import com.google.protobuf.InvalidProtocolBufferException import eu.neverblink.jelly.cli.* import eu.neverblink.jelly.cli.command.helpers.* import eu.neverblink.jelly.cli.command.rdf.util.RdfFormat +import eu.ostrzyciel.jelly.core.proto.v1.{PhysicalStreamType, RdfStreamFrame} +import eu.ostrzyciel.jelly.core.{JellyOptions, ProtoTranscoder} import org.apache.jena.riot.RDFLanguages import org.scalatest.matchers.should.Matchers import org.scalatest.wordspec.AnyWordSpec +import java.io.{ByteArrayInputStream, ByteArrayOutputStream} import java.nio.file.attribute.PosixFilePermissions import java.nio.file.{Files, Paths} import scala.io.Source @@ -17,6 +20,21 @@ class RdfFromJellySpec extends AnyWordSpec with Matchers with TestFixtureHelper: protected val testCardinality: Int = 33 + // Make a test input stream with 10 frames... all are the same, but it doesn't matter + private val input10Frames: Array[Byte] = { + val j1 = DataGenHelper.generateJellyBytes(testCardinality) + val f1 = RdfStreamFrame.parseDelimitedFrom(ByteArrayInputStream(j1)).get + val os = ByteArrayOutputStream() + // Need to use the transcoder to make sure the lookup IDs are correct + val transcoder = ProtoTranscoder.fastMergingTranscoderUnsafe( + outputOptions = JellyOptions.bigGeneralized.withPhysicalType( + PhysicalStreamType.TRIPLES, + ), + ) + for _ <- 0 until 10 do transcoder.ingestFrame(f1).writeDelimitedTo(os) + os.toByteArray + } + "rdf from-jelly command" should { "handle conversion of Jelly to NTriples" when { "a file to output stream" in withFullJellyFile { j => @@ -39,6 +57,7 @@ class RdfFromJellySpec extends AnyWordSpec with Matchers with TestFixtureHelper: val sortedQuads = nQuadString.split("\n").map(_.trim).sorted sortedOut should contain theSameElementsAs sortedQuads } + "a file to file" in withFullJellyFile { j => withEmptyJenaFile { q => val nQuadString = DataGenHelper.generateJenaString(testCardinality) @@ -54,6 +73,7 @@ class RdfFromJellySpec extends AnyWordSpec with Matchers with TestFixtureHelper: out.length should be(0) } } + "a file to file when defaulting to nQuads" in withFullJellyFile { j => withEmptyRandomFile { q => val nQuadString = DataGenHelper.generateJenaString(testCardinality) @@ -69,6 +89,7 @@ class RdfFromJellySpec extends AnyWordSpec with Matchers with TestFixtureHelper: out.length should be(0) } } + "an input stream to file" in withEmptyJenaFile { q => val input = DataGenHelper.generateJellyInputStream(testCardinality) RdfFromJelly.setStdIn(input) @@ -82,7 +103,35 @@ class RdfFromJellySpec extends AnyWordSpec with Matchers with TestFixtureHelper: sortedOut should contain theSameElementsAs sortedQuads out.length should be(0) } + + "input stream of 10 frames to output stream, --take-frames=''" in { + RdfFromJelly.setStdIn(ByteArrayInputStream(input10Frames)) + val (out, err) = RdfFromJelly.runTestCommand( + List("rdf", "from-jelly", "--out-format", "nt", "--take-frames", ""), + ) + val outSize = out.split("\n").length + outSize should be(10 * testCardinality) + } + + "input stream of 10 frames to output stream, --take-frames=7" in { + RdfFromJelly.setStdIn(ByteArrayInputStream(input10Frames)) + val (out, err) = RdfFromJelly.runTestCommand( + List("rdf", "from-jelly", "--out-format", "nt", "--take-frames", "7"), + ) + val outSize = out.split("\n").length + outSize should be(testCardinality) + } + + "input stream of 10 frames to output stream, --take-frames=3..=5" in { + RdfFromJelly.setStdIn(ByteArrayInputStream(input10Frames)) + val (out, err) = RdfFromJelly.runTestCommand( + List("rdf", "from-jelly", "--out-format", "nt", "--take-frames", "3..=5"), + ) + val outSize = out.split("\n").length + outSize should be(3 * testCardinality) + } } + "handle conversion of Jelly binary to text" when { "a file to output stream" in withFullJellyFile { j => val (out, err) = @@ -113,6 +162,7 @@ class RdfFromJellySpec extends AnyWordSpec with Matchers with TestFixtureHelper: "rows".r.findAllIn(out).length should be(70) "http://example.org/predicate/".r.findAllIn(out).length should be(1) } + "a file to file when inferred type" in withFullJellyFile { j => withEmptyJellyTextFile { t => val (out, err) = @@ -147,9 +197,27 @@ class RdfFromJellySpec extends AnyWordSpec with Matchers with TestFixtureHelper: "rows".r.findAllIn(inTxt).length should be(70) "http://example.org/predicate/".r.findAllIn(inTxt).length should be(1) } + } + "input stream (10 frames) to output stream --take-frames=3..=5" in withFullJellyFile { j => + RdfFromJelly.setStdIn(ByteArrayInputStream(input10Frames)) + val (out, err) = RdfFromJelly.runTestCommand( + List( + "rdf", + "from-jelly", + "--out-format=jelly-text", + "--take-frames=3..=5", + ), + ) + + out should not include "# Frame 0" + out should include("# Frame 3") + out should include("# Frame 4") + out should include("# Frame 5") + "rows".r.findAllIn(out).length should be(3 * testCardinality) } } + "throw proper exception" when { "input file is not found" in { val nonExist = "non-existing-file" @@ -161,6 +229,7 @@ class RdfFromJellySpec extends AnyWordSpec with Matchers with TestFixtureHelper: RdfFromJelly.getErrString should include(msg) exception.code should be(1) } + "input file is not accessible" in withFullJellyFile { j => val permissions = PosixFilePermissions.fromString("---------") Files.setPosixFilePermissions( @@ -176,6 +245,7 @@ class RdfFromJellySpec extends AnyWordSpec with Matchers with TestFixtureHelper: RdfFromJelly.getErrString should include(msg) exception.code should be(1) } + "output file cannot be created" in withFullJellyFile { j => withEmptyJenaFile { q => Paths.get(q).toFile.setWritable(false) @@ -190,10 +260,9 @@ class RdfFromJellySpec extends AnyWordSpec with Matchers with TestFixtureHelper: Paths.get(q).toFile.setWritable(true) RdfFromJelly.getErrString should include(msg) exception.code should be(1) - } - } + "deserializing error occurs" in withFullJellyFile { j => withEmptyJenaFile { q => RdfFromJelly.runTestCommand( @@ -212,6 +281,7 @@ class RdfFromJellySpec extends AnyWordSpec with Matchers with TestFixtureHelper: exception.code should be(1) } } + "parsing error occurs with debug set" in withFullJellyFile { j => withEmptyJenaFile { q => RdfFromJelly.runTestCommand( @@ -230,6 +300,7 @@ class RdfFromJellySpec extends AnyWordSpec with Matchers with TestFixtureHelper: exception.code should be(1) } } + "invalid output format supplied" in withFullJellyFile { j => withEmptyJenaFile { q => val exception = @@ -243,6 +314,7 @@ class RdfFromJellySpec extends AnyWordSpec with Matchers with TestFixtureHelper: exception.code should be(1) } } + "invalid but known output format supplied" in withFullJellyFile { j => withEmptyJellyFile { q => val exception = @@ -267,6 +339,7 @@ class RdfFromJellySpec extends AnyWordSpec with Matchers with TestFixtureHelper: exception.code should be(1) } } + "readable but not writable format supplied" in withFullJellyFile { j => withEmptyJenaFile( testCode = { q => @@ -294,5 +367,16 @@ class RdfFromJellySpec extends AnyWordSpec with Matchers with TestFixtureHelper: jenaLang = RDFLanguages.RDFXML, ) } + + "invalid --take-frames argument provided" in { + val e = intercept[ExitException] { + RdfFromJelly.runTestCommand( + List("rdf", "from-jelly", "--out-format", "nt", "--take-frames", "invalid"), + ) + } + val cause = e.getCause.asInstanceOf[InvalidArgument] + cause.argument should be("--take-frames") + cause.argumentValue should be("invalid") + } } }