From da14ea5c326c44a10439439f05cef6a9ac507836 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 5 Jun 2026 10:48:11 +0000 Subject: [PATCH 1/2] feat: add Expected DSL and addToEvaluationSet for SDK evaluation submission Let developers attach an expected structured output so a published run is auto-stored as an evaluation set on the platform. - Add Expected(evaluationSet, output) wrapper in core - start(..., expected = Expected(...)) threads the expected output to the gateway via the new RunDto.evaluation field (RunEvaluationDto) - Capture the created run id from the 201 RunCreatedDto body and expose it as AgentRun.platformRunId (nullable; tolerates empty 201 bodies from old gateways) - Add deferred Agent.addToEvaluationSet(runId, evaluationSet, expected) posting to /gateway/runs/{runId}/evaluation, with EvaluationSubmitResult - Regenerate gateway wirespec types (RunEvaluationDto, RunCreatedDto, AddToEvaluationSet endpoint) --- .../aigentic/core/agent/AgentExecutor.kt | 56 +++- .../flock/aigentic/core/agent/Expected.kt | 6 + .../flock/aigentic/core/agent/Run.kt | 2 + .../flock/aigentic/core/platform/Platform.kt | 34 ++- .../platform/client/PlatformClient.kt | 67 ++++- .../aigentic/platform/mapper/RequestMapper.kt | 10 + .../client/AigenticPlatformClientTest.kt | 7 +- .../client/EvaluationSubmissionTest.kt | 263 ++++++++++++++++++ src/platform/wirespec/gateway.ws | 22 +- 9 files changed, 443 insertions(+), 24 deletions(-) create mode 100644 src/core/src/commonMain/kotlin/community/flock/aigentic/core/agent/Expected.kt create mode 100644 src/platform/src/jvmTest/kotlin/community/flock/aigentic/platform/client/EvaluationSubmissionTest.kt diff --git a/src/core/src/commonMain/kotlin/community/flock/aigentic/core/agent/AgentExecutor.kt b/src/core/src/commonMain/kotlin/community/flock/aigentic/core/agent/AgentExecutor.kt index 6b540d223..7a4aa108e 100644 --- a/src/core/src/commonMain/kotlin/community/flock/aigentic/core/agent/AgentExecutor.kt +++ b/src/core/src/commonMain/kotlin/community/flock/aigentic/core/agent/AgentExecutor.kt @@ -33,7 +33,9 @@ import community.flock.aigentic.core.message.ToolCall import community.flock.aigentic.core.message.asJson import community.flock.aigentic.core.message.mapToTextMessages import community.flock.aigentic.core.model.ModelResponse +import community.flock.aigentic.core.platform.EvaluationSubmitResult import community.flock.aigentic.core.platform.RunSentResult +import community.flock.aigentic.core.platform.addToEvaluationSet import community.flock.aigentic.core.platform.getRuns import community.flock.aigentic.core.platform.sendRun import community.flock.aigentic.core.tool.Parameter @@ -51,6 +53,7 @@ suspend inline fun Agent.start(vararg a suspend inline fun Agent.start( input: I? = null, vararg attachments: Attachment, + expected: Expected? = null, ): AgentRun = coroutineScope { val agent = this@start @@ -59,13 +62,13 @@ suspend inline fun Agent.start( val logging = async { state.getStatus().map { it.text }.collect(::println) } try { val run = executeAction(Initialize(state, agent, input, attachments.toList())).toRun() - publishRun(agent, run, state) - run + val platformRunId = publishRun(agent, run, state, expected) + run.copy(platformRunId = platformRunId) } catch (e: AigenticException) { state.events.emit(AgentStatus.Fatal(e.message)) val run = (state to Outcome.Fatal(e.message)).toRun() - publishRun(agent, run, state) - run + val platformRunId = publishRun(agent, run, state, expected) + run.copy(platformRunId = platformRunId) } finally { delay(10) // Allow some time for the logging to finish logging.cancelAndJoin() @@ -77,22 +80,45 @@ internal suspend inline fun publishRun( agent: Agent, run: AgentRun, state: State, -) { - if (agent.platform != null) { - runCatching { - agent.platform.sendRun(run, agent) - }.onSuccess { result -> + expected: Expected?, +): RunId? { + if (agent.platform == null) return null + return runCatching { + agent.platform.sendRun(run, agent, expected) + }.fold( + onSuccess = { result -> when (result) { - RunSentResult.Success -> state.events.emit(AgentStatus.PublishedRunSuccess) - RunSentResult.Unauthorized -> state.events.emit(AgentStatus.PublishedRunUnauthorized) - is RunSentResult.Error -> state.events.emit(AgentStatus.PublishedRunError(result.message)) + is RunSentResult.Success -> { + state.events.emit(AgentStatus.PublishedRunSuccess) + result.runId + } + + RunSentResult.Unauthorized -> { + state.events.emit(AgentStatus.PublishedRunUnauthorized) + null + } + + is RunSentResult.Error -> { + state.events.emit(AgentStatus.PublishedRunError(result.message)) + null + } } - }.onFailure { exception -> + }, + onFailure = { exception -> state.events.emit(AgentStatus.PublishedRunError(exception.message ?: "Unknown error")) - } - } + null + }, + ) } +suspend inline fun Agent.addToEvaluationSet( + runId: RunId, + evaluationSet: String, + expected: O, +): EvaluationSubmitResult = + platform?.addToEvaluationSet(runId, evaluationSet, expected) + ?: aigenticException("Platform must be configured to add a run to an evaluation set") + suspend inline fun executeAction(action: Action): Pair> { var currentAction = action while (true) { diff --git a/src/core/src/commonMain/kotlin/community/flock/aigentic/core/agent/Expected.kt b/src/core/src/commonMain/kotlin/community/flock/aigentic/core/agent/Expected.kt new file mode 100644 index 000000000..98869c1b4 --- /dev/null +++ b/src/core/src/commonMain/kotlin/community/flock/aigentic/core/agent/Expected.kt @@ -0,0 +1,6 @@ +package community.flock.aigentic.core.agent + +data class Expected( + val evaluationSet: String, + val output: O, +) diff --git a/src/core/src/commonMain/kotlin/community/flock/aigentic/core/agent/Run.kt b/src/core/src/commonMain/kotlin/community/flock/aigentic/core/agent/Run.kt index 16788cc73..daa356156 100644 --- a/src/core/src/commonMain/kotlin/community/flock/aigentic/core/agent/Run.kt +++ b/src/core/src/commonMain/kotlin/community/flock/aigentic/core/agent/Run.kt @@ -25,6 +25,7 @@ data class AgentRun( override val modelRequests: List, val systemPromptMessage: Message.SystemPrompt, val exampleRunIds: List = emptyList(), + val platformRunId: RunId? = null, ) : Run() data class WorkflowRun( @@ -120,5 +121,6 @@ internal inline fun AgentRun.decode(): AgentRun { modelRequests = modelRequests, exampleRunIds = exampleRunIds, systemPromptMessage = systemPromptMessage, + platformRunId = platformRunId, ) } diff --git a/src/core/src/commonMain/kotlin/community/flock/aigentic/core/platform/Platform.kt b/src/core/src/commonMain/kotlin/community/flock/aigentic/core/platform/Platform.kt index a7fb0f183..4672f6503 100644 --- a/src/core/src/commonMain/kotlin/community/flock/aigentic/core/platform/Platform.kt +++ b/src/core/src/commonMain/kotlin/community/flock/aigentic/core/platform/Platform.kt @@ -2,6 +2,7 @@ package community.flock.aigentic.core.platform import community.flock.aigentic.core.agent.Agent import community.flock.aigentic.core.agent.AgentRun +import community.flock.aigentic.core.agent.Expected import community.flock.aigentic.core.agent.RunId import community.flock.aigentic.core.agent.RunTag import community.flock.aigentic.core.agent.decode @@ -30,8 +31,16 @@ interface PlatformClient { run: AgentRun, agent: Agent, outputSerializer: KSerializer, + expected: Expected?, ): RunSentResult + suspend fun addToEvaluationSet( + runId: RunId, + evaluationSet: String, + expected: O, + outputSerializer: KSerializer, + ): EvaluationSubmitResult + suspend fun getRuns(tags: List): List>> } @@ -44,7 +53,14 @@ interface Platform { suspend inline fun Platform.sendRun( run: AgentRun, agent: Agent, -): RunSentResult = client.sendRun(run, agent, serializer()) + expected: Expected? = null, +): RunSentResult = client.sendRun(run, agent, serializer(), expected) + +suspend inline fun Platform.addToEvaluationSet( + runId: RunId, + evaluationSet: String, + expected: O, +): EvaluationSubmitResult = client.addToEvaluationSet(runId, evaluationSet, expected, serializer()) suspend inline fun Platform.getRuns(tags: List): List>> = client @@ -54,7 +70,9 @@ suspend inline fun Platform.getRuns(tags: List): List< } sealed interface RunSentResult { - data object Success : RunSentResult + data class Success( + val runId: RunId?, + ) : RunSentResult data object Unauthorized : RunSentResult @@ -62,3 +80,15 @@ sealed interface RunSentResult { val message: String, ) : RunSentResult } + +sealed interface EvaluationSubmitResult { + data object Success : EvaluationSubmitResult + + data object Unauthorized : EvaluationSubmitResult + + data object NotFound : EvaluationSubmitResult + + data class Error( + val message: String, + ) : EvaluationSubmitResult +} diff --git a/src/platform/src/commonMain/kotlin/community/flock/aigentic/platform/client/PlatformClient.kt b/src/platform/src/commonMain/kotlin/community/flock/aigentic/platform/client/PlatformClient.kt index 08b503865..0011e250a 100644 --- a/src/platform/src/commonMain/kotlin/community/flock/aigentic/platform/client/PlatformClient.kt +++ b/src/platform/src/commonMain/kotlin/community/flock/aigentic/platform/client/PlatformClient.kt @@ -2,15 +2,20 @@ package community.flock.aigentic.platform.client import community.flock.aigentic.core.agent.Agent import community.flock.aigentic.core.agent.AgentRun +import community.flock.aigentic.core.agent.Expected import community.flock.aigentic.core.agent.RunId import community.flock.aigentic.core.agent.RunTag import community.flock.aigentic.core.exception.aigenticException import community.flock.aigentic.core.platform.Authentication +import community.flock.aigentic.core.platform.EvaluationSubmitResult import community.flock.aigentic.core.platform.PlatformApiUrl import community.flock.aigentic.core.platform.PlatformClient import community.flock.aigentic.core.platform.RunSentResult +import community.flock.aigentic.gateway.wirespec.endpoint.AddToEvaluationSet import community.flock.aigentic.gateway.wirespec.endpoint.Gateway import community.flock.aigentic.gateway.wirespec.endpoint.GetRuns +import community.flock.aigentic.gateway.wirespec.model.RunCreatedDto +import community.flock.aigentic.gateway.wirespec.model.RunEvaluationDto import community.flock.aigentic.platform.mapper.toDto import community.flock.aigentic.platform.mapper.toRun import community.flock.wirespec.kotlin.Wirespec @@ -45,7 +50,8 @@ import kotlin.reflect.KType interface PlatformEndpoints : Gateway.Handler, - GetRuns.Handler + GetRuns.Handler, + AddToEvaluationSet.Handler const val defaultPlatformApiUrl = "https://aigentic-backend-kib53ypjwq-ez.a.run.app/" @@ -58,12 +64,17 @@ class AigenticPlatformClient( run: AgentRun, agent: Agent, outputSerializer: KSerializer, + expected: Expected?, ): RunSentResult { - val runDto = run.toDto(agent, outputSerializer) + val runDto = run.toDto(agent, outputSerializer, expected) val request = Gateway.Request(body = runDto) return when (val response = endpoints.gateway(request)) { is Gateway.Response201 -> { - RunSentResult.Success + RunSentResult.Success( + response.body.runId + .takeIf { it.isNotBlank() } + ?.let(::RunId), + ) } is Gateway.Response401 -> { @@ -82,6 +93,44 @@ class AigenticPlatformClient( } } + override suspend fun addToEvaluationSet( + runId: RunId, + evaluationSet: String, + expected: O, + outputSerializer: KSerializer, + ): EvaluationSubmitResult { + val request = + AddToEvaluationSet.Request( + runId = runId.value, + body = + RunEvaluationDto( + evaluationSet = evaluationSet, + expectedResponse = Json.encodeToString(outputSerializer, expected), + ), + ) + return when (val response = endpoints.addToEvaluationSet(request)) { + is AddToEvaluationSet.Response200 -> { + EvaluationSubmitResult.Success + } + + is AddToEvaluationSet.Response401 -> { + EvaluationSubmitResult.Unauthorized + } + + is AddToEvaluationSet.Response404 -> { + EvaluationSubmitResult.NotFound + } + + is AddToEvaluationSet.Response400 -> { + EvaluationSubmitResult.Error(response.body.message) + } + + is AddToEvaluationSet.Response500 -> { + EvaluationSubmitResult.Error("${response.body.name} - ${response.body.description}") + } + } + } + override suspend fun getRuns(tags: List): List>> = when (val response = endpoints.getRuns(GetRuns.Request(tags = tags.joinToString(",") { it.value }))) { is GetRuns.Response200 -> response.body @@ -166,6 +215,11 @@ class AigenticPlatformEndpoints( val edge = Gateway.Handler.client(serialization) val rawRequest = edge.to(request) val rawResponse = executeRequest(rawRequest) + // Backward compatibility: old gateways respond 201 with an empty body (no RunCreatedDto). + // Surface it as a Response201 with an empty runId so the run id stays null instead of throwing. + if (rawResponse.statusCode == 201 && rawResponse.body?.isEmpty() != false) { + return Gateway.Response201(RunCreatedDto(runId = "")) + } return edge.from(rawResponse) } @@ -176,6 +230,13 @@ class AigenticPlatformEndpoints( return edge.from(rawResponse) } + override suspend fun addToEvaluationSet(request: AddToEvaluationSet.Request): AddToEvaluationSet.Response<*> { + val edge = AddToEvaluationSet.Handler.client(serialization) + val rawRequest = edge.to(request) + val rawResponse = executeRequest(rawRequest) + return edge.from(rawResponse) + } + private suspend fun executeRequest(rawRequest: Wirespec.RawRequest): Wirespec.RawResponse { val response = httpClient.request { diff --git a/src/platform/src/commonMain/kotlin/community/flock/aigentic/platform/mapper/RequestMapper.kt b/src/platform/src/commonMain/kotlin/community/flock/aigentic/platform/mapper/RequestMapper.kt index 2063b38f6..a2d20fb96 100644 --- a/src/platform/src/commonMain/kotlin/community/flock/aigentic/platform/mapper/RequestMapper.kt +++ b/src/platform/src/commonMain/kotlin/community/flock/aigentic/platform/mapper/RequestMapper.kt @@ -2,6 +2,7 @@ package community.flock.aigentic.platform.mapper import community.flock.aigentic.core.agent.Agent import community.flock.aigentic.core.agent.AgentRun +import community.flock.aigentic.core.agent.Expected import community.flock.aigentic.core.agent.state.ModelRequestInfo import community.flock.aigentic.core.agent.tool.Outcome import community.flock.aigentic.core.message.Message @@ -34,6 +35,7 @@ import community.flock.aigentic.gateway.wirespec.model.PrimitiveValueNumberDto import community.flock.aigentic.gateway.wirespec.model.PrimitiveValueStringDto import community.flock.aigentic.gateway.wirespec.model.PrimitiveValueTypeDto import community.flock.aigentic.gateway.wirespec.model.RunDto +import community.flock.aigentic.gateway.wirespec.model.RunEvaluationDto import community.flock.aigentic.gateway.wirespec.model.SenderDto import community.flock.aigentic.gateway.wirespec.model.StructuredOutputMessageDto import community.flock.aigentic.gateway.wirespec.model.StuckResultDto @@ -65,6 +67,7 @@ private fun Parameter.toJsonSchemaString(): String = fun AgentRun.toDto( agent: Agent, outputSerializer: KSerializer, + expected: Expected? = null, ): RunDto = RunDto( startedAt = startedAt.toString(), @@ -99,6 +102,13 @@ fun AgentRun.toDto( messages = messages.mapNotNull { it.toDto() }, modelRequests = modelRequests.map { it.toDto() }, result = outcome.toDto(outputSerializer), + evaluation = + expected?.let { + RunEvaluationDto( + evaluationSet = it.evaluationSet, + expectedResponse = Json.encodeToString(outputSerializer, it.output), + ) + }, ) private fun Parameter.toDto(): ParameterDto = diff --git a/src/platform/src/jvmTest/kotlin/community/flock/aigentic/platform/client/AigenticPlatformClientTest.kt b/src/platform/src/jvmTest/kotlin/community/flock/aigentic/platform/client/AigenticPlatformClientTest.kt index f2bb0f1de..52307b542 100644 --- a/src/platform/src/jvmTest/kotlin/community/flock/aigentic/platform/client/AigenticPlatformClientTest.kt +++ b/src/platform/src/jvmTest/kotlin/community/flock/aigentic/platform/client/AigenticPlatformClientTest.kt @@ -1,10 +1,12 @@ package community.flock.aigentic.platform.client +import community.flock.aigentic.core.agent.RunId import community.flock.aigentic.core.platform.Authentication import community.flock.aigentic.core.platform.PlatformApiUrl import community.flock.aigentic.core.platform.RunSentResult import community.flock.aigentic.gateway.wirespec.endpoint.Gateway import community.flock.aigentic.gateway.wirespec.model.GatewayClientErrorDto +import community.flock.aigentic.gateway.wirespec.model.RunCreatedDto import community.flock.aigentic.gateway.wirespec.model.ServerErrorDto import community.flock.aigentic.platform.util.createAgent import community.flock.aigentic.platform.util.createAgentRun @@ -20,7 +22,8 @@ class AigenticPlatformClientTest : withData( nameFn = { "Should map ${it.wirespecResponse} to ${it.runSentResult}" }, - TestCase(Gateway.Response201(body = Unit), RunSentResult.Success), + TestCase(Gateway.Response201(body = RunCreatedDto("run-123")), RunSentResult.Success(RunId("run-123"))), + TestCase(Gateway.Response201(body = RunCreatedDto("")), RunSentResult.Success(null)), TestCase(Gateway.Response401(body = Unit), RunSentResult.Unauthorized), TestCase( Gateway.Response400(body = GatewayClientErrorDto("invalid request")), @@ -50,7 +53,7 @@ class AigenticPlatformClientTest : platformEndpoints, ) - val result = client.sendRun(run, agent, serializer()) + val result = client.sendRun(run, agent, serializer(), null) result shouldBe it.runSentResult } diff --git a/src/platform/src/jvmTest/kotlin/community/flock/aigentic/platform/client/EvaluationSubmissionTest.kt b/src/platform/src/jvmTest/kotlin/community/flock/aigentic/platform/client/EvaluationSubmissionTest.kt new file mode 100644 index 000000000..d5dfdf8e4 --- /dev/null +++ b/src/platform/src/jvmTest/kotlin/community/flock/aigentic/platform/client/EvaluationSubmissionTest.kt @@ -0,0 +1,263 @@ +package community.flock.aigentic.platform.client + +import community.flock.aigentic.core.agent.Expected +import community.flock.aigentic.core.agent.RunId +import community.flock.aigentic.core.agent.addToEvaluationSet +import community.flock.aigentic.core.agent.start +import community.flock.aigentic.core.agent.tool.FINISHED_TASK_TOOL_NAME +import community.flock.aigentic.core.annotations.AigenticParameter +import community.flock.aigentic.core.dsl.agent +import community.flock.aigentic.core.message.Message +import community.flock.aigentic.core.message.ToolCall +import community.flock.aigentic.core.message.ToolCallId +import community.flock.aigentic.core.model.GenerationSettings +import community.flock.aigentic.core.model.Model +import community.flock.aigentic.core.model.ModelIdentifier +import community.flock.aigentic.core.model.ModelResponse +import community.flock.aigentic.core.model.Usage +import community.flock.aigentic.core.platform.Authentication +import community.flock.aigentic.core.platform.EvaluationSubmitResult +import community.flock.aigentic.core.platform.Platform +import community.flock.aigentic.core.platform.PlatformApiUrl +import community.flock.aigentic.core.tool.Parameter +import community.flock.aigentic.core.tool.ToolDescription +import community.flock.aigentic.gateway.wirespec.model.RunDto +import community.flock.aigentic.gateway.wirespec.model.RunEvaluationDto +import community.flock.aigentic.platform.AigenticPlatform +import community.flock.aigentic.platform.mapper.toDto +import community.flock.aigentic.platform.util.createAgent +import community.flock.aigentic.platform.util.createAgentRun +import io.kotest.core.spec.style.DescribeSpec +import io.kotest.matchers.nulls.shouldNotBeNull +import io.kotest.matchers.shouldBe +import io.ktor.client.engine.mock.MockEngine +import io.ktor.client.engine.mock.respond +import io.ktor.client.engine.mock.toByteArray +import io.ktor.client.request.HttpRequestData +import io.ktor.http.HttpHeaders +import io.ktor.http.HttpMethod +import io.ktor.http.HttpStatusCode +import io.ktor.http.headersOf +import io.ktor.utils.io.ByteReadChannel +import kotlinx.serialization.json.Json +import kotlinx.serialization.json.JsonObject +import kotlinx.serialization.json.JsonPrimitive +import kotlinx.serialization.json.buildJsonObject +import kotlinx.serialization.serializer + +@AigenticParameter +private data class InvoiceFields( + val invoiceNumber: String, + val total: String, +) + +private val lenientJson = Json { ignoreUnknownKeys = true } + +private val finishedInvoice = InvoiceFields("INV-001", "1250.00") + +private fun finishedTaskToolCall(): ToolCall = + ToolCall( + ToolCallId("1"), + FINISHED_TASK_TOOL_NAME, + Json.encodeToString( + JsonObject.serializer(), + buildJsonObject { + put("description", JsonPrimitive("Finished the task")) + put( + "InvoiceFields", + buildJsonObject { + put("invoiceNumber", JsonPrimitive(finishedInvoice.invoiceNumber)) + put("total", JsonPrimitive(finishedInvoice.total)) + }, + ) + }, + ), + ) + +private val finishedInvoiceModel: Model = + object : Model { + override val modelIdentifier: ModelIdentifier = + object : ModelIdentifier { + override val stringValue = "test-model" + } + override val generationSettings = GenerationSettings.DEFAULT + + override suspend fun sendRequest( + messages: List, + tools: List, + structuredOutputParameter: Parameter?, + ): ModelResponse = + ModelResponse( + message = Message.ToolCalls(listOf(finishedTaskToolCall())), + usage = Usage(inputTokenCount = 1, outputTokenCount = 1, thinkingOutputTokenCount = 0), + ) + } + +private fun structuredAgent(platform: Platform) = + agent { + platform(platform) + model(finishedInvoiceModel) + task("Extract the invoice fields") {} + } + +private fun mockPlatform(engine: MockEngine): Platform { + val auth = Authentication.BasicAuth("user", "pass") + val url = PlatformApiUrl("") + return AigenticPlatform( + authentication = auth, + apiUrl = url, + client = + AigenticPlatformClient( + basicAuth = auth, + apiUrl = url, + endpoints = AigenticPlatformEndpoints(auth, url, engine), + ), + ) +} + +private suspend fun HttpRequestData.bodyText(): String = body.toByteArray().decodeToString() + +class EvaluationSubmissionTest : + DescribeSpec({ + + describe("RequestMapper evaluation field") { + + it("builds RunEvaluationDto with evaluationSet and serialized output when expected is present") { + val agent = createAgent() + val run = createAgentRun() + + val dto = + run.toDto( + agent, + serializer(), + Expected(evaluationSet = "golden-set", output = "the-expected-output"), + ) + + val evaluation = dto.evaluation.shouldNotBeNull() + evaluation.evaluationSet shouldBe "golden-set" + evaluation.expectedResponse shouldBe Json.encodeToString(serializer(), "the-expected-output") + } + + it("leaves evaluation null when expected is absent") { + val agent = createAgent() + val run = createAgentRun() + + val dto = run.toDto(agent, serializer(), null) + + dto.evaluation shouldBe null + } + } + + describe("start with expected") { + + it("puts the serialized expected output and evaluationSet in the POST body") { + var capturedBody: String? = null + val engine = + MockEngine { request -> + when (request.method to request.url.encodedPath) { + (HttpMethod.Post to "/gateway/runs") -> { + capturedBody = request.bodyText() + respond( + content = ByteReadChannel("""{"runId":"run-1"}"""), + status = HttpStatusCode.Created, + headers = headersOf(HttpHeaders.ContentType, "application/json"), + ) + } + + else -> { + error("Unexpected endpoint called! ${request.url.encodedPath}") + } + } + } + + val agent = structuredAgent(mockPlatform(engine)) + + agent.start( + expected = Expected(evaluationSet = "invoice-golden-set", output = finishedInvoice), + ) + + val body = capturedBody.shouldNotBeNull() + val runDto = lenientJson.decodeFromString(RunDto.serializer(), body) + val evaluation = runDto.evaluation.shouldNotBeNull() + evaluation.evaluationSet shouldBe "invoice-golden-set" + evaluation.expectedResponse shouldBe Json.encodeToString(serializer(), finishedInvoice) + } + + it("populates run.platformRunId from the 201 RunCreatedDto body") { + val engine = + MockEngine { request -> + when (request.method to request.url.encodedPath) { + (HttpMethod.Post to "/gateway/runs") -> { + respond( + content = ByteReadChannel("""{"runId":"run-42"}"""), + status = HttpStatusCode.Created, + headers = headersOf(HttpHeaders.ContentType, "application/json"), + ) + } + + else -> { + error("Unexpected endpoint called! ${request.url.encodedPath}") + } + } + } + + val agent = structuredAgent(mockPlatform(engine)) + + val run = agent.start() + + run.platformRunId shouldBe RunId("run-42") + } + + it("leaves run.platformRunId null when the 201 body is empty (old gateways)") { + val engine = + MockEngine { request -> + when (request.method to request.url.encodedPath) { + (HttpMethod.Post to "/gateway/runs") -> { + respond(content = ByteReadChannel.Empty, status = HttpStatusCode.Created) + } + + else -> { + error("Unexpected endpoint called! ${request.url.encodedPath}") + } + } + } + + val agent = structuredAgent(mockPlatform(engine)) + + val run = agent.start() + + run.platformRunId shouldBe null + } + } + + describe("addToEvaluationSet") { + + it("POSTs the serialized expected output to /gateway/runs/{runId}/evaluation") { + var capturedPath: String? = null + var capturedBody: String? = null + val engine = + MockEngine { request -> + capturedPath = request.url.encodedPath + capturedBody = request.bodyText() + respond(content = ByteReadChannel.Empty, status = HttpStatusCode.OK) + } + + val agent = structuredAgent(mockPlatform(engine)) + + val result = + agent.addToEvaluationSet( + runId = RunId("run-99"), + evaluationSet = "golden", + expected = InvoiceFields("INV-009", "9.99"), + ) + + result shouldBe EvaluationSubmitResult.Success + capturedPath shouldBe "/gateway/runs/run-99/evaluation" + val body = capturedBody.shouldNotBeNull() + val evaluationDto = lenientJson.decodeFromString(RunEvaluationDto.serializer(), body) + evaluationDto.evaluationSet shouldBe "golden" + evaluationDto.expectedResponse shouldBe + Json.encodeToString(serializer(), InvoiceFields("INV-009", "9.99")) + } + } + }) diff --git a/src/platform/wirespec/gateway.ws b/src/platform/wirespec/gateway.ws index aa7518752..014dd6d3c 100644 --- a/src/platform/wirespec/gateway.ws +++ b/src/platform/wirespec/gateway.ws @@ -4,7 +4,8 @@ type RunDto { config: ConfigDto, result: ResultDto, messages: MessageDto[], - modelRequests: ModelRequestInfoDto[] + modelRequests: ModelRequestInfoDto[], + evaluation: RunEvaluationDto? } type ModelRequestInfoDto { @@ -213,8 +214,17 @@ type ServerErrorDto { description: String } +type RunEvaluationDto { + evaluationSet: String, + expectedResponse: String +} + +type RunCreatedDto { + runId: String +} + endpoint Gateway POST RunDto /gateway/runs -> { - 201 -> Unit + 201 -> RunCreatedDto 401 -> Unit 400 -> GatewayClientErrorDto 500 -> ServerErrorDto @@ -236,3 +246,11 @@ endpoint GetRuns GET /gateway/runs ? { tags: String? } -> { 401 -> Unit 500 -> ServerErrorDto } + +endpoint AddToEvaluationSet POST RunEvaluationDto /gateway/runs/{runId: String}/evaluation -> { + 200 -> Unit + 400 -> GatewayClientErrorDto + 401 -> Unit + 404 -> Unit + 500 -> ServerErrorDto +} From 361069f37c0b09de6e4ae02d75cdbef2f60a7aa8 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 5 Jun 2026 12:14:47 +0000 Subject: [PATCH 2/2] refactor: refine SDK evaluation submission API - make RunSentResult.Success.runId non-null; map an id-less 201 to an Error - accept a plain String runId in Agent.addToEvaluationSet - rename the deferred gateway endpoint to POST /gateway/runs/{runId}/annotations --- .../aigentic/core/agent/AgentExecutor.kt | 4 +-- .../flock/aigentic/core/platform/Platform.kt | 2 +- .../platform/client/PlatformClient.kt | 35 +++++++++---------- .../client/AigenticPlatformClientTest.kt | 5 ++- .../client/EvaluationSubmissionTest.kt | 6 ++-- src/platform/wirespec/gateway.ws | 2 +- 6 files changed, 28 insertions(+), 26 deletions(-) diff --git a/src/core/src/commonMain/kotlin/community/flock/aigentic/core/agent/AgentExecutor.kt b/src/core/src/commonMain/kotlin/community/flock/aigentic/core/agent/AgentExecutor.kt index 7a4aa108e..1db7a87a3 100644 --- a/src/core/src/commonMain/kotlin/community/flock/aigentic/core/agent/AgentExecutor.kt +++ b/src/core/src/commonMain/kotlin/community/flock/aigentic/core/agent/AgentExecutor.kt @@ -112,11 +112,11 @@ internal suspend inline fun publishRun( } suspend inline fun Agent.addToEvaluationSet( - runId: RunId, + runId: String, evaluationSet: String, expected: O, ): EvaluationSubmitResult = - platform?.addToEvaluationSet(runId, evaluationSet, expected) + platform?.addToEvaluationSet(RunId(runId), evaluationSet, expected) ?: aigenticException("Platform must be configured to add a run to an evaluation set") suspend inline fun executeAction(action: Action): Pair> { diff --git a/src/core/src/commonMain/kotlin/community/flock/aigentic/core/platform/Platform.kt b/src/core/src/commonMain/kotlin/community/flock/aigentic/core/platform/Platform.kt index 4672f6503..8d78ec9ad 100644 --- a/src/core/src/commonMain/kotlin/community/flock/aigentic/core/platform/Platform.kt +++ b/src/core/src/commonMain/kotlin/community/flock/aigentic/core/platform/Platform.kt @@ -71,7 +71,7 @@ suspend inline fun Platform.getRuns(tags: List): List< sealed interface RunSentResult { data class Success( - val runId: RunId?, + val runId: RunId, ) : RunSentResult data object Unauthorized : RunSentResult diff --git a/src/platform/src/commonMain/kotlin/community/flock/aigentic/platform/client/PlatformClient.kt b/src/platform/src/commonMain/kotlin/community/flock/aigentic/platform/client/PlatformClient.kt index 0011e250a..d4cdd2e4c 100644 --- a/src/platform/src/commonMain/kotlin/community/flock/aigentic/platform/client/PlatformClient.kt +++ b/src/platform/src/commonMain/kotlin/community/flock/aigentic/platform/client/PlatformClient.kt @@ -11,7 +11,7 @@ import community.flock.aigentic.core.platform.EvaluationSubmitResult import community.flock.aigentic.core.platform.PlatformApiUrl import community.flock.aigentic.core.platform.PlatformClient import community.flock.aigentic.core.platform.RunSentResult -import community.flock.aigentic.gateway.wirespec.endpoint.AddToEvaluationSet +import community.flock.aigentic.gateway.wirespec.endpoint.AddRunAnnotations import community.flock.aigentic.gateway.wirespec.endpoint.Gateway import community.flock.aigentic.gateway.wirespec.endpoint.GetRuns import community.flock.aigentic.gateway.wirespec.model.RunCreatedDto @@ -51,7 +51,7 @@ import kotlin.reflect.KType interface PlatformEndpoints : Gateway.Handler, GetRuns.Handler, - AddToEvaluationSet.Handler + AddRunAnnotations.Handler const val defaultPlatformApiUrl = "https://aigentic-backend-kib53ypjwq-ez.a.run.app/" @@ -70,11 +70,10 @@ class AigenticPlatformClient( val request = Gateway.Request(body = runDto) return when (val response = endpoints.gateway(request)) { is Gateway.Response201 -> { - RunSentResult.Success( - response.body.runId - .takeIf { it.isNotBlank() } - ?.let(::RunId), - ) + response.body.runId + .takeIf { it.isNotBlank() } + ?.let { RunSentResult.Success(RunId(it)) } + ?: RunSentResult.Error("Gateway accepted the run but returned no run id") } is Gateway.Response401 -> { @@ -100,7 +99,7 @@ class AigenticPlatformClient( outputSerializer: KSerializer, ): EvaluationSubmitResult { val request = - AddToEvaluationSet.Request( + AddRunAnnotations.Request( runId = runId.value, body = RunEvaluationDto( @@ -108,24 +107,24 @@ class AigenticPlatformClient( expectedResponse = Json.encodeToString(outputSerializer, expected), ), ) - return when (val response = endpoints.addToEvaluationSet(request)) { - is AddToEvaluationSet.Response200 -> { + return when (val response = endpoints.addRunAnnotations(request)) { + is AddRunAnnotations.Response200 -> { EvaluationSubmitResult.Success } - is AddToEvaluationSet.Response401 -> { + is AddRunAnnotations.Response401 -> { EvaluationSubmitResult.Unauthorized } - is AddToEvaluationSet.Response404 -> { + is AddRunAnnotations.Response404 -> { EvaluationSubmitResult.NotFound } - is AddToEvaluationSet.Response400 -> { + is AddRunAnnotations.Response400 -> { EvaluationSubmitResult.Error(response.body.message) } - is AddToEvaluationSet.Response500 -> { + is AddRunAnnotations.Response500 -> { EvaluationSubmitResult.Error("${response.body.name} - ${response.body.description}") } } @@ -215,8 +214,8 @@ class AigenticPlatformEndpoints( val edge = Gateway.Handler.client(serialization) val rawRequest = edge.to(request) val rawResponse = executeRequest(rawRequest) - // Backward compatibility: old gateways respond 201 with an empty body (no RunCreatedDto). - // Surface it as a Response201 with an empty runId so the run id stays null instead of throwing. + // Backward compatibility: older gateways answer 201 with an empty body (the previous `201 -> Unit`). + // Surface it as a 201 with a blank runId so it doesn't throw; sendRun maps the blank id to an Error. if (rawResponse.statusCode == 201 && rawResponse.body?.isEmpty() != false) { return Gateway.Response201(RunCreatedDto(runId = "")) } @@ -230,8 +229,8 @@ class AigenticPlatformEndpoints( return edge.from(rawResponse) } - override suspend fun addToEvaluationSet(request: AddToEvaluationSet.Request): AddToEvaluationSet.Response<*> { - val edge = AddToEvaluationSet.Handler.client(serialization) + override suspend fun addRunAnnotations(request: AddRunAnnotations.Request): AddRunAnnotations.Response<*> { + val edge = AddRunAnnotations.Handler.client(serialization) val rawRequest = edge.to(request) val rawResponse = executeRequest(rawRequest) return edge.from(rawResponse) diff --git a/src/platform/src/jvmTest/kotlin/community/flock/aigentic/platform/client/AigenticPlatformClientTest.kt b/src/platform/src/jvmTest/kotlin/community/flock/aigentic/platform/client/AigenticPlatformClientTest.kt index 52307b542..b07520f60 100644 --- a/src/platform/src/jvmTest/kotlin/community/flock/aigentic/platform/client/AigenticPlatformClientTest.kt +++ b/src/platform/src/jvmTest/kotlin/community/flock/aigentic/platform/client/AigenticPlatformClientTest.kt @@ -23,7 +23,10 @@ class AigenticPlatformClientTest : withData( nameFn = { "Should map ${it.wirespecResponse} to ${it.runSentResult}" }, TestCase(Gateway.Response201(body = RunCreatedDto("run-123")), RunSentResult.Success(RunId("run-123"))), - TestCase(Gateway.Response201(body = RunCreatedDto("")), RunSentResult.Success(null)), + TestCase( + Gateway.Response201(body = RunCreatedDto("")), + RunSentResult.Error("Gateway accepted the run but returned no run id"), + ), TestCase(Gateway.Response401(body = Unit), RunSentResult.Unauthorized), TestCase( Gateway.Response400(body = GatewayClientErrorDto("invalid request")), diff --git a/src/platform/src/jvmTest/kotlin/community/flock/aigentic/platform/client/EvaluationSubmissionTest.kt b/src/platform/src/jvmTest/kotlin/community/flock/aigentic/platform/client/EvaluationSubmissionTest.kt index d5dfdf8e4..54d1f55a0 100644 --- a/src/platform/src/jvmTest/kotlin/community/flock/aigentic/platform/client/EvaluationSubmissionTest.kt +++ b/src/platform/src/jvmTest/kotlin/community/flock/aigentic/platform/client/EvaluationSubmissionTest.kt @@ -232,7 +232,7 @@ class EvaluationSubmissionTest : describe("addToEvaluationSet") { - it("POSTs the serialized expected output to /gateway/runs/{runId}/evaluation") { + it("POSTs the serialized expected output to /gateway/runs/{runId}/annotations") { var capturedPath: String? = null var capturedBody: String? = null val engine = @@ -246,13 +246,13 @@ class EvaluationSubmissionTest : val result = agent.addToEvaluationSet( - runId = RunId("run-99"), + runId = "run-99", evaluationSet = "golden", expected = InvoiceFields("INV-009", "9.99"), ) result shouldBe EvaluationSubmitResult.Success - capturedPath shouldBe "/gateway/runs/run-99/evaluation" + capturedPath shouldBe "/gateway/runs/run-99/annotations" val body = capturedBody.shouldNotBeNull() val evaluationDto = lenientJson.decodeFromString(RunEvaluationDto.serializer(), body) evaluationDto.evaluationSet shouldBe "golden" diff --git a/src/platform/wirespec/gateway.ws b/src/platform/wirespec/gateway.ws index 014dd6d3c..ced279b7f 100644 --- a/src/platform/wirespec/gateway.ws +++ b/src/platform/wirespec/gateway.ws @@ -247,7 +247,7 @@ endpoint GetRuns GET /gateway/runs ? { tags: String? } -> { 500 -> ServerErrorDto } -endpoint AddToEvaluationSet POST RunEvaluationDto /gateway/runs/{runId: String}/evaluation -> { +endpoint AddRunAnnotations POST RunEvaluationDto /gateway/runs/{runId: String}/annotations -> { 200 -> Unit 400 -> GatewayClientErrorDto 401 -> Unit