diff --git a/Appraisals b/Appraisals index f0232d34..272581d7 100644 --- a/Appraisals +++ b/Appraisals @@ -99,3 +99,12 @@ appraise "rails" do gem "activesupport", "~> 8.0" gem "railties", "~> 8.0" end + +# Rails engine testing for the eval server engine +appraise "rails-server" do + gem "actionpack", "~> 8.0" + gem "railties", "~> 8.0" + gem "activesupport", "~> 8.0" + gem "rack", "~> 3.0" + gem "rack-test", "~> 2.1" +end diff --git a/README.md b/README.md index f41cd421..07de9947 100644 --- a/README.md +++ b/README.md @@ -392,7 +392,11 @@ See [trace_scoring.rb](./examples/eval/trace_scoring.rb) for a full example. ### Dev Server -Run evaluations from the Braintrust web UI against code in your own application. Define evaluators, pass them to the dev server, and start serving: +Run evaluations from the Braintrust web UI against code in your own application. + +#### Run as a Rack app + +Define evaluators, pass them to the dev server, and start serving: ```ruby # eval_server.ru @@ -418,10 +422,21 @@ run Braintrust::Server::Rack.app( ) ``` +Add your Rack server to your Gemfile: + +```ruby +gem "rack" +gem "puma" # recommended +``` + +Then start the server: + ```bash bundle exec rackup eval_server.ru -p 8300 -o 0.0.0.0 ``` +See example: [server/eval.ru](./examples/server/eval.ru) + **Custom evaluators** Evaluators can also be defined as subclasses: @@ -438,6 +453,51 @@ class FoodClassifier < Braintrust::Eval::Evaluator end ``` +#### Run as a Rails engine + +Use the Rails engine when your evaluators live inside an existing Rails app and you want to mount the Braintrust eval server into that application. + +Define each evaluator in its own file, for example under `app/evaluators/`: + +```ruby +# app/evaluators/food_classifier.rb +class FoodClassifier < Braintrust::Eval::Evaluator + def task + ->(input:) { classify(input) } + end + + def scorers + [Braintrust::Scorer.new("exact_match") { |expected:, output:| output == expected ? 1.0 : 0.0 }] + end +end +``` + +Then generate the Braintrust initializer: + +```bash +bin/rails generate braintrust:eval_server +``` + +```ruby +# config/routes.rb +Rails.application.routes.draw do + mount Braintrust::Contrib::Rails::Engine, at: "/braintrust" +end +``` + +The generator writes `config/initializers/braintrust_server.rb`, where you can review or customize the slug-to-evaluator mapping it discovers from `app/evaluators/**/*.rb` and `evaluators/**/*.rb`. + +See example: [contrib/rails/eval.rb](./examples/contrib/rails/eval.rb) + +**Developing locally** + +If you want to skip authentication on incoming eval requests while developing locally: + +- **For Rack**: Pass `auth: :none` to `Braintrust::Server::Rack.app(...)` +- **For Rails**: Set `config.auth = :none` in `config/initializers/braintrust_server.rb` + +*NOTE: Setting `:none` disables authentication on incoming requests into your server; executing evals requires a `BRAINTRUST_API_KEY` to fetch resources.* + **Supported web servers** The dev server requires the `rack` gem and a Rack-compatible web server. @@ -449,14 +509,7 @@ The dev server requires the `rack` gem and a Rack-compatible web server. | [Passenger](https://www.phusionpassenger.com/) | 6.x | | | [WEBrick](https://github.com/ruby/webrick) | Not supported | Does not support server-sent events. | -Add your chosen server to your Gemfile: - -```ruby -gem "rack" -gem "puma" # recommended -``` - -See example: [server/eval.ru](./examples/server/eval.ru) +See examples: [server/eval.ru](./examples/server/eval.ru), ## Documentation diff --git a/examples/README.md b/examples/README.md index 87d5d04d..6c1f6f24 100644 --- a/examples/README.md +++ b/examples/README.md @@ -33,6 +33,7 @@ BRAINTRUST_DEBUG=true ruby examples/login/login_basic.rb ### Dev Server Examples - **`server/eval.ru`**: Set up a dev server for remote evals — define evaluators (subclass or inline) and serve them via a Rack app. Start with: `bundle exec appraisal server rackup examples/server/eval.ru -p 8300 -o 0.0.0.0` +- **`contrib/rails/eval.rb`**: Mount the dev server as a Rails engine, define evaluator classes under `app/evaluators/`, and generate `config/initializers/braintrust_server.rb` with `bin/rails generate braintrust:server` ## Coming Soon diff --git a/examples/contrib/rails/eval.rb b/examples/contrib/rails/eval.rb new file mode 100644 index 00000000..b6884c09 --- /dev/null +++ b/examples/contrib/rails/eval.rb @@ -0,0 +1,57 @@ +# frozen_string_literal: true + +# Braintrust Rails Engine — mount example +# +# This file shows one conventional setup for the Braintrust eval server in Rails: +# 1. Define evaluator classes under app/evaluators/ +# 2. Generate the initializer with: +# bin/rails generate braintrust:server +# 3. Mount the engine in config/routes.rb +# +# Requirements: +# gem 'actionpack', '~> 8.0' +# gem 'railties', '~> 8.0' +# gem 'activesupport', '~> 8.0' + +# --------------------------------------------------------------------------- +# app/evaluators/my_classifier.rb +# --------------------------------------------------------------------------- + +class MyClassifier < Braintrust::Eval::Evaluator + def task + ->(input:) { classify(input) } + end + + def scorers + [Braintrust::Scorer.new("accuracy") { |expected:, output:| (output == expected) ? 1.0 : 0.0 }] + end +end + +# --------------------------------------------------------------------------- +# config/initializers/braintrust_server.rb +# --------------------------------------------------------------------------- + +# Generated by: bin/rails generate braintrust:server +# +# require "braintrust/contrib/rails/server" +# +# Braintrust::Contrib::Rails::Server::Engine.configure do |config| +# config.evaluators = { +# "my-classifier" => MyClassifier.new +# } +# +# # Default is :clerk_token. Use :none only for local development without +# # incoming request authentication. Outgoing Braintrust API calls still need +# # normal Braintrust credentials. +# config.auth = :clerk_token +# end + +# --------------------------------------------------------------------------- +# config/routes.rb +# --------------------------------------------------------------------------- + +# Rails.application.routes.draw do +# mount Braintrust::Contrib::Rails::Server::Engine, at: "/braintrust" +# end + +puts "Braintrust Rails Engine example — see comments for usage" diff --git a/gemfiles/rails_server.gemfile b/gemfiles/rails_server.gemfile new file mode 100644 index 00000000..0ab0584a --- /dev/null +++ b/gemfiles/rails_server.gemfile @@ -0,0 +1,14 @@ +# This file was generated by Appraisal + +source "https://rubygems.org" + +gem "minitest-reporters", "~> 1.6" +gem "minitest-stub-const", "~> 0.6" +gem "climate_control", "~> 1.2" +gem "actionpack", "~> 8.0" +gem "railties", "~> 8.0" +gem "activesupport", "~> 8.0" +gem "rack", "~> 3.0" +gem "rack-test", "~> 2.1" + +gemspec path: "../" diff --git a/lib/braintrust/contrib/rails/server.rb b/lib/braintrust/contrib/rails/server.rb new file mode 100644 index 00000000..558852c5 --- /dev/null +++ b/lib/braintrust/contrib/rails/server.rb @@ -0,0 +1,20 @@ +# frozen_string_literal: true + +begin + require "action_controller" + require "rails/engine" +rescue LoadError + raise LoadError, + "Rails (actionpack + railties) is required for the Braintrust Rails server engine. " \ + "Add `gem 'rails'` or `gem 'actionpack'` and `gem 'railties'` to your Gemfile." +end + +require "json" +require_relative "../../eval" +require_relative "../../server/sse" +require_relative "../../server/auth/no_auth" +require_relative "../../server/auth/clerk_token" +require_relative "../../server/middleware/cors" +require_relative "../../server/services/list_service" +require_relative "../../server/services/eval_service" +require_relative "server/engine" diff --git a/lib/braintrust/contrib/rails/server/application_controller.rb b/lib/braintrust/contrib/rails/server/application_controller.rb new file mode 100644 index 00000000..74da4411 --- /dev/null +++ b/lib/braintrust/contrib/rails/server/application_controller.rb @@ -0,0 +1,34 @@ +# frozen_string_literal: true + +module Braintrust + module Contrib + module Rails + module Server + class ApplicationController < ActionController::API + before_action :authenticate! + + private + + def authenticate! + auth_result = Engine.auth_strategy.authenticate(request.env) + unless auth_result + render json: {"error" => "Unauthorized"}, status: :unauthorized + return + end + + request.env["braintrust.auth"] = auth_result + @braintrust_auth = auth_result + end + + def parse_json_body + body = request.body.read + return nil if body.nil? || body.empty? + JSON.parse(body) + rescue JSON::ParserError + nil + end + end + end + end + end +end diff --git a/lib/braintrust/contrib/rails/server/engine.rb b/lib/braintrust/contrib/rails/server/engine.rb new file mode 100644 index 00000000..28905503 --- /dev/null +++ b/lib/braintrust/contrib/rails/server/engine.rb @@ -0,0 +1,72 @@ +# frozen_string_literal: true + +module Braintrust + module Contrib + module Rails + module Server + class Engine < ::Rails::Engine + isolate_namespace Braintrust::Contrib::Rails::Server + + config.evaluators = {} + config.auth = :clerk_token + + # Register the engine's routes file so Rails loads it during initialization. + paths["config/routes.rb"] << File.expand_path("routes.rb", __dir__) + + initializer "braintrust.server.cors" do |app| + app.middleware.use Braintrust::Server::Middleware::Cors + end + + # Class-level helpers that read from engine config. + + def self.evaluators + config.evaluators + end + + def self.auth_strategy + resolve_auth(config.auth) + end + + def self.list_service + Braintrust::Server::Services::List.new(-> { config.evaluators }) + end + + # Long-lived so the state cache persists across requests. + def self.eval_service + @eval_service ||= Braintrust::Server::Services::Eval.new(-> { config.evaluators }) + end + + # Support the explicit `|config|` style used by this integration while + # still delegating zero-arity DSL blocks to Rails' native implementation. + def self.configure(&block) + return super if block&.arity == 0 + yield config if block + end + + def self.resolve_auth(auth) + case auth + when :none + Braintrust::Server::Auth::NoAuth.new + when :clerk_token + Braintrust::Server::Auth::ClerkToken.new + when Symbol, String + raise ArgumentError, "Unknown auth strategy #{auth.inspect}. Expected :none, :clerk_token, or an auth object." + else + auth + end + end + private_class_method :resolve_auth + + generators do + require "braintrust/contrib/rails/server/generator" + end + end + end + end + end +end + +require_relative "application_controller" +require_relative "health_controller" +require_relative "list_controller" +require_relative "eval_controller" diff --git a/lib/braintrust/contrib/rails/server/eval_controller.rb b/lib/braintrust/contrib/rails/server/eval_controller.rb new file mode 100644 index 00000000..ccd4c380 --- /dev/null +++ b/lib/braintrust/contrib/rails/server/eval_controller.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +module Braintrust + module Contrib + module Rails + module Server + class EvalController < ApplicationController + include ActionController::Live + + def create + body = parse_json_body + unless body + render json: {"error" => "Invalid JSON body"}, status: :bad_request + return + end + + result = Engine.eval_service.validate(body) + if result[:error] + render json: {"error" => result[:error]}, status: result[:status] + return + end + + response.headers["Content-Type"] = "text/event-stream" + response.headers["Cache-Control"] = "no-cache" + response.headers["Connection"] = "keep-alive" + + sse = Braintrust::Server::SSEWriter.new { |chunk| response.stream.write(chunk) } + Engine.eval_service.stream(result, auth: @braintrust_auth, sse: sse) + ensure + response.stream.close + end + end + end + end + end +end diff --git a/lib/braintrust/contrib/rails/server/generator.rb b/lib/braintrust/contrib/rails/server/generator.rb new file mode 100644 index 00000000..37801eb3 --- /dev/null +++ b/lib/braintrust/contrib/rails/server/generator.rb @@ -0,0 +1,43 @@ +# frozen_string_literal: true + +require "rails/generators" + +module Braintrust + module Contrib + module Rails + module Server + module Generators + class ServerGenerator < ::Rails::Generators::Base + namespace "braintrust:server" + source_root File.expand_path("templates", __dir__) + + def create_initializer + @evaluators = discovered_evaluators + template "initializer.rb.tt", "config/initializers/braintrust_server.rb" + end + + private + + def discovered_evaluators + evaluator_roots.flat_map do |root| + Dir[File.join(destination_root, root, "**/*.rb")].sort.map do |file| + relative_path = file.delete_prefix("#{File.join(destination_root, root)}/").sub(/\.rb\z/, "") + { + class_name: relative_path.split("/").map(&:camelize).join("::"), + slug: relative_path.tr("/", "-").tr("_", "-") + } + end + end + end + + def evaluator_roots + %w[app/evaluators evaluators].select do |root| + Dir.exist?(File.join(destination_root, root)) + end + end + end + end + end + end + end +end diff --git a/lib/braintrust/contrib/rails/server/health_controller.rb b/lib/braintrust/contrib/rails/server/health_controller.rb new file mode 100644 index 00000000..361f8433 --- /dev/null +++ b/lib/braintrust/contrib/rails/server/health_controller.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +module Braintrust + module Contrib + module Rails + module Server + class HealthController < ApplicationController + def show + render json: {"status" => "ok"} + end + end + end + end + end +end diff --git a/lib/braintrust/contrib/rails/server/list_controller.rb b/lib/braintrust/contrib/rails/server/list_controller.rb new file mode 100644 index 00000000..1b661151 --- /dev/null +++ b/lib/braintrust/contrib/rails/server/list_controller.rb @@ -0,0 +1,16 @@ +# frozen_string_literal: true + +module Braintrust + module Contrib + module Rails + module Server + class ListController < ApplicationController + def show + result = Engine.list_service.call + render json: result + end + end + end + end + end +end diff --git a/lib/braintrust/contrib/rails/server/routes.rb b/lib/braintrust/contrib/rails/server/routes.rb new file mode 100644 index 00000000..5e3043cd --- /dev/null +++ b/lib/braintrust/contrib/rails/server/routes.rb @@ -0,0 +1,8 @@ +# frozen_string_literal: true + +Braintrust::Contrib::Rails::Server::Engine.routes.draw do + get "/", to: "health#show" + get "/list", to: "list#show" + post "/list", to: "list#show" + post "/eval", to: "eval#create" +end diff --git a/lib/braintrust/contrib/rails/server/templates/initializer.rb.tt b/lib/braintrust/contrib/rails/server/templates/initializer.rb.tt new file mode 100644 index 00000000..afe292e5 --- /dev/null +++ b/lib/braintrust/contrib/rails/server/templates/initializer.rb.tt @@ -0,0 +1,19 @@ +require "braintrust/contrib/rails/server" + +Braintrust::Contrib::Rails::Server::Engine.configure do |config| + config.evaluators = { +<% if @evaluators.empty? -%> + # Add evaluator instances here, for example: + # "food-classifier" => FoodClassifier.new +<% else -%> +<% @evaluators.each_with_index do |evaluator, index| -%> + "<%= evaluator[:slug] %>" => <%= evaluator[:class_name] %>.new<%= "," unless index == @evaluators.length - 1 %> +<% end -%> +<% end -%> + } + + # Default is :clerk_token. Use :none only when developing locally without + # incoming request authentication; outgoing Braintrust API calls still need + # a valid Braintrust API key. + config.auth = :clerk_token +end diff --git a/lib/braintrust/server/handlers/eval.rb b/lib/braintrust/server/handlers/eval.rb index 4283afcf..ec866808 100644 --- a/lib/braintrust/server/handlers/eval.rb +++ b/lib/braintrust/server/handlers/eval.rb @@ -10,38 +10,15 @@ module Handlers class Eval def initialize(evaluators) @evaluators = evaluators + @service = Services::Eval.new(evaluators) end def call(env) body = parse_body(env) return error_response(400, "Invalid JSON body") unless body - name = body["name"] - return error_response(400, "Missing required field: name") unless name - - evaluator = @evaluators[name] - return error_response(404, "Evaluator '#{name}' not found") unless evaluator - - data = body["data"] - return error_response(400, "Missing required field: data") unless data - - # Validate exactly one data source - data_sources = ["data", "dataset_name", "dataset_id"].count { |k| data.key?(k) } - return error_response(400, "Exactly one data source required") if data_sources != 1 - - experiment_name = body["experiment_name"] - - # Resolve data source - cases, dataset = resolve_data_source(data) - - # Resolve remote scorers from request - remote_scorer_ids = resolve_remote_scorers(body["scores"]) - - # Resolve parent span context - parent = resolve_parent(body["parent"]) - - # Build state from auth context (if present) - state = build_state(env) + result = @service.validate(body) + return error_response(result[:status], result[:error]) if result[:error] # The protocol-rack adapter (used by Falcon and any server built on # protocol-http) buffers `each`-based bodies through an Enumerable path. @@ -50,64 +27,7 @@ def call(env) body_class = env.key?("protocol.http.request") ? SSEStreamBody : SSEBody sse_body = body_class.new do |sse| - # Only pass project/experiment params when state is available - run_opts = { - on_progress: ->(progress_data) { - # Build remote eval protocol events from generic progress data. - # Runner provides: id, data/error, scores (optional), origin (optional). - # Protocol requires: id, object_type, origin, name, format, output_type, event, data. - base = { - "object_type" => "task", - "name" => name, - "format" => "code", - "output_type" => "completion" - } - base["id"] = progress_data["id"] if progress_data["id"] - base["origin"] = progress_data["origin"] if progress_data["origin"] - - if progress_data.key?("error") - sse.event("progress", JSON.dump(base.merge("event" => "error", "data" => progress_data["error"]))) - else - sse.event("progress", JSON.dump(base.merge("event" => "json_delta", "data" => JSON.dump(progress_data["data"])))) - end - - # Signal per-cell completion so the UI exits "Streaming..." state - # and updates the progress bar immediately. - sse.event("progress", JSON.dump(base.merge("event" => "done", "data" => ""))) - }, - quiet: true - } - run_opts[:parent] = parent if parent - run_opts[:scorers] = remote_scorer_ids if remote_scorer_ids - run_opts[:dataset] = dataset if dataset - - if state - run_opts[:state] = state - run_opts[:experiment] = experiment_name if experiment_name - run_opts[:project_id] = body["project_id"] if body["project_id"] - end - - result = evaluator.run(cases, **run_opts) - - # Flush buffered OTLP spans before sending completion events. - # The BatchSpanProcessor exports every ~5s; fast evals can finish - # before a single export fires, causing the UI to see no results. - Braintrust::Trace.flush_spans - - # Build summary from result scores - averaged_scores = {} - result.scorer_stats.each do |scorer_name, stats| - averaged_scores[scorer_name] = stats.score_mean - end - - sse.event("summary", JSON.dump({ - "scores" => averaged_scores, - "experiment_name" => experiment_name, - "experiment_id" => result.experiment_id, - "project_id" => result.project_id - })) - - sse.event("done", "") + @service.stream(result, auth: env["braintrust.auth"], sse: sse) end [200, {"content-type" => "text/event-stream", "cache-control" => "no-cache", "connection" => "keep-alive"}, sse_body] @@ -115,90 +35,6 @@ def call(env) private - # Resolve data source from the data field. - # Returns [cases, dataset] where exactly one is non-nil. - def resolve_data_source(data) - if data.key?("data") - cases = data["data"].map do |d| - {input: d["input"], expected: d["expected"]} - end - [cases, nil] - elsif data.key?("dataset_id") - [nil, Braintrust::Dataset::ID.new(id: data["dataset_id"])] - elsif data.key?("dataset_name") - dataset_opts = {name: data["dataset_name"]} - dataset_opts[:project] = data["project_name"] if data["project_name"] - [nil, dataset_opts] - else - [nil, nil] - end - end - - # Map request scores array to Scorer::ID structs. - # The UI sends function_id as a nested object: {"function_id": "uuid"}. - def resolve_remote_scorers(scores) - return nil if scores.nil? || scores.empty? - scores.map do |s| - func_id = s["function_id"] - func_id = func_id["function_id"] if func_id.is_a?(Hash) - Braintrust::Scorer::ID.new( - function_id: func_id, - version: s["version"] - ) - end - end - - # Map request parent to symbol-keyed Hash. - # Hardcode playground_id to match Java SDK behavior. - # Also extracts generation from propagated_event for span_attributes. - def resolve_parent(parent) - return nil unless parent.is_a?(Hash) - object_id = parent["object_id"] - return nil unless object_id - - generation = parent.dig("propagated_event", "span_attributes", "generation") - - result = {object_type: "playground_id", object_id: object_id} - result[:generation] = generation if generation - result - end - - # Build State from auth context set by Auth middleware. - # Returns nil when no auth context is present (e.g. NoAuth strategy). - # Uses an LRU-style cache (max 64 entries) keyed by [api_key, app_url, org_name]. - def build_state(env) - auth = env["braintrust.auth"] - return nil unless auth.is_a?(Hash) - - cache_key = [auth["api_key"], auth["app_url"], auth["org_name"]] - - @state_mutex ||= Mutex.new - @state_cache ||= {} - - @state_mutex.synchronize do - cached = @state_cache[cache_key] - return cached if cached - - state = Braintrust::State.new( - api_key: auth["api_key"], - org_id: auth["org_id"], - org_name: auth["org_name"], - app_url: auth["app_url"], - api_url: auth["api_url"], - enable_tracing: false - ) - - # Evict oldest entry if cache is full - if @state_cache.size >= 64 - oldest_key = @state_cache.keys.first - @state_cache.delete(oldest_key) - end - - @state_cache[cache_key] = state - state - end - end - def parse_body(env) body = env["rack.input"]&.read return nil if body.nil? || body.empty? @@ -211,6 +47,10 @@ def error_response(status, message) [status, {"content-type" => "application/json"}, [JSON.dump({"error" => message})]] end + + def build_state(env) + @service.build_state(env["braintrust.auth"]) + end end end end diff --git a/lib/braintrust/server/handlers/list.rb b/lib/braintrust/server/handlers/list.rb index 0a908116..2e58d893 100644 --- a/lib/braintrust/server/handlers/list.rb +++ b/lib/braintrust/server/handlers/list.rb @@ -23,50 +23,12 @@ module Handlers class List def initialize(evaluators) @evaluators = evaluators + @service = Services::List.new(evaluators) end def call(_env) - result = {} - @evaluators.each do |name, evaluator| - scores = (evaluator.scorers || []).each_with_index.map do |scorer, i| - scorer_name = scorer.respond_to?(:name) ? scorer.name : "score_#{i}" - {"name" => scorer_name} - end - entry = {"scores" => scores} - params = serialize_parameters(evaluator.parameters) - entry["parameters"] = params if params - result[name] = entry - end - - [200, {"content-type" => "application/json"}, - [JSON.dump(result)]] - end - - private - - # Convert user-defined parameters to the dev server protocol format. - # Wraps in a staticParameters container with "data" typed entries. - def serialize_parameters(parameters) - return nil unless parameters && !parameters.empty? - - schema = {} - parameters.each do |name, spec| - spec = spec.transform_keys(&:to_s) if spec.is_a?(Hash) - if spec.is_a?(Hash) - schema[name.to_s] = { - "type" => "data", - "schema" => {"type" => spec["type"] || "string"}, - "default" => spec["default"], - "description" => spec["description"] - } - end - end - - { - "type" => "braintrust.staticParameters", - "schema" => schema, - "source" => nil - } + result = @service.call + [200, {"content-type" => "application/json"}, [JSON.dump(result)]] end end end diff --git a/lib/braintrust/server/rack.rb b/lib/braintrust/server/rack.rb index d397d75d..154d8443 100644 --- a/lib/braintrust/server/rack.rb +++ b/lib/braintrust/server/rack.rb @@ -15,6 +15,8 @@ require_relative "auth/clerk_token" require_relative "middleware/cors" require_relative "middleware/auth" +require_relative "services/list_service" +require_relative "services/eval_service" require_relative "handlers/health" require_relative "handlers/list" require_relative "handlers/eval" diff --git a/lib/braintrust/server/services/eval_service.rb b/lib/braintrust/server/services/eval_service.rb new file mode 100644 index 00000000..406d7c80 --- /dev/null +++ b/lib/braintrust/server/services/eval_service.rb @@ -0,0 +1,214 @@ +# frozen_string_literal: true + +require "json" + +module Braintrust + module Server + module Services + # Framework-agnostic service for running evaluations and streaming SSE results. + # Must be long-lived (not per-request) to preserve the @state_cache across requests. + class Eval + def initialize(evaluators) + @evaluators = evaluators + @state_mutex = Mutex.new + @state_cache = {} + end + + # Validates request body. Returns: + # {error: String, status: Integer} on failure + # {evaluator:, name:, cases:, dataset:, ...} on success + def validate(body) + name = body["name"] + return {error: "Missing required field: name", status: 400} unless name + + evaluator = current_evaluators[name] + return {error: "Evaluator '#{name}' not found", status: 404} unless evaluator + + data = body["data"] + return {error: "Missing required field: data", status: 400} unless data + + data_sources = ["data", "dataset_name", "dataset_id"].count { |k| data.key?(k) } + return {error: "Exactly one data source required", status: 400} if data_sources != 1 + + cases, dataset = resolve_data_source(data) + + { + evaluator: evaluator, + name: name, + cases: cases, + dataset: dataset, + experiment_name: body["experiment_name"], + remote_scorer_ids: resolve_remote_scorers(body["scores"]), + parent: resolve_parent(body["parent"]), + project_id: body["project_id"] + } + end + + # Runs the validated eval and streams SSE events via the sse writer. + # +validated+ is the hash returned by #validate. + # +auth+ is the auth context hash (or nil/true for no-auth). + # +sse+ is an SSEWriter instance. + def stream(validated, auth:, sse:) + name = validated[:name] + evaluator = validated[:evaluator] + cases = validated[:cases] + dataset = validated[:dataset] + experiment_name = validated[:experiment_name] + remote_scorer_ids = validated[:remote_scorer_ids] + parent = validated[:parent] + project_id = validated[:project_id] + + state = build_state(auth) + + # Only pass project/experiment params when state is available + run_opts = { + on_progress: ->(progress_data) { + # Build remote eval protocol events from generic progress data. + # Runner provides: id, data/error, scores (optional), origin (optional). + # Protocol requires: id, object_type, origin, name, format, output_type, event, data. + base = { + "object_type" => "task", + "name" => name, + "format" => "code", + "output_type" => "completion" + } + base["id"] = progress_data["id"] if progress_data["id"] + base["origin"] = progress_data["origin"] if progress_data["origin"] + + if progress_data.key?("error") + sse.event("progress", JSON.dump(base.merge("event" => "error", "data" => progress_data["error"]))) + else + sse.event("progress", JSON.dump(base.merge("event" => "json_delta", "data" => JSON.dump(progress_data["data"])))) + end + + # Signal per-cell completion so the UI exits "Streaming..." state + # and updates the progress bar immediately. + sse.event("progress", JSON.dump(base.merge("event" => "done", "data" => ""))) + }, + quiet: true + } + run_opts[:parent] = parent if parent + run_opts[:scorers] = remote_scorer_ids if remote_scorer_ids + run_opts[:dataset] = dataset if dataset + + if state + run_opts[:state] = state + run_opts[:experiment] = experiment_name if experiment_name + run_opts[:project_id] = project_id if project_id + end + + result = evaluator.run(cases, **run_opts) + + # Flush buffered OTLP spans before sending completion events. + # The BatchSpanProcessor exports every ~5s; fast evals can finish + # before a single export fires, causing the UI to see no results. + Braintrust::Trace.flush_spans + + # Build summary from result scores + averaged_scores = {} + result.scorer_stats.each do |scorer_name, stats| + averaged_scores[scorer_name] = stats.score_mean + end + + sse.event("summary", JSON.dump({ + "scores" => averaged_scores, + "experiment_name" => experiment_name, + "experiment_id" => result.experiment_id, + "project_id" => result.project_id + })) + + sse.event("done", "") + end + + # Build State from auth context hash. + # Returns nil when auth is not a Hash (e.g. NoAuth returns true). + # Uses an LRU-style cache (max 64 entries) keyed by [api_key, app_url, org_name]. + def build_state(auth) + return nil unless auth.is_a?(Hash) + + cache_key = [auth["api_key"], auth["app_url"], auth["org_name"]] + + @state_mutex ||= Mutex.new + @state_cache ||= {} + + @state_mutex.synchronize do + cached = @state_cache[cache_key] + return cached if cached + + state = Braintrust::State.new( + api_key: auth["api_key"], + org_id: auth["org_id"], + org_name: auth["org_name"], + app_url: auth["app_url"], + api_url: auth["api_url"], + enable_tracing: false + ) + + if @state_cache.size >= 64 + oldest_key = @state_cache.keys.first + @state_cache.delete(oldest_key) + end + + @state_cache[cache_key] = state + state + end + end + + private + + def current_evaluators + return @evaluators.call if @evaluators.respond_to?(:call) + @evaluators + end + + # Resolve data source from the data field. + # Returns [cases, dataset] where exactly one is non-nil. + def resolve_data_source(data) + if data.key?("data") + cases = data["data"].map do |d| + {input: d["input"], expected: d["expected"]} + end + [cases, nil] + elsif data.key?("dataset_id") + [nil, Braintrust::Dataset::ID.new(id: data["dataset_id"])] + elsif data.key?("dataset_name") + dataset_opts = {name: data["dataset_name"]} + dataset_opts[:project] = data["project_name"] if data["project_name"] + [nil, dataset_opts] + else + [nil, nil] + end + end + + # Map request scores array to Scorer::ID structs. + # The UI sends function_id as a nested object: {"function_id": "uuid"}. + def resolve_remote_scorers(scores) + return nil if scores.nil? || scores.empty? + scores.map do |s| + func_id = s["function_id"] + func_id = func_id["function_id"] if func_id.is_a?(Hash) + Braintrust::Scorer::ID.new( + function_id: func_id, + version: s["version"] + ) + end + end + + # Map request parent to symbol-keyed Hash. + # Hardcode playground_id to match Java SDK behavior. + # Also extracts generation from propagated_event for span_attributes. + def resolve_parent(parent) + return nil unless parent.is_a?(Hash) + object_id = parent["object_id"] + return nil unless object_id + + generation = parent.dig("propagated_event", "span_attributes", "generation") + + result = {object_type: "playground_id", object_id: object_id} + result[:generation] = generation if generation + result + end + end + end + end +end diff --git a/lib/braintrust/server/services/list_service.rb b/lib/braintrust/server/services/list_service.rb new file mode 100644 index 00000000..06bd7add --- /dev/null +++ b/lib/braintrust/server/services/list_service.rb @@ -0,0 +1,64 @@ +# frozen_string_literal: true + +require "json" + +module Braintrust + module Server + module Services + # Framework-agnostic service for listing evaluators. + # Returns a plain Hash (not a Rack triplet) suitable for JSON.dump. + class List + def initialize(evaluators) + @evaluators = evaluators + end + + def call + result = {} + current_evaluators.each do |name, evaluator| + scores = (evaluator.scorers || []).each_with_index.map do |scorer, i| + scorer_name = scorer.respond_to?(:name) ? scorer.name : "score_#{i}" + {"name" => scorer_name} + end + entry = {"scores" => scores} + params = serialize_parameters(evaluator.parameters) + entry["parameters"] = params if params + result[name] = entry + end + result + end + + private + + def current_evaluators + return @evaluators.call if @evaluators.respond_to?(:call) + @evaluators + end + + # Convert user-defined parameters to the dev server protocol format. + # Wraps in a staticParameters container with "data" typed entries. + def serialize_parameters(parameters) + return nil unless parameters && !parameters.empty? + + schema = {} + parameters.each do |name, spec| + spec = spec.transform_keys(&:to_s) if spec.is_a?(Hash) + if spec.is_a?(Hash) + schema[name.to_s] = { + "type" => "data", + "schema" => {"type" => spec["type"] || "string"}, + "default" => spec["default"], + "description" => spec["description"] + } + end + end + + { + "type" => "braintrust.staticParameters", + "schema" => schema, + "source" => nil + } + end + end + end + end +end diff --git a/test/braintrust/contrib/rails/rails_server_helper.rb b/test/braintrust/contrib/rails/rails_server_helper.rb new file mode 100644 index 00000000..951ea513 --- /dev/null +++ b/test/braintrust/contrib/rails/rails_server_helper.rb @@ -0,0 +1,67 @@ +# frozen_string_literal: true + +# Try to load Rails engine dependencies. +RAILS_SERVER_AVAILABLE = begin + require "rack/test" + require "action_controller" + require "action_dispatch" + require "rails" + require "braintrust/contrib/rails/server" + true +rescue LoadError + false +end + +if RAILS_SERVER_AVAILABLE + # Create a minimal Rails application for engine integration tests. + # Guard against being required multiple times. + unless defined?(BraintrustRailsTestApp) + class BraintrustRailsTestApp < Rails::Application + config.eager_load = false + config.secret_key_base = "braintrust-rails-test-secret-key-abc123456789" + config.logger = ::Logger.new(nil) + config.log_level = :fatal + + # Allow any host in tests (Rack::Test uses "example.org" by default) + config.hosts.clear + + routes.draw do + mount Braintrust::Contrib::Rails::Server::Engine, at: "/" + end + + initialize! + end + end +end + +module Braintrust + module Contrib + module Rails + module ServerHelper + def skip_unless_rails_server! + skip "Rails not available (run with: bundle exec appraisal rails-server rake test)" unless RAILS_SERVER_AVAILABLE + end + + # The engine itself as a Rack app — use for controller integration tests. + # Faster and more direct than routing through a full Rails application. + def rails_engine_app + Braintrust::Contrib::Rails::Server::Engine + end + + # The full test Rails application (mounts the engine at /). + # Use only when you need to verify middleware stack or mounted routing. + def rails_app + BraintrustRailsTestApp + end + + def reset_engine!(evaluators: {}, auth: :none) + engine = Braintrust::Contrib::Rails::Server::Engine + engine.config.evaluators = evaluators + engine.config.auth = auth + # Clear the long-lived eval service so cached state does not leak across tests. + engine.instance_variable_set(:@eval_service, nil) + end + end + end + end +end diff --git a/test/braintrust/contrib/rails/server/engine_test.rb b/test/braintrust/contrib/rails/server/engine_test.rb new file mode 100644 index 00000000..2456606d --- /dev/null +++ b/test/braintrust/contrib/rails/server/engine_test.rb @@ -0,0 +1,135 @@ +# frozen_string_literal: true + +require "test_helper" +require_relative "../rails_server_helper" + +module Braintrust + module Contrib + module Rails + module Server + class EngineTest < Minitest::Test + include Braintrust::Contrib::Rails::ServerHelper + + def setup + skip_unless_rails_server! + reset_engine! + end + + def test_evaluators_returns_config_value + evaluator = Braintrust::Eval::Evaluator.new(task: ->(input) { input }) + Engine.config.evaluators = {"my-eval" => evaluator} + assert_same evaluator, Engine.evaluators["my-eval"] + end + + def test_auth_strategy_returns_no_auth_for_none + Engine.config.auth = :none + assert_instance_of Braintrust::Server::Auth::NoAuth, Engine.auth_strategy + end + + def test_auth_strategy_returns_clerk_token_by_default + Engine.config.auth = :clerk_token + assert_instance_of Braintrust::Server::Auth::ClerkToken, Engine.auth_strategy + end + + def test_auth_strategy_accepts_custom_object + custom = Braintrust::Server::Auth::NoAuth.new + Engine.config.auth = custom + assert_same custom, Engine.auth_strategy + end + + def test_auth_strategy_raises_for_unknown_symbol + Engine.config.auth = :jwt + assert_raises(ArgumentError) { Engine.auth_strategy } + end + + def test_auth_strategy_raises_for_unknown_string + Engine.config.auth = "jwt" + assert_raises(ArgumentError) { Engine.auth_strategy } + end + + def test_auth_strategy_reflects_config_changes_without_manual_reset + Engine.config.auth = :none + assert_instance_of Braintrust::Server::Auth::NoAuth, Engine.auth_strategy + + Engine.config.auth = :clerk_token + assert_instance_of Braintrust::Server::Auth::ClerkToken, Engine.auth_strategy + end + + def test_list_service_uses_latest_evaluators_without_manual_reset + first = Braintrust::Eval::Evaluator.new(task: ->(input) { input }) + second = Braintrust::Eval::Evaluator.new(task: ->(input) { input }) + + Engine.config.evaluators = {"first" => first} + assert_equal ["first"], Engine.list_service.call.keys + + Engine.config.evaluators = {"second" => second} + assert_equal ["second"], Engine.list_service.call.keys + end + + def test_eval_service_uses_latest_evaluators_without_manual_reset + first = Braintrust::Eval::Evaluator.new(task: ->(input) { input }) + second = Braintrust::Eval::Evaluator.new(task: ->(input) { input }) + payload = {"data" => {"data" => [{"input" => "hello"}]}} + + Engine.config.evaluators = {"first" => first} + service = Engine.eval_service + assert_same first, service.validate(payload.merge("name" => "first"))[:evaluator] + + Engine.config.evaluators = {"second" => second} + assert_same second, service.validate(payload.merge("name" => "second"))[:evaluator] + end + + def test_eval_service_returns_eval_instance + assert_instance_of Braintrust::Server::Services::Eval, Engine.eval_service + end + + def test_list_service_returns_list_instance + assert_instance_of Braintrust::Server::Services::List, Engine.list_service + end + + def test_eval_service_is_memoized + svc1 = Engine.eval_service + svc2 = Engine.eval_service + assert_same svc1, svc2 + end + + def test_configure_yields_config_without_resetting_eval_service + svc_before = Engine.eval_service + evaluator = Braintrust::Eval::Evaluator.new(task: ->(input) { input }) + payload = {"name" => "configured-eval", "data" => {"data" => [{"input" => "hello"}]}} + + Engine.configure do |config| + config.evaluators = {"configured-eval" => evaluator} + config.auth = :none + end + + assert_same evaluator, Engine.evaluators["configured-eval"] + assert_instance_of Braintrust::Server::Auth::NoAuth, Engine.auth_strategy + assert_same svc_before, Engine.eval_service + assert_same evaluator, Engine.eval_service.validate(payload)[:evaluator] + end + + def test_cors_middleware_is_in_middleware_stack + stack = BraintrustRailsTestApp.middleware + middleware_classes = stack.map { |m| + begin + m.klass + rescue + m + end + } + assert middleware_classes.any? { |klass| + klass == Braintrust::Server::Middleware::Cors + }, "CORS middleware should be in the stack" + end + + def test_engine_has_expected_routes + routes = Engine.routes.routes.map { |r| "#{r.verb} #{r.path.spec}" } + assert routes.any? { |r| r.include?("/list") }, "Should have /list route" + assert routes.any? { |r| r.include?("/eval") }, "Should have /eval route" + end + end + end + end + end +end diff --git a/test/braintrust/contrib/rails/server/eval_controller_test.rb b/test/braintrust/contrib/rails/server/eval_controller_test.rb new file mode 100644 index 00000000..8eaaa54e --- /dev/null +++ b/test/braintrust/contrib/rails/server/eval_controller_test.rb @@ -0,0 +1,172 @@ +# frozen_string_literal: true + +require "test_helper" +require_relative "../rails_server_helper" +require "json" + +module Braintrust + module Contrib + module Rails + module Server + class EvalControllerTest < Minitest::Test + include Braintrust::Contrib::Rails::ServerHelper + include ::Rack::Test::Methods if defined?(::Rack::Test::Methods) + + def setup + skip_unless_rails_server! + @evaluators = {} + @rig = setup_otel_test_rig + reset_engine!(evaluators: @evaluators, auth: :none) + end + + def app + rails_engine_app + end + + def test_streams_sse_events_for_inline_data + @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }) + reset_engine!(evaluators: @evaluators, auth: :none) + + post_json "/eval", { + name: "upcase-eval", + data: { + data: [ + {input: "hello", expected: "HELLO"}, + {input: "world", expected: "WORLD"} + ] + }, + experiment_name: "test-experiment", + project_id: "proj-123" + } + + assert_equal 200, last_response.status + assert_match "text/event-stream", last_response.content_type + + events = parse_sse_events(last_response.body) + progress_events = events.select { |e| e[:event] == "progress" } + assert_equal 4, progress_events.length # 2 per case + + summary_events = events.select { |e| e[:event] == "summary" } + assert_equal 1, summary_events.length + + assert_equal "done", events.last[:event] + end + + def test_progress_events_contain_output + @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }) + reset_engine!(evaluators: @evaluators, auth: :none) + + post_json "/eval", { + name: "upcase-eval", + data: {data: [{input: "hello", expected: "HELLO"}]}, + experiment_name: "test-experiment", + project_id: "proj-123" + } + + events = parse_sse_events(last_response.body) + progress = events.find { |e| e[:event] == "progress" } + data = JSON.parse(progress[:data]) + + assert_equal "HELLO", JSON.parse(data["data"]) + end + + def test_summary_event_contains_scores + scorer = Braintrust::Eval.scorer("exact") { |_i, e, o| (o == e) ? 1.0 : 0.0 } + @evaluators["scored-eval"] = test_evaluator( + task: ->(input) { input.to_s.upcase }, + scorers: [scorer] + ) + reset_engine!(evaluators: @evaluators, auth: :none) + + post_json "/eval", { + name: "scored-eval", + data: {data: [{input: "hello", expected: "HELLO"}]}, + experiment_name: "test-experiment", + project_id: "proj-123" + } + + events = parse_sse_events(last_response.body) + summary = events.find { |e| e[:event] == "summary" } + data = JSON.parse(summary[:data]) + + assert data.key?("scores") + assert data.key?("experiment_name") + end + + def test_error_still_emits_progress_and_done + @evaluators["failing-eval"] = test_evaluator(task: ->(_input) { raise "task exploded" }) + reset_engine!(evaluators: @evaluators, auth: :none) + + post_json "/eval", { + name: "failing-eval", + data: {data: [{input: "hello"}]}, + experiment_name: "test-experiment", + project_id: "proj-123" + } + + events = parse_sse_events(last_response.body) + assert events.any? { |e| e[:event] == "progress" || e[:event] == "error" } + assert_equal "done", events.last[:event] + end + + def test_404_for_unknown_evaluator + post_json "/eval", { + name: "nonexistent", + data: {data: [{input: "hello"}]}, + experiment_name: "test-experiment", + project_id: "proj-123" + } + + assert_equal 404, last_response.status + body = JSON.parse(last_response.body) + assert_match(/not found/i, body["error"]) + end + + def test_400_for_missing_name + post_json "/eval", { + data: {data: [{input: "hello"}]} + } + + assert_equal 400, last_response.status + end + + def test_400_for_missing_data + @evaluators["test-eval"] = test_evaluator(task: ->(input) { input }) + reset_engine!(evaluators: @evaluators, auth: :none) + + post_json "/eval", {name: "test-eval"} + + assert_equal 400, last_response.status + end + + def test_400_for_invalid_json_body + post "/eval", "not-json", {"CONTENT_TYPE" => "application/json"} + + assert_equal 400, last_response.status + end + + def test_returns_401_when_auth_fails + reset_engine!(evaluators: @evaluators, auth: :clerk_token) + + post_json "/eval", { + name: "test-eval", + data: {data: [{input: "hello"}]} + } + + assert_equal 401, last_response.status + end + + private + + def test_evaluator(**kwargs) + Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs) + end + + def post_json(path, body) + post path, JSON.generate(body), {"CONTENT_TYPE" => "application/json"} + end + end + end + end + end +end diff --git a/test/braintrust/contrib/rails/server/generator_test.rb b/test/braintrust/contrib/rails/server/generator_test.rb new file mode 100644 index 00000000..bedd2598 --- /dev/null +++ b/test/braintrust/contrib/rails/server/generator_test.rb @@ -0,0 +1,56 @@ +# frozen_string_literal: true + +require "test_helper" +require_relative "../rails_server_helper" + +if RAILS_SERVER_AVAILABLE + require "rails/generators/test_case" + require "braintrust/contrib/rails/server/generator" + + module Braintrust + module Contrib + module Rails + module Server + class GeneratorTest < ::Rails::Generators::TestCase + tests ::Braintrust::Contrib::Rails::Server::Generators::ServerGenerator + destination File.expand_path("../../../../tmp/server_generator", __dir__) + setup :prepare_destination + + def test_generates_initializer_from_app_evaluators + FileUtils.mkdir_p(File.join(destination_root, "app/evaluators")) + File.write( + File.join(destination_root, "app/evaluators/food_classifier.rb"), + <<~RUBY + class FoodClassifier < Braintrust::Eval::Evaluator + end + RUBY + ) + + run_generator + + assert_file "config/initializers/braintrust_server.rb" do |contents| + assert_includes contents, "require \"braintrust/contrib/rails/server\"" + assert_includes contents, "Braintrust::Contrib::Rails::Server::Engine.configure" + assert_includes contents, "\"food-classifier\" => FoodClassifier.new" + end + end + end + end + end + end + end +else + module Braintrust + module Contrib + module Rails + module Server + class GeneratorTest < Minitest::Test + def test_skips_without_rails + skip "Rails not available (run with: bundle exec appraisal rails-server rake test)" + end + end + end + end + end + end +end diff --git a/test/braintrust/contrib/rails/server/health_controller_test.rb b/test/braintrust/contrib/rails/server/health_controller_test.rb new file mode 100644 index 00000000..192bbfc5 --- /dev/null +++ b/test/braintrust/contrib/rails/server/health_controller_test.rb @@ -0,0 +1,43 @@ +# frozen_string_literal: true + +require "test_helper" +require_relative "../rails_server_helper" +require "json" + +module Braintrust + module Contrib + module Rails + module Server + class HealthControllerTest < Minitest::Test + include Braintrust::Contrib::Rails::ServerHelper + include ::Rack::Test::Methods if defined?(::Rack::Test::Methods) + + def setup + skip_unless_rails_server! + reset_engine!(auth: :none) + end + + def app + rails_engine_app + end + + def test_get_root_returns_200 + get "/" + assert_equal 200, last_response.status + end + + def test_get_root_returns_json_content_type + get "/" + assert_match "application/json", last_response.content_type + end + + def test_get_root_returns_status_ok + get "/" + body = JSON.parse(last_response.body) + assert_equal "ok", body["status"] + end + end + end + end + end +end diff --git a/test/braintrust/contrib/rails/server/list_controller_test.rb b/test/braintrust/contrib/rails/server/list_controller_test.rb new file mode 100644 index 00000000..15f3fa0f --- /dev/null +++ b/test/braintrust/contrib/rails/server/list_controller_test.rb @@ -0,0 +1,103 @@ +# frozen_string_literal: true + +require "test_helper" +require_relative "../rails_server_helper" +require "json" + +module Braintrust + module Contrib + module Rails + module Server + class ListControllerTest < Minitest::Test + include Braintrust::Contrib::Rails::ServerHelper + include ::Rack::Test::Methods if defined?(::Rack::Test::Methods) + + def setup + skip_unless_rails_server! + @evaluators = {} + reset_engine!(evaluators: @evaluators, auth: :none) + end + + def app + rails_engine_app + end + + def test_get_list_returns_200 + get "/list" + assert_equal 200, last_response.status + end + + def test_post_list_returns_200 + post "/list" + assert_equal 200, last_response.status + end + + def test_returns_empty_hash_when_no_evaluators + get "/list" + body = JSON.parse(last_response.body) + assert_equal({}, body) + end + + def test_returns_evaluators_keyed_by_name + @evaluators["food-classifier"] = Braintrust::Eval::Evaluator.new(task: ->(input) { input }) + @evaluators["text-summarizer"] = Braintrust::Eval::Evaluator.new(task: ->(input) { input }) + reset_engine!(evaluators: @evaluators, auth: :none) + + get "/list" + body = JSON.parse(last_response.body) + assert body.key?("food-classifier") + assert body.key?("text-summarizer") + end + + def test_includes_scorer_names + @evaluators["scored-eval"] = Braintrust::Eval::Evaluator.new( + task: ->(input) { input }, + scorers: [ + Braintrust::Eval.scorer("exact_match") { |_i, e, o| (o == e) ? 1.0 : 0.0 }, + Braintrust::Eval.scorer("length_check") { |_i, _e, _o| 1.0 } + ] + ) + reset_engine!(evaluators: @evaluators, auth: :none) + + get "/list" + body = JSON.parse(last_response.body) + score_names = body["scored-eval"]["scores"].map { |s| s["name"] } + assert_equal ["exact_match", "length_check"], score_names + end + + def test_includes_parameters_in_static_container + @evaluators["param-eval"] = Braintrust::Eval::Evaluator.new( + task: ->(input) { input }, + parameters: {"temperature" => {type: "number", default: 0.7, description: "LLM temperature"}} + ) + reset_engine!(evaluators: @evaluators, auth: :none) + + get "/list" + body = JSON.parse(last_response.body) + params = body["param-eval"]["parameters"] + assert_equal "braintrust.staticParameters", params["type"] + assert_equal 0.7, params["schema"]["temperature"]["default"] + end + + def test_omits_parameters_when_none_defined + @evaluators["no-params"] = Braintrust::Eval::Evaluator.new(task: ->(input) { input }) + reset_engine!(evaluators: @evaluators, auth: :none) + + get "/list" + body = JSON.parse(last_response.body) + refute body["no-params"].key?("parameters") + end + + def test_returns_401_when_auth_fails + # Use clerk_token auth — no auth header means failure + reset_engine!(evaluators: @evaluators, auth: :clerk_token) + + # WebMock blocks real HTTP, so clerk token validation will fail + get "/list" + assert_equal 401, last_response.status + end + end + end + end + end +end diff --git a/test/braintrust/server/handlers/eval_test.rb b/test/braintrust/server/handlers/eval_test.rb index 45473ea3..dce8a868 100644 --- a/test/braintrust/server/handlers/eval_test.rb +++ b/test/braintrust/server/handlers/eval_test.rb @@ -406,7 +406,7 @@ def test_build_state_evicts_oldest_when_cache_full end # First entry (key-0) should have been evicted - cache = h.instance_variable_get(:@state_cache) + cache = h.instance_variable_get(:@service).instance_variable_get(:@state_cache) assert_equal 64, cache.size, "Cache should not exceed 64 entries" refute cache.key?(["key-0", "https://app.example.com", "org-0"]), diff --git a/test/braintrust/server/services/eval_service_test.rb b/test/braintrust/server/services/eval_service_test.rb new file mode 100644 index 00000000..38235b35 --- /dev/null +++ b/test/braintrust/server/services/eval_service_test.rb @@ -0,0 +1,249 @@ +# frozen_string_literal: true + +require "test_helper" +require "json" + +# Unit tests for Services::Eval — runs without any framework (no appraisal needed). +module Braintrust + module Server + module Services + class EvalTest < Minitest::Test + def setup + skip_unless_server! + @evaluators = {} + @rig = setup_otel_test_rig + end + + def service + Eval.new(@evaluators) + end + + # --- validate --- + + def test_validate_returns_error_for_missing_name + result = service.validate({}) + assert_equal 400, result[:status] + assert_match(/name/, result[:error]) + end + + def test_validate_returns_error_for_unknown_evaluator + result = service.validate({"name" => "nonexistent", "data" => {"data" => []}}) + assert_equal 404, result[:status] + assert_match(/not found/i, result[:error]) + end + + def test_validate_returns_error_for_missing_data + @evaluators["test-eval"] = test_evaluator(task: ->(input) { input }) + result = service.validate({"name" => "test-eval"}) + assert_equal 400, result[:status] + assert_match(/data/, result[:error]) + end + + def test_validate_returns_error_for_multiple_data_sources + @evaluators["test-eval"] = test_evaluator(task: ->(input) { input }) + result = service.validate({ + "name" => "test-eval", + "data" => {"data" => [{"input" => "x"}], "dataset_name" => "ds"} + }) + assert_equal 400, result[:status] + end + + def test_validate_returns_valid_hash_on_success + @evaluators["my-eval"] = test_evaluator(task: ->(input) { input }) + result = service.validate({ + "name" => "my-eval", + "data" => {"data" => [{"input" => "hello", "expected" => "hello"}]}, + "experiment_name" => "exp-1", + "project_id" => "proj-1" + }) + + refute result.key?(:error) + assert_equal "my-eval", result[:name] + assert_equal @evaluators["my-eval"], result[:evaluator] + assert_equal [{input: "hello", expected: "hello"}], result[:cases] + assert_equal "exp-1", result[:experiment_name] + assert_equal "proj-1", result[:project_id] + end + + def test_validate_accepts_dataset_id + @evaluators["test-eval"] = test_evaluator(task: ->(input) { input }) + result = service.validate({ + "name" => "test-eval", + "data" => {"dataset_id" => "ds-123"} + }) + + refute result.key?(:error) + assert_nil result[:cases] + assert_instance_of Braintrust::Dataset::ID, result[:dataset] + end + + def test_validate_accepts_dataset_name + @evaluators["test-eval"] = test_evaluator(task: ->(input) { input }) + result = service.validate({ + "name" => "test-eval", + "data" => {"dataset_name" => "my-dataset", "project_name" => "my-project"} + }) + + refute result.key?(:error) + assert_nil result[:cases] + assert_equal({name: "my-dataset", project: "my-project"}, result[:dataset]) + end + + # --- stream --- + + def test_stream_emits_progress_and_done_events + @evaluators["upcase-eval"] = test_evaluator(task: ->(input) { input.to_s.upcase }) + s = service + validated = s.validate({ + "name" => "upcase-eval", + "data" => {"data" => [{"input" => "hello"}, {"input" => "world"}]}, + "experiment_name" => "exp" + }) + + events = collect_streamed_events(s, validated) + + progress = events.select { |e| e[:event] == "progress" } + assert_equal 4, progress.length # 2 per case: json_delta + done + assert_equal "done", events.last[:event] + end + + def test_stream_emits_summary_with_scores + scorer = Braintrust::Eval.scorer("exact") { |_i, e, o| (o == e) ? 1.0 : 0.0 } + @evaluators["scored-eval"] = test_evaluator( + task: ->(input) { input.to_s.upcase }, + scorers: [scorer] + ) + s = service + validated = s.validate({ + "name" => "scored-eval", + "data" => {"data" => [{"input" => "hello", "expected" => "HELLO"}]}, + "experiment_name" => "my-exp" + }) + + events = collect_streamed_events(s, validated) + summary = events.find { |e| e[:event] == "summary" } + data = JSON.parse(summary[:data]) + + assert data.key?("scores") + assert_equal "my-exp", data["experiment_name"] + end + + def test_stream_emits_error_progress_on_task_failure + @evaluators["failing-eval"] = test_evaluator(task: ->(_input) { raise "boom" }) + s = service + validated = s.validate({ + "name" => "failing-eval", + "data" => {"data" => [{"input" => "x"}]}, + "experiment_name" => "exp" + }) + + events = collect_streamed_events(s, validated) + progress = events.find { |e| e[:event] == "progress" } + data = JSON.parse(progress[:data]) + + assert_equal "error", data["event"] + assert_match(/boom/, data["data"]) + assert_equal "done", events.last[:event] + end + + def test_stream_does_not_pass_state_when_auth_is_not_hash + received_opts = nil + spy = test_evaluator( + task: ->(input) { input }, + scorers: [Braintrust::Eval.scorer("s") { |_i, _e, _o| 1.0 }] + ) + spy.define_singleton_method(:run) do |cases, **opts| + received_opts = opts + Braintrust::Eval::Result.new( + experiment_id: nil, experiment_name: nil, + project_id: nil, project_name: nil, + permalink: nil, scores: {}, errors: [], duration: 0.01 + ) + end + + @evaluators["spy-eval"] = spy + s = service + validated = s.validate({ + "name" => "spy-eval", + "data" => {"data" => [{"input" => "x"}]}, + "experiment_name" => "exp" + }) + + collect_streamed_events(s, validated, auth: true) # NoAuth returns true + + assert_nil received_opts[:state] + end + + # --- build_state --- + + def test_build_state_returns_nil_for_non_hash_auth + assert_nil service.build_state(nil) + assert_nil service.build_state(true) + assert_nil service.build_state("string") + end + + def test_build_state_caches_by_auth_key + s = service + auth = { + "api_key" => "key-1", + "org_id" => "org-1", + "org_name" => "org", + "app_url" => "https://app.example.com", + "api_url" => "https://api.example.com" + } + + state1 = s.build_state(auth) + state2 = s.build_state(auth) + + assert_same state1, state2 + end + + def test_build_state_returns_different_state_for_different_keys + s = service + auth_a = {"api_key" => "key-a", "org_id" => "org-a", "org_name" => "org-a", + "app_url" => "https://a.example.com", "api_url" => "https://a.example.com"} + auth_b = {"api_key" => "key-b", "org_id" => "org-b", "org_name" => "org-b", + "app_url" => "https://b.example.com", "api_url" => "https://b.example.com"} + + state_a = s.build_state(auth_a) + state_b = s.build_state(auth_b) + + refute_same state_a, state_b + end + + def test_build_state_evicts_oldest_when_cache_full + s = service + + 65.times do |i| + auth = { + "api_key" => "key-#{i}", + "org_id" => "org-#{i}", + "org_name" => "org-#{i}", + "app_url" => "https://app.example.com", + "api_url" => "https://api.example.com" + } + s.build_state(auth) + end + + cache = s.instance_variable_get(:@state_cache) + assert_equal 64, cache.size + refute cache.key?(["key-0", "https://app.example.com", "org-0"]), + "Oldest entry should have been evicted" + end + + private + + def test_evaluator(**kwargs) + Test::Support::EvalHelper::TestEvaluator.new(tracer_provider: @rig.tracer_provider, **kwargs) + end + + def collect_streamed_events(svc, validated, auth: nil) + chunks = [] + sse = Braintrust::Server::SSEWriter.new { |chunk| chunks << chunk } + svc.stream(validated, auth: auth, sse: sse) + parse_sse_events(chunks.join) + end + end + end + end +end diff --git a/test/braintrust/server/services/list_service_test.rb b/test/braintrust/server/services/list_service_test.rb new file mode 100644 index 00000000..ad6de82a --- /dev/null +++ b/test/braintrust/server/services/list_service_test.rb @@ -0,0 +1,87 @@ +# frozen_string_literal: true + +require "test_helper" +require "json" + +# Unit tests for Services::List — runs without any framework (no appraisal needed). +module Braintrust + module Server + module Services + class ListTest < Minitest::Test + def setup + skip_unless_server! + @evaluators = {} + end + + def service + List.new(@evaluators) + end + + def test_returns_empty_hash_when_no_evaluators + result = service.call + assert_equal({}, result) + end + + def test_returns_evaluators_keyed_by_name + @evaluators["eval-a"] = Braintrust::Eval::Evaluator.new(task: ->(input) { input }) + @evaluators["eval-b"] = Braintrust::Eval::Evaluator.new(task: ->(input) { input }) + + result = service.call + assert result.key?("eval-a") + assert result.key?("eval-b") + end + + def test_includes_scorer_names + @evaluators["scored"] = Braintrust::Eval::Evaluator.new( + task: ->(input) { input }, + scorers: [ + Braintrust::Eval.scorer("accuracy") { |_i, _e, _o| 1.0 }, + Braintrust::Eval.scorer("relevance") { |_i, _e, _o| 0.5 } + ] + ) + + result = service.call + score_names = result["scored"]["scores"].map { |s| s["name"] } + assert_equal ["accuracy", "relevance"], score_names + end + + def test_empty_scores_when_no_scorers + @evaluators["no-scores"] = Braintrust::Eval::Evaluator.new(task: ->(input) { input }) + + result = service.call + assert_equal [], result["no-scores"]["scores"] + end + + def test_includes_parameters_in_static_container + @evaluators["param-eval"] = Braintrust::Eval::Evaluator.new( + task: ->(input) { input }, + parameters: {"temperature" => {type: "number", default: 0.7, description: "LLM temperature"}} + ) + + result = service.call + params = result["param-eval"]["parameters"] + assert_equal "braintrust.staticParameters", params["type"] + assert_nil params["source"] + assert_equal 0.7, params["schema"]["temperature"]["default"] + assert_equal "number", params["schema"]["temperature"]["schema"]["type"] + end + + def test_omits_parameters_when_none_defined + @evaluators["no-params"] = Braintrust::Eval::Evaluator.new(task: ->(input) { input }) + + result = service.call + refute result["no-params"].key?("parameters") + end + + def test_result_is_json_serializable + @evaluators["my-eval"] = Braintrust::Eval::Evaluator.new(task: ->(input) { input }) + + result = service.call + json = JSON.dump(result) + parsed = JSON.parse(json) + assert parsed.key?("my-eval") + end + end + end + end +end