diff --git a/README.md b/README.md index 2a4b9861..d5ca0271 100644 --- a/README.md +++ b/README.md @@ -315,6 +315,23 @@ Braintrust::Eval.run( ) ``` +#### Scorer metadata + +Scorers can return a Hash with `:score` and `:metadata` to attach structured context to the score. The metadata is logged on the scorer's span and visible in the Braintrust UI for debugging and filtering: + +```ruby +Braintrust::Scorer.new("translation") do |expected:, output:| + common_words = output.downcase.split & expected.downcase.split + overlap = common_words.size.to_f / expected.split.size + { + score: overlap, + metadata: {word_overlap: common_words.size, missing_words: expected.downcase.split - output.downcase.split} + } +end +``` + +See [scorer_metadata.rb](./examples/eval/scorer_metadata.rb) for a full example. + #### Trace scoring Scorers can access the full evaluation trace (all spans generated by the task) by declaring a `trace:` keyword parameter. This is useful for inspecting intermediate LLM calls, validating tool usage, or checking the message thread: diff --git a/examples/eval/scorer_metadata.rb b/examples/eval/scorer_metadata.rb new file mode 100644 index 00000000..4a8ff76a --- /dev/null +++ b/examples/eval/scorer_metadata.rb @@ -0,0 +1,76 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require "bundler/setup" +require "braintrust" +require "opentelemetry/sdk" + +# Example: Scorer Metadata +# +# Scorers can return a Hash with :score and :metadata keys to attach +# structured context alongside the numeric score. The metadata is +# logged on the scorer's span and visible in the Braintrust UI for +# debugging and filtering. +# +# Usage: +# bundle exec ruby examples/eval/scorer_metadata.rb + +Braintrust.init + +EXPECTED_TOOLS = { + "What's the weather?" => {name: "get_weather", args: ["location"]}, + "Book a flight to Paris" => {name: "book_flight", args: ["destination", "date"]}, + "Send an email to Bob" => {name: "send_email", args: ["recipient", "subject", "body"]} +} + +# Simulated tool-calling model +def pick_tool(input) + case input + when /weather/i then {name: "get_weather", args: ["location"]} + when /flight/i then {name: "book_flight", args: ["destination"]} # missing "date" + when /email/i then {name: "wrong_tool", args: []} + else {name: "unknown", args: []} + end +end + +# Scorer that returns structured metadata explaining *why* a score was given +tool_accuracy = Braintrust::Scorer.new("tool_accuracy") { |expected:, output:| + expected_name = expected[:name] + actual_name = output[:name] + expected_args = expected[:args] + actual_args = output[:args] + + if actual_name != expected_name + { + score: 0.0, + metadata: { + failure_type: "wrong_tool", + reason: "Expected tool '#{expected_name}' but got '#{actual_name}'" + } + } + else + missing_args = expected_args - actual_args + if missing_args.empty? + {score: 1.0, metadata: {failure_type: nil, reason: "Correct tool and arguments"}} + else + { + score: 0.5, + metadata: { + failure_type: "missing_arguments", + reason: "Correct tool '#{expected_name}' but missing args: #{missing_args.join(", ")}", + missing_args: missing_args + } + } + end + end +} + +Braintrust::Eval.run( + project: "ruby-sdk-examples", + experiment: "scorer-metadata-example", + cases: EXPECTED_TOOLS.map { |input, expected| {input: input, expected: expected} }, + task: ->(input:) { pick_tool(input) }, + scorers: [tool_accuracy] +) + +OpenTelemetry.tracer_provider.shutdown diff --git a/lib/braintrust/eval/runner.rb b/lib/braintrust/eval/runner.rb index 496e5e6b..16ed7874 100644 --- a/lib/braintrust/eval/runner.rb +++ b/lib/braintrust/eval/runner.rb @@ -197,15 +197,23 @@ def run_scorer(scorer, scorer_kwargs, scorer_input, scores) set_json_attr(score_span, "braintrust.span_attributes", build_scorer_span_attributes(scorer.name)) set_json_attr(score_span, "braintrust.input_json", scorer_input) - score_value = scorer.call(**scorer_kwargs) - scores[scorer.name] = score_value + raw_result = scorer.call(**scorer_kwargs) + normalized = normalize_score_result(raw_result, scorer.name) - scorer_scores = {scorer.name => score_value} + score_name = normalized[:name] + scores[score_name] = normalized[:score] + + scorer_scores = {score_name => normalized[:score]} set_json_attr(score_span, "braintrust.output_json", scorer_scores) set_json_attr(score_span, "braintrust.scores", scorer_scores) + # Set scorer metadata on its span + if normalized[:metadata].is_a?(Hash) + set_json_attr(score_span, "braintrust.metadata", normalized[:metadata]) + end + # Collect raw score for summary (thread-safe) - collect_score(scorer.name, score_value) + collect_score(score_name, normalized[:score]) rescue => e record_span_error(score_span, e, "ScorerError") raise @@ -304,6 +312,20 @@ def collect_score(name, value) (@scores[name] ||= []) << value end end + + # Normalize a scorer return value into its component parts. + # Scorers may return a raw Numeric or a Hash with :score, :metadata, and :name keys. + # @param result [Object] Raw scorer return value + # @param default_name [String] Scorer name to use if not overridden + # @return [Hash] Normalized hash with :score, :metadata, :name keys + def normalize_score_result(result, default_name) + if result.is_a?(Hash) + result[:name] ||= default_name + result + else + {score: result, metadata: nil, name: default_name} + end + end end end end diff --git a/test/braintrust/eval/runner_test.rb b/test/braintrust/eval/runner_test.rb index 29308a2a..2ae0be8a 100644 --- a/test/braintrust/eval/runner_test.rb +++ b/test/braintrust/eval/runner_test.rb @@ -1300,6 +1300,308 @@ def test_trace_works_with_parallelism end end + # ============================================ + # Runner#run tests - structured scorer returns + # ============================================ + + def test_scorer_hash_return_extracts_numeric_score + rig = setup_otel_test_rig + + scorer = Braintrust::Scorer.new("structured") { |output:| + {score: 0.75, metadata: {reason: "partial match"}} + } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + result = Braintrust::Eval::Runner.new(context).run + + assert result.success? + assert_equal({"structured" => [0.75]}, result.scores) + end + + def test_scorer_hash_return_name_override + rig = setup_otel_test_rig + + scorer = Braintrust::Scorer.new("original") { |output:| + {score: 0.9, name: "overridden"} + } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + result = Braintrust::Eval::Runner.new(context).run + + assert result.success? + assert_equal({"overridden" => [0.9]}, result.scores) + assert_nil result.scores["original"] + end + + def test_scorer_hash_return_metadata_on_span + rig = setup_otel_test_rig + + scorer = Braintrust::Scorer.new("meta_scorer") { |output:| + {score: 1.0, metadata: {failure_type: "none", confidence: 0.99}} + } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + Braintrust::Eval::Runner.new(context).run + + score_span = rig.exporter.finished_spans.find { |s| s.name == "score" } + metadata = JSON.parse(score_span.attributes["braintrust.metadata"]) + assert_equal({"failure_type" => "none", "confidence" => 0.99}, metadata) + end + + def test_scorer_hash_without_score_key + rig = setup_otel_test_rig + + scorer = Braintrust::Scorer.new("no_score_key") { |output:| + {metadata: {reason: "test"}} + } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + result = Braintrust::Eval::Runner.new(context).run + + assert result.success? + # nil score is not Numeric, so not collected for stats + assert_equal({}, result.scores) + end + + def test_scorer_hash_with_nil_score + rig = setup_otel_test_rig + + scorer = Braintrust::Scorer.new("nil_score") { |output:| + {score: nil, metadata: {reason: "could not score"}} + } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + result = Braintrust::Eval::Runner.new(context).run + + assert result.success? + assert_equal({}, result.scores) + + # Metadata should still be logged even with nil score + score_span = rig.exporter.finished_spans.find { |s| s.name == "score" } + metadata = JSON.parse(score_span.attributes["braintrust.metadata"]) + assert_equal({"reason" => "could not score"}, metadata) + end + + def test_multiple_scorers_mixed_return_types + rig = setup_otel_test_rig + + scorer1 = Braintrust::Scorer.new("numeric") { 0.8 } + scorer2 = Braintrust::Scorer.new("structured") { {score: 0.6, metadata: {detail: "partial"}} } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer1, scorer2], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + result = Braintrust::Eval::Runner.new(context).run + + assert result.success? + assert_equal({"numeric" => [0.8], "structured" => [0.6]}, result.scores) + + # Only structured scorer's span has metadata + score_spans = rig.exporter.finished_spans.select { |s| s.name == "score" } + structured_span = score_spans.find { |s| s.attributes["braintrust.scores"]&.include?("structured") } + numeric_span = score_spans.find { |s| s.attributes["braintrust.scores"]&.include?("numeric") } + + metadata = JSON.parse(structured_span.attributes["braintrust.metadata"]) + assert_equal({"detail" => "partial"}, metadata) + assert_nil numeric_span.attributes["braintrust.metadata"] + end + + def test_scorer_hash_return_scores_on_span_are_extracted + rig = setup_otel_test_rig + + scorer = Braintrust::Scorer.new("structured") { {score: 0.75, metadata: {x: 1}} } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + Braintrust::Eval::Runner.new(context).run + + score_span = rig.exporter.finished_spans.find { |s| s.name == "score" } + scores = JSON.parse(score_span.attributes["braintrust.scores"]) + # Should be the numeric value, not the hash + assert_equal 0.75, scores["structured"] + end + + def test_on_progress_receives_extracted_score_from_hash + progress_calls = [] + scorer = Braintrust::Scorer.new("structured") { {score: 0.5, metadata: {x: 1}} } + runner = build_simple_runner( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "hello"}], + on_progress: ->(data) { progress_calls << data } + ) + + runner.run + + assert_equal({"structured" => 0.5}, progress_calls.first["scores"]) + end + + def test_scorer_no_metadata_attr_when_all_numeric + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [ + Braintrust::Scorer.new("a") { 0.5 }, + Braintrust::Scorer.new("b") { 1.0 } + ], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + Braintrust::Eval::Runner.new(context).run + + score_span = rig.exporter.finished_spans.find { |s| s.name == "score" } + # No metadata attribute should be set when no scorers return metadata + assert_nil score_span.attributes["braintrust.metadata"] + end + + def test_scorer_hash_with_non_hash_metadata_ignored + rig = setup_otel_test_rig + + scorer = Braintrust::Scorer.new("str_meta") { |output:| + {score: 0.5, metadata: "not a hash"} + } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + result = Braintrust::Eval::Runner.new(context).run + + assert result.success? + assert_equal({"str_meta" => [0.5]}, result.scores) + + # Non-hash metadata should not be logged + score_span = rig.exporter.finished_spans.find { |s| s.name == "score" } + assert_nil score_span.attributes["braintrust.metadata"] + end + + def test_scorer_hash_return_multiple_cases + rig = setup_otel_test_rig + + scorer = Braintrust::Scorer.new("quality") { |output:| + {score: output.length.to_f / 10, metadata: {length: output.length}} + } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "hi"}, {input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + result = Braintrust::Eval::Runner.new(context).run + + assert result.success? + assert_equal({"quality" => [0.2, 0.5]}, result.scores) + end + + def test_scorer_empty_hash_return + rig = setup_otel_test_rig + + scorer = Braintrust::Scorer.new("empty") { |output:| {} } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + result = Braintrust::Eval::Runner.new(context).run + + assert result.success? + # Empty hash has no :score key, so score is nil and not collected + assert_equal({}, result.scores) + end + private def build_simple_runner(task:, cases:, scorers: [], on_progress: nil)