braintrustdata · David Elner (delner) · Mar 17, 2026 · Mar 16, 2026
diff --git a/README.md b/README.md
@@ -315,6 +315,23 @@ Braintrust::Eval.run(
 )
 ```
 
+#### Scorer metadata
+
+Scorers can return a Hash with `:score` and `:metadata` to attach structured context to the score. The metadata is logged on the scorer's span and visible in the Braintrust UI for debugging and filtering:
+
+```ruby
+Braintrust::Scorer.new("translation") do |expected:, output:|
+  common_words = output.downcase.split & expected.downcase.split
+  overlap = common_words.size.to_f / expected.split.size
+  {
+    score: overlap,
+    metadata: {word_overlap: common_words.size, missing_words: expected.downcase.split - output.downcase.split}
+  }
+end
+```
+
+See [scorer_metadata.rb](./examples/eval/scorer_metadata.rb) for a full example.
+
 #### Trace scoring
 
 Scorers can access the full evaluation trace (all spans generated by the task) by declaring a `trace:` keyword parameter. This is useful for inspecting intermediate LLM calls, validating tool usage, or checking the message thread:

diff --git a/examples/eval/scorer_metadata.rb b/examples/eval/scorer_metadata.rb
@@ -0,0 +1,76 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+require "bundler/setup"
+require "braintrust"
+require "opentelemetry/sdk"
+
+# Example: Scorer Metadata
+#
+# Scorers can return a Hash with :score and :metadata keys to attach
+# structured context alongside the numeric score. The metadata is
+# logged on the scorer's span and visible in the Braintrust UI for
+# debugging and filtering.
+#
+# Usage:
+#   bundle exec ruby examples/eval/scorer_metadata.rb
+
+Braintrust.init
+
+EXPECTED_TOOLS = {
+  "What's the weather?" => {name: "get_weather", args: ["location"]},
+  "Book a flight to Paris" => {name: "book_flight", args: ["destination", "date"]},
+  "Send an email to Bob" => {name: "send_email", args: ["recipient", "subject", "body"]}
+}
+
+# Simulated tool-calling model
+def pick_tool(input)
+  case input
+  when /weather/i then {name: "get_weather", args: ["location"]}
+  when /flight/i then {name: "book_flight", args: ["destination"]} # missing "date"
+  when /email/i then {name: "wrong_tool", args: []}
+  else {name: "unknown", args: []}
+  end
+end
+
+# Scorer that returns structured metadata explaining *why* a score was given
+tool_accuracy = Braintrust::Scorer.new("tool_accuracy") { |expected:, output:|
+  expected_name = expected[:name]
+  actual_name = output[:name]
+  expected_args = expected[:args]
+  actual_args = output[:args]
+
+  if actual_name != expected_name
+    {
+      score: 0.0,
+      metadata: {
+        failure_type: "wrong_tool",
+        reason: "Expected tool '#{expected_name}' but got '#{actual_name}'"
+      }
+    }
+  else
+    missing_args = expected_args - actual_args
+    if missing_args.empty?
+      {score: 1.0, metadata: {failure_type: nil, reason: "Correct tool and arguments"}}
+    else
+      {
+        score: 0.5,
+        metadata: {
+          failure_type: "missing_arguments",
+          reason: "Correct tool '#{expected_name}' but missing args: #{missing_args.join(", ")}",
+          missing_args: missing_args
+        }
+      }
+    end
+  end
+}
+
+Braintrust::Eval.run(
+  project: "ruby-sdk-examples",
+  experiment: "scorer-metadata-example",
+  cases: EXPECTED_TOOLS.map { |input, expected| {input: input, expected: expected} },
+  task: ->(input:) { pick_tool(input) },
+  scorers: [tool_accuracy]
+)
+
+OpenTelemetry.tracer_provider.shutdown
diff --git a/lib/braintrust/eval/runner.rb b/lib/braintrust/eval/runner.rb
@@ -197,15 +197,23 @@ def run_scorer(scorer, scorer_kwargs, scorer_input, scores)
           set_json_attr(score_span, "braintrust.span_attributes", build_scorer_span_attributes(scorer.name))
           set_json_attr(score_span, "braintrust.input_json", scorer_input)
 
-          score_value = scorer.call(**scorer_kwargs)
-          scores[scorer.name] = score_value
+          raw_result = scorer.call(**scorer_kwargs)
+          normalized = normalize_score_result(raw_result, scorer.name)
 
-          scorer_scores = {scorer.name => score_value}
+          score_name = normalized[:name]
+          scores[score_name] = normalized[:score]
+
+          scorer_scores = {score_name => normalized[:score]}
           set_json_attr(score_span, "braintrust.output_json", scorer_scores)
           set_json_attr(score_span, "braintrust.scores", scorer_scores)
 
+          # Set scorer metadata on its span
+          if normalized[:metadata].is_a?(Hash)
+            set_json_attr(score_span, "braintrust.metadata", normalized[:metadata])
+          end
+
           # Collect raw score for summary (thread-safe)
-          collect_score(scorer.name, score_value)
+          collect_score(score_name, normalized[:score])
         rescue => e
           record_span_error(score_span, e, "ScorerError")
           raise
@@ -304,6 +312,20 @@ def collect_score(name, value)
           (@scores[name] ||= []) << value
         end
       end
+
+      # Normalize a scorer return value into its component parts.
+      # Scorers may return a raw Numeric or a Hash with :score, :metadata, and :name keys.
+      # @param result [Object] Raw scorer return value
+      # @param default_name [String] Scorer name to use if not overridden
+      # @return [Hash] Normalized hash with :score, :metadata, :name keys
+      def normalize_score_result(result, default_name)
+        if result.is_a?(Hash)
+          result[:name] ||= default_name
+          result
+        else
+          {score: result, metadata: nil, name: default_name}
+        end
+      end
     end
   end
 end