braintrustdata · David Elner (delner) · Mar 19, 2026 · Mar 19, 2026
diff --git a/README.md b/README.md
@@ -259,6 +259,8 @@ Braintrust::Eval.run(
 )
 ```
 
+See [eval.rb](./examples/eval.rb) for a full example.
+
 ### Datasets
 
 Use test cases from a Braintrust dataset:
@@ -287,6 +289,8 @@ Braintrust::Eval.run(
 )
 ```
 
+See [dataset.rb](./examples/eval/dataset.rb) for a full example.
+
 ### Scorers
 
 Use scoring functions defined in Braintrust:
@@ -315,6 +319,8 @@ Braintrust::Eval.run(
 )
 ```
 
+See [remote_functions.rb](./examples/eval/remote_functions.rb) for a full example.
+
 #### Scorer metadata
 
 Scorers can return a Hash with `:score` and `:metadata` to attach structured context to the score. The metadata is logged on the scorer's span and visible in the Braintrust UI for debugging and filtering:
@@ -332,6 +338,27 @@ end
 
 See [scorer_metadata.rb](./examples/eval/scorer_metadata.rb) for a full example.
 
+#### Multiple scores from one scorer
+
+When several scores can be computed together (e.g. in one LLM call), you can return an `Array` of score `Hash` instead of a single value. Each metric appears as a separate score column in the Braintrust UI:
+
+```ruby
+Braintrust::Scorer.new("summary_quality") do |output:, expected:|
+  words = output.downcase.split
+  key_terms = expected[:key_terms]
+  covered = key_terms.count { |t| words.include?(t) }
+
+  [
+    {name: "coverage", score: covered.to_f / key_terms.size, metadata: {missing: key_terms - words}},
+    {name: "conciseness", score: words.size <= expected[:max_words] ? 1.0 : 0.0}
+  ]
+end
+```
+
+`name` and `score` are required, `metadata` is optional.
+
+See [multi_score.rb](./examples/eval/multi_score.rb) for a full example.
+
 #### Trace scoring
 
 Scorers can access the full evaluation trace (all spans generated by the task) by declaring a `trace:` keyword parameter. This is useful for inspecting intermediate LLM calls, validating tool usage, or checking the message thread:
@@ -361,7 +388,7 @@ Braintrust::Eval.run(
 )
 ```
 
-See examples: [eval.rb](./examples/eval.rb), [dataset.rb](./examples/eval/dataset.rb), [remote_functions.rb](./examples/eval/remote_functions.rb), [trace_scoring.rb](./examples/eval/trace_scoring.rb)
+See [trace_scoring.rb](./examples/eval/trace_scoring.rb) for a full example.
 
 ### Dev Server
 

diff --git a/examples/eval/multi_score.rb b/examples/eval/multi_score.rb
@@ -0,0 +1,132 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+require "bundler/setup"
+require "braintrust"
+require "opentelemetry/sdk"
+
+# Example: Multi-Score Scorers
+#
+# A scorer can return an Array of score hashes to emit multiple named metrics
+# from a single scorer call. Each hash must have a :name and :score key; an
+# optional :metadata key attaches structured context to that metric.
+#
+# This is useful when several dimensions of quality (e.g. correctness,
+# completeness, format) can be computed together — sharing one inference call
+# or one pass over the output — rather than running separate scorers.
+#
+# Two patterns are shown:
+#
+#   1. Block-based (Braintrust::Scorer.new):
+#      Pass a block that returns an Array. Good for concise, one-off scorers.
+#
+#   2. Class-based (include Braintrust::Scorer):
+#      Define a class with a #call method. Good for reusable scorers that
+#      share helper logic across multiple metrics.
+#
+# Usage:
+#   bundle exec ruby examples/eval/multi_score.rb
+
+Braintrust.init
+
+# ---------------------------------------------------------------------------
+# Task: summarise a list of facts
+# ---------------------------------------------------------------------------
+FACTS = {
+  "The sky is blue and clouds are white." => {
+    key_terms: %w[sky blue clouds white],
+    max_words: 10
+  },
+  "Ruby was created by Matz in 1995." => {
+    key_terms: %w[ruby matz 1995],
+    max_words: 8
+  },
+  "The Pacific Ocean is the largest ocean on Earth." => {
+    key_terms: %w[pacific largest ocean earth],
+    max_words: 10
+  }
+}
+
+# Simulated summariser (replace with a real LLM call in production)
+def summarise(text)
+  # Naive: drop words over the limit and lowercase
+  text.split.first(8).join(" ").downcase
+end
+
+# ---------------------------------------------------------------------------
+# Pattern 1: block-based multi-score scorer
+#
+# Returns three metrics in one pass:
+#   - coverage:    fraction of key terms present in the summary
+#   - conciseness: 1.0 if under the word limit, else 0.0
+#   - lowercase:   1.0 if the summary is fully lowercased
+# ---------------------------------------------------------------------------
+summary_quality = Braintrust::Scorer.new("summary_quality") do |output:, expected:|
+  words = output.to_s.downcase.split
+  key_terms = expected[:key_terms]
+  max_words = expected[:max_words]
+
+  covered = key_terms.count { |t| words.include?(t) }
+  coverage_score = key_terms.empty? ? 1.0 : covered.to_f / key_terms.size
+
+  [
+    {
+      name: "coverage",
+      score: coverage_score,
+      metadata: {covered: covered, total: key_terms.size, missing: key_terms - words}
+    },
+    {
+      name: "conciseness",
+      score: (words.size <= max_words) ? 1.0 : 0.0,
+      metadata: {word_count: words.size, limit: max_words}
+    },
+    {
+      name: "lowercase",
+      score: (output.to_s == output.to_s.downcase) ? 1.0 : 0.0
+    }
+  ]
+end
+
+# ---------------------------------------------------------------------------
+# Pattern 2: class-based multi-score scorer
+#
+# Include Braintrust::Scorer and define #call. The class name is used as the
+# scorer name by default; override #name to customise it.
+#
+# Returns two metrics:
+#   - ends_with_period: checks punctuation
+#   - no_first_person:  checks for avoided first-person pronouns
+# ---------------------------------------------------------------------------
+class StyleChecker
+  include Braintrust::Scorer
+
+  FIRST_PERSON = %w[i me my myself we us our].freeze
+
+  def call(output:, **)
+    text = output.to_s
+    words = text.downcase.split(/\W+/)
+    fp_words = words & FIRST_PERSON
+
+    [
+      {
+        name: "ends_with_period",
+        score: text.strip.end_with?(".") ? 1.0 : 0.0
+      },
+      {
+        name: "no_first_person",
+        score: fp_words.empty? ? 1.0 : 0.0,
+        metadata: {found: fp_words}
+      }
+    ]
+  end
+end
+
+Braintrust::Eval.run(
+  project: "ruby-sdk-examples",
+  experiment: "multi-score-example",
+  cases: FACTS.map { |text, expected| {input: text, expected: expected} },
+  task: ->(input:) { summarise(input) },
+  scorers: [summary_quality, StyleChecker.new]
+)
+
+OpenTelemetry.tracer_provider.shutdown
diff --git a/lib/braintrust/eval/runner.rb b/lib/braintrust/eval/runner.rb
@@ -111,9 +111,8 @@ def run_eval_case(case_context, errors)
           case_context.trace = build_trace(eval_span)
 
           # Run scorers
-          case_scores = nil
           begin
-            case_scores = run_scorers(case_context)
+            run_scorers(case_context)
           rescue => e
             # Error already recorded on score span, set eval span status
             eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
@@ -123,7 +122,7 @@ def run_eval_case(case_context, errors)
           # Set output after task completes
           set_json_attr(eval_span, "braintrust.output_json", {output: case_context.output})
 
-          report_progress(eval_span, case_context, data: case_context.output, scores: case_scores || {})
+          report_progress(eval_span, case_context, data: case_context.output)
         end
       ensure
         eval_span&.finish
@@ -157,7 +156,6 @@ def run_task(case_context)
       # Run scorers with OpenTelemetry tracing.
       # Creates one span per scorer, each a direct child of the current (eval) span.
       # @param case_context [CaseContext] The per-case context (output must be populated)
-      # @return [Hash] Scores hash { scorer_name => score_value }
       def run_scorers(case_context)
         scorer_kwargs = {
           input: case_context.input,
@@ -173,47 +171,41 @@ def run_scorers(case_context)
           metadata: case_context.metadata || {}
         }
 
-        scores = {}
         scorer_error = nil
         eval_context.scorers.each do |scorer|
-          run_scorer(scorer, scorer_kwargs, scorer_input, scores)
+          collect_scores(run_scorer(scorer, scorer_kwargs, scorer_input))
         rescue => e
           scorer_error ||= e
         end
 
         raise scorer_error if scorer_error
-
-        scores
       end
 
       # Run a single scorer inside its own span.
       # @param scorer [Scorer] The scorer to run
       # @param scorer_kwargs [Hash] Keyword arguments for the scorer
       # @param scorer_input [Hash] Input to log on the span
-      # @param scores [Hash] Accumulator for score results
-      def run_scorer(scorer, scorer_kwargs, scorer_input, scores)
+      # @return [Array<Hash>] Raw score results from the scorer
+      def run_scorer(scorer, scorer_kwargs, scorer_input)
         tracer.in_span(scorer.name) do |score_span|
           score_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
           set_json_attr(score_span, "braintrust.span_attributes", build_scorer_span_attributes(scorer.name))
           set_json_attr(score_span, "braintrust.input_json", scorer_input)
 
-          raw_result = scorer.call(**scorer_kwargs)
-          normalized = normalize_score_result(raw_result, scorer.name)
+          score_results = scorer.call(**scorer_kwargs)
 
-          score_name = normalized[:name]
-          scores[score_name] = normalized[:score]
+          scorer_scores = {}
+          scorer_metadata = {}
+          score_results.each do |s|
+            scorer_scores[s[:name]] = s[:score]
+            scorer_metadata[s[:name]] = s[:metadata] if s[:metadata].is_a?(Hash)
+          end
 
-          scorer_scores = {score_name => normalized[:score]}
           set_json_attr(score_span, "braintrust.output_json", scorer_scores)
           set_json_attr(score_span, "braintrust.scores", scorer_scores)
+          set_json_attr(score_span, "braintrust.metadata", scorer_metadata) unless scorer_metadata.empty?
 
-          # Set scorer metadata on its span
-          if normalized[:metadata].is_a?(Hash)
-            set_json_attr(score_span, "braintrust.metadata", normalized[:metadata])
-          end
-
-          # Collect raw score for summary (thread-safe)
-          collect_score(score_name, normalized[:score])
+          score_results
         rescue => e
           record_span_error(score_span, e, "ScorerError")
           raise
@@ -302,28 +294,11 @@ def set_json_attr(span, key, value)
         span.set_attribute(key, JSON.dump(value))
       end
 
-      # Collect a single score value for summary calculation
-      # @param name [String] Scorer name
-      # @param value [Object] Score value (only Numeric values are collected)
-      def collect_score(name, value)
-        return unless value.is_a?(Numeric)
-
+      # Collect score results into the summary accumulator (thread-safe).
+      # @param score_results [Array<Hash>] Score results from a scorer
+      def collect_scores(score_results)
         @score_mutex.synchronize do
-          (@scores[name] ||= []) << value
-        end
-      end
-
-      # Normalize a scorer return value into its component parts.
-      # Scorers may return a raw Numeric or a Hash with :score, :metadata, and :name keys.
-      # @param result [Object] Raw scorer return value
-      # @param default_name [String] Scorer name to use if not overridden
-      # @return [Hash] Normalized hash with :score, :metadata, :name keys
-      def normalize_score_result(result, default_name)
-        if result.is_a?(Hash)
-          result[:name] ||= default_name
-          result
-        else
-          {score: result, metadata: nil, name: default_name}
+          score_results.each { |s| (@scores[s[:name]] ||= []) << s[:score] }
         end
       end
     end