diff --git a/README.md b/README.md
index d5ca0271..f41cd421 100644
--- a/README.md
+++ b/README.md
@@ -259,6 +259,8 @@ Braintrust::Eval.run(
 )
 ```
 
+See [eval.rb](./examples/eval.rb) for a full example.
+
 ### Datasets
 
 Use test cases from a Braintrust dataset:
@@ -287,6 +289,8 @@ Braintrust::Eval.run(
 )
 ```
 
+See [dataset.rb](./examples/eval/dataset.rb) for a full example.
+
 ### Scorers
 
 Use scoring functions defined in Braintrust:
@@ -315,6 +319,8 @@ Braintrust::Eval.run(
 )
 ```
 
+See [remote_functions.rb](./examples/eval/remote_functions.rb) for a full example.
+
 #### Scorer metadata
 
 Scorers can return a Hash with `:score` and `:metadata` to attach structured context to the score. The metadata is logged on the scorer's span and visible in the Braintrust UI for debugging and filtering:
@@ -332,6 +338,27 @@ end
 
 See [scorer_metadata.rb](./examples/eval/scorer_metadata.rb) for a full example.
 
+#### Multiple scores from one scorer
+
+When several scores can be computed together (e.g. in one LLM call), you can return an `Array` of score `Hash` instead of a single value. Each metric appears as a separate score column in the Braintrust UI:
+
+```ruby
+Braintrust::Scorer.new("summary_quality") do |output:, expected:|
+  words = output.downcase.split
+  key_terms = expected[:key_terms]
+  covered = key_terms.count { |t| words.include?(t) }
+
+  [
+    {name: "coverage", score: covered.to_f / key_terms.size, metadata: {missing: key_terms - words}},
+    {name: "conciseness", score: words.size <= expected[:max_words] ? 1.0 : 0.0}
+  ]
+end
+```
+
+`name` and `score` are required, `metadata` is optional.
+
+See [multi_score.rb](./examples/eval/multi_score.rb) for a full example.
+
 #### Trace scoring
 
 Scorers can access the full evaluation trace (all spans generated by the task) by declaring a `trace:` keyword parameter. This is useful for inspecting intermediate LLM calls, validating tool usage, or checking the message thread:
@@ -361,7 +388,7 @@ Braintrust::Eval.run(
 )
 ```
 
-See examples: [eval.rb](./examples/eval.rb), [dataset.rb](./examples/eval/dataset.rb), [remote_functions.rb](./examples/eval/remote_functions.rb), [trace_scoring.rb](./examples/eval/trace_scoring.rb)
+See [trace_scoring.rb](./examples/eval/trace_scoring.rb) for a full example.
 
 ### Dev Server
 
diff --git a/examples/eval/multi_score.rb b/examples/eval/multi_score.rb
new file mode 100644
index 00000000..438a00f3
--- /dev/null
+++ b/examples/eval/multi_score.rb
@@ -0,0 +1,132 @@
+#!/usr/bin/env ruby
+# frozen_string_literal: true
+
+require "bundler/setup"
+require "braintrust"
+require "opentelemetry/sdk"
+
+# Example: Multi-Score Scorers
+#
+# A scorer can return an Array of score hashes to emit multiple named metrics
+# from a single scorer call. Each hash must have a :name and :score key; an
+# optional :metadata key attaches structured context to that metric.
+#
+# This is useful when several dimensions of quality (e.g. correctness,
+# completeness, format) can be computed together — sharing one inference call
+# or one pass over the output — rather than running separate scorers.
+#
+# Two patterns are shown:
+#
+#   1. Block-based (Braintrust::Scorer.new):
+#      Pass a block that returns an Array. Good for concise, one-off scorers.
+#
+#   2. Class-based (include Braintrust::Scorer):
+#      Define a class with a #call method. Good for reusable scorers that
+#      share helper logic across multiple metrics.
+#
+# Usage:
+#   bundle exec ruby examples/eval/multi_score.rb
+
+Braintrust.init
+
+# ---------------------------------------------------------------------------
+# Task: summarise a list of facts
+# ---------------------------------------------------------------------------
+FACTS = {
+  "The sky is blue and clouds are white." => {
+    key_terms: %w[sky blue clouds white],
+    max_words: 10
+  },
+  "Ruby was created by Matz in 1995." => {
+    key_terms: %w[ruby matz 1995],
+    max_words: 8
+  },
+  "The Pacific Ocean is the largest ocean on Earth." => {
+    key_terms: %w[pacific largest ocean earth],
+    max_words: 10
+  }
+}
+
+# Simulated summariser (replace with a real LLM call in production)
+def summarise(text)
+  # Naive: drop words over the limit and lowercase
+  text.split.first(8).join(" ").downcase
+end
+
+# ---------------------------------------------------------------------------
+# Pattern 1: block-based multi-score scorer
+#
+# Returns three metrics in one pass:
+#   - coverage:    fraction of key terms present in the summary
+#   - conciseness: 1.0 if under the word limit, else 0.0
+#   - lowercase:   1.0 if the summary is fully lowercased
+# ---------------------------------------------------------------------------
+summary_quality = Braintrust::Scorer.new("summary_quality") do |output:, expected:|
+  words = output.to_s.downcase.split
+  key_terms = expected[:key_terms]
+  max_words = expected[:max_words]
+
+  covered = key_terms.count { |t| words.include?(t) }
+  coverage_score = key_terms.empty? ? 1.0 : covered.to_f / key_terms.size
+
+  [
+    {
+      name: "coverage",
+      score: coverage_score,
+      metadata: {covered: covered, total: key_terms.size, missing: key_terms - words}
+    },
+    {
+      name: "conciseness",
+      score: (words.size <= max_words) ? 1.0 : 0.0,
+      metadata: {word_count: words.size, limit: max_words}
+    },
+    {
+      name: "lowercase",
+      score: (output.to_s == output.to_s.downcase) ? 1.0 : 0.0
+    }
+  ]
+end
+
+# ---------------------------------------------------------------------------
+# Pattern 2: class-based multi-score scorer
+#
+# Include Braintrust::Scorer and define #call. The class name is used as the
+# scorer name by default; override #name to customise it.
+#
+# Returns two metrics:
+#   - ends_with_period: checks punctuation
+#   - no_first_person:  checks for avoided first-person pronouns
+# ---------------------------------------------------------------------------
+class StyleChecker
+  include Braintrust::Scorer
+
+  FIRST_PERSON = %w[i me my myself we us our].freeze
+
+  def call(output:, **)
+    text = output.to_s
+    words = text.downcase.split(/\W+/)
+    fp_words = words & FIRST_PERSON
+
+    [
+      {
+        name: "ends_with_period",
+        score: text.strip.end_with?(".") ? 1.0 : 0.0
+      },
+      {
+        name: "no_first_person",
+        score: fp_words.empty? ? 1.0 : 0.0,
+        metadata: {found: fp_words}
+      }
+    ]
+  end
+end
+
+Braintrust::Eval.run(
+  project: "ruby-sdk-examples",
+  experiment: "multi-score-example",
+  cases: FACTS.map { |text, expected| {input: text, expected: expected} },
+  task: ->(input:) { summarise(input) },
+  scorers: [summary_quality, StyleChecker.new]
+)
+
+OpenTelemetry.tracer_provider.shutdown
diff --git a/lib/braintrust/eval/runner.rb b/lib/braintrust/eval/runner.rb
index 76590280..fb020bee 100644
--- a/lib/braintrust/eval/runner.rb
+++ b/lib/braintrust/eval/runner.rb
@@ -111,9 +111,8 @@ def run_eval_case(case_context, errors)
           case_context.trace = build_trace(eval_span)
 
           # Run scorers
-          case_scores = nil
           begin
-            case_scores = run_scorers(case_context)
+            run_scorers(case_context)
           rescue => e
             # Error already recorded on score span, set eval span status
             eval_span.status = OpenTelemetry::Trace::Status.error(e.message)
@@ -123,7 +122,7 @@ def run_eval_case(case_context, errors)
           # Set output after task completes
           set_json_attr(eval_span, "braintrust.output_json", {output: case_context.output})
 
-          report_progress(eval_span, case_context, data: case_context.output, scores: case_scores || {})
+          report_progress(eval_span, case_context, data: case_context.output)
         end
       ensure
         eval_span&.finish
@@ -157,7 +156,6 @@ def run_task(case_context)
       # Run scorers with OpenTelemetry tracing.
       # Creates one span per scorer, each a direct child of the current (eval) span.
       # @param case_context [CaseContext] The per-case context (output must be populated)
-      # @return [Hash] Scores hash { scorer_name => score_value }
       def run_scorers(case_context)
         scorer_kwargs = {
           input: case_context.input,
@@ -173,47 +171,41 @@ def run_scorers(case_context)
           metadata: case_context.metadata || {}
         }
 
-        scores = {}
         scorer_error = nil
         eval_context.scorers.each do |scorer|
-          run_scorer(scorer, scorer_kwargs, scorer_input, scores)
+          collect_scores(run_scorer(scorer, scorer_kwargs, scorer_input))
         rescue => e
           scorer_error ||= e
         end
 
         raise scorer_error if scorer_error
-
-        scores
       end
 
       # Run a single scorer inside its own span.
       # @param scorer [Scorer] The scorer to run
       # @param scorer_kwargs [Hash] Keyword arguments for the scorer
       # @param scorer_input [Hash] Input to log on the span
-      # @param scores [Hash] Accumulator for score results
-      def run_scorer(scorer, scorer_kwargs, scorer_input, scores)
+      # @return [Array<Hash>] Raw score results from the scorer
+      def run_scorer(scorer, scorer_kwargs, scorer_input)
         tracer.in_span(scorer.name) do |score_span|
           score_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
           set_json_attr(score_span, "braintrust.span_attributes", build_scorer_span_attributes(scorer.name))
           set_json_attr(score_span, "braintrust.input_json", scorer_input)
 
-          raw_result = scorer.call(**scorer_kwargs)
-          normalized = normalize_score_result(raw_result, scorer.name)
+          score_results = scorer.call(**scorer_kwargs)
 
-          score_name = normalized[:name]
-          scores[score_name] = normalized[:score]
+          scorer_scores = {}
+          scorer_metadata = {}
+          score_results.each do |s|
+            scorer_scores[s[:name]] = s[:score]
+            scorer_metadata[s[:name]] = s[:metadata] if s[:metadata].is_a?(Hash)
+          end
 
-          scorer_scores = {score_name => normalized[:score]}
           set_json_attr(score_span, "braintrust.output_json", scorer_scores)
           set_json_attr(score_span, "braintrust.scores", scorer_scores)
+          set_json_attr(score_span, "braintrust.metadata", scorer_metadata) unless scorer_metadata.empty?
 
-          # Set scorer metadata on its span
-          if normalized[:metadata].is_a?(Hash)
-            set_json_attr(score_span, "braintrust.metadata", normalized[:metadata])
-          end
-
-          # Collect raw score for summary (thread-safe)
-          collect_score(score_name, normalized[:score])
+          score_results
         rescue => e
           record_span_error(score_span, e, "ScorerError")
           raise
@@ -302,28 +294,11 @@ def set_json_attr(span, key, value)
         span.set_attribute(key, JSON.dump(value))
       end
 
-      # Collect a single score value for summary calculation
-      # @param name [String] Scorer name
-      # @param value [Object] Score value (only Numeric values are collected)
-      def collect_score(name, value)
-        return unless value.is_a?(Numeric)
-
+      # Collect score results into the summary accumulator (thread-safe).
+      # @param score_results [Array<Hash>] Score results from a scorer
+      def collect_scores(score_results)
         @score_mutex.synchronize do
-          (@scores[name] ||= []) << value
-        end
-      end
-
-      # Normalize a scorer return value into its component parts.
-      # Scorers may return a raw Numeric or a Hash with :score, :metadata, and :name keys.
-      # @param result [Object] Raw scorer return value
-      # @param default_name [String] Scorer name to use if not overridden
-      # @return [Hash] Normalized hash with :score, :metadata, :name keys
-      def normalize_score_result(result, default_name)
-        if result.is_a?(Hash)
-          result[:name] ||= default_name
-          result
-        else
-          {score: result, metadata: nil, name: default_name}
+          score_results.each { |s| (@scores[s[:name]] ||= []) << s[:score] }
         end
       end
     end
diff --git a/lib/braintrust/scorer.rb b/lib/braintrust/scorer.rb
index d13ffab6..5c6b13b1 100644
--- a/lib/braintrust/scorer.rb
+++ b/lib/braintrust/scorer.rb
@@ -40,12 +40,52 @@ def self.new(name = nil, &block)
       Block.new(name: name || DEFAULT_NAME, &block)
     end
 
-    # Included into classes that +include Scorer+. Prepends KeywordFilter
-    # so #call receives only its declared kwargs, and provides a default #name.
+    # Included into classes that +include Scorer+. Prepends KeywordFilter and
+    # ResultNormalizer so #call receives only declared kwargs and always returns
+    # Array<Hash>. Also provides a default #name and #call_parameters.
     module Callable
+      # Normalizes the raw return value of #call into Array<Hash>.
+      # Nested inside Callable because it depends on #name which Callable provides.
+      module ResultNormalizer
+        # @return [Array<Hash>] normalized score hashes with :score, :metadata, :name keys
+        def call(**kwargs)
+          normalize_score_result(super)
+        end
+
+        private
+
+        # @param result [Numeric, Hash, Array<Hash>] raw return value from #call
+        # @return [Array<Hash>] one or more score hashes with :score, :metadata, :name keys
+        # @raise [ArgumentError] if any score value is not Numeric
+        def normalize_score_result(result)
+          case result
+          when Array then result.map { |item| normalize_score_item(item) }
+          when Hash then [normalize_score_item(result)]
+          else
+            raise ArgumentError, "#{name}: score must be Numeric, got #{result.inspect}" unless result.is_a?(Numeric)
+            [{score: result, metadata: nil, name: name}]
+          end
+        end
+
+        # Fills in missing :name from the scorer and validates :score.
+        # @param item [Hash] a score hash with at least a :score key
+        # @return [Hash] the same hash with :name set
+        # @raise [ArgumentError] if :score is not Numeric
+        def normalize_score_item(item)
+          item[:name] ||= name
+          raise ArgumentError, "#{item[:name]}: score must be Numeric, got #{item[:score].inspect}" unless item[:score].is_a?(Numeric)
+          item
+        end
+      end
+
+      # Infrastructure modules prepended onto every scorer class.
+      # Used both to set up the ancestor chain and to skip past them in
+      # #call_parameters so KeywordFilter sees the real call signature.
+      PREPENDED = [Internal::Callable::KeywordFilter, ResultNormalizer].freeze
+
       # @param base [Class] the class including Callable
       def self.included(base)
-        base.prepend(Internal::Callable::KeywordFilter)
+        PREPENDED.each { |mod| base.prepend(mod) }
       end
 
       # Default name derived from the class name (e.g. FuzzyMatch -> "fuzzy_match").
@@ -55,6 +95,17 @@ def name
         return Scorer::DEFAULT_NAME unless klass
         klass.gsub(/([a-z])([A-Z])/, '\1_\2').downcase
       end
+
+      # Provides KeywordFilter with the actual call signature of the subclass.
+      # Walks past PREPENDED modules in the ancestor chain so that user-defined
+      # #call keyword params are correctly introspected.
+      # Block overrides this to point directly at @block.parameters.
+      # @return [Array<Array>] parameter list
+      def call_parameters
+        meth = method(:call)
+        meth = meth.super_method while meth.super_method && PREPENDED.include?(meth.owner)
+        meth.parameters
+      end
     end
 
     # Block-based scorer. Stores a Proc and delegates #call to it.
@@ -75,7 +126,7 @@ def initialize(name: DEFAULT_NAME, &block)
       end
 
       # @param kwargs [Hash] keyword arguments (filtered by KeywordFilter)
-      # @return [Float, Hash, Array] score result
+      # @return [Array<Hash>] normalized score results
       def call(**kwargs)
         @block.call(**kwargs)
       end
diff --git a/test/braintrust/eval/context_test.rb b/test/braintrust/eval/context_test.rb
index cfe52b28..ac4b1128 100644
--- a/test/braintrust/eval/context_test.rb
+++ b/test/braintrust/eval/context_test.rb
@@ -99,7 +99,8 @@ def test_normalize_scorers_wraps_lambda_with_kwargs
     result = factory.normalize_scorers([lam])
     assert_equal 1, result.length
     assert_kind_of Braintrust::Scorer, result.first
-    assert_equal 1.0, result.first.call(input: "x", expected: "YES", output: "YES")
+    assert_equal [{score: 1.0, metadata: nil, name: "scorer"}],
+      result.first.call(input: "x", expected: "YES", output: "YES")
   end
 
   def test_normalize_scorers_wraps_callable_class_with_kwargs
@@ -122,7 +123,8 @@ def call(expected:, output:)
     result = factory.normalize_scorers([callable])
     assert_equal 1, result.length
     assert_equal "threshold_scorer", result.first.name
-    assert_equal 0.5, result.first.call(input: "x", expected: "a", output: "b")
+    assert_equal [{score: 0.5, metadata: nil, name: "threshold_scorer"}],
+      result.first.call(input: "x", expected: "a", output: "b")
   end
 
   # ============================================
@@ -201,7 +203,8 @@ def test_normalize_scorers_wraps_lambda
     result = factory.normalize_scorers([lam])
     assert_equal 1, result.length
     assert_kind_of Braintrust::Scorer, result.first
-    assert_equal 1.0, result.first.call(input: "x", expected: "a", output: "a")
+    assert_equal [{score: 1.0, metadata: nil, name: "scorer"}],
+      result.first.call(input: "x", expected: "a", output: "a")
   end
 
   def test_normalize_scorers_callable_class_without_name
@@ -216,7 +219,8 @@ def call(output:, expected:)
     result = factory.normalize_scorers([callable])
     assert_equal 1, result.length
     assert_equal "scorer", result.first.name
-    assert_equal 1.0, result.first.call(input: "x", expected: "a", output: "a")
+    assert_equal [{score: 1.0, metadata: nil, name: "scorer"}],
+      result.first.call(input: "x", expected: "a", output: "a")
   end
 
   # ============================================
diff --git a/test/braintrust/eval/runner_test.rb b/test/braintrust/eval/runner_test.rb
index d6a5da62..01246d8f 100644
--- a/test/braintrust/eval/runner_test.rb
+++ b/test/braintrust/eval/runner_test.rb
@@ -979,7 +979,7 @@ def test_on_progress_receives_output_data
     assert_equal "HELLO", progress_calls.first["data"]
   end
 
-  def test_on_progress_receives_scores
+  def test_on_progress_receives_data
     scorer = Braintrust::Scorer.new("exact") { |expected:, output:| (output == expected) ? 1.0 : 0.0 }
     progress_calls = []
     runner = build_simple_runner(
@@ -991,7 +991,8 @@ def test_on_progress_receives_scores
 
     runner.run
 
-    assert_equal({"exact" => 1.0}, progress_calls.first["scores"])
+    assert_equal "HELLO", progress_calls.first["data"]
+    refute progress_calls.first.key?("scores")
   end
 
   def test_on_progress_receives_error_on_task_failure
@@ -1375,10 +1376,10 @@ def test_scorer_hash_return_metadata_on_span
 
     score_span = rig.exporter.finished_spans.find { |s| s.name == "meta_scorer" }
     metadata = JSON.parse(score_span.attributes["braintrust.metadata"])
-    assert_equal({"failure_type" => "none", "confidence" => 0.99}, metadata)
+    assert_equal({"meta_scorer" => {"failure_type" => "none", "confidence" => 0.99}}, metadata)
   end
 
-  def test_scorer_hash_without_score_key
+  def test_scorer_hash_without_score_key_raises
     rig = setup_otel_test_rig
 
     scorer = Braintrust::Scorer.new("no_score_key") { |output:|
@@ -1398,12 +1399,12 @@ def test_scorer_hash_without_score_key
     )
     result = Braintrust::Eval::Runner.new(context).run
 
-    assert result.success?
-    # nil score is not Numeric, so not collected for stats
-    assert_equal({}, result.scores)
+    refute result.success?
+    assert_equal 1, result.errors.length
+    assert_match(/score must be Numeric/, result.errors.first)
   end
 
-  def test_scorer_hash_with_nil_score
+  def test_scorer_hash_with_nil_score_raises
     rig = setup_otel_test_rig
 
     scorer = Braintrust::Scorer.new("nil_score") { |output:|
@@ -1423,13 +1424,9 @@ def test_scorer_hash_with_nil_score
     )
     result = Braintrust::Eval::Runner.new(context).run
 
-    assert result.success?
-    assert_equal({}, result.scores)
-
-    # Metadata should still be logged even with nil score
-    score_span = rig.exporter.finished_spans.find { |s| s.name == "nil_score" }
-    metadata = JSON.parse(score_span.attributes["braintrust.metadata"])
-    assert_equal({"reason" => "could not score"}, metadata)
+    refute result.success?
+    assert_equal 1, result.errors.length
+    assert_match(/score must be Numeric/, result.errors.first)
   end
 
   def test_multiple_scorers_mixed_return_types
@@ -1460,7 +1457,7 @@ def test_multiple_scorers_mixed_return_types
     numeric_span = score_spans.find { |s| s.name == "numeric" }
 
     metadata = JSON.parse(structured_span.attributes["braintrust.metadata"])
-    assert_equal({"detail" => "partial"}, metadata)
+    assert_equal({"structured" => {"detail" => "partial"}}, metadata)
     assert_nil numeric_span.attributes["braintrust.metadata"]
   end
 
@@ -1488,7 +1485,7 @@ def test_scorer_hash_return_scores_on_span_are_extracted
     assert_equal 0.75, scores["structured"]
   end
 
-  def test_on_progress_receives_extracted_score_from_hash
+  def test_on_progress_receives_data_with_hash_scorer
     progress_calls = []
     scorer = Braintrust::Scorer.new("structured") { {score: 0.5, metadata: {x: 1}} }
     runner = build_simple_runner(
@@ -1500,7 +1497,8 @@ def test_on_progress_receives_extracted_score_from_hash
 
     runner.run
 
-    assert_equal({"structured" => 0.5}, progress_calls.first["scores"])
+    assert_equal "HELLO", progress_calls.first["data"]
+    refute progress_calls.first.key?("scores")
   end
 
   def test_scorer_no_metadata_attr_when_all_numeric
@@ -1579,7 +1577,7 @@ def test_scorer_hash_return_multiple_cases
     assert_equal({"quality" => [0.2, 0.5]}, result.scores)
   end
 
-  def test_scorer_empty_hash_return
+  def test_scorer_empty_hash_raises
     rig = setup_otel_test_rig
 
     scorer = Braintrust::Scorer.new("empty") { |output:| {} }
@@ -1597,9 +1595,201 @@ def test_scorer_empty_hash_return
     )
     result = Braintrust::Eval::Runner.new(context).run
 
+    refute result.success?
+    assert_equal 1, result.errors.length
+    assert_match(/score must be Numeric/, result.errors.first)
+  end
+
+  # ============================================
+  # Runner#run tests - multi-score (Array) return
+  # ============================================
+
+  def test_scorer_array_return_two_scores
+    rig = setup_otel_test_rig
+
+    scorer = Braintrust::Scorer.new("llm_judge") { |output:|
+      [
+        {score: 0.9, name: "relevance"},
+        {score: 0.7, name: "factuality"}
+      ]
+    }
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input.upcase },
+      scorers: [scorer],
+      cases: [{input: "hello"}],
+      experiment_id: "exp-123",
+      experiment_name: "test-experiment",
+      project_id: "proj-456",
+      project_name: "test-project",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+    result = Braintrust::Eval::Runner.new(context).run
+
     assert result.success?
-    # Empty hash has no :score key, so score is nil and not collected
-    assert_equal({}, result.scores)
+    assert_equal({"relevance" => [0.9], "factuality" => [0.7]}, result.scores)
+  end
+
+  def test_scorer_array_return_scores_on_span
+    rig = setup_otel_test_rig
+
+    scorer = Braintrust::Scorer.new("llm_judge") { |output:|
+      [
+        {score: 0.9, name: "relevance"},
+        {score: 0.7, name: "factuality"}
+      ]
+    }
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input.upcase },
+      scorers: [scorer],
+      cases: [{input: "hello"}],
+      experiment_id: "exp-123",
+      experiment_name: "test-experiment",
+      project_id: "proj-456",
+      project_name: "test-project",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+    Braintrust::Eval::Runner.new(context).run
+
+    score_span = rig.exporter.finished_spans.find { |s| s.name == "llm_judge" }
+    scores = JSON.parse(score_span.attributes["braintrust.scores"])
+    assert_equal({"relevance" => 0.9, "factuality" => 0.7}, scores)
+  end
+
+  def test_scorer_array_return_metadata_keyed_by_score_name
+    rig = setup_otel_test_rig
+
+    scorer = Braintrust::Scorer.new("llm_judge") { |output:|
+      [
+        {score: 0.9, name: "relevance", metadata: {reason: "on topic"}},
+        {score: 0.7, name: "factuality"}
+      ]
+    }
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input.upcase },
+      scorers: [scorer],
+      cases: [{input: "hello"}],
+      experiment_id: "exp-123",
+      experiment_name: "test-experiment",
+      project_id: "proj-456",
+      project_name: "test-project",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+    Braintrust::Eval::Runner.new(context).run
+
+    score_span = rig.exporter.finished_spans.find { |s| s.name == "llm_judge" }
+    metadata = JSON.parse(score_span.attributes["braintrust.metadata"])
+    # Only the score with metadata should appear; keyed by score name
+    assert_equal({"relevance" => {"reason" => "on topic"}}, metadata)
+  end
+
+  def test_scorer_array_return_no_metadata_attr_when_none_present
+    rig = setup_otel_test_rig
+
+    scorer = Braintrust::Scorer.new("llm_judge") { |output:|
+      [
+        {score: 0.9, name: "relevance"},
+        {score: 0.7, name: "factuality"}
+      ]
+    }
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input.upcase },
+      scorers: [scorer],
+      cases: [{input: "hello"}],
+      experiment_id: "exp-123",
+      experiment_name: "test-experiment",
+      project_id: "proj-456",
+      project_name: "test-project",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+    Braintrust::Eval::Runner.new(context).run
+
+    score_span = rig.exporter.finished_spans.find { |s| s.name == "llm_judge" }
+    assert_nil score_span.attributes["braintrust.metadata"]
+  end
+
+  def test_scorer_array_return_multiple_cases_accumulates
+    rig = setup_otel_test_rig
+
+    scorer = Braintrust::Scorer.new("llm_judge") { |output:|
+      [
+        {score: 1.0, name: "relevance"},
+        {score: 0.5, name: "tone"}
+      ]
+    }
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input.upcase },
+      scorers: [scorer],
+      cases: [{input: "a"}, {input: "b"}],
+      experiment_id: "exp-123",
+      experiment_name: "test-experiment",
+      project_id: "proj-456",
+      project_name: "test-project",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+    result = Braintrust::Eval::Runner.new(context).run
+
+    assert result.success?
+    assert_equal({"relevance" => [1.0, 1.0], "tone" => [0.5, 0.5]}, result.scores)
+  end
+
+  def test_scorer_array_return_single_numeric_unchanged
+    rig = setup_otel_test_rig
+
+    scorer = Braintrust::Scorer.new("exact") { |output:, expected:| (output == expected) ? 1.0 : 0.0 }
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input.upcase },
+      scorers: [scorer],
+      cases: [{input: "hello", expected: "HELLO"}],
+      experiment_id: "exp-123",
+      experiment_name: "test-experiment",
+      project_id: "proj-456",
+      project_name: "test-project",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+    result = Braintrust::Eval::Runner.new(context).run
+
+    assert result.success?
+    assert_equal({"exact" => [1.0]}, result.scores)
+  end
+
+  def test_scorer_array_return_single_hash_unchanged
+    rig = setup_otel_test_rig
+
+    scorer = Braintrust::Scorer.new("quality") { |output:|
+      {score: 0.8, metadata: {reason: "good"}}
+    }
+
+    context = Braintrust::Eval::Context.build(
+      task: ->(input:) { input.upcase },
+      scorers: [scorer],
+      cases: [{input: "hello"}],
+      experiment_id: "exp-123",
+      experiment_name: "test-experiment",
+      project_id: "proj-456",
+      project_name: "test-project",
+      state: rig.state,
+      tracer_provider: rig.tracer_provider
+    )
+    result = Braintrust::Eval::Runner.new(context).run
+
+    assert result.success?
+    assert_equal({"quality" => [0.8]}, result.scores)
+
+    score_span = rig.exporter.finished_spans.find { |s| s.name == "quality" }
+    metadata = JSON.parse(score_span.attributes["braintrust.metadata"])
+    assert_equal({"quality" => {"reason" => "good"}}, metadata)
   end
 
   private
diff --git a/test/braintrust/eval/scorer_test.rb b/test/braintrust/eval/scorer_test.rb
index c05bcc74..1cd73885 100644
--- a/test/braintrust/eval/scorer_test.rb
+++ b/test/braintrust/eval/scorer_test.rb
@@ -39,11 +39,37 @@ def test_new_logs_deprecation_warning
     end
   end
 
+  def test_new_with_keyword_block_multi_score
+    scorer = suppress_logs {
+      Braintrust::Eval::Scorer.new("multi") do |expected:, output:|
+        [
+          {name: "exact", score: (output == expected) ? 1.0 : 0.0},
+          {name: "nonempty", score: output.to_s.empty? ? 0.0 : 1.0}
+        ]
+      end
+    }
+    result = scorer.call(input: "x", expected: "a", output: "a")
+    assert_equal [{name: "exact", score: 1.0}, {name: "nonempty", score: 1.0}], result
+  end
+
   def test_new_with_legacy_positional_block
     scorer = suppress_logs { Braintrust::Eval::Scorer.new("legacy") { |i, e, o| (o == e) ? 1.0 : 0.0 } }
     assert_kind_of Braintrust::Scorer, scorer
-    result = scorer.call(input: "x", expected: "a", output: "a")
-    assert_equal 1.0, result
+    assert_equal [{score: 1.0, metadata: nil, name: "legacy"}],
+      scorer.call(input: "x", expected: "a", output: "a")
+  end
+
+  def test_new_with_legacy_positional_block_multi_score
+    scorer = suppress_logs {
+      Braintrust::Eval::Scorer.new("legacy_multi") do |i, e, o|
+        [
+          {name: "exact", score: (o == e) ? 1.0 : 0.0},
+          {name: "length", score: (o.length == e.length) ? 1.0 : 0.0}
+        ]
+      end
+    }
+    result = scorer.call(input: "x", expected: "hello", output: "world")
+    assert_equal [{name: "exact", score: 0.0}, {name: "length", score: 1.0}], result
   end
 
   # ============================================
@@ -53,13 +79,26 @@ def test_new_with_legacy_positional_block
   def test_call_with_positional_args
     scorer = suppress_logs { Braintrust::Eval::Scorer.new("legacy") { |i, e, o| (o == e) ? 1.0 : 0.0 } }
     result = suppress_logs { scorer.call("apple", "fruit", "fruit") }
-    assert_equal 1.0, result
+    assert_equal [{score: 1.0, metadata: nil, name: "legacy"}], result
+  end
+
+  def test_call_with_positional_args_multi_score
+    scorer = suppress_logs {
+      Braintrust::Eval::Scorer.new("legacy_multi") do |i, e, o|
+        [
+          {name: "exact", score: (o == e) ? 1.0 : 0.0},
+          {name: "nonempty", score: o.to_s.empty? ? 0.0 : 1.0}
+        ]
+      end
+    }
+    result = suppress_logs { scorer.call("x", "fruit", "fruit") }
+    assert_equal [{name: "exact", score: 1.0}, {name: "nonempty", score: 1.0}], result
   end
 
   def test_call_with_positional_args_including_metadata
     scorer = suppress_logs { Braintrust::Eval::Scorer.new("legacy") { |i, e, o, m| m[:boost] ? 1.0 : 0.0 } }
     result = suppress_logs { scorer.call("apple", "fruit", "fruit", {boost: true}) }
-    assert_equal 1.0, result
+    assert_equal [{score: 1.0, metadata: nil, name: "legacy"}], result
   end
 
   def test_call_with_positional_args_logs_deprecation_warning
@@ -72,6 +111,51 @@ def test_call_with_positional_args_logs_deprecation_warning
   def test_call_with_keyword_args_does_not_trigger_positional_warning
     scorer = suppress_logs { Braintrust::Eval::Scorer.new("kw") { |expected:, output:| (output == expected) ? 1.0 : 0.0 } }
     result = scorer.call(input: "x", expected: "a", output: "a")
-    assert_equal 1.0, result
+    assert_equal [{score: 1.0, metadata: nil, name: "kw"}], result
+  end
+
+  # ============================================
+  # Eval.scorer module method (deprecated)
+  # ============================================
+
+  def test_eval_scorer_method_with_keyword_block
+    scorer = suppress_logs { Braintrust::Eval.scorer("kw") { |expected:, output:| (output == expected) ? 1.0 : 0.0 } }
+    assert_kind_of Braintrust::Scorer, scorer
+    assert_equal "kw", scorer.name
+    assert_equal [{score: 1.0, metadata: nil, name: "kw"}],
+      scorer.call(input: "x", expected: "a", output: "a")
+  end
+
+  def test_eval_scorer_method_with_keyword_block_multi_score
+    scorer = suppress_logs {
+      Braintrust::Eval.scorer("multi") do |expected:, output:|
+        [
+          {name: "exact", score: (output == expected) ? 1.0 : 0.0},
+          {name: "nonempty", score: output.to_s.empty? ? 0.0 : 1.0}
+        ]
+      end
+    }
+    result = scorer.call(input: "x", expected: "a", output: "a")
+    assert_equal [{name: "exact", score: 1.0}, {name: "nonempty", score: 1.0}], result
+  end
+
+  def test_eval_scorer_method_with_legacy_positional_block
+    scorer = suppress_logs { Braintrust::Eval.scorer("legacy") { |i, e, o| (o == e) ? 1.0 : 0.0 } }
+    assert_kind_of Braintrust::Scorer, scorer
+    assert_equal [{score: 1.0, metadata: nil, name: "legacy"}],
+      scorer.call(input: "x", expected: "a", output: "a")
+  end
+
+  def test_eval_scorer_method_with_legacy_positional_block_multi_score
+    scorer = suppress_logs {
+      Braintrust::Eval.scorer("legacy_multi") do |i, e, o|
+        [
+          {name: "exact", score: (o == e) ? 1.0 : 0.0},
+          {name: "length", score: (o.length == e.length) ? 1.0 : 0.0}
+        ]
+      end
+    }
+    result = scorer.call(input: "x", expected: "hello", output: "world")
+    assert_equal [{name: "exact", score: 0.0}, {name: "length", score: 1.0}], result
   end
 end
diff --git a/test/braintrust/functions_test.rb b/test/braintrust/functions_test.rb
index b427f8a9..33e94ab0 100644
--- a/test/braintrust/functions_test.rb
+++ b/test/braintrust/functions_test.rb
@@ -312,8 +312,7 @@ def test_scorer_parses_structured_response
 
       result = scorer.call(input: "hello", expected: "HELLO", output: "HELLO", metadata: {})
 
-      assert_kind_of Numeric, result
-      assert_equal 1.0, result
+      assert_equal [{score: 1.0, metadata: nil, name: "test-ruby-sdk-scorer-structured"}], result
     end
   end
 
@@ -343,8 +342,7 @@ def test_scorer_parses_code_string_response
 
       result = scorer.call(input: "test", expected: "test", output: "test", metadata: {})
 
-      assert_kind_of Numeric, result
-      assert_equal 0.45, result
+      assert_equal [{score: 0.45, metadata: nil, name: "test-ruby-sdk-code-scorer"}], result
     end
   end
 
@@ -355,45 +353,44 @@ def test_scorer_parses_code_string_response
   def test_remote_scorer_handles_integer_response
     scorer = scorer_with_stubbed_invoke(1)
     result = scorer.call(input: "input", expected: "expected", output: "output", metadata: {})
-    assert_equal 1.0, result
-    assert_instance_of Float, result
+    assert_equal [{score: 1.0, metadata: nil, name: "test-scorer"}], result
   end
 
   def test_remote_scorer_handles_float_response
     scorer = scorer_with_stubbed_invoke(0.75)
     result = scorer.call(input: "input", expected: "expected", output: "output", metadata: {})
-    assert_equal 0.75, result
-    assert_instance_of Float, result
+    assert_equal [{score: 0.75, metadata: nil, name: "test-scorer"}], result
   end
 
   def test_remote_scorer_handles_boolean_true_response
     scorer = scorer_with_stubbed_invoke(true)
     result = scorer.call(input: "input", expected: "expected", output: "output", metadata: {})
-    assert_equal 1.0, result
+    assert_equal [{score: 1.0, metadata: nil, name: "test-scorer"}], result
   end
 
   def test_remote_scorer_handles_boolean_false_response
     scorer = scorer_with_stubbed_invoke(false)
     result = scorer.call(input: "input", expected: "expected", output: "output", metadata: {})
-    assert_equal 0.0, result
+    assert_equal [{score: 0.0, metadata: nil, name: "test-scorer"}], result
   end
 
-  def test_remote_scorer_handles_nil_response
+  def test_remote_scorer_raises_for_nil_response
     scorer = scorer_with_stubbed_invoke(nil)
-    result = scorer.call(input: "input", expected: "expected", output: "output", metadata: {})
-    assert_nil result
+    assert_raises(ArgumentError) do
+      scorer.call(input: "input", expected: "expected", output: "output", metadata: {})
+    end
   end
 
   def test_remote_scorer_handles_hash_with_score_key
     scorer = scorer_with_stubbed_invoke({"name" => "my_scorer", "score" => 0.9, "metadata" => {}})
     result = scorer.call(input: "input", expected: "expected", output: "output", metadata: {})
-    assert_equal 0.9, result
+    assert_equal [{score: 0.9, metadata: nil, name: "test-scorer"}], result
   end
 
   def test_remote_scorer_handles_string_numeric_response
     scorer = scorer_with_stubbed_invoke("0.85")
     result = scorer.call(input: "input", expected: "expected", output: "output", metadata: {})
-    assert_equal 0.85, result
+    assert_equal [{score: 0.85, metadata: nil, name: "test-scorer"}], result
   end
 
   def test_remote_scorer_raises_for_hash_without_score_key
diff --git a/test/braintrust/internal/callable_test.rb b/test/braintrust/internal/callable_test.rb
index a290eb28..63a63dd2 100644
--- a/test/braintrust/internal/callable_test.rb
+++ b/test/braintrust/internal/callable_test.rb
@@ -10,16 +10,18 @@ class Braintrust::Internal::CallableTest < Minitest::Test
   # ============================================
 
   def test_keyword_block_receives_only_declared_kwargs
+    received = nil
     scorer = Braintrust::Scorer.new("subset") do |output:, expected:|
-      {output: output, expected: expected}
+      received = {output: output, expected: expected}
+      1.0
     end
 
-    result = scorer.call(
+    scorer.call(
       input: "apple", expected: "fruit", output: "fruit",
       metadata: {key: "val"}, tags: ["t1"]
     )
 
-    assert_equal({output: "fruit", expected: "fruit"}, result)
+    assert_equal({output: "fruit", expected: "fruit"}, received)
   end
 
   def test_keyword_block_with_single_kwarg
@@ -76,7 +78,8 @@ def test_positional_scorer_block_arity_3
     suppress_logs do
       scorer = Braintrust::Scorer.new("pos3") { |i, e, o| (o == e) ? 1.0 : 0.0 }
 
-      assert_equal 1.0, scorer.call(input: "a", expected: "b", output: "b", metadata: {})
+      assert_equal [{score: 1.0, metadata: nil, name: "pos3"}],
+        scorer.call(input: "a", expected: "b", output: "b", metadata: {})
     end
   end
 
@@ -84,7 +87,8 @@ def test_positional_scorer_block_arity_4
     suppress_logs do
       scorer = Braintrust::Scorer.new("pos4") { |i, e, o, m| m[:threshold] }
 
-      assert_equal 0.9, scorer.call(input: "a", expected: "b", output: "c", metadata: {threshold: 0.9})
+      assert_equal [{score: 0.9, metadata: nil, name: "pos4"}],
+        scorer.call(input: "a", expected: "b", output: "c", metadata: {threshold: 0.9})
     end
   end
 
@@ -95,7 +99,8 @@ def test_positional_scorer_block_arity_4
   def test_zero_arity_block_passes_through
     scorer = Braintrust::Scorer.new("zero") { 42 }
 
-    assert_equal 42, scorer.call(input: "a", expected: "b", output: "c")
+    assert_equal [{score: 42, metadata: nil, name: "zero"}],
+      scorer.call(input: "a", expected: "b", output: "c")
   end
 
   # ============================================
@@ -147,7 +152,8 @@ def call(output:, expected:)
 
     scorer = klass.new
     # KeywordFilter strips extra kwargs (input:, metadata:, tags:) before calling user's #call
-    assert_equal 1.0, scorer.call(input: "a", expected: "b", output: "b", metadata: {}, tags: [])
+    assert_equal [{score: 1.0, metadata: nil, name: "scorer"}],
+      scorer.call(input: "a", expected: "b", output: "b", metadata: {}, tags: [])
   end
 
   # ============================================
@@ -176,6 +182,108 @@ def test_invalid_positional_arity_raises_for_scorer
   end
 end
 
+# Direct unit tests for ResultNormalizer prepend behavior.
+class Braintrust::Scorer::Callable::ResultNormalizerTest < Minitest::Test
+  # Build a minimal class with ResultNormalizer prepended and a controllable #call return.
+  def make_scorer(name, &block)
+    klass = Class.new do
+      prepend Braintrust::Scorer::Callable::ResultNormalizer
+
+      define_method(:name) { name }
+      define_method(:call) { |**| instance_exec(&block) }
+    end
+    klass.new
+  end
+
+  # ============================================
+  # Scalar return (else branch)
+  # ============================================
+
+  def test_scalar_float_wrapped
+    scorer = make_scorer("s") { 0.9 }
+    assert_equal [{score: 0.9, metadata: nil, name: "s"}], scorer.call
+  end
+
+  def test_scalar_integer_wrapped
+    scorer = make_scorer("s") { 1 }
+    assert_equal [{score: 1, metadata: nil, name: "s"}], scorer.call
+  end
+
+  def test_scalar_nil_raises
+    scorer = make_scorer("s") { nil }
+    assert_raises(ArgumentError) { scorer.call }
+  end
+
+  def test_scalar_boolean_raises
+    scorer = make_scorer("s") { true }
+    assert_raises(ArgumentError) { scorer.call }
+  end
+
+  def test_hash_with_nil_score_raises
+    scorer = make_scorer("s") { {score: nil} }
+    assert_raises(ArgumentError) { scorer.call }
+  end
+
+  def test_array_item_with_nil_score_raises
+    scorer = make_scorer("s") { [{name: "a", score: 1.0}, {name: "b", score: nil}] }
+    assert_raises(ArgumentError) { scorer.call }
+  end
+
+  # ============================================
+  # Hash return
+  # ============================================
+
+  def test_hash_without_name_gets_scorer_name
+    scorer = make_scorer("my_scorer") { {score: 0.5} }
+    assert_equal [{score: 0.5, name: "my_scorer"}], scorer.call
+  end
+
+  def test_hash_with_name_preserves_name
+    scorer = make_scorer("my_scorer") { {score: 0.5, name: "override"} }
+    assert_equal [{score: 0.5, name: "override"}], scorer.call
+  end
+
+  def test_hash_with_metadata_preserved
+    scorer = make_scorer("s") { {score: 0.8, metadata: {reason: "close"}} }
+    assert_equal [{score: 0.8, metadata: {reason: "close"}, name: "s"}], scorer.call
+  end
+
+  # ============================================
+  # Array return
+  # ============================================
+
+  def test_array_items_passed_through
+    scorer = make_scorer("s") { [{name: "a", score: 1.0}, {name: "b", score: 0.5}] }
+    assert_equal [{name: "a", score: 1.0}, {name: "b", score: 0.5}], scorer.call
+  end
+
+  def test_array_items_without_name_get_scorer_name
+    scorer = make_scorer("my_scorer") { [{score: 1.0}, {score: 0.5}] }
+    assert_equal [{score: 1.0, name: "my_scorer"}, {score: 0.5, name: "my_scorer"}], scorer.call
+  end
+
+  def test_array_items_mixed_name_presence
+    scorer = make_scorer("fallback") { [{name: "explicit", score: 1.0}, {score: 0.5}] }
+    assert_equal [{name: "explicit", score: 1.0}, {score: 0.5, name: "fallback"}], scorer.call
+  end
+
+  def test_empty_array_returns_empty_array
+    scorer = make_scorer("s") { [] }
+    assert_equal [], scorer.call
+  end
+
+  # ============================================
+  # Always returns Array
+  # ============================================
+
+  def test_result_is_always_array
+    [0.5, {score: 1.0}, [{score: 0.9}]].each do |raw|
+      scorer = make_scorer("s") { raw }
+      assert_instance_of Array, scorer.call
+    end
+  end
+end
+
 # Direct unit tests for KeywordFilter class methods and instance behavior.
 class Braintrust::Internal::Callable::KeywordFilterTest < Minitest::Test
   # ============================================
diff --git a/test/braintrust/scorer_test.rb b/test/braintrust/scorer_test.rb
index d9c522a8..04765c9c 100644
--- a/test/braintrust/scorer_test.rb
+++ b/test/braintrust/scorer_test.rb
@@ -14,8 +14,10 @@ def test_scorer_with_kwargs_block
     end
 
     assert_equal "exact_match", scorer.name
-    assert_equal 1.0, scorer.call(input: "apple", expected: "fruit", output: "fruit")
-    assert_equal 0.0, scorer.call(input: "apple", expected: "fruit", output: "wrong")
+    assert_equal [{score: 1.0, metadata: nil, name: "exact_match"}],
+      scorer.call(input: "apple", expected: "fruit", output: "fruit")
+    assert_equal [{score: 0.0, metadata: nil, name: "exact_match"}],
+      scorer.call(input: "apple", expected: "fruit", output: "wrong")
   end
 
   def test_scorer_with_subset_kwargs_filters_extra_keys
@@ -25,8 +27,10 @@ def test_scorer_with_subset_kwargs_filters_extra_keys
     end
 
     # Calling with extra kwargs (input:, metadata:, tags:) should not raise
-    assert_equal 1.0, scorer.call(input: "apple", expected: "fruit", output: "fruit", metadata: {}, tags: ["t1"])
-    assert_equal 0.0, scorer.call(input: "apple", expected: "fruit", output: "wrong", metadata: {}, tags: nil)
+    assert_equal [{score: 1.0, metadata: nil, name: "subset"}],
+      scorer.call(input: "apple", expected: "fruit", output: "fruit", metadata: {}, tags: ["t1"])
+    assert_equal [{score: 0.0, metadata: nil, name: "subset"}],
+      scorer.call(input: "apple", expected: "fruit", output: "wrong", metadata: {}, tags: nil)
   end
 
   def test_scorer_with_legacy_3_param_block
@@ -36,7 +40,22 @@ def test_scorer_with_legacy_3_param_block
       end
 
       assert_equal "exact_match", scorer.name
-      assert_equal 1.0, scorer.call(input: "apple", expected: "fruit", output: "fruit", metadata: {threshold: 0.5})
+      assert_equal [{score: 1.0, metadata: nil, name: "exact_match"}],
+        scorer.call(input: "apple", expected: "fruit", output: "fruit", metadata: {threshold: 0.5})
+    end
+  end
+
+  def test_scorer_with_legacy_3_param_block_multi_score
+    suppress_logs do
+      scorer = Braintrust::Scorer.new("legacy3") do |input, expected, output|
+        [
+          {name: "exact", score: (output == expected) ? 1.0 : 0.0},
+          {name: "length", score: (output.length == expected.length) ? 1.0 : 0.0}
+        ]
+      end
+
+      result = scorer.call(input: "x", expected: "fruit", output: "fruit")
+      assert_equal [{name: "exact", score: 1.0}, {name: "length", score: 1.0}], result
     end
   end
 
@@ -49,19 +68,52 @@ def test_scorer_with_legacy_4_param_block
       end
 
       assert_equal "threshold_match", scorer.name
-      assert_equal 0.0, scorer.call(input: "a", expected: "b", output: "c", metadata: {threshold: 0.95})
-      assert_equal 1.0, scorer.call(input: "a", expected: "b", output: "c", metadata: {threshold: 0.85})
+      assert_equal [{score: 0.0, metadata: nil, name: "threshold_match"}],
+        scorer.call(input: "a", expected: "b", output: "c", metadata: {threshold: 0.95})
+      assert_equal [{score: 1.0, metadata: nil, name: "threshold_match"}],
+        scorer.call(input: "a", expected: "b", output: "c", metadata: {threshold: 0.85})
+    end
+  end
+
+  def test_scorer_with_legacy_4_param_block_multi_score
+    suppress_logs do
+      scorer = Braintrust::Scorer.new("legacy4") do |input, expected, output, metadata|
+        threshold = metadata[:threshold] || 0.8
+        [
+          {name: "match", score: (output == expected) ? 1.0 : 0.0},
+          {name: "threshold_met", score: (threshold < 0.9) ? 1.0 : 0.0}
+        ]
+      end
+
+      result = scorer.call(input: "a", expected: "b", output: "b", metadata: {threshold: 0.5})
+      assert_equal [{name: "match", score: 1.0}, {name: "threshold_met", score: 1.0}], result
     end
   end
 
+  def test_scorer_with_keyword_lambda_multi_score
+    # Bare lambda passed through Factory (Proc branch -> Scorer.new(&scorer))
+    lam = ->(expected:, output:) {
+      [
+        {name: "exact", score: (output == expected) ? 1.0 : 0.0},
+        {name: "length", score: (output.length == expected.length) ? 1.0 : 0.0}
+      ]
+    }
+    scorer = Braintrust::Scorer.new(&lam)
+
+    result = scorer.call(input: "x", expected: "hello", output: "world")
+    assert_equal [{name: "exact", score: 0.0}, {name: "length", score: 1.0}], result
+  end
+
   def test_scorer_return_float
     scorer = Braintrust::Scorer.new("float_scorer") { |**| 0.75 }
-    assert_equal 0.75, scorer.call(input: "a", expected: "b", output: "c")
+    assert_equal [{score: 0.75, metadata: nil, name: "float_scorer"}],
+      scorer.call(input: "a", expected: "b", output: "c")
   end
 
   def test_scorer_return_hash
     scorer = Braintrust::Scorer.new("hash_scorer") { |**| {name: "custom_name", score: 0.85} }
-    assert_equal({name: "custom_name", score: 0.85}, scorer.call(input: "a", expected: "b", output: "c"))
+    assert_equal [{name: "custom_name", score: 0.85}],
+      scorer.call(input: "a", expected: "b", output: "c")
   end
 
   def test_scorer_return_array
@@ -131,8 +183,34 @@ def call(output:, expected:)
     scorer = klass.new
     assert_kind_of Braintrust::Scorer, scorer
 
-    assert_equal 1.0, scorer.call(input: "apple", expected: "fruit", output: "fruit")
-    assert_equal 0.0, scorer.call(input: "apple", expected: "fruit", output: "wrong")
+    assert_equal [{score: 1.0, metadata: nil, name: "scorer"}],
+      scorer.call(input: "apple", expected: "fruit", output: "fruit")
+    assert_equal [{score: 0.0, metadata: nil, name: "scorer"}],
+      scorer.call(input: "apple", expected: "fruit", output: "wrong")
+  end
+
+  def test_subclass_with_call_override_multi_score
+    klass = Class.new do
+      include Braintrust::Scorer
+
+      def name
+        "multi_subclass"
+      end
+
+      def call(output:, expected:)
+        [
+          {name: "exact", score: (output == expected) ? 1.0 : 0.0},
+          {name: "nonempty", score: output.to_s.empty? ? 0.0 : 1.0}
+        ]
+      end
+    end
+
+    scorer = klass.new
+    result = scorer.call(input: "x", expected: "fruit", output: "fruit")
+    assert_equal [{name: "exact", score: 1.0}, {name: "nonempty", score: 1.0}], result
+
+    result2 = scorer.call(input: "x", expected: "fruit", output: "wrong")
+    assert_equal [{name: "exact", score: 0.0}, {name: "nonempty", score: 1.0}], result2
   end
 
   def test_subclass_with_name_override
@@ -187,15 +265,10 @@ def call(output:, expected:, metadata:)
 
     scorer = klass.new
 
-    assert_equal 1.0, scorer.call(
-      input: "a", expected: "b", output: "b",
-      metadata: {threshold: 0.9}
-    )
-
-    assert_equal 0.5, scorer.call(
-      input: "a", expected: "b", output: "c",
-      metadata: {threshold: 0.3}
-    )
+    assert_equal [{score: 1.0, metadata: nil, name: "threshold_scorer"}],
+      scorer.call(input: "a", expected: "b", output: "b", metadata: {threshold: 0.9})
+    assert_equal [{score: 0.5, metadata: nil, name: "threshold_scorer"}],
+      scorer.call(input: "a", expected: "b", output: "c", metadata: {threshold: 0.3})
   end
 
   # ============================================
@@ -222,7 +295,32 @@ def call(input, expected, output)
       assert_equal "legacy_scorer", scorer.name
 
       # Arity 3 block gets auto-wrapped to kwargs
-      assert_equal 1.0, scorer.call(input: "test", expected: "HELLO", output: "hello")
+      assert_equal [{score: 1.0, metadata: nil, name: "legacy_scorer"}],
+        scorer.call(input: "test", expected: "HELLO", output: "hello")
+    end
+  end
+
+  def test_legacy_callable_class_multi_score_normalized_via_factory
+    suppress_logs do
+      callable = Class.new do
+        def name
+          "legacy_multi"
+        end
+
+        def call(input, expected, output)
+          [
+            {name: "exact", score: (output == expected) ? 1.0 : 0.0},
+            {name: "case_insensitive", score: (output.downcase == expected.downcase) ? 1.0 : 0.0}
+          ]
+        end
+      end.new
+
+      name = callable.respond_to?(:name) ? callable.name : nil
+      scorer = Braintrust::Scorer.new(name, &callable.method(:call))
+
+      assert_equal "legacy_multi", scorer.name
+      result = scorer.call(input: "test", expected: "HELLO", output: "hello")
+      assert_equal [{name: "exact", score: 0.0}, {name: "case_insensitive", score: 1.0}], result
     end
   end
 
@@ -249,10 +347,8 @@ def call(input, expected, output, metadata = {})
       assert_equal "legacy_with_meta", scorer.name
 
       # Arity 4 block gets auto-wrapped to kwargs
-      assert_equal 1.0, scorer.call(
-        input: "a", expected: "b", output: "b",
-        metadata: {threshold: 0.9}
-      )
+      assert_equal [{score: 1.0, metadata: nil, name: "legacy_with_meta"}],
+        scorer.call(input: "a", expected: "b", output: "b", metadata: {threshold: 0.9})
     end
   end