diff --git a/README.md b/README.md index d5ca0271..f41cd421 100644 --- a/README.md +++ b/README.md @@ -259,6 +259,8 @@ Braintrust::Eval.run( ) ``` +See [eval.rb](./examples/eval.rb) for a full example. + ### Datasets Use test cases from a Braintrust dataset: @@ -287,6 +289,8 @@ Braintrust::Eval.run( ) ``` +See [dataset.rb](./examples/eval/dataset.rb) for a full example. + ### Scorers Use scoring functions defined in Braintrust: @@ -315,6 +319,8 @@ Braintrust::Eval.run( ) ``` +See [remote_functions.rb](./examples/eval/remote_functions.rb) for a full example. + #### Scorer metadata Scorers can return a Hash with `:score` and `:metadata` to attach structured context to the score. The metadata is logged on the scorer's span and visible in the Braintrust UI for debugging and filtering: @@ -332,6 +338,27 @@ end See [scorer_metadata.rb](./examples/eval/scorer_metadata.rb) for a full example. +#### Multiple scores from one scorer + +When several scores can be computed together (e.g. in one LLM call), you can return an `Array` of score `Hash` instead of a single value. Each metric appears as a separate score column in the Braintrust UI: + +```ruby +Braintrust::Scorer.new("summary_quality") do |output:, expected:| + words = output.downcase.split + key_terms = expected[:key_terms] + covered = key_terms.count { |t| words.include?(t) } + + [ + {name: "coverage", score: covered.to_f / key_terms.size, metadata: {missing: key_terms - words}}, + {name: "conciseness", score: words.size <= expected[:max_words] ? 1.0 : 0.0} + ] +end +``` + +`name` and `score` are required, `metadata` is optional. + +See [multi_score.rb](./examples/eval/multi_score.rb) for a full example. + #### Trace scoring Scorers can access the full evaluation trace (all spans generated by the task) by declaring a `trace:` keyword parameter. This is useful for inspecting intermediate LLM calls, validating tool usage, or checking the message thread: @@ -361,7 +388,7 @@ Braintrust::Eval.run( ) ``` -See examples: [eval.rb](./examples/eval.rb), [dataset.rb](./examples/eval/dataset.rb), [remote_functions.rb](./examples/eval/remote_functions.rb), [trace_scoring.rb](./examples/eval/trace_scoring.rb) +See [trace_scoring.rb](./examples/eval/trace_scoring.rb) for a full example. ### Dev Server diff --git a/examples/eval/multi_score.rb b/examples/eval/multi_score.rb new file mode 100644 index 00000000..438a00f3 --- /dev/null +++ b/examples/eval/multi_score.rb @@ -0,0 +1,132 @@ +#!/usr/bin/env ruby +# frozen_string_literal: true + +require "bundler/setup" +require "braintrust" +require "opentelemetry/sdk" + +# Example: Multi-Score Scorers +# +# A scorer can return an Array of score hashes to emit multiple named metrics +# from a single scorer call. Each hash must have a :name and :score key; an +# optional :metadata key attaches structured context to that metric. +# +# This is useful when several dimensions of quality (e.g. correctness, +# completeness, format) can be computed together — sharing one inference call +# or one pass over the output — rather than running separate scorers. +# +# Two patterns are shown: +# +# 1. Block-based (Braintrust::Scorer.new): +# Pass a block that returns an Array. Good for concise, one-off scorers. +# +# 2. Class-based (include Braintrust::Scorer): +# Define a class with a #call method. Good for reusable scorers that +# share helper logic across multiple metrics. +# +# Usage: +# bundle exec ruby examples/eval/multi_score.rb + +Braintrust.init + +# --------------------------------------------------------------------------- +# Task: summarise a list of facts +# --------------------------------------------------------------------------- +FACTS = { + "The sky is blue and clouds are white." => { + key_terms: %w[sky blue clouds white], + max_words: 10 + }, + "Ruby was created by Matz in 1995." => { + key_terms: %w[ruby matz 1995], + max_words: 8 + }, + "The Pacific Ocean is the largest ocean on Earth." => { + key_terms: %w[pacific largest ocean earth], + max_words: 10 + } +} + +# Simulated summariser (replace with a real LLM call in production) +def summarise(text) + # Naive: drop words over the limit and lowercase + text.split.first(8).join(" ").downcase +end + +# --------------------------------------------------------------------------- +# Pattern 1: block-based multi-score scorer +# +# Returns three metrics in one pass: +# - coverage: fraction of key terms present in the summary +# - conciseness: 1.0 if under the word limit, else 0.0 +# - lowercase: 1.0 if the summary is fully lowercased +# --------------------------------------------------------------------------- +summary_quality = Braintrust::Scorer.new("summary_quality") do |output:, expected:| + words = output.to_s.downcase.split + key_terms = expected[:key_terms] + max_words = expected[:max_words] + + covered = key_terms.count { |t| words.include?(t) } + coverage_score = key_terms.empty? ? 1.0 : covered.to_f / key_terms.size + + [ + { + name: "coverage", + score: coverage_score, + metadata: {covered: covered, total: key_terms.size, missing: key_terms - words} + }, + { + name: "conciseness", + score: (words.size <= max_words) ? 1.0 : 0.0, + metadata: {word_count: words.size, limit: max_words} + }, + { + name: "lowercase", + score: (output.to_s == output.to_s.downcase) ? 1.0 : 0.0 + } + ] +end + +# --------------------------------------------------------------------------- +# Pattern 2: class-based multi-score scorer +# +# Include Braintrust::Scorer and define #call. The class name is used as the +# scorer name by default; override #name to customise it. +# +# Returns two metrics: +# - ends_with_period: checks punctuation +# - no_first_person: checks for avoided first-person pronouns +# --------------------------------------------------------------------------- +class StyleChecker + include Braintrust::Scorer + + FIRST_PERSON = %w[i me my myself we us our].freeze + + def call(output:, **) + text = output.to_s + words = text.downcase.split(/\W+/) + fp_words = words & FIRST_PERSON + + [ + { + name: "ends_with_period", + score: text.strip.end_with?(".") ? 1.0 : 0.0 + }, + { + name: "no_first_person", + score: fp_words.empty? ? 1.0 : 0.0, + metadata: {found: fp_words} + } + ] + end +end + +Braintrust::Eval.run( + project: "ruby-sdk-examples", + experiment: "multi-score-example", + cases: FACTS.map { |text, expected| {input: text, expected: expected} }, + task: ->(input:) { summarise(input) }, + scorers: [summary_quality, StyleChecker.new] +) + +OpenTelemetry.tracer_provider.shutdown diff --git a/lib/braintrust/eval/runner.rb b/lib/braintrust/eval/runner.rb index 76590280..fb020bee 100644 --- a/lib/braintrust/eval/runner.rb +++ b/lib/braintrust/eval/runner.rb @@ -111,9 +111,8 @@ def run_eval_case(case_context, errors) case_context.trace = build_trace(eval_span) # Run scorers - case_scores = nil begin - case_scores = run_scorers(case_context) + run_scorers(case_context) rescue => e # Error already recorded on score span, set eval span status eval_span.status = OpenTelemetry::Trace::Status.error(e.message) @@ -123,7 +122,7 @@ def run_eval_case(case_context, errors) # Set output after task completes set_json_attr(eval_span, "braintrust.output_json", {output: case_context.output}) - report_progress(eval_span, case_context, data: case_context.output, scores: case_scores || {}) + report_progress(eval_span, case_context, data: case_context.output) end ensure eval_span&.finish @@ -157,7 +156,6 @@ def run_task(case_context) # Run scorers with OpenTelemetry tracing. # Creates one span per scorer, each a direct child of the current (eval) span. # @param case_context [CaseContext] The per-case context (output must be populated) - # @return [Hash] Scores hash { scorer_name => score_value } def run_scorers(case_context) scorer_kwargs = { input: case_context.input, @@ -173,47 +171,41 @@ def run_scorers(case_context) metadata: case_context.metadata || {} } - scores = {} scorer_error = nil eval_context.scorers.each do |scorer| - run_scorer(scorer, scorer_kwargs, scorer_input, scores) + collect_scores(run_scorer(scorer, scorer_kwargs, scorer_input)) rescue => e scorer_error ||= e end raise scorer_error if scorer_error - - scores end # Run a single scorer inside its own span. # @param scorer [Scorer] The scorer to run # @param scorer_kwargs [Hash] Keyword arguments for the scorer # @param scorer_input [Hash] Input to log on the span - # @param scores [Hash] Accumulator for score results - def run_scorer(scorer, scorer_kwargs, scorer_input, scores) + # @return [Array] Raw score results from the scorer + def run_scorer(scorer, scorer_kwargs, scorer_input) tracer.in_span(scorer.name) do |score_span| score_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr set_json_attr(score_span, "braintrust.span_attributes", build_scorer_span_attributes(scorer.name)) set_json_attr(score_span, "braintrust.input_json", scorer_input) - raw_result = scorer.call(**scorer_kwargs) - normalized = normalize_score_result(raw_result, scorer.name) + score_results = scorer.call(**scorer_kwargs) - score_name = normalized[:name] - scores[score_name] = normalized[:score] + scorer_scores = {} + scorer_metadata = {} + score_results.each do |s| + scorer_scores[s[:name]] = s[:score] + scorer_metadata[s[:name]] = s[:metadata] if s[:metadata].is_a?(Hash) + end - scorer_scores = {score_name => normalized[:score]} set_json_attr(score_span, "braintrust.output_json", scorer_scores) set_json_attr(score_span, "braintrust.scores", scorer_scores) + set_json_attr(score_span, "braintrust.metadata", scorer_metadata) unless scorer_metadata.empty? - # Set scorer metadata on its span - if normalized[:metadata].is_a?(Hash) - set_json_attr(score_span, "braintrust.metadata", normalized[:metadata]) - end - - # Collect raw score for summary (thread-safe) - collect_score(score_name, normalized[:score]) + score_results rescue => e record_span_error(score_span, e, "ScorerError") raise @@ -302,28 +294,11 @@ def set_json_attr(span, key, value) span.set_attribute(key, JSON.dump(value)) end - # Collect a single score value for summary calculation - # @param name [String] Scorer name - # @param value [Object] Score value (only Numeric values are collected) - def collect_score(name, value) - return unless value.is_a?(Numeric) - + # Collect score results into the summary accumulator (thread-safe). + # @param score_results [Array] Score results from a scorer + def collect_scores(score_results) @score_mutex.synchronize do - (@scores[name] ||= []) << value - end - end - - # Normalize a scorer return value into its component parts. - # Scorers may return a raw Numeric or a Hash with :score, :metadata, and :name keys. - # @param result [Object] Raw scorer return value - # @param default_name [String] Scorer name to use if not overridden - # @return [Hash] Normalized hash with :score, :metadata, :name keys - def normalize_score_result(result, default_name) - if result.is_a?(Hash) - result[:name] ||= default_name - result - else - {score: result, metadata: nil, name: default_name} + score_results.each { |s| (@scores[s[:name]] ||= []) << s[:score] } end end end diff --git a/lib/braintrust/scorer.rb b/lib/braintrust/scorer.rb index d13ffab6..5c6b13b1 100644 --- a/lib/braintrust/scorer.rb +++ b/lib/braintrust/scorer.rb @@ -40,12 +40,52 @@ def self.new(name = nil, &block) Block.new(name: name || DEFAULT_NAME, &block) end - # Included into classes that +include Scorer+. Prepends KeywordFilter - # so #call receives only its declared kwargs, and provides a default #name. + # Included into classes that +include Scorer+. Prepends KeywordFilter and + # ResultNormalizer so #call receives only declared kwargs and always returns + # Array. Also provides a default #name and #call_parameters. module Callable + # Normalizes the raw return value of #call into Array. + # Nested inside Callable because it depends on #name which Callable provides. + module ResultNormalizer + # @return [Array] normalized score hashes with :score, :metadata, :name keys + def call(**kwargs) + normalize_score_result(super) + end + + private + + # @param result [Numeric, Hash, Array] raw return value from #call + # @return [Array] one or more score hashes with :score, :metadata, :name keys + # @raise [ArgumentError] if any score value is not Numeric + def normalize_score_result(result) + case result + when Array then result.map { |item| normalize_score_item(item) } + when Hash then [normalize_score_item(result)] + else + raise ArgumentError, "#{name}: score must be Numeric, got #{result.inspect}" unless result.is_a?(Numeric) + [{score: result, metadata: nil, name: name}] + end + end + + # Fills in missing :name from the scorer and validates :score. + # @param item [Hash] a score hash with at least a :score key + # @return [Hash] the same hash with :name set + # @raise [ArgumentError] if :score is not Numeric + def normalize_score_item(item) + item[:name] ||= name + raise ArgumentError, "#{item[:name]}: score must be Numeric, got #{item[:score].inspect}" unless item[:score].is_a?(Numeric) + item + end + end + + # Infrastructure modules prepended onto every scorer class. + # Used both to set up the ancestor chain and to skip past them in + # #call_parameters so KeywordFilter sees the real call signature. + PREPENDED = [Internal::Callable::KeywordFilter, ResultNormalizer].freeze + # @param base [Class] the class including Callable def self.included(base) - base.prepend(Internal::Callable::KeywordFilter) + PREPENDED.each { |mod| base.prepend(mod) } end # Default name derived from the class name (e.g. FuzzyMatch -> "fuzzy_match"). @@ -55,6 +95,17 @@ def name return Scorer::DEFAULT_NAME unless klass klass.gsub(/([a-z])([A-Z])/, '\1_\2').downcase end + + # Provides KeywordFilter with the actual call signature of the subclass. + # Walks past PREPENDED modules in the ancestor chain so that user-defined + # #call keyword params are correctly introspected. + # Block overrides this to point directly at @block.parameters. + # @return [Array] parameter list + def call_parameters + meth = method(:call) + meth = meth.super_method while meth.super_method && PREPENDED.include?(meth.owner) + meth.parameters + end end # Block-based scorer. Stores a Proc and delegates #call to it. @@ -75,7 +126,7 @@ def initialize(name: DEFAULT_NAME, &block) end # @param kwargs [Hash] keyword arguments (filtered by KeywordFilter) - # @return [Float, Hash, Array] score result + # @return [Array] normalized score results def call(**kwargs) @block.call(**kwargs) end diff --git a/test/braintrust/eval/context_test.rb b/test/braintrust/eval/context_test.rb index cfe52b28..ac4b1128 100644 --- a/test/braintrust/eval/context_test.rb +++ b/test/braintrust/eval/context_test.rb @@ -99,7 +99,8 @@ def test_normalize_scorers_wraps_lambda_with_kwargs result = factory.normalize_scorers([lam]) assert_equal 1, result.length assert_kind_of Braintrust::Scorer, result.first - assert_equal 1.0, result.first.call(input: "x", expected: "YES", output: "YES") + assert_equal [{score: 1.0, metadata: nil, name: "scorer"}], + result.first.call(input: "x", expected: "YES", output: "YES") end def test_normalize_scorers_wraps_callable_class_with_kwargs @@ -122,7 +123,8 @@ def call(expected:, output:) result = factory.normalize_scorers([callable]) assert_equal 1, result.length assert_equal "threshold_scorer", result.first.name - assert_equal 0.5, result.first.call(input: "x", expected: "a", output: "b") + assert_equal [{score: 0.5, metadata: nil, name: "threshold_scorer"}], + result.first.call(input: "x", expected: "a", output: "b") end # ============================================ @@ -201,7 +203,8 @@ def test_normalize_scorers_wraps_lambda result = factory.normalize_scorers([lam]) assert_equal 1, result.length assert_kind_of Braintrust::Scorer, result.first - assert_equal 1.0, result.first.call(input: "x", expected: "a", output: "a") + assert_equal [{score: 1.0, metadata: nil, name: "scorer"}], + result.first.call(input: "x", expected: "a", output: "a") end def test_normalize_scorers_callable_class_without_name @@ -216,7 +219,8 @@ def call(output:, expected:) result = factory.normalize_scorers([callable]) assert_equal 1, result.length assert_equal "scorer", result.first.name - assert_equal 1.0, result.first.call(input: "x", expected: "a", output: "a") + assert_equal [{score: 1.0, metadata: nil, name: "scorer"}], + result.first.call(input: "x", expected: "a", output: "a") end # ============================================ diff --git a/test/braintrust/eval/runner_test.rb b/test/braintrust/eval/runner_test.rb index d6a5da62..01246d8f 100644 --- a/test/braintrust/eval/runner_test.rb +++ b/test/braintrust/eval/runner_test.rb @@ -979,7 +979,7 @@ def test_on_progress_receives_output_data assert_equal "HELLO", progress_calls.first["data"] end - def test_on_progress_receives_scores + def test_on_progress_receives_data scorer = Braintrust::Scorer.new("exact") { |expected:, output:| (output == expected) ? 1.0 : 0.0 } progress_calls = [] runner = build_simple_runner( @@ -991,7 +991,8 @@ def test_on_progress_receives_scores runner.run - assert_equal({"exact" => 1.0}, progress_calls.first["scores"]) + assert_equal "HELLO", progress_calls.first["data"] + refute progress_calls.first.key?("scores") end def test_on_progress_receives_error_on_task_failure @@ -1375,10 +1376,10 @@ def test_scorer_hash_return_metadata_on_span score_span = rig.exporter.finished_spans.find { |s| s.name == "meta_scorer" } metadata = JSON.parse(score_span.attributes["braintrust.metadata"]) - assert_equal({"failure_type" => "none", "confidence" => 0.99}, metadata) + assert_equal({"meta_scorer" => {"failure_type" => "none", "confidence" => 0.99}}, metadata) end - def test_scorer_hash_without_score_key + def test_scorer_hash_without_score_key_raises rig = setup_otel_test_rig scorer = Braintrust::Scorer.new("no_score_key") { |output:| @@ -1398,12 +1399,12 @@ def test_scorer_hash_without_score_key ) result = Braintrust::Eval::Runner.new(context).run - assert result.success? - # nil score is not Numeric, so not collected for stats - assert_equal({}, result.scores) + refute result.success? + assert_equal 1, result.errors.length + assert_match(/score must be Numeric/, result.errors.first) end - def test_scorer_hash_with_nil_score + def test_scorer_hash_with_nil_score_raises rig = setup_otel_test_rig scorer = Braintrust::Scorer.new("nil_score") { |output:| @@ -1423,13 +1424,9 @@ def test_scorer_hash_with_nil_score ) result = Braintrust::Eval::Runner.new(context).run - assert result.success? - assert_equal({}, result.scores) - - # Metadata should still be logged even with nil score - score_span = rig.exporter.finished_spans.find { |s| s.name == "nil_score" } - metadata = JSON.parse(score_span.attributes["braintrust.metadata"]) - assert_equal({"reason" => "could not score"}, metadata) + refute result.success? + assert_equal 1, result.errors.length + assert_match(/score must be Numeric/, result.errors.first) end def test_multiple_scorers_mixed_return_types @@ -1460,7 +1457,7 @@ def test_multiple_scorers_mixed_return_types numeric_span = score_spans.find { |s| s.name == "numeric" } metadata = JSON.parse(structured_span.attributes["braintrust.metadata"]) - assert_equal({"detail" => "partial"}, metadata) + assert_equal({"structured" => {"detail" => "partial"}}, metadata) assert_nil numeric_span.attributes["braintrust.metadata"] end @@ -1488,7 +1485,7 @@ def test_scorer_hash_return_scores_on_span_are_extracted assert_equal 0.75, scores["structured"] end - def test_on_progress_receives_extracted_score_from_hash + def test_on_progress_receives_data_with_hash_scorer progress_calls = [] scorer = Braintrust::Scorer.new("structured") { {score: 0.5, metadata: {x: 1}} } runner = build_simple_runner( @@ -1500,7 +1497,8 @@ def test_on_progress_receives_extracted_score_from_hash runner.run - assert_equal({"structured" => 0.5}, progress_calls.first["scores"]) + assert_equal "HELLO", progress_calls.first["data"] + refute progress_calls.first.key?("scores") end def test_scorer_no_metadata_attr_when_all_numeric @@ -1579,7 +1577,7 @@ def test_scorer_hash_return_multiple_cases assert_equal({"quality" => [0.2, 0.5]}, result.scores) end - def test_scorer_empty_hash_return + def test_scorer_empty_hash_raises rig = setup_otel_test_rig scorer = Braintrust::Scorer.new("empty") { |output:| {} } @@ -1597,9 +1595,201 @@ def test_scorer_empty_hash_return ) result = Braintrust::Eval::Runner.new(context).run + refute result.success? + assert_equal 1, result.errors.length + assert_match(/score must be Numeric/, result.errors.first) + end + + # ============================================ + # Runner#run tests - multi-score (Array) return + # ============================================ + + def test_scorer_array_return_two_scores + rig = setup_otel_test_rig + + scorer = Braintrust::Scorer.new("llm_judge") { |output:| + [ + {score: 0.9, name: "relevance"}, + {score: 0.7, name: "factuality"} + ] + } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + result = Braintrust::Eval::Runner.new(context).run + assert result.success? - # Empty hash has no :score key, so score is nil and not collected - assert_equal({}, result.scores) + assert_equal({"relevance" => [0.9], "factuality" => [0.7]}, result.scores) + end + + def test_scorer_array_return_scores_on_span + rig = setup_otel_test_rig + + scorer = Braintrust::Scorer.new("llm_judge") { |output:| + [ + {score: 0.9, name: "relevance"}, + {score: 0.7, name: "factuality"} + ] + } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + Braintrust::Eval::Runner.new(context).run + + score_span = rig.exporter.finished_spans.find { |s| s.name == "llm_judge" } + scores = JSON.parse(score_span.attributes["braintrust.scores"]) + assert_equal({"relevance" => 0.9, "factuality" => 0.7}, scores) + end + + def test_scorer_array_return_metadata_keyed_by_score_name + rig = setup_otel_test_rig + + scorer = Braintrust::Scorer.new("llm_judge") { |output:| + [ + {score: 0.9, name: "relevance", metadata: {reason: "on topic"}}, + {score: 0.7, name: "factuality"} + ] + } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + Braintrust::Eval::Runner.new(context).run + + score_span = rig.exporter.finished_spans.find { |s| s.name == "llm_judge" } + metadata = JSON.parse(score_span.attributes["braintrust.metadata"]) + # Only the score with metadata should appear; keyed by score name + assert_equal({"relevance" => {"reason" => "on topic"}}, metadata) + end + + def test_scorer_array_return_no_metadata_attr_when_none_present + rig = setup_otel_test_rig + + scorer = Braintrust::Scorer.new("llm_judge") { |output:| + [ + {score: 0.9, name: "relevance"}, + {score: 0.7, name: "factuality"} + ] + } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + Braintrust::Eval::Runner.new(context).run + + score_span = rig.exporter.finished_spans.find { |s| s.name == "llm_judge" } + assert_nil score_span.attributes["braintrust.metadata"] + end + + def test_scorer_array_return_multiple_cases_accumulates + rig = setup_otel_test_rig + + scorer = Braintrust::Scorer.new("llm_judge") { |output:| + [ + {score: 1.0, name: "relevance"}, + {score: 0.5, name: "tone"} + ] + } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "a"}, {input: "b"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + result = Braintrust::Eval::Runner.new(context).run + + assert result.success? + assert_equal({"relevance" => [1.0, 1.0], "tone" => [0.5, 0.5]}, result.scores) + end + + def test_scorer_array_return_single_numeric_unchanged + rig = setup_otel_test_rig + + scorer = Braintrust::Scorer.new("exact") { |output:, expected:| (output == expected) ? 1.0 : 0.0 } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "hello", expected: "HELLO"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + result = Braintrust::Eval::Runner.new(context).run + + assert result.success? + assert_equal({"exact" => [1.0]}, result.scores) + end + + def test_scorer_array_return_single_hash_unchanged + rig = setup_otel_test_rig + + scorer = Braintrust::Scorer.new("quality") { |output:| + {score: 0.8, metadata: {reason: "good"}} + } + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [scorer], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + result = Braintrust::Eval::Runner.new(context).run + + assert result.success? + assert_equal({"quality" => [0.8]}, result.scores) + + score_span = rig.exporter.finished_spans.find { |s| s.name == "quality" } + metadata = JSON.parse(score_span.attributes["braintrust.metadata"]) + assert_equal({"quality" => {"reason" => "good"}}, metadata) end private diff --git a/test/braintrust/eval/scorer_test.rb b/test/braintrust/eval/scorer_test.rb index c05bcc74..1cd73885 100644 --- a/test/braintrust/eval/scorer_test.rb +++ b/test/braintrust/eval/scorer_test.rb @@ -39,11 +39,37 @@ def test_new_logs_deprecation_warning end end + def test_new_with_keyword_block_multi_score + scorer = suppress_logs { + Braintrust::Eval::Scorer.new("multi") do |expected:, output:| + [ + {name: "exact", score: (output == expected) ? 1.0 : 0.0}, + {name: "nonempty", score: output.to_s.empty? ? 0.0 : 1.0} + ] + end + } + result = scorer.call(input: "x", expected: "a", output: "a") + assert_equal [{name: "exact", score: 1.0}, {name: "nonempty", score: 1.0}], result + end + def test_new_with_legacy_positional_block scorer = suppress_logs { Braintrust::Eval::Scorer.new("legacy") { |i, e, o| (o == e) ? 1.0 : 0.0 } } assert_kind_of Braintrust::Scorer, scorer - result = scorer.call(input: "x", expected: "a", output: "a") - assert_equal 1.0, result + assert_equal [{score: 1.0, metadata: nil, name: "legacy"}], + scorer.call(input: "x", expected: "a", output: "a") + end + + def test_new_with_legacy_positional_block_multi_score + scorer = suppress_logs { + Braintrust::Eval::Scorer.new("legacy_multi") do |i, e, o| + [ + {name: "exact", score: (o == e) ? 1.0 : 0.0}, + {name: "length", score: (o.length == e.length) ? 1.0 : 0.0} + ] + end + } + result = scorer.call(input: "x", expected: "hello", output: "world") + assert_equal [{name: "exact", score: 0.0}, {name: "length", score: 1.0}], result end # ============================================ @@ -53,13 +79,26 @@ def test_new_with_legacy_positional_block def test_call_with_positional_args scorer = suppress_logs { Braintrust::Eval::Scorer.new("legacy") { |i, e, o| (o == e) ? 1.0 : 0.0 } } result = suppress_logs { scorer.call("apple", "fruit", "fruit") } - assert_equal 1.0, result + assert_equal [{score: 1.0, metadata: nil, name: "legacy"}], result + end + + def test_call_with_positional_args_multi_score + scorer = suppress_logs { + Braintrust::Eval::Scorer.new("legacy_multi") do |i, e, o| + [ + {name: "exact", score: (o == e) ? 1.0 : 0.0}, + {name: "nonempty", score: o.to_s.empty? ? 0.0 : 1.0} + ] + end + } + result = suppress_logs { scorer.call("x", "fruit", "fruit") } + assert_equal [{name: "exact", score: 1.0}, {name: "nonempty", score: 1.0}], result end def test_call_with_positional_args_including_metadata scorer = suppress_logs { Braintrust::Eval::Scorer.new("legacy") { |i, e, o, m| m[:boost] ? 1.0 : 0.0 } } result = suppress_logs { scorer.call("apple", "fruit", "fruit", {boost: true}) } - assert_equal 1.0, result + assert_equal [{score: 1.0, metadata: nil, name: "legacy"}], result end def test_call_with_positional_args_logs_deprecation_warning @@ -72,6 +111,51 @@ def test_call_with_positional_args_logs_deprecation_warning def test_call_with_keyword_args_does_not_trigger_positional_warning scorer = suppress_logs { Braintrust::Eval::Scorer.new("kw") { |expected:, output:| (output == expected) ? 1.0 : 0.0 } } result = scorer.call(input: "x", expected: "a", output: "a") - assert_equal 1.0, result + assert_equal [{score: 1.0, metadata: nil, name: "kw"}], result + end + + # ============================================ + # Eval.scorer module method (deprecated) + # ============================================ + + def test_eval_scorer_method_with_keyword_block + scorer = suppress_logs { Braintrust::Eval.scorer("kw") { |expected:, output:| (output == expected) ? 1.0 : 0.0 } } + assert_kind_of Braintrust::Scorer, scorer + assert_equal "kw", scorer.name + assert_equal [{score: 1.0, metadata: nil, name: "kw"}], + scorer.call(input: "x", expected: "a", output: "a") + end + + def test_eval_scorer_method_with_keyword_block_multi_score + scorer = suppress_logs { + Braintrust::Eval.scorer("multi") do |expected:, output:| + [ + {name: "exact", score: (output == expected) ? 1.0 : 0.0}, + {name: "nonempty", score: output.to_s.empty? ? 0.0 : 1.0} + ] + end + } + result = scorer.call(input: "x", expected: "a", output: "a") + assert_equal [{name: "exact", score: 1.0}, {name: "nonempty", score: 1.0}], result + end + + def test_eval_scorer_method_with_legacy_positional_block + scorer = suppress_logs { Braintrust::Eval.scorer("legacy") { |i, e, o| (o == e) ? 1.0 : 0.0 } } + assert_kind_of Braintrust::Scorer, scorer + assert_equal [{score: 1.0, metadata: nil, name: "legacy"}], + scorer.call(input: "x", expected: "a", output: "a") + end + + def test_eval_scorer_method_with_legacy_positional_block_multi_score + scorer = suppress_logs { + Braintrust::Eval.scorer("legacy_multi") do |i, e, o| + [ + {name: "exact", score: (o == e) ? 1.0 : 0.0}, + {name: "length", score: (o.length == e.length) ? 1.0 : 0.0} + ] + end + } + result = scorer.call(input: "x", expected: "hello", output: "world") + assert_equal [{name: "exact", score: 0.0}, {name: "length", score: 1.0}], result end end diff --git a/test/braintrust/functions_test.rb b/test/braintrust/functions_test.rb index b427f8a9..33e94ab0 100644 --- a/test/braintrust/functions_test.rb +++ b/test/braintrust/functions_test.rb @@ -312,8 +312,7 @@ def test_scorer_parses_structured_response result = scorer.call(input: "hello", expected: "HELLO", output: "HELLO", metadata: {}) - assert_kind_of Numeric, result - assert_equal 1.0, result + assert_equal [{score: 1.0, metadata: nil, name: "test-ruby-sdk-scorer-structured"}], result end end @@ -343,8 +342,7 @@ def test_scorer_parses_code_string_response result = scorer.call(input: "test", expected: "test", output: "test", metadata: {}) - assert_kind_of Numeric, result - assert_equal 0.45, result + assert_equal [{score: 0.45, metadata: nil, name: "test-ruby-sdk-code-scorer"}], result end end @@ -355,45 +353,44 @@ def test_scorer_parses_code_string_response def test_remote_scorer_handles_integer_response scorer = scorer_with_stubbed_invoke(1) result = scorer.call(input: "input", expected: "expected", output: "output", metadata: {}) - assert_equal 1.0, result - assert_instance_of Float, result + assert_equal [{score: 1.0, metadata: nil, name: "test-scorer"}], result end def test_remote_scorer_handles_float_response scorer = scorer_with_stubbed_invoke(0.75) result = scorer.call(input: "input", expected: "expected", output: "output", metadata: {}) - assert_equal 0.75, result - assert_instance_of Float, result + assert_equal [{score: 0.75, metadata: nil, name: "test-scorer"}], result end def test_remote_scorer_handles_boolean_true_response scorer = scorer_with_stubbed_invoke(true) result = scorer.call(input: "input", expected: "expected", output: "output", metadata: {}) - assert_equal 1.0, result + assert_equal [{score: 1.0, metadata: nil, name: "test-scorer"}], result end def test_remote_scorer_handles_boolean_false_response scorer = scorer_with_stubbed_invoke(false) result = scorer.call(input: "input", expected: "expected", output: "output", metadata: {}) - assert_equal 0.0, result + assert_equal [{score: 0.0, metadata: nil, name: "test-scorer"}], result end - def test_remote_scorer_handles_nil_response + def test_remote_scorer_raises_for_nil_response scorer = scorer_with_stubbed_invoke(nil) - result = scorer.call(input: "input", expected: "expected", output: "output", metadata: {}) - assert_nil result + assert_raises(ArgumentError) do + scorer.call(input: "input", expected: "expected", output: "output", metadata: {}) + end end def test_remote_scorer_handles_hash_with_score_key scorer = scorer_with_stubbed_invoke({"name" => "my_scorer", "score" => 0.9, "metadata" => {}}) result = scorer.call(input: "input", expected: "expected", output: "output", metadata: {}) - assert_equal 0.9, result + assert_equal [{score: 0.9, metadata: nil, name: "test-scorer"}], result end def test_remote_scorer_handles_string_numeric_response scorer = scorer_with_stubbed_invoke("0.85") result = scorer.call(input: "input", expected: "expected", output: "output", metadata: {}) - assert_equal 0.85, result + assert_equal [{score: 0.85, metadata: nil, name: "test-scorer"}], result end def test_remote_scorer_raises_for_hash_without_score_key diff --git a/test/braintrust/internal/callable_test.rb b/test/braintrust/internal/callable_test.rb index a290eb28..63a63dd2 100644 --- a/test/braintrust/internal/callable_test.rb +++ b/test/braintrust/internal/callable_test.rb @@ -10,16 +10,18 @@ class Braintrust::Internal::CallableTest < Minitest::Test # ============================================ def test_keyword_block_receives_only_declared_kwargs + received = nil scorer = Braintrust::Scorer.new("subset") do |output:, expected:| - {output: output, expected: expected} + received = {output: output, expected: expected} + 1.0 end - result = scorer.call( + scorer.call( input: "apple", expected: "fruit", output: "fruit", metadata: {key: "val"}, tags: ["t1"] ) - assert_equal({output: "fruit", expected: "fruit"}, result) + assert_equal({output: "fruit", expected: "fruit"}, received) end def test_keyword_block_with_single_kwarg @@ -76,7 +78,8 @@ def test_positional_scorer_block_arity_3 suppress_logs do scorer = Braintrust::Scorer.new("pos3") { |i, e, o| (o == e) ? 1.0 : 0.0 } - assert_equal 1.0, scorer.call(input: "a", expected: "b", output: "b", metadata: {}) + assert_equal [{score: 1.0, metadata: nil, name: "pos3"}], + scorer.call(input: "a", expected: "b", output: "b", metadata: {}) end end @@ -84,7 +87,8 @@ def test_positional_scorer_block_arity_4 suppress_logs do scorer = Braintrust::Scorer.new("pos4") { |i, e, o, m| m[:threshold] } - assert_equal 0.9, scorer.call(input: "a", expected: "b", output: "c", metadata: {threshold: 0.9}) + assert_equal [{score: 0.9, metadata: nil, name: "pos4"}], + scorer.call(input: "a", expected: "b", output: "c", metadata: {threshold: 0.9}) end end @@ -95,7 +99,8 @@ def test_positional_scorer_block_arity_4 def test_zero_arity_block_passes_through scorer = Braintrust::Scorer.new("zero") { 42 } - assert_equal 42, scorer.call(input: "a", expected: "b", output: "c") + assert_equal [{score: 42, metadata: nil, name: "zero"}], + scorer.call(input: "a", expected: "b", output: "c") end # ============================================ @@ -147,7 +152,8 @@ def call(output:, expected:) scorer = klass.new # KeywordFilter strips extra kwargs (input:, metadata:, tags:) before calling user's #call - assert_equal 1.0, scorer.call(input: "a", expected: "b", output: "b", metadata: {}, tags: []) + assert_equal [{score: 1.0, metadata: nil, name: "scorer"}], + scorer.call(input: "a", expected: "b", output: "b", metadata: {}, tags: []) end # ============================================ @@ -176,6 +182,108 @@ def test_invalid_positional_arity_raises_for_scorer end end +# Direct unit tests for ResultNormalizer prepend behavior. +class Braintrust::Scorer::Callable::ResultNormalizerTest < Minitest::Test + # Build a minimal class with ResultNormalizer prepended and a controllable #call return. + def make_scorer(name, &block) + klass = Class.new do + prepend Braintrust::Scorer::Callable::ResultNormalizer + + define_method(:name) { name } + define_method(:call) { |**| instance_exec(&block) } + end + klass.new + end + + # ============================================ + # Scalar return (else branch) + # ============================================ + + def test_scalar_float_wrapped + scorer = make_scorer("s") { 0.9 } + assert_equal [{score: 0.9, metadata: nil, name: "s"}], scorer.call + end + + def test_scalar_integer_wrapped + scorer = make_scorer("s") { 1 } + assert_equal [{score: 1, metadata: nil, name: "s"}], scorer.call + end + + def test_scalar_nil_raises + scorer = make_scorer("s") { nil } + assert_raises(ArgumentError) { scorer.call } + end + + def test_scalar_boolean_raises + scorer = make_scorer("s") { true } + assert_raises(ArgumentError) { scorer.call } + end + + def test_hash_with_nil_score_raises + scorer = make_scorer("s") { {score: nil} } + assert_raises(ArgumentError) { scorer.call } + end + + def test_array_item_with_nil_score_raises + scorer = make_scorer("s") { [{name: "a", score: 1.0}, {name: "b", score: nil}] } + assert_raises(ArgumentError) { scorer.call } + end + + # ============================================ + # Hash return + # ============================================ + + def test_hash_without_name_gets_scorer_name + scorer = make_scorer("my_scorer") { {score: 0.5} } + assert_equal [{score: 0.5, name: "my_scorer"}], scorer.call + end + + def test_hash_with_name_preserves_name + scorer = make_scorer("my_scorer") { {score: 0.5, name: "override"} } + assert_equal [{score: 0.5, name: "override"}], scorer.call + end + + def test_hash_with_metadata_preserved + scorer = make_scorer("s") { {score: 0.8, metadata: {reason: "close"}} } + assert_equal [{score: 0.8, metadata: {reason: "close"}, name: "s"}], scorer.call + end + + # ============================================ + # Array return + # ============================================ + + def test_array_items_passed_through + scorer = make_scorer("s") { [{name: "a", score: 1.0}, {name: "b", score: 0.5}] } + assert_equal [{name: "a", score: 1.0}, {name: "b", score: 0.5}], scorer.call + end + + def test_array_items_without_name_get_scorer_name + scorer = make_scorer("my_scorer") { [{score: 1.0}, {score: 0.5}] } + assert_equal [{score: 1.0, name: "my_scorer"}, {score: 0.5, name: "my_scorer"}], scorer.call + end + + def test_array_items_mixed_name_presence + scorer = make_scorer("fallback") { [{name: "explicit", score: 1.0}, {score: 0.5}] } + assert_equal [{name: "explicit", score: 1.0}, {score: 0.5, name: "fallback"}], scorer.call + end + + def test_empty_array_returns_empty_array + scorer = make_scorer("s") { [] } + assert_equal [], scorer.call + end + + # ============================================ + # Always returns Array + # ============================================ + + def test_result_is_always_array + [0.5, {score: 1.0}, [{score: 0.9}]].each do |raw| + scorer = make_scorer("s") { raw } + assert_instance_of Array, scorer.call + end + end +end + # Direct unit tests for KeywordFilter class methods and instance behavior. class Braintrust::Internal::Callable::KeywordFilterTest < Minitest::Test # ============================================ diff --git a/test/braintrust/scorer_test.rb b/test/braintrust/scorer_test.rb index d9c522a8..04765c9c 100644 --- a/test/braintrust/scorer_test.rb +++ b/test/braintrust/scorer_test.rb @@ -14,8 +14,10 @@ def test_scorer_with_kwargs_block end assert_equal "exact_match", scorer.name - assert_equal 1.0, scorer.call(input: "apple", expected: "fruit", output: "fruit") - assert_equal 0.0, scorer.call(input: "apple", expected: "fruit", output: "wrong") + assert_equal [{score: 1.0, metadata: nil, name: "exact_match"}], + scorer.call(input: "apple", expected: "fruit", output: "fruit") + assert_equal [{score: 0.0, metadata: nil, name: "exact_match"}], + scorer.call(input: "apple", expected: "fruit", output: "wrong") end def test_scorer_with_subset_kwargs_filters_extra_keys @@ -25,8 +27,10 @@ def test_scorer_with_subset_kwargs_filters_extra_keys end # Calling with extra kwargs (input:, metadata:, tags:) should not raise - assert_equal 1.0, scorer.call(input: "apple", expected: "fruit", output: "fruit", metadata: {}, tags: ["t1"]) - assert_equal 0.0, scorer.call(input: "apple", expected: "fruit", output: "wrong", metadata: {}, tags: nil) + assert_equal [{score: 1.0, metadata: nil, name: "subset"}], + scorer.call(input: "apple", expected: "fruit", output: "fruit", metadata: {}, tags: ["t1"]) + assert_equal [{score: 0.0, metadata: nil, name: "subset"}], + scorer.call(input: "apple", expected: "fruit", output: "wrong", metadata: {}, tags: nil) end def test_scorer_with_legacy_3_param_block @@ -36,7 +40,22 @@ def test_scorer_with_legacy_3_param_block end assert_equal "exact_match", scorer.name - assert_equal 1.0, scorer.call(input: "apple", expected: "fruit", output: "fruit", metadata: {threshold: 0.5}) + assert_equal [{score: 1.0, metadata: nil, name: "exact_match"}], + scorer.call(input: "apple", expected: "fruit", output: "fruit", metadata: {threshold: 0.5}) + end + end + + def test_scorer_with_legacy_3_param_block_multi_score + suppress_logs do + scorer = Braintrust::Scorer.new("legacy3") do |input, expected, output| + [ + {name: "exact", score: (output == expected) ? 1.0 : 0.0}, + {name: "length", score: (output.length == expected.length) ? 1.0 : 0.0} + ] + end + + result = scorer.call(input: "x", expected: "fruit", output: "fruit") + assert_equal [{name: "exact", score: 1.0}, {name: "length", score: 1.0}], result end end @@ -49,19 +68,52 @@ def test_scorer_with_legacy_4_param_block end assert_equal "threshold_match", scorer.name - assert_equal 0.0, scorer.call(input: "a", expected: "b", output: "c", metadata: {threshold: 0.95}) - assert_equal 1.0, scorer.call(input: "a", expected: "b", output: "c", metadata: {threshold: 0.85}) + assert_equal [{score: 0.0, metadata: nil, name: "threshold_match"}], + scorer.call(input: "a", expected: "b", output: "c", metadata: {threshold: 0.95}) + assert_equal [{score: 1.0, metadata: nil, name: "threshold_match"}], + scorer.call(input: "a", expected: "b", output: "c", metadata: {threshold: 0.85}) + end + end + + def test_scorer_with_legacy_4_param_block_multi_score + suppress_logs do + scorer = Braintrust::Scorer.new("legacy4") do |input, expected, output, metadata| + threshold = metadata[:threshold] || 0.8 + [ + {name: "match", score: (output == expected) ? 1.0 : 0.0}, + {name: "threshold_met", score: (threshold < 0.9) ? 1.0 : 0.0} + ] + end + + result = scorer.call(input: "a", expected: "b", output: "b", metadata: {threshold: 0.5}) + assert_equal [{name: "match", score: 1.0}, {name: "threshold_met", score: 1.0}], result end end + def test_scorer_with_keyword_lambda_multi_score + # Bare lambda passed through Factory (Proc branch -> Scorer.new(&scorer)) + lam = ->(expected:, output:) { + [ + {name: "exact", score: (output == expected) ? 1.0 : 0.0}, + {name: "length", score: (output.length == expected.length) ? 1.0 : 0.0} + ] + } + scorer = Braintrust::Scorer.new(&lam) + + result = scorer.call(input: "x", expected: "hello", output: "world") + assert_equal [{name: "exact", score: 0.0}, {name: "length", score: 1.0}], result + end + def test_scorer_return_float scorer = Braintrust::Scorer.new("float_scorer") { |**| 0.75 } - assert_equal 0.75, scorer.call(input: "a", expected: "b", output: "c") + assert_equal [{score: 0.75, metadata: nil, name: "float_scorer"}], + scorer.call(input: "a", expected: "b", output: "c") end def test_scorer_return_hash scorer = Braintrust::Scorer.new("hash_scorer") { |**| {name: "custom_name", score: 0.85} } - assert_equal({name: "custom_name", score: 0.85}, scorer.call(input: "a", expected: "b", output: "c")) + assert_equal [{name: "custom_name", score: 0.85}], + scorer.call(input: "a", expected: "b", output: "c") end def test_scorer_return_array @@ -131,8 +183,34 @@ def call(output:, expected:) scorer = klass.new assert_kind_of Braintrust::Scorer, scorer - assert_equal 1.0, scorer.call(input: "apple", expected: "fruit", output: "fruit") - assert_equal 0.0, scorer.call(input: "apple", expected: "fruit", output: "wrong") + assert_equal [{score: 1.0, metadata: nil, name: "scorer"}], + scorer.call(input: "apple", expected: "fruit", output: "fruit") + assert_equal [{score: 0.0, metadata: nil, name: "scorer"}], + scorer.call(input: "apple", expected: "fruit", output: "wrong") + end + + def test_subclass_with_call_override_multi_score + klass = Class.new do + include Braintrust::Scorer + + def name + "multi_subclass" + end + + def call(output:, expected:) + [ + {name: "exact", score: (output == expected) ? 1.0 : 0.0}, + {name: "nonempty", score: output.to_s.empty? ? 0.0 : 1.0} + ] + end + end + + scorer = klass.new + result = scorer.call(input: "x", expected: "fruit", output: "fruit") + assert_equal [{name: "exact", score: 1.0}, {name: "nonempty", score: 1.0}], result + + result2 = scorer.call(input: "x", expected: "fruit", output: "wrong") + assert_equal [{name: "exact", score: 0.0}, {name: "nonempty", score: 1.0}], result2 end def test_subclass_with_name_override @@ -187,15 +265,10 @@ def call(output:, expected:, metadata:) scorer = klass.new - assert_equal 1.0, scorer.call( - input: "a", expected: "b", output: "b", - metadata: {threshold: 0.9} - ) - - assert_equal 0.5, scorer.call( - input: "a", expected: "b", output: "c", - metadata: {threshold: 0.3} - ) + assert_equal [{score: 1.0, metadata: nil, name: "threshold_scorer"}], + scorer.call(input: "a", expected: "b", output: "b", metadata: {threshold: 0.9}) + assert_equal [{score: 0.5, metadata: nil, name: "threshold_scorer"}], + scorer.call(input: "a", expected: "b", output: "c", metadata: {threshold: 0.3}) end # ============================================ @@ -222,7 +295,32 @@ def call(input, expected, output) assert_equal "legacy_scorer", scorer.name # Arity 3 block gets auto-wrapped to kwargs - assert_equal 1.0, scorer.call(input: "test", expected: "HELLO", output: "hello") + assert_equal [{score: 1.0, metadata: nil, name: "legacy_scorer"}], + scorer.call(input: "test", expected: "HELLO", output: "hello") + end + end + + def test_legacy_callable_class_multi_score_normalized_via_factory + suppress_logs do + callable = Class.new do + def name + "legacy_multi" + end + + def call(input, expected, output) + [ + {name: "exact", score: (output == expected) ? 1.0 : 0.0}, + {name: "case_insensitive", score: (output.downcase == expected.downcase) ? 1.0 : 0.0} + ] + end + end.new + + name = callable.respond_to?(:name) ? callable.name : nil + scorer = Braintrust::Scorer.new(name, &callable.method(:call)) + + assert_equal "legacy_multi", scorer.name + result = scorer.call(input: "test", expected: "HELLO", output: "hello") + assert_equal [{name: "exact", score: 0.0}, {name: "case_insensitive", score: 1.0}], result end end @@ -249,10 +347,8 @@ def call(input, expected, output, metadata = {}) assert_equal "legacy_with_meta", scorer.name # Arity 4 block gets auto-wrapped to kwargs - assert_equal 1.0, scorer.call( - input: "a", expected: "b", output: "b", - metadata: {threshold: 0.9} - ) + assert_equal [{score: 1.0, metadata: nil, name: "legacy_with_meta"}], + scorer.call(input: "a", expected: "b", output: "b", metadata: {threshold: 0.9}) end end