Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,23 @@ Braintrust::Eval.run(
)
```

#### Scorer metadata

Scorers can return a Hash with `:score` and `:metadata` to attach structured context to the score. The metadata is logged on the scorer's span and visible in the Braintrust UI for debugging and filtering:

```ruby
Braintrust::Scorer.new("translation") do |expected:, output:|
common_words = output.downcase.split & expected.downcase.split
overlap = common_words.size.to_f / expected.split.size
{
score: overlap,
metadata: {word_overlap: common_words.size, missing_words: expected.downcase.split - output.downcase.split}
}
end
```

See [scorer_metadata.rb](./examples/eval/scorer_metadata.rb) for a full example.

#### Trace scoring

Scorers can access the full evaluation trace (all spans generated by the task) by declaring a `trace:` keyword parameter. This is useful for inspecting intermediate LLM calls, validating tool usage, or checking the message thread:
Expand Down
76 changes: 76 additions & 0 deletions examples/eval/scorer_metadata.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#!/usr/bin/env ruby
# frozen_string_literal: true

require "bundler/setup"
require "braintrust"
require "opentelemetry/sdk"

# Example: Scorer Metadata
#
# Scorers can return a Hash with :score and :metadata keys to attach
# structured context alongside the numeric score. The metadata is
# logged on the scorer's span and visible in the Braintrust UI for
# debugging and filtering.
#
# Usage:
# bundle exec ruby examples/eval/scorer_metadata.rb

Braintrust.init

EXPECTED_TOOLS = {
"What's the weather?" => {name: "get_weather", args: ["location"]},
"Book a flight to Paris" => {name: "book_flight", args: ["destination", "date"]},
"Send an email to Bob" => {name: "send_email", args: ["recipient", "subject", "body"]}
}

# Simulated tool-calling model
def pick_tool(input)
case input
when /weather/i then {name: "get_weather", args: ["location"]}
when /flight/i then {name: "book_flight", args: ["destination"]} # missing "date"
when /email/i then {name: "wrong_tool", args: []}
else {name: "unknown", args: []}
end
end

# Scorer that returns structured metadata explaining *why* a score was given
tool_accuracy = Braintrust::Scorer.new("tool_accuracy") { |expected:, output:|
expected_name = expected[:name]
actual_name = output[:name]
expected_args = expected[:args]
actual_args = output[:args]

if actual_name != expected_name
{
score: 0.0,
metadata: {
failure_type: "wrong_tool",
reason: "Expected tool '#{expected_name}' but got '#{actual_name}'"
}
}
else
missing_args = expected_args - actual_args
if missing_args.empty?
{score: 1.0, metadata: {failure_type: nil, reason: "Correct tool and arguments"}}
else
{
score: 0.5,
metadata: {
failure_type: "missing_arguments",
reason: "Correct tool '#{expected_name}' but missing args: #{missing_args.join(", ")}",
missing_args: missing_args
}
}
end
end
}

Braintrust::Eval.run(
project: "ruby-sdk-examples",
experiment: "scorer-metadata-example",
cases: EXPECTED_TOOLS.map { |input, expected| {input: input, expected: expected} },
task: ->(input:) { pick_tool(input) },
scorers: [tool_accuracy]
)

OpenTelemetry.tracer_provider.shutdown
30 changes: 26 additions & 4 deletions lib/braintrust/eval/runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -197,15 +197,23 @@ def run_scorer(scorer, scorer_kwargs, scorer_input, scores)
set_json_attr(score_span, "braintrust.span_attributes", build_scorer_span_attributes(scorer.name))
set_json_attr(score_span, "braintrust.input_json", scorer_input)

score_value = scorer.call(**scorer_kwargs)
scores[scorer.name] = score_value
raw_result = scorer.call(**scorer_kwargs)
normalized = normalize_score_result(raw_result, scorer.name)

scorer_scores = {scorer.name => score_value}
score_name = normalized[:name]
scores[score_name] = normalized[:score]

scorer_scores = {score_name => normalized[:score]}
set_json_attr(score_span, "braintrust.output_json", scorer_scores)
set_json_attr(score_span, "braintrust.scores", scorer_scores)

# Set scorer metadata on its span
if normalized[:metadata].is_a?(Hash)
set_json_attr(score_span, "braintrust.metadata", normalized[:metadata])
end

# Collect raw score for summary (thread-safe)
collect_score(scorer.name, score_value)
collect_score(score_name, normalized[:score])
rescue => e
record_span_error(score_span, e, "ScorerError")
raise
Expand Down Expand Up @@ -304,6 +312,20 @@ def collect_score(name, value)
(@scores[name] ||= []) << value
end
end

# Normalize a scorer return value into its component parts.
# Scorers may return a raw Numeric or a Hash with :score, :metadata, and :name keys.
# @param result [Object] Raw scorer return value
# @param default_name [String] Scorer name to use if not overridden
# @return [Hash] Normalized hash with :score, :metadata, :name keys
def normalize_score_result(result, default_name)
if result.is_a?(Hash)
result[:name] ||= default_name
result
else
{score: result, metadata: nil, name: default_name}
end
end
end
end
end
Loading
Loading