Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lib/braintrust/eval/runner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ def run_scorers(case_context)
# @param scorer_input [Hash] Input to log on the span
# @param scores [Hash] Accumulator for score results
def run_scorer(scorer, scorer_kwargs, scorer_input, scores)
tracer.in_span("score") do |score_span|
tracer.in_span(scorer.name) do |score_span|
score_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr
set_json_attr(score_span, "braintrust.span_attributes", build_scorer_span_attributes(scorer.name))
set_json_attr(score_span, "braintrust.input_json", scorer_input)
Expand Down
24 changes: 12 additions & 12 deletions test/braintrust/eval/runner_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -720,7 +720,7 @@ def test_runner_run_creates_per_scorer_spans
runner.run

spans = rig.exporter.finished_spans
score_spans = spans.select { |s| s.name == "score" }
score_spans = spans.select { |s| ["accuracy", "relevance"].include?(s.name) }

# One span per scorer, not one shared span
assert_equal 2, score_spans.length
Expand Down Expand Up @@ -751,7 +751,7 @@ def test_runner_run_records_scores_on_per_scorer_spans
result = runner.run

spans = rig.exporter.finished_spans
score_spans = spans.select { |s| s.name == "score" }
score_spans = spans.select { |s| ["accuracy", "relevance"].include?(s.name) }

scores_by_name = score_spans.each_with_object({}) do |span, h|
parsed = JSON.parse(span.attributes["braintrust.scores"])
Expand Down Expand Up @@ -780,7 +780,7 @@ def test_runner_scorer_span_attributes
Braintrust::Eval::Runner.new(context).run

spans = rig.exporter.finished_spans
scorer_span = spans.find { |s| s.name == "score" }
scorer_span = spans.find { |s| s.name == "exact" }
span_attrs = JSON.parse(scorer_span.attributes["braintrust.span_attributes"])

assert_equal "score", span_attrs["type"]
Expand All @@ -805,7 +805,7 @@ def test_runner_scorer_span_has_input_and_output
Braintrust::Eval::Runner.new(context).run

spans = rig.exporter.finished_spans
scorer_span = spans.find { |s| s.name == "score" }
scorer_span = spans.find { |s| s.name == "exact" }

input = JSON.parse(scorer_span.attributes["braintrust.input_json"])
assert_equal "hello", input["input"]
Expand Down Expand Up @@ -1373,7 +1373,7 @@ def test_scorer_hash_return_metadata_on_span
)
Braintrust::Eval::Runner.new(context).run

score_span = rig.exporter.finished_spans.find { |s| s.name == "score" }
score_span = rig.exporter.finished_spans.find { |s| s.name == "meta_scorer" }
metadata = JSON.parse(score_span.attributes["braintrust.metadata"])
assert_equal({"failure_type" => "none", "confidence" => 0.99}, metadata)
end
Expand Down Expand Up @@ -1427,7 +1427,7 @@ def test_scorer_hash_with_nil_score
assert_equal({}, result.scores)

# Metadata should still be logged even with nil score
score_span = rig.exporter.finished_spans.find { |s| s.name == "score" }
score_span = rig.exporter.finished_spans.find { |s| s.name == "nil_score" }
metadata = JSON.parse(score_span.attributes["braintrust.metadata"])
assert_equal({"reason" => "could not score"}, metadata)
end
Expand Down Expand Up @@ -1455,9 +1455,9 @@ def test_multiple_scorers_mixed_return_types
assert_equal({"numeric" => [0.8], "structured" => [0.6]}, result.scores)

# Only structured scorer's span has metadata
score_spans = rig.exporter.finished_spans.select { |s| s.name == "score" }
structured_span = score_spans.find { |s| s.attributes["braintrust.scores"]&.include?("structured") }
numeric_span = score_spans.find { |s| s.attributes["braintrust.scores"]&.include?("numeric") }
score_spans = rig.exporter.finished_spans.select { |s| ["numeric", "structured"].include?(s.name) }
structured_span = score_spans.find { |s| s.name == "structured" }
numeric_span = score_spans.find { |s| s.name == "numeric" }

metadata = JSON.parse(structured_span.attributes["braintrust.metadata"])
assert_equal({"detail" => "partial"}, metadata)
Expand All @@ -1482,7 +1482,7 @@ def test_scorer_hash_return_scores_on_span_are_extracted
)
Braintrust::Eval::Runner.new(context).run

score_span = rig.exporter.finished_spans.find { |s| s.name == "score" }
score_span = rig.exporter.finished_spans.find { |s| s.name == "structured" }
scores = JSON.parse(score_span.attributes["braintrust.scores"])
# Should be the numeric value, not the hash
assert_equal 0.75, scores["structured"]
Expand Down Expand Up @@ -1522,7 +1522,7 @@ def test_scorer_no_metadata_attr_when_all_numeric
)
Braintrust::Eval::Runner.new(context).run

score_span = rig.exporter.finished_spans.find { |s| s.name == "score" }
score_span = rig.exporter.finished_spans.find { |s| s.name == "a" }
# No metadata attribute should be set when no scorers return metadata
assert_nil score_span.attributes["braintrust.metadata"]
end
Expand Down Expand Up @@ -1551,7 +1551,7 @@ def test_scorer_hash_with_non_hash_metadata_ignored
assert_equal({"str_meta" => [0.5]}, result.scores)

# Non-hash metadata should not be logged
score_span = rig.exporter.finished_spans.find { |s| s.name == "score" }
score_span = rig.exporter.finished_spans.find { |s| s.name == "str_meta" }
assert_nil score_span.attributes["braintrust.metadata"]
end

Expand Down
4 changes: 2 additions & 2 deletions test/braintrust/eval_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def test_eval_scorer_error_records_exception_event
)

spans = rig.drain
score_spans = spans.select { |s| s.name == "score" }
score_spans = spans.select { |s| ["good", "failing"].include?(s.name) }

# Each scorer gets its own span
assert_equal 2, score_spans.length
Expand Down Expand Up @@ -348,7 +348,7 @@ def test_eval_run_with_tracing

eval_span = spans.find { |s| s.name == "eval" }
task_span = spans.find { |s| s.name == "task" }
score_span = spans.find { |s| s.name == "score" }
score_span = spans.find { |s| s.name == "exact" }

assert eval_span, "Expected eval span"
assert task_span, "Expected task span"
Expand Down
Loading