diff --git a/lib/braintrust/eval/runner.rb b/lib/braintrust/eval/runner.rb index 16ed7874..76590280 100644 --- a/lib/braintrust/eval/runner.rb +++ b/lib/braintrust/eval/runner.rb @@ -192,7 +192,7 @@ def run_scorers(case_context) # @param scorer_input [Hash] Input to log on the span # @param scores [Hash] Accumulator for score results def run_scorer(scorer, scorer_kwargs, scorer_input, scores) - tracer.in_span("score") do |score_span| + tracer.in_span(scorer.name) do |score_span| score_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr set_json_attr(score_span, "braintrust.span_attributes", build_scorer_span_attributes(scorer.name)) set_json_attr(score_span, "braintrust.input_json", scorer_input) diff --git a/test/braintrust/eval/runner_test.rb b/test/braintrust/eval/runner_test.rb index 2ae0be8a..d6a5da62 100644 --- a/test/braintrust/eval/runner_test.rb +++ b/test/braintrust/eval/runner_test.rb @@ -720,7 +720,7 @@ def test_runner_run_creates_per_scorer_spans runner.run spans = rig.exporter.finished_spans - score_spans = spans.select { |s| s.name == "score" } + score_spans = spans.select { |s| ["accuracy", "relevance"].include?(s.name) } # One span per scorer, not one shared span assert_equal 2, score_spans.length @@ -751,7 +751,7 @@ def test_runner_run_records_scores_on_per_scorer_spans result = runner.run spans = rig.exporter.finished_spans - score_spans = spans.select { |s| s.name == "score" } + score_spans = spans.select { |s| ["accuracy", "relevance"].include?(s.name) } scores_by_name = score_spans.each_with_object({}) do |span, h| parsed = JSON.parse(span.attributes["braintrust.scores"]) @@ -780,7 +780,7 @@ def test_runner_scorer_span_attributes Braintrust::Eval::Runner.new(context).run spans = rig.exporter.finished_spans - scorer_span = spans.find { |s| s.name == "score" } + scorer_span = spans.find { |s| s.name == "exact" } span_attrs = JSON.parse(scorer_span.attributes["braintrust.span_attributes"]) assert_equal "score", span_attrs["type"] @@ -805,7 +805,7 @@ def test_runner_scorer_span_has_input_and_output Braintrust::Eval::Runner.new(context).run spans = rig.exporter.finished_spans - scorer_span = spans.find { |s| s.name == "score" } + scorer_span = spans.find { |s| s.name == "exact" } input = JSON.parse(scorer_span.attributes["braintrust.input_json"]) assert_equal "hello", input["input"] @@ -1373,7 +1373,7 @@ def test_scorer_hash_return_metadata_on_span ) Braintrust::Eval::Runner.new(context).run - score_span = rig.exporter.finished_spans.find { |s| s.name == "score" } + score_span = rig.exporter.finished_spans.find { |s| s.name == "meta_scorer" } metadata = JSON.parse(score_span.attributes["braintrust.metadata"]) assert_equal({"failure_type" => "none", "confidence" => 0.99}, metadata) end @@ -1427,7 +1427,7 @@ def test_scorer_hash_with_nil_score assert_equal({}, result.scores) # Metadata should still be logged even with nil score - score_span = rig.exporter.finished_spans.find { |s| s.name == "score" } + score_span = rig.exporter.finished_spans.find { |s| s.name == "nil_score" } metadata = JSON.parse(score_span.attributes["braintrust.metadata"]) assert_equal({"reason" => "could not score"}, metadata) end @@ -1455,9 +1455,9 @@ def test_multiple_scorers_mixed_return_types assert_equal({"numeric" => [0.8], "structured" => [0.6]}, result.scores) # Only structured scorer's span has metadata - score_spans = rig.exporter.finished_spans.select { |s| s.name == "score" } - structured_span = score_spans.find { |s| s.attributes["braintrust.scores"]&.include?("structured") } - numeric_span = score_spans.find { |s| s.attributes["braintrust.scores"]&.include?("numeric") } + score_spans = rig.exporter.finished_spans.select { |s| ["numeric", "structured"].include?(s.name) } + structured_span = score_spans.find { |s| s.name == "structured" } + numeric_span = score_spans.find { |s| s.name == "numeric" } metadata = JSON.parse(structured_span.attributes["braintrust.metadata"]) assert_equal({"detail" => "partial"}, metadata) @@ -1482,7 +1482,7 @@ def test_scorer_hash_return_scores_on_span_are_extracted ) Braintrust::Eval::Runner.new(context).run - score_span = rig.exporter.finished_spans.find { |s| s.name == "score" } + score_span = rig.exporter.finished_spans.find { |s| s.name == "structured" } scores = JSON.parse(score_span.attributes["braintrust.scores"]) # Should be the numeric value, not the hash assert_equal 0.75, scores["structured"] @@ -1522,7 +1522,7 @@ def test_scorer_no_metadata_attr_when_all_numeric ) Braintrust::Eval::Runner.new(context).run - score_span = rig.exporter.finished_spans.find { |s| s.name == "score" } + score_span = rig.exporter.finished_spans.find { |s| s.name == "a" } # No metadata attribute should be set when no scorers return metadata assert_nil score_span.attributes["braintrust.metadata"] end @@ -1551,7 +1551,7 @@ def test_scorer_hash_with_non_hash_metadata_ignored assert_equal({"str_meta" => [0.5]}, result.scores) # Non-hash metadata should not be logged - score_span = rig.exporter.finished_spans.find { |s| s.name == "score" } + score_span = rig.exporter.finished_spans.find { |s| s.name == "str_meta" } assert_nil score_span.attributes["braintrust.metadata"] end diff --git a/test/braintrust/eval_test.rb b/test/braintrust/eval_test.rb index f3f184da..862ef43a 100644 --- a/test/braintrust/eval_test.rb +++ b/test/braintrust/eval_test.rb @@ -139,7 +139,7 @@ def test_eval_scorer_error_records_exception_event ) spans = rig.drain - score_spans = spans.select { |s| s.name == "score" } + score_spans = spans.select { |s| ["good", "failing"].include?(s.name) } # Each scorer gets its own span assert_equal 2, score_spans.length @@ -348,7 +348,7 @@ def test_eval_run_with_tracing eval_span = spans.find { |s| s.name == "eval" } task_span = spans.find { |s| s.name == "task" } - score_span = spans.find { |s| s.name == "score" } + score_span = spans.find { |s| s.name == "exact" } assert eval_span, "Expected eval span" assert task_span, "Expected task span"