diff --git a/lib/braintrust/eval/runner.rb b/lib/braintrust/eval/runner.rb index 4cae30cd..496e5e6b 100644 --- a/lib/braintrust/eval/runner.rb +++ b/lib/braintrust/eval/runner.rb @@ -82,11 +82,17 @@ def run(parallelism: 1) # @param case_context [CaseContext] The per-case accumulator # @param errors [Queue] Thread-safe error collection queue def run_eval_case(case_context, errors) - tracer.in_span("eval") do |eval_span| + # Each eval case starts its own trace — detach from any ambient span context + eval_span = tracer.start_root_span("eval") + OpenTelemetry::Trace.with_span(eval_span) do + # Set attributes known before task execution eval_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr - - # Set tags early so they're present even if task fails + set_json_attr(eval_span, "braintrust.span_attributes", build_span_attributes("eval")) + set_json_attr(eval_span, "braintrust.input_json", {input: case_context.input}) + set_json_attr(eval_span, "braintrust.expected", case_context.expected) if case_context.expected + set_json_attr(eval_span, "braintrust.metadata", case_context.metadata) if case_context.metadata eval_span.set_attribute("braintrust.tags", case_context.tags) if case_context.tags + eval_span.set_attribute("braintrust.origin", case_context.origin) if case_context.origin # Run task begin @@ -94,6 +100,7 @@ def run_eval_case(case_context, errors) rescue => e # Error already recorded on task span, set eval span status eval_span.status = OpenTelemetry::Trace::Status.error(e.message) + set_json_attr(eval_span, "braintrust.output_json", {output: nil}) errors << "Task failed for input '#{case_context.input}': #{e.message}" report_progress(eval_span, case_context, error: e.message) next @@ -113,17 +120,13 @@ def run_eval_case(case_context, errors) errors << "Scorers failed for input '#{case_context.input}': #{e.message}" end - # Set eval span attributes (after task and scorers complete) - set_json_attr(eval_span, "braintrust.span_attributes", build_span_attributes("eval")) - set_json_attr(eval_span, "braintrust.input_json", case_context.input) - set_json_attr(eval_span, "braintrust.output_json", case_context.output) - set_json_attr(eval_span, "braintrust.expected", case_context.expected) if case_context.expected - - # Set origin for cases from remote sources (already JSON-serialized) - eval_span.set_attribute("braintrust.origin", case_context.origin) if case_context.origin + # Set output after task completes + set_json_attr(eval_span, "braintrust.output_json", {output: case_context.output}) report_progress(eval_span, case_context, data: case_context.output, scores: case_scores || {}) end + ensure + eval_span&.finish end # Run task with OpenTelemetry tracing @@ -151,43 +154,61 @@ def run_task(case_context) end end - # Run scorers with OpenTelemetry tracing - # Creates single score span for all scorers + # Run scorers with OpenTelemetry tracing. + # Creates one span per scorer, each a direct child of the current (eval) span. # @param case_context [CaseContext] The per-case context (output must be populated) # @return [Hash] Scores hash { scorer_name => score_value } def run_scorers(case_context) + scorer_kwargs = { + input: case_context.input, + expected: case_context.expected, + output: case_context.output, + metadata: case_context.metadata || {}, + trace: case_context.trace + } + scorer_input = { + input: case_context.input, + expected: case_context.expected, + output: case_context.output, + metadata: case_context.metadata || {} + } + + scores = {} + scorer_error = nil + eval_context.scorers.each do |scorer| + run_scorer(scorer, scorer_kwargs, scorer_input, scores) + rescue => e + scorer_error ||= e + end + + raise scorer_error if scorer_error + + scores + end + + # Run a single scorer inside its own span. + # @param scorer [Scorer] The scorer to run + # @param scorer_kwargs [Hash] Keyword arguments for the scorer + # @param scorer_input [Hash] Input to log on the span + # @param scores [Hash] Accumulator for score results + def run_scorer(scorer, scorer_kwargs, scorer_input, scores) tracer.in_span("score") do |score_span| score_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr - set_json_attr(score_span, "braintrust.span_attributes", build_span_attributes("score")) - - scorer_kwargs = { - input: case_context.input, - expected: case_context.expected, - output: case_context.output, - metadata: case_context.metadata || {}, - trace: case_context.trace - } - scores = {} - scorer_error = nil - eval_context.scorers.each do |scorer| - score_value = scorer.call(**scorer_kwargs) - scores[scorer.name] = score_value - - # Collect raw score for summary (thread-safe) - collect_score(scorer.name, score_value) - rescue => e - # Record first error but continue processing other scorers - scorer_error ||= e - record_span_error(score_span, e, "ScorerError") - end + set_json_attr(score_span, "braintrust.span_attributes", build_scorer_span_attributes(scorer.name)) + set_json_attr(score_span, "braintrust.input_json", scorer_input) - # Always set scores attribute, even if some scorers failed - set_json_attr(score_span, "braintrust.scores", scores) + score_value = scorer.call(**scorer_kwargs) + scores[scorer.name] = score_value - # Raise after setting scores so we can see which scorers succeeded - raise scorer_error if scorer_error + scorer_scores = {scorer.name => score_value} + set_json_attr(score_span, "braintrust.output_json", scorer_scores) + set_json_attr(score_span, "braintrust.scores", scorer_scores) - scores + # Collect raw score for summary (thread-safe) + collect_score(scorer.name, score_value) + rescue => e + record_span_error(score_span, e, "ScorerError") + raise end end @@ -255,6 +276,16 @@ def build_span_attributes(type) attrs end + # Build span_attributes for a scorer span. + # Each scorer gets its own span with type "score", purpose "scorer", and the scorer's name. + # @param scorer_name [String] The scorer name + # @return [Hash] + def build_scorer_span_attributes(scorer_name) + attrs = {type: "score", name: scorer_name, purpose: "scorer"} + attrs[:generation] = eval_context.generation if eval_context.generation + attrs + end + # Set a span attribute by JSON encoding the value # @param span [OpenTelemetry::Trace::Span] The span # @param key [String] The attribute key diff --git a/test/braintrust/eval/runner_test.rb b/test/braintrust/eval/runner_test.rb index b8670e62..29308a2a 100644 --- a/test/braintrust/eval/runner_test.rb +++ b/test/braintrust/eval/runner_test.rb @@ -508,6 +508,169 @@ def test_runner_run_creates_eval_spans assert_equal "experiment_id:exp-123", eval_spans[0].attributes["braintrust.parent"] end + def test_runner_eval_span_has_case_metadata + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [Braintrust::Scorer.new("exact") { 1.0 }], + cases: [{input: "hello", expected: "HELLO", metadata: {difficulty: "easy", category: "greeting"}}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + Braintrust::Eval::Runner.new(context).run + + eval_span = rig.exporter.finished_spans.find { |s| s.name == "eval" } + metadata = JSON.parse(eval_span.attributes["braintrust.metadata"]) + + assert_equal "easy", metadata["difficulty"] + assert_equal "greeting", metadata["category"] + end + + def test_runner_eval_span_input_json_wrapped + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [Braintrust::Scorer.new("exact") { 1.0 }], + cases: [{input: "hello", expected: "HELLO"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + Braintrust::Eval::Runner.new(context).run + + eval_span = rig.exporter.finished_spans.find { |s| s.name == "eval" } + input_json = JSON.parse(eval_span.attributes["braintrust.input_json"]) + + assert_equal({"input" => "hello"}, input_json) + end + + def test_runner_eval_span_tags_as_array + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [Braintrust::Scorer.new("exact") { 1.0 }], + cases: [{input: "hello", tags: ["fast", "regression"]}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + Braintrust::Eval::Runner.new(context).run + + eval_span = rig.exporter.finished_spans.find { |s| s.name == "eval" } + tags = eval_span.attributes["braintrust.tags"] + + assert_instance_of Array, tags + assert_equal ["fast", "regression"], tags + end + + def test_runner_eval_span_output_json_null_on_task_error + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: -> { raise "boom" }, + scorers: [Braintrust::Scorer.new("exact") { 1.0 }], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + Braintrust::Eval::Runner.new(context).run + + eval_span = rig.exporter.finished_spans.find { |s| s.name == "eval" } + output_json = JSON.parse(eval_span.attributes["braintrust.output_json"]) + + assert_equal({"output" => nil}, output_json) + end + + def test_runner_eval_span_output_json_wrapped + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [Braintrust::Scorer.new("exact") { 1.0 }], + cases: [{input: "hello", expected: "HELLO"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + Braintrust::Eval::Runner.new(context).run + + eval_span = rig.exporter.finished_spans.find { |s| s.name == "eval" } + output_json = JSON.parse(eval_span.attributes["braintrust.output_json"]) + + assert_equal({"output" => "HELLO"}, output_json) + end + + def test_runner_eval_spans_are_independent_roots + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [Braintrust::Scorer.new("exact") { 1.0 }], + cases: [{input: "a"}, {input: "b"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + Braintrust::Eval::Runner.new(context).run + + eval_spans = rig.exporter.finished_spans.select { |s| s.name == "eval" } + assert_equal 2, eval_spans.length + + # Each eval span should have a unique trace ID (independent roots) + trace_ids = eval_spans.map { |s| s.hex_trace_id }.uniq + assert_equal 2, trace_ids.length, "Each eval case should be its own trace" + + # Eval spans should not have a parent span + invalid_hex = OpenTelemetry::Trace::INVALID_SPAN_ID.unpack1("H*") + eval_spans.each do |span| + assert_equal invalid_hex, span.hex_parent_span_id, + "Eval span should be a root span with no parent" + end + end + + def test_runner_eval_span_no_metadata_when_nil + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [Braintrust::Scorer.new("exact") { 1.0 }], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + Braintrust::Eval::Runner.new(context).run + + eval_span = rig.exporter.finished_spans.find { |s| s.name == "eval" } + assert_nil eval_span.attributes["braintrust.metadata"] + end + def test_runner_run_creates_task_spans rig = setup_otel_test_rig @@ -534,12 +697,15 @@ def test_runner_run_creates_task_spans assert_equal "experiment_id:exp-123", task_spans[0].attributes["braintrust.parent"] end - def test_runner_run_creates_score_spans + def test_runner_run_creates_per_scorer_spans rig = setup_otel_test_rig context = Braintrust::Eval::Context.build( task: ->(input:) { input.upcase }, - scorers: [Braintrust::Scorer.new("exact") { 1.0 }], + scorers: [ + Braintrust::Scorer.new("accuracy") { 0.95 }, + Braintrust::Scorer.new("relevance") { 0.87 } + ], cases: [{input: "hello", expected: "HELLO"}], experiment_id: "exp-123", experiment_name: "test-experiment", @@ -556,11 +722,14 @@ def test_runner_run_creates_score_spans spans = rig.exporter.finished_spans score_spans = spans.select { |s| s.name == "score" } - assert_equal 1, score_spans.length - assert_equal "experiment_id:exp-123", score_spans[0].attributes["braintrust.parent"] + # One span per scorer, not one shared span + assert_equal 2, score_spans.length + score_spans.each do |span| + assert_equal "experiment_id:exp-123", span.attributes["braintrust.parent"] + end end - def test_runner_run_records_scores_on_span + def test_runner_run_records_scores_on_per_scorer_spans rig = setup_otel_test_rig context = Braintrust::Eval::Context.build( @@ -582,16 +751,72 @@ def test_runner_run_records_scores_on_span result = runner.run spans = rig.exporter.finished_spans - score_span = spans.find { |s| s.name == "score" } + score_spans = spans.select { |s| s.name == "score" } - scores = JSON.parse(score_span.attributes["braintrust.scores"]) - assert_equal 0.95, scores["accuracy"] - assert_equal 0.87, scores["relevance"] + scores_by_name = score_spans.each_with_object({}) do |span, h| + parsed = JSON.parse(span.attributes["braintrust.scores"]) + h.merge!(parsed) + end + assert_equal({"accuracy" => 0.95, "relevance" => 0.87}, scores_by_name) - # Check scores contains scores from multiple scorers + # Result still aggregates all scores assert_equal({"accuracy" => [0.95], "relevance" => [0.87]}, result.scores) end + def test_runner_scorer_span_attributes + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [Braintrust::Scorer.new("exact") { 1.0 }], + cases: [{input: "hello"}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + Braintrust::Eval::Runner.new(context).run + + spans = rig.exporter.finished_spans + scorer_span = spans.find { |s| s.name == "score" } + span_attrs = JSON.parse(scorer_span.attributes["braintrust.span_attributes"]) + + assert_equal "score", span_attrs["type"] + assert_equal "exact", span_attrs["name"] + assert_equal "scorer", span_attrs["purpose"] + end + + def test_runner_scorer_span_has_input_and_output + rig = setup_otel_test_rig + + context = Braintrust::Eval::Context.build( + task: ->(input:) { input.upcase }, + scorers: [Braintrust::Scorer.new("exact") { 0.5 }], + cases: [{input: "hello", expected: "HELLO", metadata: {key: "val"}}], + experiment_id: "exp-123", + experiment_name: "test-experiment", + project_id: "proj-456", + project_name: "test-project", + state: rig.state, + tracer_provider: rig.tracer_provider + ) + Braintrust::Eval::Runner.new(context).run + + spans = rig.exporter.finished_spans + scorer_span = spans.find { |s| s.name == "score" } + + input = JSON.parse(scorer_span.attributes["braintrust.input_json"]) + assert_equal "hello", input["input"] + assert_equal "HELLO", input["expected"] + assert_equal "HELLO", input["output"] + assert_equal({"key" => "val"}, input["metadata"]) + + output = JSON.parse(scorer_span.attributes["braintrust.output_json"]) + assert_equal({"exact" => 0.5}, output) + end + # ============================================ # Runner reusability tests # ============================================ diff --git a/test/braintrust/eval_test.rb b/test/braintrust/eval_test.rb index f28f6ad7..f3f184da 100644 --- a/test/braintrust/eval_test.rb +++ b/test/braintrust/eval_test.rb @@ -139,21 +139,25 @@ def test_eval_scorer_error_records_exception_event ) spans = rig.drain - score_span = spans.find { |s| s.name == "score" } + score_spans = spans.select { |s| s.name == "score" } - assert score_span, "Expected score span" - assert score_span.events, "Expected span to have events" + # Each scorer gets its own span + assert_equal 2, score_spans.length - exception_event = score_span.events.find { |e| e.name == "exception" } - assert exception_event, "Expected exception event" + # Find the failing scorer's span (has exception event) + failing_span = score_spans.find { |s| s.events&.any? { |e| e.name == "exception" } } + assert failing_span, "Expected failing scorer span with exception event" + + exception_event = failing_span.events.find { |e| e.name == "exception" } assert_equal "ScorerError", exception_event.attributes["exception.type"] assert_match(/Intentional error/, exception_event.attributes["exception.message"]) assert exception_event.attributes["exception.stacktrace"], "Expected stacktrace in exception event" - # Verify scores still recorded for successful scorers - scores = JSON.parse(score_span.attributes["braintrust.scores"]) - assert_equal 1.0, scores["good"], "Good scorer should have succeeded" - assert_nil scores["failing"], "Failing scorer should not have a score" + # Verify good scorer span has its score + good_span = score_spans.find { |s| s.attributes["braintrust.scores"]&.include?("good") } + assert good_span, "Expected good scorer span" + scores = JSON.parse(good_span.attributes["braintrust.scores"]) + assert_equal 1.0, scores["good"] end def test_eval_run_with_multiple_scorers @@ -339,7 +343,7 @@ def test_eval_run_with_tracing # Verify spans were created spans = rig.drain - # Should have: 1 eval span, 1 task span, 1 score span + # Should have: 1 eval span, 1 task span, 1 score span (one per scorer) assert_equal 3, spans.length eval_span = spans.find { |s| s.name == "eval" } @@ -353,16 +357,22 @@ def test_eval_run_with_tracing # Verify eval span attributes assert eval_span.attributes["braintrust.parent"] assert_match(/experiment_id:[0-9a-f-]{36}/, eval_span.attributes["braintrust.parent"]) - assert_includes eval_span.attributes["braintrust.input_json"], "hello" - assert_includes eval_span.attributes["braintrust.output_json"], "HELLO" + input_json = JSON.parse(eval_span.attributes["braintrust.input_json"]) + assert_equal({"input" => "hello"}, input_json) + output_json = JSON.parse(eval_span.attributes["braintrust.output_json"]) + assert_equal({"output" => "HELLO"}, output_json) # Verify task span assert task_span.attributes["braintrust.span_attributes"] assert_includes task_span.attributes["braintrust.span_attributes"], "task" - # Verify score span + # Verify score span has scorer-specific attributes assert score_span.attributes["braintrust.scores"] assert_includes score_span.attributes["braintrust.scores"], "exact" + span_attrs = JSON.parse(score_span.attributes["braintrust.span_attributes"]) + assert_equal "score", span_attrs["type"] + assert_equal "exact", span_attrs["name"] + assert_equal "scorer", span_attrs["purpose"] # Verify experiment result has permalink in correct format assert result.permalink.include?("object_type=experiment"), "Result permalink should be experiment URL"