From 630e7b7b8aecdedf508062a041b68ed8ca49199b Mon Sep 17 00:00:00 2001 From: David Elner Date: Sun, 22 Mar 2026 21:39:46 +0000 Subject: [PATCH 1/3] Fixed: Default for tracer_provider in Evals --- lib/braintrust/eval/context.rb | 48 ++-- lib/braintrust/eval/runner.rb | 5 +- test/braintrust/eval/context_test.rb | 326 +++++++++++++-------------- 3 files changed, 188 insertions(+), 191 deletions(-) diff --git a/lib/braintrust/eval/context.rb b/lib/braintrust/eval/context.rb index 1c98a075..b422621e 100644 --- a/lib/braintrust/eval/context.rb +++ b/lib/braintrust/eval/context.rb @@ -29,37 +29,49 @@ def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil end # Build a Context from raw user inputs. - # Factory normalizes task, scorers, and cases into typed wrappers. - # Parent is resolved into parent_span_attr and generation. + # Delegates to Factory for normalization. def self.build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil, project_id: nil, project_name: nil, state: nil, tracer_provider: nil, on_progress: nil, parent: nil) - factory = Factory.new(state: state, tracer_provider: tracer_provider, project_name: project_name) - - Context.new( - task: factory.normalize_task(task), - scorers: factory.normalize_scorers(scorers), - cases: factory.normalize_cases(cases), - experiment_id: experiment_id, - experiment_name: experiment_name, - project_id: project_id, - project_name: project_name, - state: state, - tracer_provider: tracer_provider, - on_progress: on_progress, - parent_span_attr: factory.resolve_parent_span_attr(parent), - generation: parent&.dig(:generation) + Factory.new( + state: state, tracer_provider: tracer_provider, + project_id: project_id, project_name: project_name + ).build( + task: task, scorers: scorers, cases: cases, + experiment_id: experiment_id, experiment_name: experiment_name, + on_progress: on_progress, parent: parent ) end # Encapsulates normalization of raw user inputs into typed wrappers. class Factory - def initialize(state: nil, tracer_provider: nil, project_name: nil) + def initialize(state: nil, tracer_provider: nil, project_id: nil, project_name: nil) @state = state @tracer_provider = tracer_provider + @project_id = project_id @project_name = project_name end + def build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil, + on_progress: nil, parent: nil) + Context.new( + task: normalize_task(task), + scorers: normalize_scorers(scorers), + cases: normalize_cases(cases), + experiment_id: experiment_id, + experiment_name: experiment_name, + project_id: @project_id, + project_name: @project_name, + state: @state, + tracer_provider: @tracer_provider || OpenTelemetry.tracer_provider, + on_progress: on_progress, + parent_span_attr: resolve_parent_span_attr(parent), + generation: parent&.dig(:generation) + ) + end + + private + def normalize_cases(raw) case raw when Cases diff --git a/lib/braintrust/eval/runner.rb b/lib/braintrust/eval/runner.rb index fb020bee..25987062 100644 --- a/lib/braintrust/eval/runner.rb +++ b/lib/braintrust/eval/runner.rb @@ -24,8 +24,7 @@ class Runner # @param eval_context [Context] Normalized eval context def initialize(eval_context) @eval_context = eval_context - tracer_provider = eval_context.tracer_provider || OpenTelemetry.tracer_provider - @tracer = tracer_provider.tracer("braintrust-eval") + @tracer = eval_context.tracer_provider.tracer("braintrust-eval") # Mutex for thread-safe score collection @score_mutex = Mutex.new @@ -107,7 +106,7 @@ def run_eval_case(case_context, errors) end # Flush spans so they're queryable via BTQL, then build trace - eval_context.tracer_provider&.force_flush + eval_context.tracer_provider.force_flush if eval_context.tracer_provider.respond_to?(:force_flush) case_context.trace = build_trace(eval_span) # Run scorers diff --git a/test/braintrust/eval/context_test.rb b/test/braintrust/eval/context_test.rb index ac4b1128..1f596e36 100644 --- a/test/braintrust/eval/context_test.rb +++ b/test/braintrust/eval/context_test.rb @@ -6,38 +6,30 @@ # Unit tests for Eval::Context and Context::Factory class Braintrust::Eval::ContextTest < Minitest::Test # ============================================ - # normalize_task + # Factory#build — task normalization # ============================================ - def test_normalize_task_passes_through_task_instance + def test_build_passes_through_task_instance task = Braintrust::Task.new("my_task") { |input:| input.upcase } - factory = Braintrust::Eval::Context::Factory.new - - result = factory.normalize_task(task) - assert_same task, result + ctx = build_context(task: task) + assert_same task, ctx.task end - def test_normalize_task_wraps_lambda_with_kwargs - lam = ->(input:) { input.upcase } - factory = Braintrust::Eval::Context::Factory.new - - result = factory.normalize_task(lam) - assert_kind_of Braintrust::Task, result - assert_equal "HELLO", result.call(input: "hello") + def test_build_wraps_lambda_task + ctx = build_context(task: ->(input:) { input.upcase }) + assert_kind_of Braintrust::Task, ctx.task + assert_equal "HELLO", ctx.task.call(input: "hello") end - def test_normalize_task_wraps_legacy_positional_lambda + def test_build_wraps_legacy_positional_lambda_task suppress_logs do - lam = ->(input) { input.upcase } - factory = Braintrust::Eval::Context::Factory.new - - result = factory.normalize_task(lam) - assert_kind_of Braintrust::Task, result - assert_equal "HELLO", result.call(input: "hello") + ctx = build_context(task: ->(input) { input.upcase }) + assert_kind_of Braintrust::Task, ctx.task + assert_equal "HELLO", ctx.task.call(input: "hello") end end - def test_normalize_task_wraps_callable_class_with_kwargs + def test_build_wraps_callable_class_task callable = Class.new do def initialize(prefix) @prefix = prefix @@ -52,15 +44,13 @@ def call(input:) end end.new("Result") - factory = Braintrust::Eval::Context::Factory.new - - result = factory.normalize_task(callable) - assert_kind_of Braintrust::Task, result - assert_equal "prefixer", result.name - assert_equal "Result: hello", result.call(input: "hello") + ctx = build_context(task: callable) + assert_kind_of Braintrust::Task, ctx.task + assert_equal "prefixer", ctx.task.name + assert_equal "Result: hello", ctx.task.call(input: "hello") end - def test_normalize_task_callable_class_preserves_instance_state + def test_build_callable_class_task_preserves_instance_state callable = Class.new do attr_accessor :mode @@ -69,41 +59,48 @@ def call(input:) end end.new - factory = Braintrust::Eval::Context::Factory.new - callable.mode = :shout - task = factory.normalize_task(callable) - assert_equal "HELLO", task.call(input: "hello") + ctx = build_context(task: callable) + assert_equal "HELLO", ctx.task.call(input: "hello") callable.mode = :whisper - assert_equal "hello", task.call(input: "HELLO") + assert_equal "hello", ctx.task.call(input: "HELLO") + end + + def test_build_callable_class_task_without_name + callable = Class.new do + def call(input:) + input.upcase + end + end.new + + ctx = build_context(task: callable) + assert_kind_of Braintrust::Task, ctx.task + assert_equal "task", ctx.task.name + assert_equal "HELLO", ctx.task.call(input: "hello") end # ============================================ - # normalize_scorers + # Factory#build — scorer normalization # ============================================ - def test_normalize_scorers_passes_through_scorer_instance + def test_build_passes_through_scorer_instance scorer = Braintrust::Scorer.new("exact") { |expected:, output:| (output == expected) ? 1.0 : 0.0 } - factory = Braintrust::Eval::Context::Factory.new - - result = factory.normalize_scorers([scorer]) - assert_equal 1, result.length - assert_same scorer, result.first + ctx = build_context(scorers: [scorer]) + assert_equal 1, ctx.scorers.length + assert_same scorer, ctx.scorers.first end - def test_normalize_scorers_wraps_lambda_with_kwargs + def test_build_wraps_lambda_scorer lam = ->(expected:, output:) { (output == expected) ? 1.0 : 0.0 } - factory = Braintrust::Eval::Context::Factory.new - - result = factory.normalize_scorers([lam]) - assert_equal 1, result.length - assert_kind_of Braintrust::Scorer, result.first + ctx = build_context(scorers: [lam]) + assert_equal 1, ctx.scorers.length + assert_kind_of Braintrust::Scorer, ctx.scorers.first assert_equal [{score: 1.0, metadata: nil, name: "scorer"}], - result.first.call(input: "x", expected: "YES", output: "YES") + ctx.scorers.first.call(input: "x", expected: "YES", output: "YES") end - def test_normalize_scorers_wraps_callable_class_with_kwargs + def test_build_wraps_callable_class_scorer callable = Class.new do def initialize(threshold) @threshold = threshold @@ -118,116 +115,41 @@ def call(expected:, output:) end end.new(0.5) - factory = Braintrust::Eval::Context::Factory.new - - result = factory.normalize_scorers([callable]) - assert_equal 1, result.length - assert_equal "threshold_scorer", result.first.name + ctx = build_context(scorers: [callable]) + assert_equal 1, ctx.scorers.length + assert_equal "threshold_scorer", ctx.scorers.first.name assert_equal [{score: 0.5, metadata: nil, name: "threshold_scorer"}], - result.first.call(input: "x", expected: "a", output: "b") - end - - # ============================================ - # normalize_cases - # ============================================ - - def test_normalize_cases_passes_through_cases_instance - cases = Braintrust::Eval::Cases.new([{input: "a"}]) - factory = Braintrust::Eval::Context::Factory.new - - result = factory.normalize_cases(cases) - assert_same cases, result - end - - def test_normalize_cases_wraps_array - factory = Braintrust::Eval::Context::Factory.new - - result = factory.normalize_cases([{input: "a"}, {input: "b"}]) - assert_instance_of Braintrust::Eval::Cases, result - assert_equal 2, result.to_a.length - end - - # ============================================ - # resolve_parent_span_attr - # ============================================ - - def test_resolve_parent_span_attr_returns_nil_for_nil - factory = Braintrust::Eval::Context::Factory.new - assert_nil factory.resolve_parent_span_attr(nil) - end - - def test_resolve_parent_span_attr_formats_correctly - factory = Braintrust::Eval::Context::Factory.new - result = factory.resolve_parent_span_attr(object_type: "experiment_id", object_id: "exp-123") - assert_equal "experiment_id:exp-123", result - end - - # ============================================ - # Context.build - # ============================================ - - def test_build_extracts_generation_from_parent - ctx = Braintrust::Eval::Context.build( - task: ->(input:) { input }, - scorers: [], - cases: [{input: "a"}], - parent: {object_type: "experiment_id", object_id: "exp-1", generation: 42} - ) - assert_equal 42, ctx.generation - assert_equal "experiment_id:exp-1", ctx.parent_span_attr - end - - # ============================================ - # Factory edge cases - # ============================================ - - def test_normalize_task_callable_class_without_name - callable = Class.new do - def call(input:) - input.upcase - end - end.new - - factory = Braintrust::Eval::Context::Factory.new - - result = factory.normalize_task(callable) - assert_kind_of Braintrust::Task, result - assert_equal "task", result.name - assert_equal "HELLO", result.call(input: "hello") + ctx.scorers.first.call(input: "x", expected: "a", output: "b") end - def test_normalize_scorers_wraps_lambda + def test_build_wraps_lambda_scorer_alternate_arg_order lam = ->(output:, expected:) { (output == expected) ? 1.0 : 0.0 } - factory = Braintrust::Eval::Context::Factory.new - - result = factory.normalize_scorers([lam]) - assert_equal 1, result.length - assert_kind_of Braintrust::Scorer, result.first + ctx = build_context(scorers: [lam]) + assert_equal 1, ctx.scorers.length + assert_kind_of Braintrust::Scorer, ctx.scorers.first assert_equal [{score: 1.0, metadata: nil, name: "scorer"}], - result.first.call(input: "x", expected: "a", output: "a") + ctx.scorers.first.call(input: "x", expected: "a", output: "a") end - def test_normalize_scorers_callable_class_without_name + def test_build_callable_class_scorer_without_name callable = Class.new do def call(output:, expected:) (output == expected) ? 1.0 : 0.0 end end.new - factory = Braintrust::Eval::Context::Factory.new - - result = factory.normalize_scorers([callable]) - assert_equal 1, result.length - assert_equal "scorer", result.first.name + ctx = build_context(scorers: [callable]) + assert_equal 1, ctx.scorers.length + assert_equal "scorer", ctx.scorers.first.name assert_equal [{score: 1.0, metadata: nil, name: "scorer"}], - result.first.call(input: "x", expected: "a", output: "a") + ctx.scorers.first.call(input: "x", expected: "a", output: "a") end # ============================================ - # normalize_scorers — String slug resolution + # Factory#build — scorer slug/ID resolution # ============================================ - def test_normalize_scorers_resolves_string_slug + def test_build_resolves_string_scorer_slug fake_scorer = Braintrust::Scorer.new("resolved") { |**| 1.0 } resolved_kwargs = nil @@ -235,15 +157,17 @@ def test_normalize_scorers_resolves_string_slug resolved_kwargs = kw fake_scorer }) do - factory = Braintrust::Eval::Context::Factory.new( + ctx = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: ["my-scorer-slug"], + cases: [{input: "a"}], project_name: "my-project", state: :fake_state, tracer_provider: :fake_tp ) - result = factory.normalize_scorers(["my-scorer-slug"]) - assert_equal 1, result.length - assert_same fake_scorer, result.first + assert_equal 1, ctx.scorers.length + assert_same fake_scorer, ctx.scorers.first assert_equal "my-project", resolved_kwargs[:project] assert_equal "my-scorer-slug", resolved_kwargs[:slug] assert_equal :fake_state, resolved_kwargs[:state] @@ -251,20 +175,14 @@ def test_normalize_scorers_resolves_string_slug end end - def test_normalize_scorers_string_slug_raises_without_project - factory = Braintrust::Eval::Context::Factory.new - + def test_build_string_scorer_slug_raises_without_project error = assert_raises(ArgumentError) do - factory.normalize_scorers(["some-slug"]) + build_context(scorers: ["some-slug"]) end assert_match(/project is required/, error.message) end - # ============================================ - # normalize_scorers — Scorer::ID resolution - # ============================================ - - def test_normalize_scorers_resolves_scorer_id + def test_build_resolves_scorer_id fake_scorer = Braintrust::Scorer.new("resolved") { |**| 1.0 } resolved_kwargs = nil @@ -272,12 +190,17 @@ def test_normalize_scorers_resolves_scorer_id resolved_kwargs = kw fake_scorer }) do - factory = Braintrust::Eval::Context::Factory.new(state: :fake_state, tracer_provider: :fake_tp) scorer_id = Braintrust::Scorer::ID.new(function_id: "func-abc", version: "v3") - result = factory.normalize_scorers([scorer_id]) + ctx = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [scorer_id], + cases: [{input: "a"}], + state: :fake_state, + tracer_provider: :fake_tp + ) - assert_equal 1, result.length - assert_same fake_scorer, result.first + assert_equal 1, ctx.scorers.length + assert_same fake_scorer, ctx.scorers.first assert_equal "func-abc", resolved_kwargs[:id] assert_equal "v3", resolved_kwargs[:version] assert_equal :fake_state, resolved_kwargs[:state] @@ -285,7 +208,7 @@ def test_normalize_scorers_resolves_scorer_id end end - def test_normalize_scorers_resolves_deprecated_scorer_id_alias + def test_build_resolves_deprecated_scorer_id_alias fake_scorer = Braintrust::Scorer.new("resolved") { |**| 1.0 } resolved_kwargs = nil @@ -293,37 +216,78 @@ def test_normalize_scorers_resolves_deprecated_scorer_id_alias resolved_kwargs = kw fake_scorer }) do - factory = Braintrust::Eval::Context::Factory.new(state: :fake_state) scorer_id = Braintrust::ScorerId.new(function_id: "func-legacy", version: "v1") - result = factory.normalize_scorers([scorer_id]) + ctx = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [scorer_id], + cases: [{input: "a"}], + state: :fake_state + ) - assert_equal 1, result.length - assert_same fake_scorer, result.first + assert_equal 1, ctx.scorers.length + assert_same fake_scorer, ctx.scorers.first assert_equal "func-legacy", resolved_kwargs[:id] assert_equal "v1", resolved_kwargs[:version] end end - def test_normalize_cases_rejects_non_enumerable - factory = Braintrust::Eval::Context::Factory.new + # ============================================ + # Factory#build — cases normalization + # ============================================ + + def test_build_passes_through_cases_instance + cases = Braintrust::Eval::Cases.new([{input: "a"}]) + ctx = build_context(cases: cases) + assert_same cases, ctx.cases + end - assert_raises(ArgumentError) do - factory.normalize_cases("not enumerable") - end + def test_build_wraps_array_cases + ctx = build_context(cases: [{input: "a"}, {input: "b"}]) + assert_instance_of Braintrust::Eval::Cases, ctx.cases + assert_equal 2, ctx.cases.to_a.length end - def test_normalize_cases_wraps_custom_enumerable + def test_build_wraps_custom_enumerable_cases enum = Object.new def enum.each(&block) [{input: "a"}, {input: "b"}].each(&block) end - factory = Braintrust::Eval::Context::Factory.new + ctx = build_context(cases: enum) + assert_instance_of Braintrust::Eval::Cases, ctx.cases + end + + def test_build_rejects_non_enumerable_cases + assert_raises(ArgumentError) do + build_context(cases: "not enumerable") + end + end + + # ============================================ + # Factory#build — parent resolution + # ============================================ + + def test_build_extracts_generation_from_parent + ctx = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + cases: [{input: "a"}], + parent: {object_type: "experiment_id", object_id: "exp-1", generation: 42} + ) + assert_equal 42, ctx.generation + assert_equal "experiment_id:exp-1", ctx.parent_span_attr + end - result = factory.normalize_cases(enum) - assert_instance_of Braintrust::Eval::Cases, result + def test_build_nil_parent + ctx = build_context + assert_nil ctx.parent_span_attr + assert_nil ctx.generation end + # ============================================ + # Context.build — field pass-through + # ============================================ + def test_build_passes_through_all_fields on_progress = ->(_) {} ctx = Braintrust::Eval::Context.build( @@ -344,4 +308,26 @@ def test_build_passes_through_all_fields assert_nil ctx.parent_span_attr assert_nil ctx.generation end + + def test_build_defaults_tracer_provider_to_global + ctx = build_context + assert_same OpenTelemetry.tracer_provider, ctx.tracer_provider + end + + def test_build_uses_explicit_tracer_provider + fake_tp = Object.new + ctx = Braintrust::Eval::Context.build( + task: ->(input:) { input }, + scorers: [], + cases: [{input: "a"}], + tracer_provider: fake_tp + ) + assert_same fake_tp, ctx.tracer_provider + end + + private + + def build_context(task: ->(input:) { input }, scorers: [], cases: [{input: "a"}], **kwargs) + Braintrust::Eval::Context.build(task: task, scorers: scorers, cases: cases, **kwargs) + end end From 8f5be4cf784403b4ca858d2426fd661a0416d3e5 Mon Sep 17 00:00:00 2001 From: David Elner Date: Sun, 22 Mar 2026 23:42:12 +0000 Subject: [PATCH 2/3] Changed: Use kase over case_context for brevity. --- lib/braintrust/eval/runner.rb | 66 +++++++++++++++++------------------ 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/lib/braintrust/eval/runner.rb b/lib/braintrust/eval/runner.rb index 25987062..b4920436 100644 --- a/lib/braintrust/eval/runner.rb +++ b/lib/braintrust/eval/runner.rb @@ -78,50 +78,50 @@ def run(parallelism: 1) # Run a single test case with OpenTelemetry tracing # Creates eval span (parent) with task and score as children - # @param case_context [CaseContext] The per-case accumulator + # @param kase [CaseContext] The per-case accumulator # @param errors [Queue] Thread-safe error collection queue - def run_eval_case(case_context, errors) + def run_eval_case(kase, errors) # Each eval case starts its own trace — detach from any ambient span context eval_span = tracer.start_root_span("eval") OpenTelemetry::Trace.with_span(eval_span) do # Set attributes known before task execution eval_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr set_json_attr(eval_span, "braintrust.span_attributes", build_span_attributes("eval")) - set_json_attr(eval_span, "braintrust.input_json", {input: case_context.input}) - set_json_attr(eval_span, "braintrust.expected", case_context.expected) if case_context.expected - set_json_attr(eval_span, "braintrust.metadata", case_context.metadata) if case_context.metadata - eval_span.set_attribute("braintrust.tags", case_context.tags) if case_context.tags - eval_span.set_attribute("braintrust.origin", case_context.origin) if case_context.origin + set_json_attr(eval_span, "braintrust.input_json", {input: kase.input}) + set_json_attr(eval_span, "braintrust.expected", kase.expected) if kase.expected + set_json_attr(eval_span, "braintrust.metadata", kase.metadata) if kase.metadata + eval_span.set_attribute("braintrust.tags", kase.tags) if kase.tags + eval_span.set_attribute("braintrust.origin", kase.origin) if kase.origin # Run task begin - case_context.output = run_task(case_context) + kase.output = run_task(kase) rescue => e # Error already recorded on task span, set eval span status eval_span.status = OpenTelemetry::Trace::Status.error(e.message) set_json_attr(eval_span, "braintrust.output_json", {output: nil}) - errors << "Task failed for input '#{case_context.input}': #{e.message}" - report_progress(eval_span, case_context, error: e.message) + errors << "Task failed for input '#{kase.input}': #{e.message}" + report_progress(eval_span, kase, error: e.message) next end # Flush spans so they're queryable via BTQL, then build trace eval_context.tracer_provider.force_flush if eval_context.tracer_provider.respond_to?(:force_flush) - case_context.trace = build_trace(eval_span) + kase.trace = build_trace(eval_span) # Run scorers begin - run_scorers(case_context) + run_scorers(kase) rescue => e # Error already recorded on score span, set eval span status eval_span.status = OpenTelemetry::Trace::Status.error(e.message) - errors << "Scorers failed for input '#{case_context.input}': #{e.message}" + errors << "Scorers failed for input '#{kase.input}': #{e.message}" end # Set output after task completes - set_json_attr(eval_span, "braintrust.output_json", {output: case_context.output}) + set_json_attr(eval_span, "braintrust.output_json", {output: kase.output}) - report_progress(eval_span, case_context, data: case_context.output) + report_progress(eval_span, kase, data: kase.output) end ensure eval_span&.finish @@ -129,17 +129,17 @@ def run_eval_case(case_context, errors) # Run task with OpenTelemetry tracing # Creates task span with input and output - # @param case_context [CaseContext] The per-case context + # @param kase [CaseContext] The per-case context # @return [Object] Task output - def run_task(case_context) + def run_task(kase) tracer.in_span("task") do |task_span| task_span.set_attribute("braintrust.parent", eval_context.parent_span_attr) if eval_context.parent_span_attr set_json_attr(task_span, "braintrust.span_attributes", build_span_attributes("task")) - set_json_attr(task_span, "braintrust.input_json", case_context.input) + set_json_attr(task_span, "braintrust.input_json", kase.input) begin output = eval_context.task.call( - input: case_context.input + input: kase.input ) set_json_attr(task_span, "braintrust.output_json", output) output @@ -154,20 +154,20 @@ def run_task(case_context) # Run scorers with OpenTelemetry tracing. # Creates one span per scorer, each a direct child of the current (eval) span. - # @param case_context [CaseContext] The per-case context (output must be populated) - def run_scorers(case_context) + # @param kase [CaseContext] The per-case context (output must be populated) + def run_scorers(kase) scorer_kwargs = { - input: case_context.input, - expected: case_context.expected, - output: case_context.output, - metadata: case_context.metadata || {}, - trace: case_context.trace + input: kase.input, + expected: kase.expected, + output: kase.output, + metadata: kase.metadata || {}, + trace: kase.trace } scorer_input = { - input: case_context.input, - expected: case_context.expected, - output: case_context.output, - metadata: case_context.metadata || {} + input: kase.input, + expected: kase.expected, + output: kase.output, + metadata: kase.metadata || {} } scorer_error = nil @@ -240,11 +240,11 @@ def build_case_context(eval_case) # Report progress for a case via on_progress callback. # Rescues errors in the callback so a broken handler never crashes the eval. - def report_progress(eval_span, case_context, **fields) + def report_progress(eval_span, kase, **fields) return unless eval_context.on_progress progress = {"id" => eval_span.context.hex_span_id}.merge(fields.transform_keys(&:to_s)) - if case_context.origin - progress["origin"] = case_context.origin.is_a?(String) ? JSON.parse(case_context.origin) : case_context.origin + if kase.origin + progress["origin"] = kase.origin.is_a?(String) ? JSON.parse(kase.origin) : kase.origin end eval_context.on_progress.call(progress) rescue => e From 4db190611f3368bd508e132f2c6a1d72e0891857 Mon Sep 17 00:00:00 2001 From: David Elner Date: Sun, 22 Mar 2026 23:51:26 +0000 Subject: [PATCH 3/3] Added: YARD doc comments --- lib/braintrust/eval/context.rb | 47 ++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/lib/braintrust/eval/context.rb b/lib/braintrust/eval/context.rb index b422621e..a35df7d1 100644 --- a/lib/braintrust/eval/context.rb +++ b/lib/braintrust/eval/context.rb @@ -11,6 +11,18 @@ class Context :project_id, :project_name, :state, :tracer_provider, :on_progress, :parent_span_attr, :generation + # @param task [Task] Normalized task wrapper + # @param scorers [Array] Normalized scorer wrappers + # @param cases [Cases] Normalized eval cases + # @param experiment_id [String, nil] Experiment ID for logging and trace linkage + # @param experiment_name [String, nil] Experiment name, included in span attributes + # @param project_id [String, nil] Project ID + # @param project_name [String, nil] Project name + # @param state [Braintrust::State, nil] Authenticated API state; nil for local-only evals + # @param tracer_provider [#tracer, nil] OpenTelemetry tracer provider + # @param on_progress [Proc, nil] Callback invoked after each case completes, receiving a progress Hash + # @param parent_span_attr [String, nil] Formatted parent span identifier ("type:id"), linking spans to a parent context + # @param generation [Integer, nil] Generation number from the parent span context, used to link spans in a trace hierarchy def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil, project_id: nil, project_name: nil, state: nil, tracer_provider: nil, on_progress: nil, parent_span_attr: nil, generation: nil) @@ -30,6 +42,18 @@ def initialize(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil # Build a Context from raw user inputs. # Delegates to Factory for normalization. + # @param task [Task, Proc, #call] Task to evaluate; wrapped into a {Task} if needed + # @param scorers [Array] Scorers; each is normalized into a {Scorer} + # @param cases [Cases, Array, Enumerable] Eval cases; wrapped into {Cases} if needed + # @param experiment_id [String, nil] Experiment ID for logging + # @param experiment_name [String, nil] Experiment name, included in span attributes + # @param project_id [String, nil] Project ID + # @param project_name [String, nil] Project name; required when resolving scorer slugs + # @param state [Braintrust::State, nil] Authenticated API state; nil for local-only evals + # @param tracer_provider [#tracer, nil] OpenTelemetry tracer provider; defaults to global provider + # @param on_progress [Proc, nil] Callback invoked after each case completes, receiving a progress Hash + # @param parent [Hash, nil] Parent span info with keys :object_type, :object_id, and optionally :generation + # @return [Context] def self.build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil, project_id: nil, project_name: nil, state: nil, tracer_provider: nil, on_progress: nil, parent: nil) @@ -45,6 +69,10 @@ def self.build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil # Encapsulates normalization of raw user inputs into typed wrappers. class Factory + # @param state [Braintrust::State, nil] Authenticated API state; passed through to scorer resolution + # @param tracer_provider [#tracer, nil] OpenTelemetry tracer provider; passed through to remote scorers + # @param project_id [String, nil] Project ID; passed through to the built Context + # @param project_name [String, nil] Project name; required when resolving scorer slugs def initialize(state: nil, tracer_provider: nil, project_id: nil, project_name: nil) @state = state @tracer_provider = tracer_provider @@ -52,6 +80,15 @@ def initialize(state: nil, tracer_provider: nil, project_id: nil, project_name: @project_name = project_name end + # Normalize raw inputs and construct a {Context}. + # @param task [Task, Proc, #call] Raw task + # @param scorers [Array] Raw scorers + # @param cases [Cases, Array, Enumerable] Raw eval cases + # @param experiment_id [String, nil] + # @param experiment_name [String, nil] + # @param on_progress [Proc, nil] + # @param parent [Hash, nil] Parent span info with keys :object_type, :object_id, and optionally :generation + # @return [Context] def build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil, on_progress: nil, parent: nil) Context.new( @@ -72,6 +109,9 @@ def build(task:, scorers:, cases:, experiment_id: nil, experiment_name: nil, private + # @param raw [Cases, Array, Enumerable, #each] + # @return [Cases] + # @raise [ArgumentError] if raw is not enumerable def normalize_cases(raw) case raw when Cases @@ -87,11 +127,15 @@ def normalize_cases(raw) end end + # @param parent [Hash, nil] + # @return [String, nil] Formatted as "type:id", e.g. "experiment_id:abc-123" def resolve_parent_span_attr(parent) return nil unless parent "#{parent[:object_type]}:#{parent[:object_id]}" end + # @param raw [Task, Proc, #call] + # @return [Task] def normalize_task(raw) case raw when Task @@ -107,6 +151,9 @@ def normalize_task(raw) end end + # @param raw [Array] + # @return [Array] + # @raise [ArgumentError] if a String slug is given without a project name def normalize_scorers(raw) raw.map do |scorer| case scorer