Skip to content

Commit e75fc49

Browse files
committed
Refactored: Extract retry logic into its own internal component
1 parent 44c169c commit e75fc49

7 files changed

Lines changed: 337 additions & 154 deletions

File tree

lib/braintrust/api/internal/btql.rb

Lines changed: 3 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -11,56 +11,26 @@ module Internal
1111
# Internal BTQL client for querying spans.
1212
# Not part of the public API — instantiated directly where needed.
1313
class BTQL
14-
# Maximum number of retries before returning partial results.
15-
# Covers both freshness lag (partially indexed) and ingestion lag
16-
# (spans not yet visible to BTQL after OTel flush).
17-
MAX_FRESHNESS_RETRIES = 7
18-
19-
# Base delay (seconds) between retries (doubles each attempt, capped).
20-
FRESHNESS_BASE_DELAY = 1.0
21-
22-
# Maximum delay (seconds) between retries. Caps exponential growth
23-
# so we keep polling at a reasonable rate in the later window.
24-
# Schedule: 1, 2, 4, 8, 8, 8, 8 = ~39s total worst-case.
25-
MAX_FRESHNESS_DELAY = 8.0
26-
2714
def initialize(state)
2815
@state = state
2916
end
3017

3118
# Query spans belonging to a specific trace within an object.
3219
#
3320
# Builds a BTQL SQL query that matches the root_span_id and excludes scorer spans.
34-
# Retries with exponential backoff if the response indicates data is not yet fresh.
21+
# Returns a single-shot result; callers are responsible for retry and error handling.
3522
#
3623
# @param object_type [String] e.g. "experiment"
3724
# @param object_id [String] Object UUID
3825
# @param root_span_id [String] Hex trace ID of the root span
39-
# @return [Array<Hash>] Parsed span data
26+
# @return [Array(Array<Hash>, String)] [rows, freshness]
4027
def trace_spans(object_type:, object_id:, root_span_id:)
4128
query = build_trace_query(
4229
object_type: object_type,
4330
object_id: object_id,
4431
root_span_id: root_span_id
4532
)
46-
payload = {query: query, fmt: "jsonl"}
47-
48-
retries = 0
49-
loop do
50-
rows, freshness = execute_query(payload)
51-
# Return when data is fresh AND non-empty, or we've exhausted retries.
52-
# We retry on empty even when "complete" because there is ingestion lag
53-
# between OTel flush and BTQL indexing — the server may report "complete"
54-
# before it knows about newly-flushed spans.
55-
return rows if (freshness == "complete" && !rows.empty?) || retries >= MAX_FRESHNESS_RETRIES
56-
57-
retries += 1
58-
delay = [FRESHNESS_BASE_DELAY * (2**(retries - 1)), MAX_FRESHNESS_DELAY].min
59-
sleep(delay)
60-
end
61-
rescue => e
62-
Braintrust::Log.warn("[BTQL] Query failed: #{e.message}")
63-
[]
33+
execute_query(query: query, fmt: "jsonl")
6434
end
6535

6636
private

lib/braintrust/eval/runner.rb

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
require_relative "trace"
77
require_relative "../internal/thread_pool"
88
require_relative "../api/internal/btql"
9+
require_relative "../internal/retry"
910

1011
require "opentelemetry/sdk"
1112
require "json"
@@ -223,9 +224,23 @@ def build_trace(eval_span)
223224
object_id = eval_context.experiment_id
224225
btql = API::Internal::BTQL.new(eval_context.state)
225226

226-
Eval::Trace.new(
227-
spans: -> { btql.trace_spans(object_type: object_type, object_id: object_id, root_span_id: root_span_id) }
228-
)
227+
Eval::Trace.new(spans: -> { fetch_trace_spans(btql, object_type, object_id, root_span_id) })
228+
end
229+
230+
# Fetch trace spans with retry to handle freshness and ingestion lag.
231+
# @return [Array<Hash>] Parsed span data
232+
def fetch_trace_spans(btql, object_type, object_id, root_span_id)
233+
rows, _freshness = Internal::Retry.with_backoff(
234+
max_retries: 7, base_delay: 1.0, max_delay: 8.0,
235+
until: ->(result) {
236+
r, f = result
237+
f == "complete" && !r.empty?
238+
}
239+
) { btql.trace_spans(object_type: object_type, object_id: object_id, root_span_id: root_span_id) }
240+
rows || []
241+
rescue => e
242+
Braintrust::Log.warn("[BTQL] Query failed: #{e.message}")
243+
[]
229244
end
230245

231246
# Build a CaseContext from a Case struct

lib/braintrust/internal/retry.rb

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
# frozen_string_literal: true
2+
3+
module Braintrust
4+
module Internal
5+
module Retry
6+
MAX_RETRIES = 7
7+
BASE_DELAY = 1.0
8+
MAX_DELAY = 8.0
9+
10+
# Retry a block with exponential backoff.
11+
#
12+
# The block is the task to attempt. Its return value is captured each attempt.
13+
#
14+
# @param max_retries [Integer] Maximum number of retries after the first attempt
15+
# @param base_delay [Float] Initial delay in seconds (doubles each retry)
16+
# @param max_delay [Float] Cap on delay between retries
17+
# @param until [Proc, nil] Optional condition — receives block result, truthy stops retrying.
18+
# When omitted, the block result's own truthiness decides.
19+
# @return The last block result (whether retries were exhausted or condition was met)
20+
#
21+
# @example Simple: retry until truthy
22+
# conn = Retry.with_backoff(max_retries: 5) { try_connect }
23+
#
24+
# @example With condition: retry until non-empty
25+
# data = Retry.with_backoff(until: ->(r) { r.any? }) { api.fetch }
26+
#
27+
def self.with_backoff(max_retries: MAX_RETRIES, base_delay: BASE_DELAY, max_delay: MAX_DELAY, until: nil, &task)
28+
check = binding.local_variable_get(:until)
29+
result = task.call
30+
retries = 0
31+
while retries < max_retries && !(check ? check.call(result) : result)
32+
retries += 1
33+
delay = [base_delay * (2**(retries - 1)), max_delay].min
34+
sleep(delay)
35+
result = task.call
36+
end
37+
result
38+
end
39+
end
40+
end
41+
end

test/braintrust/api/internal/btql_integration_test.rb

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -40,22 +40,23 @@ def test_trace_spans_queries_experiment
4040

4141
# Query back via BTQL
4242
btql = Braintrust::API::Internal::BTQL.new(state)
43-
result = btql.trace_spans(
43+
rows, freshness = btql.trace_spans(
4444
object_type: "experiment",
4545
object_id: experiment["id"],
4646
root_span_id: root_span_id
4747
)
4848

49-
refute_empty result, "BTQL should return spans for the trace"
49+
refute_empty rows, "BTQL should return spans for the trace"
50+
assert_equal "complete", freshness
5051

51-
result.each do |span|
52+
rows.each do |span|
5253
assert span.key?("span_id"), "span should have span_id"
5354
assert span.key?("root_span_id"), "span should have root_span_id"
5455
assert span.key?("span_attributes"), "span should have span_attributes"
5556
end
5657

5758
# Verify score spans are excluded by the query filter
58-
types = result.map { |s| s.dig("span_attributes", "type") }
59+
types = rows.map { |s| s.dig("span_attributes", "type") }
5960
refute_includes types, "score"
6061
ensure
6162
cleanup_experiment(experiments, experiment)
@@ -76,18 +77,13 @@ def test_trace_spans_returns_empty_for_nonexistent_trace
7677
)
7778

7879
btql = Braintrust::API::Internal::BTQL.new(state)
80+
rows, _freshness = btql.trace_spans(
81+
object_type: "experiment",
82+
object_id: experiment["id"],
83+
root_span_id: "0000000000000000ffffffffffffffff"
84+
)
7985

80-
Braintrust::API::Internal::BTQL.stub_const(:FRESHNESS_BASE_DELAY, 0.001) do
81-
Braintrust::API::Internal::BTQL.stub_const(:MAX_FRESHNESS_DELAY, 0.001) do
82-
result = btql.trace_spans(
83-
object_type: "experiment",
84-
object_id: experiment["id"],
85-
root_span_id: "0000000000000000ffffffffffffffff"
86-
)
87-
88-
assert_equal [], result
89-
end
90-
end
86+
assert_equal [], rows
9187
ensure
9288
cleanup_experiment(experiments, experiment)
9389
end

0 commit comments

Comments
 (0)