Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ tests/evals/js/eval-bun/test-data.txt
__pycache__

bt-sync
*.env
15 changes: 15 additions & 0 deletions scripts/eval-runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -435,6 +435,21 @@ def load_evaluators(files: list[str]) -> tuple[list[EvaluatorInstance], dict[str
cwd = os.getcwd()
if cwd not in sys.path:
sys.path.insert(0, cwd)

# Add the project root inferred from input files to sys.path so that
# sibling-package imports work when files live outside CWD (e.g.
# sandbox bundles extracted to a temp directory). Walk up from each
# file's directory looking for a register.py (bundle marker) or the
# filesystem root, whichever comes first.
for f in files:
d = os.path.dirname(os.path.abspath(f))
while d and d != os.path.dirname(d):
if os.path.isfile(os.path.join(d, "register.py")):
if d not in sys.path:
sys.path.insert(0, d)
break
d = os.path.dirname(d)

unique_files: set[str] = set()
for file_path in files:
for candidate in collect_files(file_path):
Expand Down
5 changes: 4 additions & 1 deletion scripts/functions-bundler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,10 @@ async function main(): Promise<void> {
const externalPackages = parseExternalPackages(
process.env.BT_FUNCTIONS_PUSH_EXTERNAL_PACKAGES,
);
const external = buildExternalPackagePatterns(externalPackages);
const selfContained = process.env.BT_FUNCTIONS_PUSH_SELF_CONTAINED === "1";
const external = selfContained
? ["fsevents", "chokidar"]
: buildExternalPackagePatterns(externalPackages);
const tsconfig = loadTsconfigPath();

const outputDir = path.dirname(outputFile);
Expand Down
140 changes: 130 additions & 10 deletions scripts/functions-runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import inspect
import json
import os
import re
import sys
from contextlib import nullcontext
from typing import Any
Expand All @@ -28,9 +29,9 @@ def to_json_value(value: Any) -> Any:
return [to_json_value(item) for item in value]
if isinstance(value, dict):
return {str(key): to_json_value(val) for key, val in value.items()}
if hasattr(value, "model_dump"):
if hasattr(value, "model_dump") and not isinstance(value, type):
return to_json_value(value.model_dump())
if hasattr(value, "dict"):
if hasattr(value, "dict") and not isinstance(value, type):
return to_json_value(value.dict())
if hasattr(value, "__dict__"):
result: dict[str, Any] = {}
Expand All @@ -42,21 +43,31 @@ def to_json_value(value: Any) -> Any:
return str(value)


def load_framework_globals() -> tuple[Any, Any, Any]:
def load_framework_globals() -> tuple[Any, Any, Any, Any]:
# Prefer current SDK layout first:
# - braintrust.framework2 exposes module-level `global_`
# - braintrust.framework exposes `_set_lazy_load`
try:
from braintrust.framework import _set_lazy_load as lazy
from braintrust.framework2 import global_ as global_state

return global_state.functions, global_state.prompts, lazy
try:
from braintrust.framework import _evals
except (ImportError, ModuleNotFoundError):
_evals = None

return global_state.functions, global_state.prompts, lazy, _evals
except (ImportError, ModuleNotFoundError):
# Backward compatibility with older SDK layout.
from braintrust.framework2.global_ import functions, prompts
from braintrust.framework2.lazy_load import _set_lazy_load as lazy

return functions, prompts, lazy
try:
from braintrust.framework import _evals
except (ImportError, ModuleNotFoundError):
_evals = None

return functions, prompts, lazy, _evals


def normalize_project_selector(project: Any) -> tuple[str | None, str | None]:
Expand Down Expand Up @@ -277,16 +288,113 @@ async def collect_function_event_entries(prompts_registry: Any) -> list[dict[str
return entries


def slugify(text: str) -> str:
return re.sub(r"^-|-$", "", re.sub(r"[^a-z0-9]+", "-", text.lower()))


def collect_evaluator_entries(evals_registry: Any, source_file: str) -> list[dict[str, Any]]:
if evals_registry is None:
return []

evaluators = getattr(evals_registry, "evaluators", None)
if not evaluators or not isinstance(evaluators, dict):
return []

entries: list[dict[str, Any]] = []
stem_base, _ = os.path.splitext(os.path.basename(source_file))
stem = re.sub(r"\.eval$", "", stem_base)

for eval_name, instance in evaluators.items():
if instance is None:
continue
evaluator = getattr(instance, "evaluator", None)
if evaluator is None:
continue

project_name = getattr(evaluator, "project_name", None)
project_id, proj_name = normalize_project_selector(
{"project_name": project_name} if isinstance(project_name, str) else None
)

scores = getattr(evaluator, "scores", []) or []
score_descriptors = [
{"name": getattr(score, "__name__", f"scorer_{i}")}
for i, score in enumerate(scores)
]

evaluator_definition: dict[str, Any] = {"scores": score_descriptors}

raw_params = getattr(evaluator, "parameters", None)
if raw_params is not None:
marker = getattr(raw_params, "__braintrust_parameters_marker", None)
if marker is True:
evaluator_definition["parameters"] = {
"type": "braintrust.parameters",
"schema": getattr(raw_params, "schema", None),
"source": {
"parametersId": getattr(raw_params, "id", None),
"slug": getattr(raw_params, "slug", None),
"name": getattr(raw_params, "name", None),
"projectId": getattr(raw_params, "projectId", None),
"version": getattr(raw_params, "version", None),
},
}
else:
# Use the braintrust SDK's parameters_to_json_schema when
# available so that Pydantic model classes are converted to
# proper staticParametersSchema entries (type: "data" with a
# JSON Schema) that the UI can parse.
try:
from braintrust.parameters import parameters_to_json_schema
serialized = parameters_to_json_schema(raw_params)
except Exception:
serialized = to_json_value(raw_params)
if serialized is not None:
evaluator_definition["parameters"] = serialized

base_entry: dict[str, Any] = {"kind": "code"}
if project_id:
base_entry["project_id"] = project_id
if proj_name:
base_entry["project_name"] = proj_name

# Sandbox entry only — task and scorer entries are pushed separately
# when the eval is actually run, matching the Python SDK behavior.
sandbox_entry = {
**base_entry,
"name": f"Eval {eval_name} sandbox",
"slug": slugify(f"{stem}-{eval_name}-sandbox"),
"function_type": "sandbox",
"location": {
"type": "sandbox",
"sandbox_spec": {"provider": "lambda"},
"entrypoints": [os.path.relpath(source_file)],
"eval_name": eval_name,
"evaluator_definition": evaluator_definition,
},
"metadata": {"_bt_sandbox_group_name": stem},
}
entries.append(sandbox_entry)

return entries


async def process_file(file_path: str) -> dict[str, Any]:
abs_path = os.path.abspath(file_path)
cwd = os.getcwd()
if cwd not in sys.path:
sys.path.insert(0, cwd)

purge_local_modules(cwd, preserve_modules={__name__, "python_runner_common"})
functions_registry, prompts_registry, lazy_loader = load_framework_globals()
functions_registry, prompts_registry, lazy_loader, evals_registry = load_framework_globals()
clear_registry(functions_registry)
clear_registry(prompts_registry)
if (
evals_registry is not None
and hasattr(evals_registry, "evaluators")
and isinstance(evals_registry.evaluators, dict)
):
evals_registry.evaluators.clear()
purge_local_modules(cwd, preserve_modules={__name__, "python_runner_common"})

module_name = import_module_name_from_cwd(cwd, abs_path)
if module_name is None:
Expand All @@ -298,12 +406,13 @@ async def process_file(file_path: str) -> dict[str, Any]:
import_file(module_name, abs_path, extra_paths)
code_entries = collect_code_entries(functions_registry)
event_entries = await collect_function_event_entries(prompts_registry)
entries = [*code_entries, *event_entries]
evaluator_entries = collect_evaluator_entries(evals_registry, abs_path)
entries = [*code_entries, *event_entries, *evaluator_entries]
file_manifest: dict[str, Any] = {
"source_file": abs_path,
"entries": entries,
}
if code_entries:
if code_entries or evaluator_entries:
runner_root = os.path.dirname(os.path.abspath(__file__))
project_root = os.path.abspath(cwd)
path_rest: list[str] = []
Expand Down Expand Up @@ -350,13 +459,24 @@ async def process_file(file_path: str) -> dict[str, Any]:
continue
seen_sources.add(init_source)
bundled_sources.append(init_source)
# Compute entry_module as a CWD-relative dotted path so that the
# archive root inferred by push.rs walks back to CWD, matching
# the Python SDK behavior and allowing sibling-package imports.
rel_path = os.path.relpath(abs_path, cwd)
archive_module = re.sub(r"\.py$", "", rel_path).replace("-", "_").replace(os.sep, ".")
file_manifest["python_bundle"] = {
"entry_module": module_name,
"entry_module": archive_module,
"sources": bundled_sources,
}

clear_registry(functions_registry)
clear_registry(prompts_registry)
if (
evals_registry is not None
and hasattr(evals_registry, "evaluators")
and isinstance(evals_registry.evaluators, dict)
):
evals_registry.evaluators.clear()
return file_manifest


Expand Down
109 changes: 109 additions & 0 deletions scripts/functions-runner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,20 @@ type Manifest = {
files: ManifestFile[];
};

function slugify(input: string): string {
return input
.toLowerCase()
.replace(/[^a-z0-9]+/g, "-")
.replace(/^-|-$/g, "");
}

function extractScoreName(score: unknown, idx: number): string {
if (typeof score === "function" && typeof score.name === "string") {
return score.name || `scorer_${idx}`;
}
return `scorer_${idx}`;
}

type EvalRegistry = NonNullable<typeof globalThis._evals>;
type ZodToJsonSchemaFn = (schema: unknown) => unknown;

Expand Down Expand Up @@ -473,6 +487,97 @@ function collectCodeEntries(items: CodeRegistryItem[]): CodeEntry[] {
return entries;
}

function collectEvaluatorEntries(
evaluators: Record<string, unknown>,
sourceFilePath: string,
): CodeEntry[] {
const entries: CodeEntry[] = [];
const ext = path.extname(sourceFilePath);
const stem = path.basename(sourceFilePath, ext).replace(/\.eval$/, "");

for (const [evalName, entry] of Object.entries(evaluators)) {
if (!entry || typeof entry !== "object") {
continue;
}

const evaluator = (entry as Record<string, unknown>).evaluator;
if (!evaluator || typeof evaluator !== "object") {
continue;
}

const evalObj = evaluator as Record<string, unknown>;
const projectName =
typeof evalObj.project_name === "string" ? evalObj.project_name : undefined;
const scores = Array.isArray(evalObj.scores) ? evalObj.scores : [];

const selector = asProjectSelector(
typeof projectName === "string" ? { name: projectName } : undefined,
);
const projectId =
typeof selector.project_id === "string" ? selector.project_id : undefined;
const selectorProjectName =
typeof selector.project_name === "string"
? selector.project_name
: undefined;

const scoreDescriptors = scores.map((s: unknown, i: number) => ({
name: extractScoreName(s, i),
}));

const evaluatorDefinition: JsonObject = {
scores: scoreDescriptors as JsonValue,
};

const rawParams = evalObj.parameters;
if (rawParams !== undefined && rawParams !== null) {
const marker =
rawParams !== null &&
typeof rawParams === "object" &&
(rawParams as Record<string, unknown>).__braintrust_parameters_marker === true;
if (marker) {
const paramObj = rawParams as Record<string, unknown>;
evaluatorDefinition.parameters = toJsonValue({
type: "braintrust.parameters",
schema: paramObj.schema,
source: {
parametersId: paramObj.id,
slug: paramObj.slug,
name: paramObj.name,
projectId: paramObj.projectId,
version: paramObj.version,
},
} as JsonValue);
} else {
const serialized = toJsonValue(rawParams as JsonValue);
if (serialized !== undefined) {
evaluatorDefinition.parameters = serialized;
}
}
}

// Sandbox entry only — task and scorer entries are pushed separately
// when the eval is actually run, matching the Python SDK behavior.
entries.push({
kind: "code",
project_id: projectId,
project_name: selectorProjectName,
name: `Eval ${evalName} sandbox`,
slug: slugify(`${stem}-${evalName}-sandbox`),
function_type: "sandbox",
location: {
type: "sandbox",
sandbox_spec: { provider: "lambda" },
entrypoints: [path.relative(process.cwd(), sourceFilePath)],
eval_name: evalName,
evaluator_definition: evaluatorDefinition as JsonValue,
} as JsonValue,
metadata: { _bt_sandbox_group_name: stem },
});
}

return entries;
}

async function processFile(filePath: string): Promise<ManifestFile> {
const absolutePath = path.resolve(process.cwd(), filePath);
const fallbackRegistry = freshRegistry();
Expand All @@ -492,6 +597,10 @@ async function processFile(filePath: string): Promise<ManifestFile> {
registry.parameters as EventRegistryItem[],
false,
)),
...collectEvaluatorEntries(
registry.evaluators as Record<string, unknown>,
absolutePath,
),
];

return {
Expand Down
Loading
Loading