From 1e8e17c214fe53d508b374c5f38c4f72d0b9185b Mon Sep 17 00:00:00 2001 From: Nate Selvidge Date: Fri, 13 Mar 2026 20:12:25 +0000 Subject: [PATCH 1/5] add support for pushing sandboxes --- scripts/functions-runner.py | 121 ++++++++++- scripts/functions-runner.ts | 109 ++++++++++ src/functions/mod.rs | 3 + src/functions/push.rs | 51 +++++ .../list-sandbox-type-parses/fixture.json | 5 + tests/functions.rs | 196 ++++++++++++++++++ 6 files changed, 478 insertions(+), 7 deletions(-) create mode 100644 tests/functions-fixtures/list-sandbox-type-parses/fixture.json diff --git a/scripts/functions-runner.py b/scripts/functions-runner.py index 1f16f06..aa7bd05 100644 --- a/scripts/functions-runner.py +++ b/scripts/functions-runner.py @@ -3,6 +3,7 @@ import inspect import json import os +import re import sys from contextlib import nullcontext from typing import Any @@ -42,7 +43,7 @@ def to_json_value(value: Any) -> Any: return str(value) -def load_framework_globals() -> tuple[Any, Any, Any]: +def load_framework_globals() -> tuple[Any, Any, Any, Any]: # Prefer current SDK layout first: # - braintrust.framework2 exposes module-level `global_` # - braintrust.framework exposes `_set_lazy_load` @@ -50,13 +51,23 @@ def load_framework_globals() -> tuple[Any, Any, Any]: from braintrust.framework import _set_lazy_load as lazy from braintrust.framework2 import global_ as global_state - return global_state.functions, global_state.prompts, lazy + try: + from braintrust.framework import _evals + except (ImportError, ModuleNotFoundError): + _evals = None + + return global_state.functions, global_state.prompts, lazy, _evals except (ImportError, ModuleNotFoundError): # Backward compatibility with older SDK layout. from braintrust.framework2.global_ import functions, prompts from braintrust.framework2.lazy_load import _set_lazy_load as lazy - return functions, prompts, lazy + try: + from braintrust.framework import _evals + except (ImportError, ModuleNotFoundError): + _evals = None + + return functions, prompts, lazy, _evals def normalize_project_selector(project: Any) -> tuple[str | None, str | None]: @@ -277,16 +288,105 @@ async def collect_function_event_entries(prompts_registry: Any) -> list[dict[str return entries +def slugify(text: str) -> str: + return re.sub(r"^-|-$", "", re.sub(r"[^a-z0-9]+", "-", text.lower())) + + +def collect_evaluator_entries(evals_registry: Any, source_file: str) -> list[dict[str, Any]]: + if evals_registry is None: + return [] + + evaluators = getattr(evals_registry, "evaluators", None) + if not evaluators or not isinstance(evaluators, dict): + return [] + + entries: list[dict[str, Any]] = [] + stem_base, _ = os.path.splitext(os.path.basename(source_file)) + stem = re.sub(r"\.eval$", "", stem_base) + + for eval_name, instance in evaluators.items(): + if instance is None: + continue + evaluator = getattr(instance, "evaluator", None) + if evaluator is None: + continue + + project_name = getattr(evaluator, "project_name", None) + project_id, proj_name = normalize_project_selector( + {"project_name": project_name} if isinstance(project_name, str) else None + ) + + scores = getattr(evaluator, "scores", []) or [] + score_descriptors = [ + {"name": getattr(score, "__name__", f"scorer_{i}")} + for i, score in enumerate(scores) + ] + + evaluator_definition: dict[str, Any] = {"scores": score_descriptors} + + raw_params = getattr(evaluator, "parameters", None) + if raw_params is not None: + marker = getattr(raw_params, "__braintrust_parameters_marker", None) + if marker is True: + evaluator_definition["parameters"] = { + "type": "braintrust.parameters", + "schema": getattr(raw_params, "schema", None), + "source": { + "parametersId": getattr(raw_params, "id", None), + "slug": getattr(raw_params, "slug", None), + "name": getattr(raw_params, "name", None), + "projectId": getattr(raw_params, "projectId", None), + "version": getattr(raw_params, "version", None), + }, + } + else: + serialized = to_json_value(raw_params) + if serialized is not None: + evaluator_definition["parameters"] = serialized + + base_entry: dict[str, Any] = {"kind": "code"} + if project_id: + base_entry["project_id"] = project_id + if proj_name: + base_entry["project_name"] = proj_name + + # Sandbox entry only — task and scorer entries are pushed separately + # when the eval is actually run, matching the Python SDK behavior. + sandbox_entry = { + **base_entry, + "name": f"Eval {eval_name} sandbox", + "slug": slugify(f"{stem}-{eval_name}-sandbox"), + "function_type": "sandbox", + "location": { + "type": "sandbox", + "sandbox_spec": {"provider": "lambda"}, + "entrypoints": [source_file], + "eval_name": eval_name, + "evaluator_definition": evaluator_definition, + }, + "metadata": {"_bt_sandbox_group_name": stem}, + } + entries.append(sandbox_entry) + + return entries + + async def process_file(file_path: str) -> dict[str, Any]: abs_path = os.path.abspath(file_path) cwd = os.getcwd() if cwd not in sys.path: sys.path.insert(0, cwd) - purge_local_modules(cwd, preserve_modules={__name__, "python_runner_common"}) - functions_registry, prompts_registry, lazy_loader = load_framework_globals() + functions_registry, prompts_registry, lazy_loader, evals_registry = load_framework_globals() clear_registry(functions_registry) clear_registry(prompts_registry) + if ( + evals_registry is not None + and hasattr(evals_registry, "evaluators") + and isinstance(evals_registry.evaluators, dict) + ): + evals_registry.evaluators.clear() + purge_local_modules(cwd, preserve_modules={__name__, "python_runner_common"}) module_name = import_module_name_from_cwd(cwd, abs_path) if module_name is None: @@ -298,12 +398,13 @@ async def process_file(file_path: str) -> dict[str, Any]: import_file(module_name, abs_path, extra_paths) code_entries = collect_code_entries(functions_registry) event_entries = await collect_function_event_entries(prompts_registry) - entries = [*code_entries, *event_entries] + evaluator_entries = collect_evaluator_entries(evals_registry, abs_path) + entries = [*code_entries, *event_entries, *evaluator_entries] file_manifest: dict[str, Any] = { "source_file": abs_path, "entries": entries, } - if code_entries: + if code_entries or evaluator_entries: runner_root = os.path.dirname(os.path.abspath(__file__)) project_root = os.path.abspath(cwd) path_rest: list[str] = [] @@ -357,6 +458,12 @@ async def process_file(file_path: str) -> dict[str, Any]: clear_registry(functions_registry) clear_registry(prompts_registry) + if ( + evals_registry is not None + and hasattr(evals_registry, "evaluators") + and isinstance(evals_registry.evaluators, dict) + ): + evals_registry.evaluators.clear() return file_manifest diff --git a/scripts/functions-runner.ts b/scripts/functions-runner.ts index 20fc308..1a5b9e8 100644 --- a/scripts/functions-runner.ts +++ b/scripts/functions-runner.ts @@ -88,6 +88,20 @@ type Manifest = { files: ManifestFile[]; }; +function slugify(input: string): string { + return input + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-|-$/g, ""); +} + +function extractScoreName(score: unknown, idx: number): string { + if (typeof score === "function" && typeof score.name === "string") { + return score.name || `scorer_${idx}`; + } + return `scorer_${idx}`; +} + type EvalRegistry = NonNullable; type ZodToJsonSchemaFn = (schema: unknown) => unknown; @@ -473,6 +487,97 @@ function collectCodeEntries(items: CodeRegistryItem[]): CodeEntry[] { return entries; } +function collectEvaluatorEntries( + evaluators: Record, + sourceFilePath: string, +): CodeEntry[] { + const entries: CodeEntry[] = []; + const ext = path.extname(sourceFilePath); + const stem = path.basename(sourceFilePath, ext).replace(/\.eval$/, ""); + + for (const [evalName, entry] of Object.entries(evaluators)) { + if (!entry || typeof entry !== "object") { + continue; + } + + const evaluator = (entry as Record).evaluator; + if (!evaluator || typeof evaluator !== "object") { + continue; + } + + const evalObj = evaluator as Record; + const projectName = + typeof evalObj.project_name === "string" ? evalObj.project_name : undefined; + const scores = Array.isArray(evalObj.scores) ? evalObj.scores : []; + + const selector = asProjectSelector( + typeof projectName === "string" ? { name: projectName } : undefined, + ); + const projectId = + typeof selector.project_id === "string" ? selector.project_id : undefined; + const selectorProjectName = + typeof selector.project_name === "string" + ? selector.project_name + : undefined; + + const scoreDescriptors = scores.map((s: unknown, i: number) => ({ + name: extractScoreName(s, i), + })); + + const evaluatorDefinition: JsonObject = { + scores: scoreDescriptors as JsonValue, + }; + + const rawParams = evalObj.parameters; + if (rawParams !== undefined && rawParams !== null) { + const marker = + rawParams !== null && + typeof rawParams === "object" && + (rawParams as Record).__braintrust_parameters_marker === true; + if (marker) { + const paramObj = rawParams as Record; + evaluatorDefinition.parameters = toJsonValue({ + type: "braintrust.parameters", + schema: paramObj.schema, + source: { + parametersId: paramObj.id, + slug: paramObj.slug, + name: paramObj.name, + projectId: paramObj.projectId, + version: paramObj.version, + }, + } as JsonValue); + } else { + const serialized = toJsonValue(rawParams as JsonValue); + if (serialized !== undefined) { + evaluatorDefinition.parameters = serialized; + } + } + } + + // Sandbox entry only — task and scorer entries are pushed separately + // when the eval is actually run, matching the Python SDK behavior. + entries.push({ + kind: "code", + project_id: projectId, + project_name: selectorProjectName, + name: `Eval ${evalName} sandbox`, + slug: slugify(`${stem}-${evalName}-sandbox`), + function_type: "sandbox", + location: { + type: "sandbox", + sandbox_spec: { provider: "lambda" }, + entrypoints: [sourceFilePath], + eval_name: evalName, + evaluator_definition: evaluatorDefinition as JsonValue, + } as JsonValue, + metadata: { _bt_sandbox_group_name: stem }, + }); + } + + return entries; +} + async function processFile(filePath: string): Promise { const absolutePath = path.resolve(process.cwd(), filePath); const fallbackRegistry = freshRegistry(); @@ -492,6 +597,10 @@ async function processFile(filePath: string): Promise { registry.parameters as EventRegistryItem[], false, )), + ...collectEvaluatorEntries( + registry.evaluators as Record, + absolutePath, + ), ]; return { diff --git a/src/functions/mod.rs b/src/functions/mod.rs index 9974284..eadc100 100644 --- a/src/functions/mod.rs +++ b/src/functions/mod.rs @@ -35,6 +35,7 @@ pub enum FunctionTypeFilter { Classifier, Tag, Parameters, + Sandbox, } impl FunctionTypeFilter { @@ -50,6 +51,7 @@ impl FunctionTypeFilter { Self::Classifier => "classifier", Self::Tag => "tag", Self::Parameters => "parameters", + Self::Sandbox => "sandbox", } } @@ -73,6 +75,7 @@ impl FunctionTypeFilter { Self::Classifier => "classifiers", Self::Tag => "tags", Self::Parameters => "parameters", + Self::Sandbox => "sandboxes", } } } diff --git a/src/functions/push.rs b/src/functions/push.rs index 3803c25..a83e58f 100644 --- a/src/functions/push.rs +++ b/src/functions/push.rs @@ -3263,6 +3263,57 @@ mod tests { ); } + #[test] + fn code_function_data_passes_through_sandbox_location() { + let runtime = RuntimeContext { + runtime: "node".to_string(), + version: "20.0.0".to_string(), + }; + let sandbox_location = serde_json::json!({ + "type": "sandbox", + "sandbox_spec": { "provider": "lambda" }, + "entrypoints": ["/tmp/eval.ts"], + "eval_name": "my-eval", + "evaluator_definition": { + "scores": [{ "name": "accuracy" }] + } + }); + let value = build_code_function_data( + &runtime, + sandbox_location.clone(), + "bundle-sandbox-1", + None, + ); + + assert_eq!(value["type"], "code"); + assert_eq!(value["data"]["type"], "bundle"); + assert_eq!(value["data"]["bundle_id"], "bundle-sandbox-1"); + assert_eq!(value["data"]["location"], sandbox_location); + assert!(value["data"].get("preview").is_none()); + } + + #[test] + fn code_function_data_passes_through_experiment_location() { + let runtime = RuntimeContext { + runtime: "node".to_string(), + version: "20.0.0".to_string(), + }; + let experiment_location = serde_json::json!({ + "type": "experiment", + "eval_name": "my-eval", + "position": { "type": "task" } + }); + let value = build_code_function_data( + &runtime, + experiment_location.clone(), + "bundle-task-1", + None, + ); + + assert_eq!(value["type"], "code"); + assert_eq!(value["data"]["location"], experiment_location); + } + fn test_base_args() -> BaseArgs { BaseArgs { json: false, diff --git a/tests/functions-fixtures/list-sandbox-type-parses/fixture.json b/tests/functions-fixtures/list-sandbox-type-parses/fixture.json new file mode 100644 index 0000000..642a921 --- /dev/null +++ b/tests/functions-fixtures/list-sandbox-type-parses/fixture.json @@ -0,0 +1,5 @@ +{ + "command": ["functions", "list", "--type", "sandbox"], + "expect_success": false, + "stderr_not_contains": ["invalid value 'sandbox'"] +} diff --git a/tests/functions.rs b/tests/functions.rs index 5f18f8e..c9e6a17 100644 --- a/tests/functions.rs +++ b/tests/functions.rs @@ -1909,6 +1909,202 @@ exit 24 ); } +#[cfg(unix)] +#[tokio::test(flavor = "multi_thread", worker_threads = 2)] +async fn functions_push_sandbox_entries_reach_api() { + if !command_exists("node") { + eprintln!("Skipping functions_push_sandbox_entries_reach_api (node not installed)."); + return; + } + + let state = Arc::new(MockServerState::default()); + state + .projects + .lock() + .expect("projects lock") + .push(MockProject { + id: "proj_mock".to_string(), + name: "mock-project".to_string(), + org_id: "org_mock".to_string(), + }); + let server = MockServer::start(state.clone()).await; + + let tmp = tempdir().expect("tempdir"); + let source = tmp.path().join("my-eval.js"); + std::fs::write( + &source, + "globalThis._evals ??= { functions: [], prompts: [], parameters: [], evaluators: {}, reporters: {} };\n", + ) + .expect("write source file"); + + let runner = tmp.path().join("mock-runner.sh"); + std::fs::write( + &runner, + r#"#!/bin/sh +set -eu +_runner_script="$1" +shift +_runner_name="$(basename "$_runner_script")" + +if [ "$_runner_name" = "functions-runner.ts" ]; then +node - "$@" <<'NODE' +const path = require("node:path"); +const files = process.argv.slice(2); +const manifest = { + runtime_context: { runtime: "node", version: process.versions.node || "unknown" }, + files: files.map((file) => { + const abs = path.resolve(file); + return { + source_file: abs, + entries: [ + { + kind: "code", + project_name: "mock-project", + name: "Eval my-eval sandbox", + slug: "my-eval-my-eval-sandbox", + function_type: "sandbox", + location: { + type: "sandbox", + sandbox_spec: { provider: "lambda" }, + entrypoints: [abs], + eval_name: "my-eval", + evaluator_definition: { scores: [{ name: "accuracy" }] } + }, + metadata: { _bt_sandbox_group_name: "my-eval" } + } + ] + }; + }) +}; +process.stdout.write(JSON.stringify(manifest)); +NODE +exit 0 +fi + +if [ "$_runner_name" = "functions-bundler.ts" ]; then + _source_file="$1" + _output_file="$2" + cp "$_source_file" "$_output_file" + exit 0 +fi + +echo "unexpected runner script: $_runner_name" >&2 +exit 24 +"#, + ) + .expect("write mock runner"); + use std::os::unix::fs::PermissionsExt; + let mut perms = std::fs::metadata(&runner) + .expect("runner metadata") + .permissions(); + perms.set_mode(0o755); + std::fs::set_permissions(&runner, perms).expect("runner permissions"); + + let output = Command::new(bt_binary_path()) + .current_dir(tmp.path()) + .args([ + "functions", + "--json", + "push", + "--file", + source + .to_str() + .expect("source path should be valid UTF-8 for test"), + "--language", + "javascript", + "--runner", + runner + .to_str() + .expect("runner path should be valid UTF-8 for test"), + "--if-exists", + "replace", + ]) + .env("BRAINTRUST_API_KEY", "test-key") + .env("BRAINTRUST_ORG_NAME", "test-org") + .env("BRAINTRUST_API_URL", &server.base_url) + .env("BRAINTRUST_APP_URL", &server.base_url) + .env("BRAINTRUST_NO_COLOR", "1") + .env_remove("BRAINTRUST_PROFILE") + .output() + .expect("run bt functions push"); + + server.stop().await; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + panic!("mock push failed:\n{stderr}"); + } + + let summary: Value = serde_json::from_slice(&output.stdout).expect("parse push summary"); + assert_eq!(summary["status"].as_str(), Some("success")); + assert_eq!(summary["uploaded_files"].as_u64(), Some(1)); + assert_eq!(summary["failed_files"].as_u64(), Some(0)); + + let inserted = state + .inserted_functions + .lock() + .expect("inserted functions lock") + .clone(); + assert_eq!( + inserted.len(), + 1, + "expected 1 inserted function (sandbox only)" + ); + + let sandbox_obj = inserted[0].as_object().expect("sandbox should be an object"); + assert_eq!( + sandbox_obj.get("slug").and_then(Value::as_str), + Some("my-eval-my-eval-sandbox") + ); + assert_eq!( + sandbox_obj.get("function_type").and_then(Value::as_str), + Some("sandbox") + ); + + // Verify function_data.data.location is sandbox type + let function_data = sandbox_obj + .get("function_data") + .and_then(Value::as_object) + .expect("function_data object"); + assert_eq!( + function_data.get("type").and_then(Value::as_str), + Some("code") + ); + let data = function_data + .get("data") + .and_then(Value::as_object) + .expect("function_data.data object"); + let location = data + .get("location") + .and_then(Value::as_object) + .expect("location object"); + assert_eq!( + location.get("type").and_then(Value::as_str), + Some("sandbox") + ); + let sandbox_spec = location + .get("sandbox_spec") + .and_then(Value::as_object) + .expect("sandbox_spec object"); + assert_eq!( + sandbox_spec.get("provider").and_then(Value::as_str), + Some("lambda") + ); + + // Verify metadata + let metadata = sandbox_obj + .get("metadata") + .and_then(Value::as_object) + .expect("metadata object"); + assert_eq!( + metadata + .get("_bt_sandbox_group_name") + .and_then(Value::as_str), + Some("my-eval") + ); + +} + #[tokio::test(flavor = "multi_thread", worker_threads = 2)] async fn functions_pull_works_against_mock_api() { let state = Arc::new(MockServerState::default()); From 9a70a3d9357f8093a24c9528c48b220d317050af Mon Sep 17 00:00:00 2001 From: Nate Selvidge Date: Fri, 13 Mar 2026 21:18:35 +0000 Subject: [PATCH 2/5] fixes --- scripts/functions-bundler.ts | 5 ++++- scripts/functions-runner.py | 2 +- scripts/functions-runner.ts | 2 +- src/functions/push.rs | 10 +++++++++- tests/functions.rs | 16 ++++++++++++++++ 5 files changed, 31 insertions(+), 4 deletions(-) diff --git a/scripts/functions-bundler.ts b/scripts/functions-bundler.ts index 02b9d11..de0af0b 100644 --- a/scripts/functions-bundler.ts +++ b/scripts/functions-bundler.ts @@ -275,7 +275,10 @@ async function main(): Promise { const externalPackages = parseExternalPackages( process.env.BT_FUNCTIONS_PUSH_EXTERNAL_PACKAGES, ); - const external = buildExternalPackagePatterns(externalPackages); + const selfContained = process.env.BT_FUNCTIONS_PUSH_SELF_CONTAINED === "1"; + const external = selfContained + ? ["fsevents", "chokidar"] + : buildExternalPackagePatterns(externalPackages); const tsconfig = loadTsconfigPath(); const outputDir = path.dirname(outputFile); diff --git a/scripts/functions-runner.py b/scripts/functions-runner.py index aa7bd05..a140a19 100644 --- a/scripts/functions-runner.py +++ b/scripts/functions-runner.py @@ -360,7 +360,7 @@ def collect_evaluator_entries(evals_registry: Any, source_file: str) -> list[dic "location": { "type": "sandbox", "sandbox_spec": {"provider": "lambda"}, - "entrypoints": [source_file], + "entrypoints": [os.path.relpath(source_file)], "eval_name": eval_name, "evaluator_definition": evaluator_definition, }, diff --git a/scripts/functions-runner.ts b/scripts/functions-runner.ts index 1a5b9e8..cd49251 100644 --- a/scripts/functions-runner.ts +++ b/scripts/functions-runner.ts @@ -567,7 +567,7 @@ function collectEvaluatorEntries( location: { type: "sandbox", sandbox_spec: { provider: "lambda" }, - entrypoints: [sourceFilePath], + entrypoints: [path.relative(process.cwd(), sourceFilePath)], eval_name: evalName, evaluator_definition: evaluatorDefinition as JsonValue, } as JsonValue, diff --git a/src/functions/push.rs b/src/functions/push.rs index a83e58f..5d4de74 100644 --- a/src/functions/push.rs +++ b/src/functions/push.rs @@ -716,10 +716,14 @@ async fn push_file( let mut function_events: Vec = Vec::new(); + let has_sandbox_entries = code_entries + .iter() + .any(|(code, _)| code.function_type.as_deref() == Some("sandbox")); + if !code_entries.is_empty() { let (upload_bytes, content_encoding) = match selected_language { SourceLanguage::JsLike => { - let bundle_bytes = build_js_bundle(source_path, args)?; + let bundle_bytes = build_js_bundle(source_path, args, has_sandbox_entries)?; let gzipped = gzip_bytes(&bundle_bytes).map_err(|err| FileFailure { reason: HardFailureReason::BundleUploadFailed, message: format!("failed to gzip {}: {err}", source_path.display()), @@ -922,6 +926,7 @@ async fn push_file( fn build_js_bundle( source_path: &Path, args: &PushArgs, + self_contained: bool, ) -> std::result::Result, FileFailure> { let build_dir = TempBuildDir::create("bt-functions-js-bundle").map_err(|err| FileFailure { reason: HardFailureReason::BundleUploadFailed, @@ -954,6 +959,9 @@ fn build_js_bundle( args.external_packages.join(","), ); } + if self_contained { + command.env("BT_FUNCTIONS_PUSH_SELF_CONTAINED", "1"); + } let output = command.output().map_err(|err| FileFailure { reason: HardFailureReason::RunnerSpawnFailed, diff --git a/tests/functions.rs b/tests/functions.rs index c9e6a17..fa3de7c 100644 --- a/tests/functions.rs +++ b/tests/functions.rs @@ -1091,6 +1091,22 @@ fn functions_python_runner_emits_valid_manifest_with_bundle() { "from contextlib import nullcontext\n\ndef _set_lazy_load(_enabled):\n return nullcontext()\n", ) .expect("write lazy_load.py"); + std::fs::write( + stub_root.join("braintrust").join("framework.py"), + concat!( + "from contextlib import nullcontext\n", + "\n", + "def _set_lazy_load(_enabled):\n", + " return nullcontext()\n", + "\n", + "class _EvalFile:\n", + " def __init__(self):\n", + " self.evaluators = {}\n", + "\n", + "_evals = _EvalFile()\n", + ), + ) + .expect("write framework.py"); let sample_path = tmp.path().join("sample_tool.py"); std::fs::write( From 0aafaf19c8e978fdd4578fcce5615f484f0806f8 Mon Sep 17 00:00:00 2001 From: Nate Selvidge Date: Fri, 13 Mar 2026 22:20:12 +0000 Subject: [PATCH 3/5] fixes --- .gitignore | 1 + scripts/eval-runner.py | 15 +++++++++++++++ scripts/functions-runner.py | 11 ++++++++--- scripts/python_runner_common.py | 12 ++++++++++++ src/eval.rs | 26 ++++++++------------------ src/functions/push.rs | 8 ++++++++ 6 files changed, 52 insertions(+), 21 deletions(-) diff --git a/.gitignore b/.gitignore index da0bacf..a0fe928 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ tests/evals/js/eval-bun/test-data.txt __pycache__ bt-sync +*.env \ No newline at end of file diff --git a/scripts/eval-runner.py b/scripts/eval-runner.py index 8742375..e9d3ce0 100755 --- a/scripts/eval-runner.py +++ b/scripts/eval-runner.py @@ -435,6 +435,21 @@ def load_evaluators(files: list[str]) -> tuple[list[EvaluatorInstance], dict[str cwd = os.getcwd() if cwd not in sys.path: sys.path.insert(0, cwd) + + # Add the project root inferred from input files to sys.path so that + # sibling-package imports work when files live outside CWD (e.g. + # sandbox bundles extracted to a temp directory). Walk up from each + # file's directory looking for a register.py (bundle marker) or the + # filesystem root, whichever comes first. + for f in files: + d = os.path.dirname(os.path.abspath(f)) + while d and d != os.path.dirname(d): + if os.path.isfile(os.path.join(d, "register.py")): + if d not in sys.path: + sys.path.insert(0, d) + break + d = os.path.dirname(d) + unique_files: set[str] = set() for file_path in files: for candidate in collect_files(file_path): diff --git a/scripts/functions-runner.py b/scripts/functions-runner.py index a140a19..fa93c75 100644 --- a/scripts/functions-runner.py +++ b/scripts/functions-runner.py @@ -29,9 +29,9 @@ def to_json_value(value: Any) -> Any: return [to_json_value(item) for item in value] if isinstance(value, dict): return {str(key): to_json_value(val) for key, val in value.items()} - if hasattr(value, "model_dump"): + if hasattr(value, "model_dump") and not isinstance(value, type): return to_json_value(value.model_dump()) - if hasattr(value, "dict"): + if hasattr(value, "dict") and not isinstance(value, type): return to_json_value(value.dict()) if hasattr(value, "__dict__"): result: dict[str, Any] = {} @@ -451,8 +451,13 @@ async def process_file(file_path: str) -> dict[str, Any]: continue seen_sources.add(init_source) bundled_sources.append(init_source) + # Compute entry_module as a CWD-relative dotted path so that the + # archive root inferred by push.rs walks back to CWD, matching + # the Python SDK behavior and allowing sibling-package imports. + rel_path = os.path.relpath(abs_path, cwd) + archive_module = re.sub(r"\.py$", "", rel_path).replace("-", "_").replace(os.sep, ".") file_manifest["python_bundle"] = { - "entry_module": module_name, + "entry_module": archive_module, "sources": bundled_sources, } diff --git a/scripts/python_runner_common.py b/scripts/python_runner_common.py index 4a738f9..1d83141 100644 --- a/scripts/python_runner_common.py +++ b/scripts/python_runner_common.py @@ -61,6 +61,12 @@ def purge_local_modules(cwd: str, preserve_modules: set[str] | None = None) -> N candidate_abs = os.path.abspath(candidate) if not os.path.isfile(candidate_abs): continue + # Skip installed packages inside virtualenvs under cwd (e.g. .venv/lib/.../site-packages). + if os.sep + "site-packages" + os.sep in candidate_abs: + continue + # Skip bt runner scripts materialised under .bt/. + if os.sep + ".bt" + os.sep in candidate_abs: + continue try: common = os.path.commonpath([candidate_abs, cwd_abs]) except ValueError: @@ -84,6 +90,12 @@ def collect_python_sources(cwd: str, input_file: str) -> list[str]: continue if not candidate_abs.endswith(".py"): continue + # Skip installed packages inside virtualenvs under cwd (e.g. .venv/lib/.../site-packages). + if os.sep + "site-packages" + os.sep in candidate_abs: + continue + # Skip bt runner scripts materialised under .bt/. + if os.sep + ".bt" + os.sep in candidate_abs: + continue try: common = os.path.commonpath([candidate_abs, cwd]) except ValueError: diff --git a/src/eval.rs b/src/eval.rs index d108c61..6bc5d40 100644 --- a/src/eval.rs +++ b/src/eval.rs @@ -1207,12 +1207,6 @@ fn serialize_sse_event(event: &str, data: &str) -> String { format!("event: {event}\ndata: {data}\n\n") } -fn is_eval_progress_payload(progress: &SseProgressEventData) -> bool { - serde_json::from_str::(&progress.data) - .map(|payload| payload.kind_type == "eval_progress") - .unwrap_or(false) -} - fn encode_eval_event_for_http(event: &EvalEvent) -> Option { match event { EvalEvent::Processing(payload) => serde_json::to_string(payload) @@ -1224,15 +1218,9 @@ fn encode_eval_event_for_http(event: &EvalEvent) -> Option { EvalEvent::Summary(summary) => serde_json::to_string(summary) .ok() .map(|data| serialize_sse_event("summary", &data)), - EvalEvent::Progress(progress) => { - if is_eval_progress_payload(progress) { - None - } else { - serde_json::to_string(progress) - .ok() - .map(|data| serialize_sse_event("progress", &data)) - } - } + EvalEvent::Progress(progress) => serde_json::to_string(progress) + .ok() + .map(|data| serialize_sse_event("progress", &data)), EvalEvent::Dependencies { .. } => None, EvalEvent::Done => Some(serialize_sse_event("done", "")), EvalEvent::Error { @@ -2188,7 +2176,7 @@ fn build_python_command( .or_else(|| std::env::var("BT_EVAL_PYTHON_RUNNER").ok()) .or_else(|| std::env::var("BT_EVAL_PYTHON").ok()); - let command = if let Some(explicit) = runner_override { + let mut command = if let Some(explicit) = runner_override { let mut command = Command::new(explicit); command.arg(runner).args(files); command @@ -4022,7 +4010,7 @@ mod tests { } #[test] - fn encode_eval_event_for_http_filters_internal_eval_progress() { + fn encode_eval_event_for_http_forwards_eval_progress() { let event = EvalEvent::Progress(SseProgressEventData { id: "id-1".to_string(), object_type: "task".to_string(), @@ -4034,7 +4022,9 @@ mod tests { data: r#"{"type":"eval_progress","kind":"start","total":1}"#.to_string(), }); - assert!(encode_eval_event_for_http(&event).is_none()); + let encoded = encode_eval_event_for_http(&event).expect("eval_progress should be forwarded"); + assert!(encoded.contains("event: progress")); + assert!(encoded.contains("eval_progress")); } #[test] diff --git a/src/functions/push.rs b/src/functions/push.rs index 5d4de74..0ca6109 100644 --- a/src/functions/push.rs +++ b/src/functions/push.rs @@ -1125,6 +1125,14 @@ fn collect_classified_files(inputs: &[PathBuf]) -> Result { let mut explicit_js_like = 0usize; let mut explicit_python = 0usize; + // Always include CWD so that Python files importing from sibling + // packages (e.g. `from src.agents import ...`) are accepted. + if let Ok(cwd) = std::env::current_dir() { + if let Ok(canonical_cwd) = cwd.canonicalize() { + allowed_roots.insert(canonical_cwd); + } + } + for input in inputs { let path = if input.is_absolute() { input.clone() From 9628bbc49f6889acda041fdfd869a51dd22893dc Mon Sep 17 00:00:00 2001 From: Nate Selvidge Date: Mon, 16 Mar 2026 16:11:49 +0000 Subject: [PATCH 4/5] WIP --- scripts/functions-runner.py | 10 +++++++++- src/eval.rs | 24 +++++++++++++++++------- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/scripts/functions-runner.py b/scripts/functions-runner.py index fa93c75..0291a50 100644 --- a/scripts/functions-runner.py +++ b/scripts/functions-runner.py @@ -340,7 +340,15 @@ def collect_evaluator_entries(evals_registry: Any, source_file: str) -> list[dic }, } else: - serialized = to_json_value(raw_params) + # Use the braintrust SDK's parameters_to_json_schema when + # available so that Pydantic model classes are converted to + # proper staticParametersSchema entries (type: "data" with a + # JSON Schema) that the UI can parse. + try: + from braintrust.parameters import parameters_to_json_schema + serialized = parameters_to_json_schema(raw_params) + except Exception: + serialized = to_json_value(raw_params) if serialized is not None: evaluator_definition["parameters"] = serialized diff --git a/src/eval.rs b/src/eval.rs index 6bc5d40..bf3afd5 100644 --- a/src/eval.rs +++ b/src/eval.rs @@ -1218,9 +1218,21 @@ fn encode_eval_event_for_http(event: &EvalEvent) -> Option { EvalEvent::Summary(summary) => serde_json::to_string(summary) .ok() .map(|data| serialize_sse_event("summary", &data)), - EvalEvent::Progress(progress) => serde_json::to_string(progress) - .ok() - .map(|data| serialize_sse_event("progress", &data)), + EvalEvent::Progress(progress) => { + // Filter out internal eval_progress events (start/increment/stop) + // which are used for CLI progress bars but crash the UI stream + // parser. Only forward external progress events (e.g. json_delta). + if serde_json::from_str::(&progress.data) + .map(|p| p.kind_type == "eval_progress") + .unwrap_or(false) + { + None + } else { + serde_json::to_string(progress) + .ok() + .map(|data| serialize_sse_event("progress", &data)) + } + } EvalEvent::Dependencies { .. } => None, EvalEvent::Done => Some(serialize_sse_event("done", "")), EvalEvent::Error { @@ -4010,7 +4022,7 @@ mod tests { } #[test] - fn encode_eval_event_for_http_forwards_eval_progress() { + fn encode_eval_event_for_http_filters_internal_eval_progress() { let event = EvalEvent::Progress(SseProgressEventData { id: "id-1".to_string(), object_type: "task".to_string(), @@ -4022,9 +4034,7 @@ mod tests { data: r#"{"type":"eval_progress","kind":"start","total":1}"#.to_string(), }); - let encoded = encode_eval_event_for_http(&event).expect("eval_progress should be forwarded"); - assert!(encoded.contains("event: progress")); - assert!(encoded.contains("eval_progress")); + assert!(encode_eval_event_for_http(&event).is_none()); } #[test] From 1ff2094a12a5b7372349929665c2566b34776908 Mon Sep 17 00:00:00 2001 From: Nate Selvidge Date: Wed, 18 Mar 2026 21:34:08 +0000 Subject: [PATCH 5/5] fix CI --- src/eval.rs | 2 +- src/source_language.rs | 1 + tests/eval_dev_server.rs | 16 ++++++++-------- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/eval.rs b/src/eval.rs index bf3afd5..bb67433 100644 --- a/src/eval.rs +++ b/src/eval.rs @@ -2188,7 +2188,7 @@ fn build_python_command( .or_else(|| std::env::var("BT_EVAL_PYTHON_RUNNER").ok()) .or_else(|| std::env::var("BT_EVAL_PYTHON").ok()); - let mut command = if let Some(explicit) = runner_override { + let command = if let Some(explicit) = runner_override { let mut command = Command::new(explicit); command.arg(runner).args(files); command diff --git a/src/source_language.rs b/src/source_language.rs index 8a1b71f..1bb82bc 100644 --- a/src/source_language.rs +++ b/src/source_language.rs @@ -7,6 +7,7 @@ pub enum SourceLanguage { #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum JsExtensionProfile { FunctionsPush, + #[allow(dead_code)] Eval, } diff --git a/tests/eval_dev_server.rs b/tests/eval_dev_server.rs index 3921b92..88d6936 100644 --- a/tests/eval_dev_server.rs +++ b/tests/eval_dev_server.rs @@ -123,10 +123,10 @@ fn parse_sse_events(body: &str) -> Vec { let mut current_data = Vec::::new(); for line in body.lines() { - if line.starts_with("event: ") { - current_event = line["event: ".len()..].to_string(); - } else if line.starts_with("data: ") { - current_data.push(line["data: ".len()..].to_string()); + if let Some(event) = line.strip_prefix("event: ") { + current_event = event.to_string(); + } else if let Some(data) = line.strip_prefix("data: ") { + current_data.push(data.to_string()); } else if line.is_empty() && !current_event.is_empty() { events.push(SseEvent { event: std::mem::take(&mut current_event), @@ -518,10 +518,10 @@ fn streaming_eval_post( Ok(l) => l, Err(_) => break, }; - if line.starts_with("event: ") { - current_event = line["event: ".len()..].to_string(); - } else if line.starts_with("data: ") { - current_data.push(line["data: ".len()..].to_string()); + if let Some(event) = line.strip_prefix("event: ") { + current_event = event.to_string(); + } else if let Some(data) = line.strip_prefix("data: ") { + current_data.push(data.to_string()); } else if line.is_empty() && !current_event.is_empty() { let event = SseEvent { event: std::mem::take(&mut current_event),