From 80eb21027fb85b6f97e71068025272a804e74e53 Mon Sep 17 00:00:00 2001 From: Aswin Karumbunathan Date: Thu, 12 Mar 2026 12:06:59 -0700 Subject: [PATCH 1/5] Add support for classifier "scorers" --- js/src/framework.ts | 59 +++++++++++++++++++++++++++++++++++++-------- js/util/score.ts | 9 +++++++ 2 files changed, 58 insertions(+), 10 deletions(-) diff --git a/js/src/framework.ts b/js/src/framework.ts index 637e688d9..78cee4bc8 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -227,6 +227,12 @@ export interface Evaluator< */ scores: EvalScorer[]; + /** + * Optional span types per scorer. When provided, same length as `scores`. + * Use `"classifier"` for classifier spans; otherwise `"score"`. + */ + scorerSpanTypes?: ("score" | "classifier")[]; + /** * A set of parameters that will be passed to the evaluator. * Can be: @@ -1154,6 +1160,7 @@ async function runEvaluatorInternal( output, trace, }; + const scorerSpanTypes = evaluator.scorerSpanTypes ?? []; const scoreResults = await Promise.all( evaluator.scores.map(async (score, score_idx) => { try { @@ -1192,7 +1199,14 @@ async function runEvaluatorInternal( ]; const getOtherFields = (s: Score) => { - const { metadata: _metadata, name: _name, ...rest } = s; + const { + metadata: _metadata, + name: _name, + classification: _classification, + id: _id, + label: _label, + ...rest + } = s; return rest; }; @@ -1212,31 +1226,36 @@ async function runEvaluatorInternal( ? getOtherFields(results[0]) : results.reduce( (prev, s) => - mergeDicts(prev, { [s.name]: getOtherFields(s) }), + mergeDicts(prev, { + [s.name]: getOtherFields(s), + }), {}, ); - const scores = results.reduce( - (prev, s) => mergeDicts(prev, { [s.name]: s.score }), + const scoresRecord = results.reduce( + (prev, s) => + mergeDicts(prev, { [s.name]: s.score }), {}, ); span.log({ output: resultOutput, metadata: resultMetadata, - scores: scores, + scores: scoresRecord, }); return results; }; - // Exclude trace from logged input since it contains internal state - // that shouldn't be serialized (spansFlushPromise, spansFlushed, etc.) const { trace: _trace, ...scoringArgsForLogging } = scoringArgs; + const spanType = + scorerSpanTypes[score_idx] === "classifier" + ? SpanTypeAttribute.CLASSIFIER + : SpanTypeAttribute.SCORE; const results = await rootSpan.traced(runScorer, { name: scorerNames[score_idx], spanAttributes: { - type: SpanTypeAttribute.SCORE, + type: spanType, purpose: "scorer", }, propagatedEvent: makeScorerPropagatedEvent( @@ -1250,21 +1269,41 @@ async function runEvaluatorInternal( } }), ); - // Resolve each promise on its own so that we can separate the passing - // from the failing ones. const failingScorersAndResults: { name: string; error: unknown }[] = []; + const classifications: Record< + string, + { id: string; label: string }[] + > = {}; scoreResults.forEach((results, i) => { const name = scorerNames[i]; if (results.kind === "score") { (results.value || []).forEach((result) => { scores[result.name] = result.score; + if (result.id != null) { + classifications[result.name] = [ + { id: result.id, label: result.label ?? result.id }, + ]; + } else if (result.classification != null) { + classifications[result.name] = [ + { + id: result.classification, + label: result.classification, + }, + ]; + } }); } else { failingScorersAndResults.push({ name, error: results.value }); } }); + if (Object.keys(classifications).length > 0) { + rootSpan.log({ classifications } as Parameters< + typeof rootSpan.log + >[0]); + } + unhandledScores = null; if (failingScorersAndResults.length) { const scorerErrors = Object.fromEntries( diff --git a/js/util/score.ts b/js/util/score.ts index 758902344..373a1c6de 100644 --- a/js/util/score.ts +++ b/js/util/score.ts @@ -1,7 +1,16 @@ +/** + * A classification result: either a plain string label, or an object with a + * stable `id` and an optional human-readable `label` (defaults to `id`). + * When set, the value is recorded in the `classifications` column keyed by + * scorer name instead of (or in addition to) `score`. + */ +export type Classification = string | { id: string; label?: string }; + export interface Score { name: string; score: number | null; metadata?: Record; + classification?: Classification; // DEPRECATION_NOTICE: this field is deprecated, as errors are propagated up to the caller. /** * @deprecated From 76cc66dc4016f76d6fed5a6ab0331340b52b297f Mon Sep 17 00:00:00 2001 From: Aswin Karumbunathan Date: Thu, 12 Mar 2026 16:05:57 -0700 Subject: [PATCH 2/5] take 2: with a separate Classification type --- js/src/exports.ts | 1 + js/src/framework.test.ts | 23 ++ js/src/framework.ts | 434 ++++++++++++++++++++++++-------------- js/src/parameters.test.ts | 7 + js/util/index.ts | 2 +- js/util/score.ts | 17 +- 6 files changed, 322 insertions(+), 162 deletions(-) diff --git a/js/src/exports.ts b/js/src/exports.ts index 6eb2b5c1f..352d5a80e 100644 --- a/js/src/exports.ts +++ b/js/src/exports.ts @@ -185,6 +185,7 @@ export type { EvalResult, EvalScorerArgs, EvalScorer, + EvalClassifier, EvaluatorDef, EvaluatorFile, ReporterBody, diff --git a/js/src/framework.test.ts b/js/src/framework.test.ts index bd9381342..f431a728f 100644 --- a/js/src/framework.test.ts +++ b/js/src/framework.test.ts @@ -47,6 +47,7 @@ test("meta (write) is passed to task", async () => { return input * 2; }, scores: [], + classifications: [], }, new NoopProgressReporter(), [], @@ -85,6 +86,7 @@ test("metadata (read/write) is passed to task", async () => { return input * 2; }, scores: [], + classifications: [], }, new NoopProgressReporter(), [], @@ -125,6 +127,7 @@ test("expected (read/write) is passed to task", async () => { return input * 2; }, scores: [], + classifications: [], }, new NoopProgressReporter(), [], @@ -175,6 +178,7 @@ describe("runEvaluator", () => { scores: Array.from({ length: 3 }, (_, i) => makeTestScorer(`scorer_${i}`), ), + classifications: [], }, new NoopProgressReporter(), [], @@ -202,6 +206,7 @@ describe("runEvaluator", () => { scores: Array.from({ length: 3 }, (_, i) => makeTestScorer(`scorer_${i}`), ), + classifications: [], errorScoreHandler: defaultErrorScoreHandler, }, new NoopProgressReporter(), @@ -232,6 +237,7 @@ describe("runEvaluator", () => { scores: Array.from({ length: 3 }, (_, i) => makeTestScorer(`scorer_${i}`, i === 0), ), + classifications: [], errorScoreHandler: defaultErrorScoreHandler, }, new NoopProgressReporter(), @@ -266,6 +272,7 @@ describe("runEvaluator", () => { scores: Array.from({ length: 3 }, (_, i) => makeTestScorer(`scorer_${i}`), ), + classifications: [], errorScoreHandler: () => undefined, }, new NoopProgressReporter(), @@ -292,6 +299,7 @@ describe("runEvaluator", () => { scores: Array.from({ length: 3 }, (_, i) => makeTestScorer(`scorer_${i}`), ), + classifications: [], errorScoreHandler: () => ({ error_score: 1 }), }, new NoopProgressReporter(), @@ -345,6 +353,7 @@ describe("runEvaluator", () => { return input * 2; }, scores: [], + classifications: [], timeout: 10, maxConcurrency: 1, }, @@ -394,6 +403,7 @@ describe("runEvaluator", () => { return input * 2; }, scores: [], + classifications: [], signal: abortController.signal, maxConcurrency: 1, }, @@ -431,6 +441,7 @@ describe("runEvaluator", () => { return input * 2; }, scores: [], + classifications: [], }, new NoopProgressReporter(), [], @@ -458,6 +469,7 @@ test("trialIndex is passed to task", async () => { return input * 2; }, scores: [], + classifications: [], trialCount: 3, }, new NoopProgressReporter(), @@ -500,6 +512,7 @@ test("trialIndex with multiple inputs", async () => { return input * 2; }, scores: [], + classifications: [], trialCount: 2, }, new NoopProgressReporter(), @@ -546,6 +559,7 @@ test("Eval with noSendLogs: true runs locally without creating experiment", asyn }), () => ({ name: "simple_scorer", score: 0.8 }), ], + classifications: [], }, { noSendLogs: true, returnResults: true }, ); @@ -592,6 +606,7 @@ test("Eval with returnResults: false produces empty results but valid summary", () => ({ name: "length_score", score: 0.75 }), () => ({ name: "quality_score", score: 0.9 }), ], + classifications: [], }, { noSendLogs: true, returnResults: false }, ); @@ -629,6 +644,7 @@ test("Eval with returnResults: true collects all results", async () => { score: args.output === args.expected ? 1 : 0, }), ], + classifications: [], }, { noSendLogs: true, returnResults: true }, ); @@ -672,6 +688,7 @@ test("tags can be appended and logged to root span", async () => { return input; }, scores: [() => ({ name: "simple_scorer", score: 0.8 })], + classifications: [], summarizeScores: false, }, new NoopProgressReporter(), @@ -721,6 +738,7 @@ test.each([ return input; }, scores: [() => ({ name: "simple_scorer", score: 0.8 })], + classifications: [], summarizeScores: false, }, new NoopProgressReporter(), @@ -761,6 +779,7 @@ test("tags are persisted with a failing scorer", async () => { throw new Error("test error"); }, ], + classifications: [], summarizeScores: false, }, new NoopProgressReporter(), @@ -794,6 +813,7 @@ test("tags remain empty when not set", async () => { return input; }, scores: [() => ({ name: "simple_scorer", score: 0.8 })], + classifications: [], summarizeScores: false, }, new NoopProgressReporter(), @@ -830,6 +850,7 @@ test("scorer spans have purpose='scorer' attribute", async () => { score: args.output === args.expected ? 1 : 0, }), ], + classifications: [], }, new NoopProgressReporter(), [], @@ -1471,6 +1492,7 @@ test("Eval with enableCache: false does not use span cache", async () => { data: [{ input: 1, expected: 2 }], task: (input) => input * 2, scores: [], + classifications: [], state, }, { noSendLogs: true, enableCache: false }, @@ -1496,6 +1518,7 @@ test("Eval with enableCache: true (default) uses span cache", async () => { data: [{ input: 1, expected: 2 }], task: (input) => input * 2, scores: [], + classifications: [], state, }, { noSendLogs: true }, // enableCache defaults to true diff --git a/js/src/framework.ts b/js/src/framework.ts index 78cee4bc8..ff1851783 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -1,6 +1,7 @@ import { makeScorerPropagatedEvent, mergeDicts, + Classification, Score, SpanComponentsV3, SpanTypeAttribute, @@ -186,6 +187,17 @@ export type EvalScorer< args: EvalScorerArgs, ) => OneOrMoreScores | Promise; +export type OneOrMoreClassifications = Classification | Classification[] | null; + +export type EvalClassifier< + Input, + Output, + Expected, + Metadata extends BaseMetadata = DefaultMetadataType, +> = ( + args: EvalScorerArgs, +) => OneOrMoreClassifications | Promise; + export type EvalResult< Input, Output, @@ -223,15 +235,16 @@ export interface Evaluator< task: EvalTask; /** - * A set of functions that take an input, output, and expected value and return a score. + * A set of functions that take an input, output, and expected value and return a {@link Score}. */ scores: EvalScorer[]; /** - * Optional span types per scorer. When provided, same length as `scores`. - * Use `"classifier"` for classifier spans; otherwise `"score"`. + * A set of functions that take an input, output, and expected value and return a + * classification. Each function must return a {@link Classification} with a required + * `classification` field. Results are recorded under the `classifications` column. */ - scorerSpanTypes?: ("score" | "classifier")[]; + classifications: EvalClassifier[]; /** * A set of parameters that will be passed to the evaluator. @@ -870,6 +883,102 @@ export function scorerName( return scorer.name || `scorer_${scorer_idx}`; } +export function classifierName( + classifier: EvalClassifier, + classifier_idx: number, +) { + return classifier.name || `classifier_${classifier_idx}`; +} + +function buildSpanMetadata( + results: Array<{ name: string; metadata?: Record }>, +) { + return results.length === 1 + ? results[0].metadata + : results.reduce( + (prev, s) => mergeDicts(prev, { [s.name]: s.metadata }), + {}, + ); +} + +function buildSpanScores( + results: Array<{ + name: string; + score: number | null; + metadata?: Record; + }>, +) { + const scoresRecord = results.reduce( + (prev, s) => mergeDicts(prev, { [s.name]: s.score }), + {}, + ); + return { resultMetadata: buildSpanMetadata(results), scoresRecord }; +} + +async function runInScorerSpan( + rootSpan: Span, + spanName: string, + spanType: SpanTypeAttribute, + propagatedEvent: ReturnType, + eventInput: unknown, + fn: (span: Span) => Promise, +): Promise< + { kind: "score"; value: T[] | null } | { kind: "error"; value: unknown } +> { + try { + const value = await rootSpan.traced(fn, { + name: spanName, + spanAttributes: { type: spanType, purpose: "scorer" }, + propagatedEvent, + event: { input: eventInput }, + }); + return { kind: "score", value }; + } catch (e) { + return { kind: "error", value: e }; + } +} + +function collectScoringResults( + runResults: Array< + { kind: "score"; value: T[] | null } | { kind: "error"; value: unknown } + >, + names: string[], + onResult: (result: T) => void, +): { name: string; error: unknown }[] { + const failing: { name: string; error: unknown }[] = []; + runResults.forEach((r, i) => { + if (r.kind === "score") { + (r.value ?? []).forEach(onResult); + } else { + failing.push({ name: names[i], error: r.value }); + } + }); + return failing; +} + +function logScoringFailures( + kind: string, + failures: { name: string; error: unknown }[], + metadata: Record, + rootSpan: Span, + state: BraintrustState | undefined, +): string[] { + if (!failures.length) return []; + const errorMap = Object.fromEntries( + failures.map(({ name, error }) => [ + name, + error instanceof Error ? error.stack : `${error}`, + ]), + ); + metadata[`${kind}_errors`] = errorMap; + rootSpan.log({ metadata: { [`${kind}_errors`]: errorMap } }); + debugLogger.forState(state).warn( + `Found exceptions for the following ${kind}s: ${Object.keys(errorMap).join(", ")}`, + failures.map((f) => f.error), + ); + return Object.keys(errorMap); +} + export async function runEvaluator( experiment: Experiment | null, // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -1096,6 +1205,7 @@ async function runEvaluatorInternal( let tags: string[] = [...(datum.tags ?? [])]; const scores: Record = {}; const scorerNames = evaluator.scores.map(scorerName); + const classifierNames = evaluator.classifications.map(classifierName); let unhandledScores: string[] | null = scorerNames; try { const meta = (o: Record) => @@ -1160,143 +1270,162 @@ async function runEvaluatorInternal( output, trace, }; - const scorerSpanTypes = evaluator.scorerSpanTypes ?? []; - const scoreResults = await Promise.all( - evaluator.scores.map(async (score, score_idx) => { - try { - const runScorer = async (span: Span) => { - const scoreResult = score(scoringArgs); - const scoreValue = - scoreResult instanceof Promise - ? await scoreResult - : scoreResult; - - if (scoreValue === null) { - return null; - } - - if (Array.isArray(scoreValue)) { - for (const s of scoreValue) { - if (!(typeof s === "object" && !isEmpty(s))) { - throw new Error( - `When returning an array of scores, each score must be a non-empty object. Got: ${JSON.stringify( - s, - )}`, - ); + const { trace: _trace, ...scoringArgsForLogging } = scoringArgs; + const propagatedEvent = makeScorerPropagatedEvent( + await rootSpan.export(), + ); + + const getOtherFields = (s: Score) => { + const { metadata: _metadata, name: _name, ...rest } = s; + return rest; + }; + + const [scoreResults, classificationResults] = await Promise.all([ + Promise.all( + evaluator.scores.map((score, score_idx) => + runInScorerSpan( + rootSpan, + scorerNames[score_idx], + SpanTypeAttribute.SCORE, + propagatedEvent, + scoringArgsForLogging, + async (span) => { + const scoreValue = await Promise.resolve( + score(scoringArgs), + ); + if (scoreValue === null) return null; + if (Array.isArray(scoreValue)) { + for (const s of scoreValue) { + if (!(typeof s === "object" && !isEmpty(s))) { + throw new Error( + `When returning an array of scores, each score must be a non-empty object. Got: ${JSON.stringify(s)}`, + ); + } } } - } - - const results = Array.isArray(scoreValue) - ? scoreValue - : typeof scoreValue === "object" && !isEmpty(scoreValue) - ? [scoreValue] - : [ - { - name: scorerNames[score_idx], - score: scoreValue, - }, - ]; - - const getOtherFields = (s: Score) => { - const { - metadata: _metadata, - name: _name, - classification: _classification, - id: _id, - label: _label, - ...rest - } = s; - return rest; - }; - - const resultMetadata = - results.length === 1 - ? results[0].metadata - : results.reduce( - (prev, s) => - mergeDicts(prev, { - [s.name]: s.metadata, - }), - {}, - ); - - const resultOutput = - results.length === 1 - ? getOtherFields(results[0]) - : results.reduce( - (prev, s) => - mergeDicts(prev, { - [s.name]: getOtherFields(s), - }), - {}, - ); - - const scoresRecord = results.reduce( - (prev, s) => - mergeDicts(prev, { [s.name]: s.score }), - {}, - ); - - span.log({ - output: resultOutput, - metadata: resultMetadata, - scores: scoresRecord, - }); - return results; - }; - - const { trace: _trace, ...scoringArgsForLogging } = - scoringArgs; - const spanType = - scorerSpanTypes[score_idx] === "classifier" - ? SpanTypeAttribute.CLASSIFIER - : SpanTypeAttribute.SCORE; - const results = await rootSpan.traced(runScorer, { - name: scorerNames[score_idx], - spanAttributes: { - type: spanType, - purpose: "scorer", + const results: Score[] = Array.isArray(scoreValue) + ? scoreValue + : typeof scoreValue === "object" && !isEmpty(scoreValue) + ? [scoreValue] + : [ + { + name: scorerNames[score_idx], + score: scoreValue, + }, + ]; + const { resultMetadata, scoresRecord } = + buildSpanScores(results); + const resultOutput = + results.length === 1 + ? getOtherFields(results[0]) + : results.reduce( + (prev, s) => + mergeDicts(prev, { + [s.name]: getOtherFields(s), + }), + {}, + ); + span.log({ + output: resultOutput, + metadata: resultMetadata, + scores: scoresRecord, + }); + return results; }, - propagatedEvent: makeScorerPropagatedEvent( - await rootSpan.export(), - ), - event: { input: scoringArgsForLogging }, - }); - return { kind: "score", value: results } as const; - } catch (e) { - return { kind: "error", value: e } as const; - } - }), + ), + ), + ), + Promise.all( + evaluator.classifications.map((classifier, idx) => + runInScorerSpan( + rootSpan, + classifierNames[idx], + SpanTypeAttribute.CLASSIFIER, + propagatedEvent, + scoringArgsForLogging, + async (span) => { + const classifierValue = await Promise.resolve( + classifier(scoringArgs), + ); + if (classifierValue === null) return null; + const rawResults = Array.isArray(classifierValue) + ? classifierValue + : [classifierValue]; + // Normalize: if the result is a raw classification value + // (string or { id, label? }) rather than a full + // Classification object, wrap it using the classifier name. + const toClassification = (r: unknown): Classification => { + if ( + r !== null && + typeof r === "object" && + "classification" in r && + "name" in r + ) { + return r as Classification; + } + return { + name: classifierNames[idx], + classification: r as + | string + | { id: string; label?: string }, + }; + }; + const results = rawResults.map(toClassification); + const toIdLabel = ( + c: string | { id: string; label?: string } | undefined, + ) => + c == null + ? null + : typeof c === "string" + ? { id: c, label: c } + : { id: c.id, label: c.label ?? c.id }; + const resultOutput = + results.length === 1 + ? toIdLabel(results[0].classification) + : results.reduce( + (prev, r) => + mergeDicts(prev, { + [r.name]: toIdLabel(r.classification), + }), + {}, + ); + span.log({ + output: resultOutput, + metadata: buildSpanMetadata(results), + }); + return results; + }, + ), + ), + ), + ]); + + const failingScorers = collectScoringResults( + scoreResults, + scorerNames, + (result) => { + scores[result.name] = result.score; + }, ); - const failingScorersAndResults: { name: string; error: unknown }[] = - []; + const classifications: Record< string, { id: string; label: string }[] > = {}; - scoreResults.forEach((results, i) => { - const name = scorerNames[i]; - if (results.kind === "score") { - (results.value || []).forEach((result) => { - scores[result.name] = result.score; - if (result.id != null) { - classifications[result.name] = [ - { id: result.id, label: result.label ?? result.id }, - ]; - } else if (result.classification != null) { - classifications[result.name] = [ - { - id: result.classification, - label: result.classification, - }, - ]; - } - }); - } else { - failingScorersAndResults.push({ name, error: results.value }); - } - }); + const failingClassifiers = collectScoringResults( + classificationResults, + classifierNames, + (result) => { + const c = result.classification; + if (typeof c === "string") { + classifications[result.name] = [{ id: c, label: c }]; + } else { + classifications[result.name] = [ + { id: c.id, label: c.label ?? c.id }, + ]; + } + }, + ); if (Object.keys(classifications).length > 0) { rootSpan.log({ classifications } as Parameters< @@ -1304,28 +1433,23 @@ async function runEvaluatorInternal( >[0]); } - unhandledScores = null; - if (failingScorersAndResults.length) { - const scorerErrors = Object.fromEntries( - failingScorersAndResults.map(({ name, error }) => [ - name, - error instanceof Error ? error.stack : `${error}`, - ]), - ); - metadata["scorer_errors"] = scorerErrors; - rootSpan.log({ - metadata: { scorer_errors: scorerErrors }, - }); - const names = Object.keys(scorerErrors).join(", "); - const errors = failingScorersAndResults.map((item) => item.error); - unhandledScores = Object.keys(scorerErrors); - debugLogger - .forState(evaluator.state) - .warn( - `Found exceptions for the following scorers: ${names}`, - errors, - ); - } + const failedScorerNames = logScoringFailures( + "scorer", + failingScorers, + metadata, + rootSpan, + evaluator.state, + ); + unhandledScores = failedScorerNames.length + ? failedScorerNames + : null; + logScoringFailures( + "classifier", + failingClassifiers, + metadata, + rootSpan, + evaluator.state, + ); } catch (e) { logSpanError(rootSpan, e); error = e; diff --git a/js/src/parameters.test.ts b/js/src/parameters.test.ts index dbba8ea49..99d29950d 100644 --- a/js/src/parameters.test.ts +++ b/js/src/parameters.test.ts @@ -26,6 +26,7 @@ test("parameters are passed to task", async () => { return output; }, scores: [], + classifications: [], parameters: { prefix: z.string().default("start:"), suffix: z.string().default(":end"), @@ -59,6 +60,7 @@ test("prompt parameter is passed correctly", async () => { return input; }, scores: [], + classifications: [], parameters: { main: { type: "prompt", @@ -99,6 +101,7 @@ test("custom parameter values override defaults", async () => { return output; }, scores: [], + classifications: [], parameters: { prefix: z.string().default("start:"), suffix: z.string().default(":end"), @@ -131,6 +134,7 @@ test("array parameter is handled correctly", async () => { return input; }, scores: [], + classifications: [], parameters: { items: z.array(z.string()).default(["item1", "item2"]), }, @@ -161,6 +165,7 @@ test("object parameter is handled correctly", async () => { return input; }, scores: [], + classifications: [], parameters: { config: z .object({ @@ -196,6 +201,7 @@ test("model parameter defaults to configured value", async () => { return input; }, scores: [], + classifications: [], parameters: { model: { type: "model", @@ -224,6 +230,7 @@ test("model parameter is required when default is missing", async () => { data: [{ input: "test" }], task: async (input: string) => input, scores: [], + classifications: [], parameters: { model: { type: "model", diff --git a/js/util/index.ts b/js/util/index.ts index 25a76cc03..9746567bc 100644 --- a/js/util/index.ts +++ b/js/util/index.ts @@ -55,7 +55,7 @@ export { ensureNewDatasetRecord, } from "./object"; -export type { Score, Scorer, ScorerArgs } from "./score"; +export type { Classification, Score, Scorer, ScorerArgs } from "./score"; export { constructJsonArray, deterministicReplacer } from "./json_util"; diff --git a/js/util/score.ts b/js/util/score.ts index 373a1c6de..b8ce404d2 100644 --- a/js/util/score.ts +++ b/js/util/score.ts @@ -1,16 +1,21 @@ /** - * A classification result: either a plain string label, or an object with a - * stable `id` and an optional human-readable `label` (defaults to `id`). - * When set, the value is recorded in the `classifications` column keyed by - * scorer name instead of (or in addition to) `score`. + * The result returned by a classifier function. Unlike `Score`, `classification` + * is required and the span will be recorded as a classifier span. */ -export type Classification = string | { id: string; label?: string }; +export interface Classification { + name: string; + /** + * The classification value: either a plain string label, or an object with a + * stable `id` and an optional human-readable `label` (defaults to `id`). + */ + classification: string | { id: string; label?: string }; + metadata?: Record; +} export interface Score { name: string; score: number | null; metadata?: Record; - classification?: Classification; // DEPRECATION_NOTICE: this field is deprecated, as errors are propagated up to the caller. /** * @deprecated From bbaf36254ce2ccce46a46e031712c62385acab19 Mon Sep 17 00:00:00 2001 From: Aswin Karumbunathan Date: Mon, 16 Mar 2026 11:42:58 -0700 Subject: [PATCH 3/5] Address review feedback --- js/src/framework.test.ts | 202 ++++++++++++++++++++++++++++----------- js/src/framework.ts | 131 +++++++++++++------------ js/util/index.ts | 8 +- js/util/object.ts | 1 + js/util/score.ts | 22 +++-- 5 files changed, 241 insertions(+), 123 deletions(-) diff --git a/js/src/framework.test.ts b/js/src/framework.test.ts index f431a728f..de57c2a3c 100644 --- a/js/src/framework.test.ts +++ b/js/src/framework.test.ts @@ -47,7 +47,6 @@ test("meta (write) is passed to task", async () => { return input * 2; }, scores: [], - classifications: [], }, new NoopProgressReporter(), [], @@ -86,7 +85,6 @@ test("metadata (read/write) is passed to task", async () => { return input * 2; }, scores: [], - classifications: [], }, new NoopProgressReporter(), [], @@ -127,7 +125,6 @@ test("expected (read/write) is passed to task", async () => { return input * 2; }, scores: [], - classifications: [], }, new NoopProgressReporter(), [], @@ -178,12 +175,10 @@ describe("runEvaluator", () => { scores: Array.from({ length: 3 }, (_, i) => makeTestScorer(`scorer_${i}`), ), - classifications: [], }, new NoopProgressReporter(), [], undefined, - true, ); expect(out.results.every((r) => Object.keys(r.scores).length === 0)).toBe( @@ -206,13 +201,11 @@ describe("runEvaluator", () => { scores: Array.from({ length: 3 }, (_, i) => makeTestScorer(`scorer_${i}`), ), - classifications: [], errorScoreHandler: defaultErrorScoreHandler, }, new NoopProgressReporter(), [], undefined, - true, ); expect( @@ -237,13 +230,11 @@ describe("runEvaluator", () => { scores: Array.from({ length: 3 }, (_, i) => makeTestScorer(`scorer_${i}`, i === 0), ), - classifications: [], errorScoreHandler: defaultErrorScoreHandler, }, new NoopProgressReporter(), [], undefined, - true, ); expect( @@ -272,13 +263,11 @@ describe("runEvaluator", () => { scores: Array.from({ length: 3 }, (_, i) => makeTestScorer(`scorer_${i}`), ), - classifications: [], errorScoreHandler: () => undefined, }, new NoopProgressReporter(), [], undefined, - true, ); expect( @@ -299,13 +288,11 @@ describe("runEvaluator", () => { scores: Array.from({ length: 3 }, (_, i) => makeTestScorer(`scorer_${i}`), ), - classifications: [], errorScoreHandler: () => ({ error_score: 1 }), }, new NoopProgressReporter(), [], undefined, - true, ); expect( @@ -353,7 +340,6 @@ describe("runEvaluator", () => { return input * 2; }, scores: [], - classifications: [], timeout: 10, maxConcurrency: 1, }, @@ -403,7 +389,6 @@ describe("runEvaluator", () => { return input * 2; }, scores: [], - classifications: [], signal: abortController.signal, maxConcurrency: 1, }, @@ -441,7 +426,6 @@ describe("runEvaluator", () => { return input * 2; }, scores: [], - classifications: [], }, new NoopProgressReporter(), [], @@ -469,7 +453,6 @@ test("trialIndex is passed to task", async () => { return input * 2; }, scores: [], - classifications: [], trialCount: 3, }, new NoopProgressReporter(), @@ -489,7 +472,7 @@ test("trialIndex is passed to task", async () => { // All results should be correct results.forEach((result) => { expect(result.input).toBe(1); - expect(result.expected).toBe(2); + expect("expected" in result ? result.expected : undefined).toBe(2); expect(result.output).toBe(2); expect(result.error).toBeUndefined(); }); @@ -512,7 +495,6 @@ test("trialIndex with multiple inputs", async () => { return input * 2; }, scores: [], - classifications: [], trialCount: 2, }, new NoopProgressReporter(), @@ -559,7 +541,6 @@ test("Eval with noSendLogs: true runs locally without creating experiment", asyn }), () => ({ name: "simple_scorer", score: 0.8 }), ], - classifications: [], }, { noSendLogs: true, returnResults: true }, ); @@ -589,9 +570,8 @@ test("Eval with noSendLogs: true runs locally without creating experiment", asyn test("Eval with returnResults: false produces empty results but valid summary", async () => { const result = await Eval( - "test-no-results", + "test-no-results-project", { - projectName: "test-no-results-project", data: [ { input: "hello", expected: "hello world" }, { input: "test", expected: "test world" }, @@ -606,7 +586,6 @@ test("Eval with returnResults: false produces empty results but valid summary", () => ({ name: "length_score", score: 0.75 }), () => ({ name: "quality_score", score: 0.9 }), ], - classifications: [], }, { noSendLogs: true, returnResults: false }, ); @@ -630,9 +609,8 @@ test("Eval with returnResults: false produces empty results but valid summary", test("Eval with returnResults: true collects all results", async () => { const result = await Eval( - "test-with-results", + "test-with-results-project", { - projectName: "test-with-results-project", data: [ { input: "hello", expected: "hello world" }, { input: "test", expected: "test world" }, @@ -644,7 +622,6 @@ test("Eval with returnResults: true collects all results", async () => { score: args.output === args.expected ? 1 : 0, }), ], - classifications: [], }, { noSendLogs: true, returnResults: true }, ); @@ -684,11 +661,10 @@ test("tags can be appended and logged to root span", async () => { evalName: "js-tags-append", data: [{ input: "hello", expected: "hello world", tags: initialTags }], task: (input, hooks) => { - for (const t of appendedTags) hooks.tags.push(t); + for (const t of appendedTags) hooks.tags!.push(t); return input; }, scores: [() => ({ name: "simple_scorer", score: 0.8 })], - classifications: [], summarizeScores: false, }, new NoopProgressReporter(), @@ -738,7 +714,6 @@ test.each([ return input; }, scores: [() => ({ name: "simple_scorer", score: 0.8 })], - classifications: [], summarizeScores: false, }, new NoopProgressReporter(), @@ -779,7 +754,6 @@ test("tags are persisted with a failing scorer", async () => { throw new Error("test error"); }, ], - classifications: [], summarizeScores: false, }, new NoopProgressReporter(), @@ -813,7 +787,6 @@ test("tags remain empty when not set", async () => { return input; }, scores: [() => ({ name: "simple_scorer", score: 0.8 })], - classifications: [], summarizeScores: false, }, new NoopProgressReporter(), @@ -845,12 +818,11 @@ test("scorer spans have purpose='scorer' attribute", async () => { data: [{ input: "hello", expected: "hello" }], task: async (input: string) => input, scores: [ - (args: { input: string; output: string; expected: string }) => ({ + (args: { output: string; expected?: string }) => ({ name: "simple_scorer", score: args.output === args.expected ? 1 : 0, }), ], - classifications: [], }, new NoopProgressReporter(), [], @@ -993,11 +965,12 @@ describe("framework2 metadata support", () => { options: { model: "gpt-4" }, }, [], + // eslint-disable-next-line @typescript-eslint/no-explicit-any { name: "test-prompt", slug: "test-prompt", metadata, - }, + } as any, ); const mockProjectMap = { @@ -1022,10 +995,8 @@ describe("framework2 metadata support", () => { options: { model: "gpt-4" }, }, [], - { - name: "test-prompt", - slug: "test-prompt", - }, + // eslint-disable-next-line @typescript-eslint/no-explicit-any + { name: "test-prompt", slug: "test-prompt" } as any, ); const mockProjectMap = { @@ -1048,11 +1019,12 @@ describe("framework2 metadata support", () => { options: { model: "gpt-4" }, }, [], + // eslint-disable-next-line @typescript-eslint/no-explicit-any { name: "test-prompt", slug: "test-prompt", environments: ["production"], - }, + } as any, ); const mockProjectMap = { @@ -1075,11 +1047,12 @@ describe("framework2 metadata support", () => { options: { model: "gpt-4" }, }, [], + // eslint-disable-next-line @typescript-eslint/no-explicit-any { name: "test-prompt", slug: "test-prompt", environments: ["staging", "production"], - }, + } as any, ); const mockProjectMap = { @@ -1105,10 +1078,8 @@ describe("framework2 metadata support", () => { options: { model: "gpt-4" }, }, [], - { - name: "test-prompt", - slug: "test-prompt", - }, + // eslint-disable-next-line @typescript-eslint/no-explicit-any + { name: "test-prompt", slug: "test-prompt" } as any, ); const mockProjectMap = { @@ -1151,11 +1122,8 @@ describe("framework2 metadata support", () => { options: { model: "gpt-4" }, }, [], - { - name: "test-prompt", - slug: "test-prompt", - tags, - }, + // eslint-disable-next-line @typescript-eslint/no-explicit-any + { name: "test-prompt", slug: "test-prompt", tags } as any, ); const mockProjectMap = { @@ -1180,10 +1148,8 @@ describe("framework2 metadata support", () => { options: { model: "gpt-4" }, }, [], - { - name: "test-prompt", - slug: "test-prompt", - }, + // eslint-disable-next-line @typescript-eslint/no-explicit-any + { name: "test-prompt", slug: "test-prompt" } as any, ); const mockProjectMap = { @@ -1492,7 +1458,6 @@ test("Eval with enableCache: false does not use span cache", async () => { data: [{ input: 1, expected: 2 }], task: (input) => input * 2, scores: [], - classifications: [], state, }, { noSendLogs: true, enableCache: false }, @@ -1518,7 +1483,6 @@ test("Eval with enableCache: true (default) uses span cache", async () => { data: [{ input: 1, expected: 2 }], task: (input) => input * 2, scores: [], - classifications: [], state, }, { noSendLogs: true }, // enableCache defaults to true @@ -1527,3 +1491,131 @@ test("Eval with enableCache: true (default) uses span cache", async () => { expect(startSpy).toHaveBeenCalled(); expect(stopSpy).toHaveBeenCalled(); }); + +test("classifier-only evaluator populates classifications field", async () => { + const result = await Eval( + "test-classifier-only", + { + data: [{ input: "hello", expected: "greeting" }], + task: (input) => input, + scores: [], + classifications: [ + () => ({ + name: "category", + id: "greeting", + label: "Greeting", + confidence: 0.91, + metadata: { source: "unit-test" }, + }), + ], + }, + { noSendLogs: true, returnResults: true }, + ); + + expect(result.results).toHaveLength(1); + const r = result.results[0]; + expect(r.classifications?.category).toEqual([ + { + id: "greeting", + label: "Greeting", + confidence: 0.91, + metadata: { source: "unit-test" }, + }, + ]); +}); + +test("scorer-only evaluator populates scores field", async () => { + const result = await Eval( + "test-scorer-only", + { + data: [{ input: "hello", expected: "hello" }], + task: (input) => input, + scores: [ + (args) => ({ + name: "exact_match", + score: args.output === args.expected ? 1 : 0, + }), + ], + }, + { noSendLogs: true, returnResults: true }, + ); + + expect(result.results).toHaveLength(1); + expect(result.results[0].scores.exact_match).toBe(1); + expect(result.results[0].classifications).toBeUndefined(); +}); + +test("multiple classifiers returning the same name append items correctly", async () => { + const result = await Eval( + "test-classifier-append", + { + data: [{ input: "hello" }], + task: (input) => input, + scores: [], + classifications: [ + () => [ + { name: "category", id: "greeting", label: "Greeting" }, + { name: "category", id: "informal", label: "Informal" }, + ], + ], + }, + { noSendLogs: true, returnResults: true }, + ); + + expect(result.results).toHaveLength(1); + expect(result.results[0].classifications?.category).toHaveLength(2); + expect(result.results[0].classifications?.category[0]).toEqual({ + id: "greeting", + label: "Greeting", + }); + expect(result.results[0].classifications?.category[1]).toEqual({ + id: "informal", + label: "Informal", + }); +}); + +test("mixed evaluator populates both scores and classifications", async () => { + const result = await Eval( + "test-score-and-classify", + { + data: [{ input: "hello", expected: "hello" }], + task: (input) => input, + scores: [ + (args) => ({ + name: "exact_match", + score: args.output === args.expected ? 1 : 0, + }), + ], + classifications: [ + () => ({ name: "category", id: "greeting", label: "Greeting" }), + ], + }, + { noSendLogs: true, returnResults: true }, + ); + + expect(result.results).toHaveLength(1); + expect(result.results[0].scores.exact_match).toBe(1); + expect(result.results[0].classifications?.category).toEqual([ + { id: "greeting", label: "Greeting" }, + ]); +}); + +test("malformed classifier output fails clearly", async () => { + const result = await Eval( + "test-invalid-classifier-output", + { + data: [{ input: "hello" }], + task: (input) => input, + scores: [], + classifications: [() => ({}) as never], + }, + { noSendLogs: true, returnResults: true }, + ); + + expect(result.results).toHaveLength(1); + expect((result.results[0] as any).metadata?.classifier_errors).toMatchObject({ + classifier_0: expect.stringMatching( + /must return classifications with a non-empty string name/, + ), + }); +}); diff --git a/js/src/framework.ts b/js/src/framework.ts index ff1851783..c57fc521c 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -2,6 +2,7 @@ import { makeScorerPropagatedEvent, mergeDicts, Classification, + ClassificationItem, Score, SpanComponentsV3, SpanTypeAttribute, @@ -205,9 +206,10 @@ export type EvalResult< Metadata extends BaseMetadata = DefaultMetadataType, > = EvalCase & { output: Output; - scores: Record; error: unknown; origin?: ObjectReference; + scores: Record; + classifications?: Record; }; type ErrorScoreHandler = (args: { @@ -241,10 +243,9 @@ export interface Evaluator< /** * A set of functions that take an input, output, and expected value and return a - * classification. Each function must return a {@link Classification} with a required - * `classification` field. Results are recorded under the `classifications` column. + * {@link Classification}. Results are recorded under the `classifications` column. */ - classifications: EvalClassifier[]; + classifications?: EvalClassifier[]; /** * A set of parameters that will be passed to the evaluator. @@ -883,7 +884,7 @@ export function scorerName( return scorer.name || `scorer_${scorer_idx}`; } -export function classifierName( +function classifierName( classifier: EvalClassifier, classifier_idx: number, ) { @@ -956,6 +957,37 @@ function collectScoringResults( return failing; } +function validateClassificationResult( + value: unknown, + scorerName: string, +): Classification { + if (!(typeof value === "object" && value !== null && !isEmpty(value))) { + throw new Error( + `When returning structured classifier results, each classification must be a non-empty object. Got: ${JSON.stringify(value)}`, + ); + } + if (!("name" in value) || typeof value.name !== "string" || !value.name) { + throw new Error( + `Classifier ${scorerName} must return classifications with a non-empty string name. Got: ${JSON.stringify(value)}`, + ); + } + if (!("id" in value) || typeof value.id !== "string" || !value.id) { + throw new Error( + `Classifier ${scorerName} must return classifications with a non-empty string id. Got: ${JSON.stringify(value)}`, + ); + } + return value as Classification; +} + +function toClassificationItem(c: Classification): ClassificationItem { + return { + id: c.id, + label: c.label ?? c.id, + ...(c.confidence !== undefined ? { confidence: c.confidence } : {}), + ...(c.metadata !== undefined ? { metadata: c.metadata } : {}), + }; +} + function logScoringFailures( kind: string, failures: { name: string; error: unknown }[], @@ -1204,8 +1236,11 @@ async function runEvaluatorInternal( let error: unknown | undefined = undefined; let tags: string[] = [...(datum.tags ?? [])]; const scores: Record = {}; + const classifications: Record = {}; const scorerNames = evaluator.scores.map(scorerName); - const classifierNames = evaluator.classifications.map(classifierName); + const classifierNames = (evaluator.classifications ?? []).map( + classifierName, + ); let unhandledScores: string[] | null = scorerNames; try { const meta = (o: Record) => @@ -1336,7 +1371,7 @@ async function runEvaluatorInternal( ), ), Promise.all( - evaluator.classifications.map((classifier, idx) => + (evaluator.classifications ?? []).map((classifier, idx) => runInScorerSpan( rootSpan, classifierNames[idx], @@ -1348,52 +1383,31 @@ async function runEvaluatorInternal( classifier(scoringArgs), ); if (classifierValue === null) return null; - const rawResults = Array.isArray(classifierValue) - ? classifierValue - : [classifierValue]; - // Normalize: if the result is a raw classification value - // (string or { id, label? }) rather than a full - // Classification object, wrap it using the classifier name. - const toClassification = (r: unknown): Classification => { - if ( - r !== null && - typeof r === "object" && - "classification" in r && - "name" in r - ) { - return r as Classification; - } - return { - name: classifierNames[idx], - classification: r as - | string - | { id: string; label?: string }, - }; - }; - const results = rawResults.map(toClassification); - const toIdLabel = ( - c: string | { id: string; label?: string } | undefined, - ) => - c == null - ? null - : typeof c === "string" - ? { id: c, label: c } - : { id: c.id, label: c.label ?? c.id }; + const rawResults = ( + Array.isArray(classifierValue) + ? classifierValue + : [classifierValue] + ).map((result) => + validateClassificationResult( + result, + classifierNames[idx], + ), + ); const resultOutput = - results.length === 1 - ? toIdLabel(results[0].classification) - : results.reduce( + rawResults.length === 1 + ? toClassificationItem(rawResults[0]) + : rawResults.reduce( (prev, r) => mergeDicts(prev, { - [r.name]: toIdLabel(r.classification), + [r.name]: toClassificationItem(r), }), {}, ); span.log({ output: resultOutput, - metadata: buildSpanMetadata(results), + metadata: buildSpanMetadata(rawResults), }); - return results; + return rawResults; }, ), ), @@ -1408,29 +1422,20 @@ async function runEvaluatorInternal( }, ); - const classifications: Record< - string, - { id: string; label: string }[] - > = {}; const failingClassifiers = collectScoringResults( classificationResults, classifierNames, (result) => { - const c = result.classification; - if (typeof c === "string") { - classifications[result.name] = [{ id: c, label: c }]; - } else { - classifications[result.name] = [ - { id: c.id, label: c.label ?? c.id }, - ]; + const item = toClassificationItem(result); + if (!classifications[result.name]) { + classifications[result.name] = []; } + classifications[result.name].push(item); }, ); if (Object.keys(classifications).length > 0) { - rootSpan.log({ classifications } as Parameters< - typeof rootSpan.log - >[0]); + rootSpan.log({ classifications }); } const failedScorerNames = logScoringFailures( @@ -1473,15 +1478,21 @@ async function runEvaluatorInternal( } if (collectResults) { - collectedResults.push({ + const baseResult = { input: datum.input, ...("expected" in datum ? { expected: datum.expected } : {}), output, tags: tags.length ? tags : undefined, metadata, - scores: mergedScores, error, origin: baseEvent.event?.origin, + }; + collectedResults.push({ + ...baseResult, + scores: mergedScores, + ...(Object.keys(classifications).length > 0 + ? { classifications } + : {}), }); } }; diff --git a/js/util/index.ts b/js/util/index.ts index 9746567bc..52b082cc1 100644 --- a/js/util/index.ts +++ b/js/util/index.ts @@ -55,7 +55,13 @@ export { ensureNewDatasetRecord, } from "./object"; -export type { Classification, Score, Scorer, ScorerArgs } from "./score"; +export type { + Classification, + ClassificationItem, + Score, + Scorer, + ScorerArgs, +} from "./score"; export { constructJsonArray, deterministicReplacer } from "./json_util"; diff --git a/js/util/object.ts b/js/util/object.ts index 735f52960..fea8735d6 100644 --- a/js/util/object.ts +++ b/js/util/object.ts @@ -21,6 +21,7 @@ export type OtherExperimentLogFields = { error: unknown; tags: string[]; scores: Record; + classifications?: Record; metadata: Record; metrics: Record; datasetRecordId: string; diff --git a/js/util/score.ts b/js/util/score.ts index b8ce404d2..c02365dde 100644 --- a/js/util/score.ts +++ b/js/util/score.ts @@ -1,14 +1,22 @@ /** - * The result returned by a classifier function. Unlike `Score`, `classification` - * is required and the span will be recorded as a classifier span. + * The result returned by a classifier function. Unlike `Score`, `id` is + * required and the span will be recorded as a classifier span. */ export interface Classification { name: string; - /** - * The classification value: either a plain string label, or an object with a - * stable `id` and an optional human-readable `label` (defaults to `id`). - */ - classification: string | { id: string; label?: string }; + id: string; + label?: string; + confidence?: number | null; + metadata?: Record; +} + +/** + * The serialized form of a classification stored in the `classifications` log record. + */ +export interface ClassificationItem { + id: string; + label: string; + confidence?: number | null; metadata?: Record; } From 0eaf765a96dbe4b2af28d6eb3bbed1bdee10033b Mon Sep 17 00:00:00 2001 From: Aswin Karumbunathan Date: Tue, 17 Mar 2026 09:56:32 -0700 Subject: [PATCH 4/5] Updates to match https://github.com/braintrustdata/braintrust-spec/pull/2/changes --- js/dev/server.ts | 4 +-- js/src/framework.test.ts | 11 +++---- js/src/framework.ts | 65 ++++++++++++++++++++++++++++----------- js/src/parameters.test.ts | 14 ++++----- 4 files changed, 60 insertions(+), 34 deletions(-) diff --git a/js/dev/server.ts b/js/dev/server.ts index aee357ad5..4e4daaf4d 100644 --- a/js/dev/server.ts +++ b/js/dev/server.ts @@ -117,7 +117,7 @@ export function runDevServer( evalDefs[name] = { parameters, - scores: evaluator.scores.map((score, idx) => ({ + scores: (evaluator.scores ?? []).map((score, idx) => ({ name: scorerName(score, idx), })), }; @@ -209,7 +209,7 @@ export function runDevServer( { ...evaluator, data: evalData.data, - scores: evaluator.scores.concat( + scores: (evaluator.scores ?? []).concat( scores?.map((score) => makeScorer( state, diff --git a/js/src/framework.test.ts b/js/src/framework.test.ts index de57c2a3c..ddda9d267 100644 --- a/js/src/framework.test.ts +++ b/js/src/framework.test.ts @@ -1498,8 +1498,7 @@ test("classifier-only evaluator populates classifications field", async () => { { data: [{ input: "hello", expected: "greeting" }], task: (input) => input, - scores: [], - classifications: [ + classifiers: [ () => ({ name: "category", id: "greeting", @@ -1551,8 +1550,7 @@ test("multiple classifiers returning the same name append items correctly", asyn { data: [{ input: "hello" }], task: (input) => input, - scores: [], - classifications: [ + classifiers: [ () => [ { name: "category", id: "greeting", label: "Greeting" }, { name: "category", id: "informal", label: "Informal" }, @@ -1586,7 +1584,7 @@ test("mixed evaluator populates both scores and classifications", async () => { score: args.output === args.expected ? 1 : 0, }), ], - classifications: [ + classifiers: [ () => ({ name: "category", id: "greeting", label: "Greeting" }), ], }, @@ -1606,8 +1604,7 @@ test("malformed classifier output fails clearly", async () => { { data: [{ input: "hello" }], task: (input) => input, - scores: [], - classifications: [() => ({}) as never], + classifiers: [() => ({}) as never], }, { noSendLogs: true, returnResults: true }, ); diff --git a/js/src/framework.ts b/js/src/framework.ts index c57fc521c..62e4e33ed 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -219,13 +219,13 @@ type ErrorScoreHandler = (args: { unhandledScores: string[]; }) => Record | undefined | void; -export interface Evaluator< +type EvaluatorBase< Input, Output, Expected, Metadata extends BaseMetadata = DefaultMetadataType, Parameters extends EvalParameters = EvalParameters, -> { +> = { /** * A function that returns a list of inputs, expected outputs, and metadata. */ @@ -236,17 +236,6 @@ export interface Evaluator< */ task: EvalTask; - /** - * A set of functions that take an input, output, and expected value and return a {@link Score}. - */ - scores: EvalScorer[]; - - /** - * A set of functions that take an input, output, and expected value and return a - * {@link Classification}. Results are recorded under the `classifications` column. - */ - classifications?: EvalClassifier[]; - /** * A set of parameters that will be passed to the evaluator. * Can be: @@ -364,7 +353,42 @@ export interface Evaluator< * Flushes spans before calling scoring functions */ flushBeforeScoring?: boolean; -} +}; + +/** + * Defines an evaluator. At least one of `scores` or `classifiers` must be provided. + */ +export type Evaluator< + Input, + Output, + Expected, + Metadata extends BaseMetadata = DefaultMetadataType, + Parameters extends EvalParameters = EvalParameters, +> = EvaluatorBase & + ( + | { + /** + * A set of functions that take an input, output, and expected value and return a {@link Score}. + */ + scores: EvalScorer[]; + /** + * A set of functions that take an input, output, and expected value and return a + * {@link Classification}. Results are recorded under the `classifications` column. + */ + classifiers?: EvalClassifier[]; + } + | { + /** + * A set of functions that take an input, output, and expected value and return a {@link Score}. + */ + scores?: EvalScorer[]; + /** + * A set of functions that take an input, output, and expected value and return a + * {@link Classification}. Results are recorded under the `classifications` column. + */ + classifiers: EvalClassifier[]; + } + ); export class EvalResultWithSummary< Input, @@ -1023,6 +1047,11 @@ export async function runEvaluator( enableCache = true, // eslint-disable-next-line @typescript-eslint/no-explicit-any ): Promise> { + if (!evaluator.scores && !evaluator.classifiers) { + throw new Error( + "Evaluator must include at least one of `scores` or `classifiers`", + ); + } return await runEvaluatorInternal( experiment, evaluator, @@ -1237,8 +1266,8 @@ async function runEvaluatorInternal( let tags: string[] = [...(datum.tags ?? [])]; const scores: Record = {}; const classifications: Record = {}; - const scorerNames = evaluator.scores.map(scorerName); - const classifierNames = (evaluator.classifications ?? []).map( + const scorerNames = (evaluator.scores ?? []).map(scorerName); + const classifierNames = (evaluator.classifiers ?? []).map( classifierName, ); let unhandledScores: string[] | null = scorerNames; @@ -1317,7 +1346,7 @@ async function runEvaluatorInternal( const [scoreResults, classificationResults] = await Promise.all([ Promise.all( - evaluator.scores.map((score, score_idx) => + (evaluator.scores ?? []).map((score, score_idx) => runInScorerSpan( rootSpan, scorerNames[score_idx], @@ -1371,7 +1400,7 @@ async function runEvaluatorInternal( ), ), Promise.all( - (evaluator.classifications ?? []).map((classifier, idx) => + (evaluator.classifiers ?? []).map((classifier, idx) => runInScorerSpan( rootSpan, classifierNames[idx], diff --git a/js/src/parameters.test.ts b/js/src/parameters.test.ts index 99d29950d..d5b7b7e4b 100644 --- a/js/src/parameters.test.ts +++ b/js/src/parameters.test.ts @@ -26,7 +26,7 @@ test("parameters are passed to task", async () => { return output; }, scores: [], - classifications: [], + classifiers: [], parameters: { prefix: z.string().default("start:"), suffix: z.string().default(":end"), @@ -60,7 +60,7 @@ test("prompt parameter is passed correctly", async () => { return input; }, scores: [], - classifications: [], + classifiers: [], parameters: { main: { type: "prompt", @@ -101,7 +101,7 @@ test("custom parameter values override defaults", async () => { return output; }, scores: [], - classifications: [], + classifiers: [], parameters: { prefix: z.string().default("start:"), suffix: z.string().default(":end"), @@ -134,7 +134,7 @@ test("array parameter is handled correctly", async () => { return input; }, scores: [], - classifications: [], + classifiers: [], parameters: { items: z.array(z.string()).default(["item1", "item2"]), }, @@ -165,7 +165,7 @@ test("object parameter is handled correctly", async () => { return input; }, scores: [], - classifications: [], + classifiers: [], parameters: { config: z .object({ @@ -201,7 +201,7 @@ test("model parameter defaults to configured value", async () => { return input; }, scores: [], - classifications: [], + classifiers: [], parameters: { model: { type: "model", @@ -230,7 +230,7 @@ test("model parameter is required when default is missing", async () => { data: [{ input: "test" }], task: async (input: string) => input, scores: [], - classifications: [], + classifiers: [], parameters: { model: { type: "model", From 1421ef71cfd68ce2d9adcc03f58943cb92890483 Mon Sep 17 00:00:00 2001 From: Aswin Karumbunathan Date: Tue, 17 Mar 2026 10:58:05 -0700 Subject: [PATCH 5/5] remove confidence, fix CI we're not using confidence yet, so leave it out for now, we can always add it in later --- js/src/cli/functions/infer-source.ts | 2 +- js/src/cli/functions/upload.ts | 38 +++++++++--------- js/src/framework.test.ts | 2 - js/src/framework.ts | 59 ++++++++++------------------ js/util/score.ts | 2 - 5 files changed, 41 insertions(+), 62 deletions(-) diff --git a/js/src/cli/functions/infer-source.ts b/js/src/cli/functions/infer-source.ts index 179b5c50d..0a759b422 100644 --- a/js/src/cli/functions/infer-source.ts +++ b/js/src/cli/functions/infer-source.ts @@ -85,7 +85,7 @@ export async function findCodeDefinition({ fn = location.position.type === "task" ? evaluator.task - : evaluator.scores[location.position.index]; + : (evaluator.scores ?? [])[location.position.index]; } } else if (location.type === "function") { fn = outFileModule.functions[location.index].handler; diff --git a/js/src/cli/functions/upload.ts b/js/src/cli/functions/upload.ts index ce67f44d9..1745b5329 100644 --- a/js/src/cli/functions/upload.ts +++ b/js/src/cli/functions/upload.ts @@ -180,23 +180,25 @@ export async function uploadHandleBundles({ function_type: "task", origin, }, - ...evaluator.evaluator.scores.map((score, i): BundledFunctionSpec => { - const name = scorerName(score, i); - return { - ...baseInfo, - // There is a very small chance that someone names a function with the same convention, but - // let's assume it's low enough that it doesn't matter. - ...formatNameAndSlug(["eval", namePrefix, "scorer", name]), - description: `Score ${name} for eval ${namePrefix}`, - location: { - type: "experiment", - eval_name: evaluator.evaluator.evalName, - position: { type: "scorer", index: i }, - }, - function_type: "scorer", - origin, - }; - }), + ...(evaluator.evaluator.scores ?? []).map( + (score, i): BundledFunctionSpec => { + const name = scorerName(score, i); + return { + ...baseInfo, + // There is a very small chance that someone names a function with the same convention, but + // let's assume it's low enough that it doesn't matter. + ...formatNameAndSlug(["eval", namePrefix, "scorer", name]), + description: `Score ${name} for eval ${namePrefix}`, + location: { + type: "experiment", + eval_name: evaluator.evaluator.evalName, + position: { type: "scorer", index: i }, + }, + function_type: "scorer", + origin, + }; + }, + ), ]; bundleSpecs.push(...fileSpecs); @@ -219,7 +221,7 @@ export async function uploadHandleBundles({ serializeRemoteEvalParametersContainer(resolvedParameters), } : {}), - scores: evaluator.evaluator.scores.map((score, i) => ({ + scores: (evaluator.evaluator.scores ?? []).map((score, i) => ({ name: scorerName(score, i), })), }; diff --git a/js/src/framework.test.ts b/js/src/framework.test.ts index ddda9d267..45989fcdf 100644 --- a/js/src/framework.test.ts +++ b/js/src/framework.test.ts @@ -1503,7 +1503,6 @@ test("classifier-only evaluator populates classifications field", async () => { name: "category", id: "greeting", label: "Greeting", - confidence: 0.91, metadata: { source: "unit-test" }, }), ], @@ -1517,7 +1516,6 @@ test("classifier-only evaluator populates classifications field", async () => { { id: "greeting", label: "Greeting", - confidence: 0.91, metadata: { source: "unit-test" }, }, ]); diff --git a/js/src/framework.ts b/js/src/framework.ts index 62e4e33ed..c2bad700d 100644 --- a/js/src/framework.ts +++ b/js/src/framework.ts @@ -219,13 +219,17 @@ type ErrorScoreHandler = (args: { unhandledScores: string[]; }) => Record | undefined | void; -type EvaluatorBase< +/** + * Defines an evaluator. At least one of `scores` or `classifiers` must be provided; + * a runtime error is raised if neither is present. + */ +export interface Evaluator< Input, Output, Expected, Metadata extends BaseMetadata = DefaultMetadataType, Parameters extends EvalParameters = EvalParameters, -> = { +> { /** * A function that returns a list of inputs, expected outputs, and metadata. */ @@ -236,6 +240,19 @@ type EvaluatorBase< */ task: EvalTask; + /** + * A set of functions that take an input, output, and expected value and return a {@link Score}. + * At least one of `scores` or `classifiers` must be provided. + */ + scores?: EvalScorer[]; + + /** + * A set of functions that take an input, output, and expected value and return a + * {@link Classification}. Results are recorded under the `classifications` column. + * At least one of `scores` or `classifiers` must be provided. + */ + classifiers?: EvalClassifier[]; + /** * A set of parameters that will be passed to the evaluator. * Can be: @@ -353,42 +370,7 @@ type EvaluatorBase< * Flushes spans before calling scoring functions */ flushBeforeScoring?: boolean; -}; - -/** - * Defines an evaluator. At least one of `scores` or `classifiers` must be provided. - */ -export type Evaluator< - Input, - Output, - Expected, - Metadata extends BaseMetadata = DefaultMetadataType, - Parameters extends EvalParameters = EvalParameters, -> = EvaluatorBase & - ( - | { - /** - * A set of functions that take an input, output, and expected value and return a {@link Score}. - */ - scores: EvalScorer[]; - /** - * A set of functions that take an input, output, and expected value and return a - * {@link Classification}. Results are recorded under the `classifications` column. - */ - classifiers?: EvalClassifier[]; - } - | { - /** - * A set of functions that take an input, output, and expected value and return a {@link Score}. - */ - scores?: EvalScorer[]; - /** - * A set of functions that take an input, output, and expected value and return a - * {@link Classification}. Results are recorded under the `classifications` column. - */ - classifiers: EvalClassifier[]; - } - ); +} export class EvalResultWithSummary< Input, @@ -1007,7 +989,6 @@ function toClassificationItem(c: Classification): ClassificationItem { return { id: c.id, label: c.label ?? c.id, - ...(c.confidence !== undefined ? { confidence: c.confidence } : {}), ...(c.metadata !== undefined ? { metadata: c.metadata } : {}), }; } diff --git a/js/util/score.ts b/js/util/score.ts index c02365dde..08daebeef 100644 --- a/js/util/score.ts +++ b/js/util/score.ts @@ -6,7 +6,6 @@ export interface Classification { name: string; id: string; label?: string; - confidence?: number | null; metadata?: Record; } @@ -16,7 +15,6 @@ export interface Classification { export interface ClassificationItem { id: string; label: string; - confidence?: number | null; metadata?: Record; }