From 80eb21027fb85b6f97e71068025272a804e74e53 Mon Sep 17 00:00:00 2001
From: Aswin Karumbunathan <aswin@braintrustdata.com>
Date: Thu, 12 Mar 2026 12:06:59 -0700
Subject: [PATCH 1/5] Add support for classifier "scorers"

---
 js/src/framework.ts | 59 +++++++++++++++++++++++++++++++++++++--------
 js/util/score.ts    |  9 +++++++
 2 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/js/src/framework.ts b/js/src/framework.ts
index 637e688d9..78cee4bc8 100644
--- a/js/src/framework.ts
+++ b/js/src/framework.ts
@@ -227,6 +227,12 @@ export interface Evaluator<
    */
   scores: EvalScorer<Input, Output, Expected, Metadata>[];
 
+  /**
+   * Optional span types per scorer. When provided, same length as `scores`.
+   * Use `"classifier"` for classifier spans; otherwise `"score"`.
+   */
+  scorerSpanTypes?: ("score" | "classifier")[];
+
   /**
    * A set of parameters that will be passed to the evaluator.
    * Can be:
@@ -1154,6 +1160,7 @@ async function runEvaluatorInternal(
               output,
               trace,
             };
+            const scorerSpanTypes = evaluator.scorerSpanTypes ?? [];
             const scoreResults = await Promise.all(
               evaluator.scores.map(async (score, score_idx) => {
                 try {
@@ -1192,7 +1199,14 @@ async function runEvaluatorInternal(
                           ];
 
                     const getOtherFields = (s: Score) => {
-                      const { metadata: _metadata, name: _name, ...rest } = s;
+                      const {
+                        metadata: _metadata,
+                        name: _name,
+                        classification: _classification,
+                        id: _id,
+                        label: _label,
+                        ...rest
+                      } = s;
                       return rest;
                     };
 
@@ -1212,31 +1226,36 @@ async function runEvaluatorInternal(
                         ? getOtherFields(results[0])
                         : results.reduce(
                             (prev, s) =>
-                              mergeDicts(prev, { [s.name]: getOtherFields(s) }),
+                              mergeDicts(prev, {
+                                [s.name]: getOtherFields(s),
+                              }),
                             {},
                           );
 
-                    const scores = results.reduce(
-                      (prev, s) => mergeDicts(prev, { [s.name]: s.score }),
+                    const scoresRecord = results.reduce(
+                      (prev, s) =>
+                        mergeDicts(prev, { [s.name]: s.score }),
                       {},
                     );
 
                     span.log({
                       output: resultOutput,
                       metadata: resultMetadata,
-                      scores: scores,
+                      scores: scoresRecord,
                     });
                     return results;
                   };
 
-                  // Exclude trace from logged input since it contains internal state
-                  // that shouldn't be serialized (spansFlushPromise, spansFlushed, etc.)
                   const { trace: _trace, ...scoringArgsForLogging } =
                     scoringArgs;
+                  const spanType =
+                    scorerSpanTypes[score_idx] === "classifier"
+                      ? SpanTypeAttribute.CLASSIFIER
+                      : SpanTypeAttribute.SCORE;
                   const results = await rootSpan.traced(runScorer, {
                     name: scorerNames[score_idx],
                     spanAttributes: {
-                      type: SpanTypeAttribute.SCORE,
+                      type: spanType,
                       purpose: "scorer",
                     },
                     propagatedEvent: makeScorerPropagatedEvent(
@@ -1250,21 +1269,41 @@ async function runEvaluatorInternal(
                 }
               }),
             );
-            // Resolve each promise on its own so that we can separate the passing
-            // from the failing ones.
             const failingScorersAndResults: { name: string; error: unknown }[] =
               [];
+            const classifications: Record<
+              string,
+              { id: string; label: string }[]
+            > = {};
             scoreResults.forEach((results, i) => {
               const name = scorerNames[i];
               if (results.kind === "score") {
                 (results.value || []).forEach((result) => {
                   scores[result.name] = result.score;
+                  if (result.id != null) {
+                    classifications[result.name] = [
+                      { id: result.id, label: result.label ?? result.id },
+                    ];
+                  } else if (result.classification != null) {
+                    classifications[result.name] = [
+                      {
+                        id: result.classification,
+                        label: result.classification,
+                      },
+                    ];
+                  }
                 });
               } else {
                 failingScorersAndResults.push({ name, error: results.value });
               }
             });
 
+            if (Object.keys(classifications).length > 0) {
+              rootSpan.log({ classifications } as Parameters<
+                typeof rootSpan.log
+              >[0]);
+            }
+
             unhandledScores = null;
             if (failingScorersAndResults.length) {
               const scorerErrors = Object.fromEntries(
diff --git a/js/util/score.ts b/js/util/score.ts
index 758902344..373a1c6de 100644
--- a/js/util/score.ts
+++ b/js/util/score.ts
@@ -1,7 +1,16 @@
+/**
+ * A classification result: either a plain string label, or an object with a
+ * stable `id` and an optional human-readable `label` (defaults to `id`).
+ * When set, the value is recorded in the `classifications` column keyed by
+ * scorer name instead of (or in addition to) `score`.
+ */
+export type Classification = string | { id: string; label?: string };
+
 export interface Score {
   name: string;
   score: number | null;
   metadata?: Record<string, unknown>;
+  classification?: Classification;
   // DEPRECATION_NOTICE: this field is deprecated, as errors are propagated up to the caller.
   /**
    * @deprecated

From 76cc66dc4016f76d6fed5a6ab0331340b52b297f Mon Sep 17 00:00:00 2001
From: Aswin Karumbunathan <aswin@braintrustdata.com>
Date: Thu, 12 Mar 2026 16:05:57 -0700
Subject: [PATCH 2/5] take 2: with a separate Classification type

---
 js/src/exports.ts         |   1 +
 js/src/framework.test.ts  |  23 ++
 js/src/framework.ts       | 434 ++++++++++++++++++++++++--------------
 js/src/parameters.test.ts |   7 +
 js/util/index.ts          |   2 +-
 js/util/score.ts          |  17 +-
 6 files changed, 322 insertions(+), 162 deletions(-)

diff --git a/js/src/exports.ts b/js/src/exports.ts
index 6eb2b5c1f..352d5a80e 100644
--- a/js/src/exports.ts
+++ b/js/src/exports.ts
@@ -185,6 +185,7 @@ export type {
   EvalResult,
   EvalScorerArgs,
   EvalScorer,
+  EvalClassifier,
   EvaluatorDef,
   EvaluatorFile,
   ReporterBody,
diff --git a/js/src/framework.test.ts b/js/src/framework.test.ts
index bd9381342..f431a728f 100644
--- a/js/src/framework.test.ts
+++ b/js/src/framework.test.ts
@@ -47,6 +47,7 @@ test("meta (write) is passed to task", async () => {
         return input * 2;
       },
       scores: [],
+      classifications: [],
     },
     new NoopProgressReporter(),
     [],
@@ -85,6 +86,7 @@ test("metadata (read/write) is passed to task", async () => {
         return input * 2;
       },
       scores: [],
+      classifications: [],
     },
     new NoopProgressReporter(),
     [],
@@ -125,6 +127,7 @@ test("expected (read/write) is passed to task", async () => {
         return input * 2;
       },
       scores: [],
+      classifications: [],
     },
     new NoopProgressReporter(),
     [],
@@ -175,6 +178,7 @@ describe("runEvaluator", () => {
           scores: Array.from({ length: 3 }, (_, i) =>
             makeTestScorer(`scorer_${i}`),
           ),
+          classifications: [],
         },
         new NoopProgressReporter(),
         [],
@@ -202,6 +206,7 @@ describe("runEvaluator", () => {
               scores: Array.from({ length: 3 }, (_, i) =>
                 makeTestScorer(`scorer_${i}`),
               ),
+              classifications: [],
               errorScoreHandler: defaultErrorScoreHandler,
             },
             new NoopProgressReporter(),
@@ -232,6 +237,7 @@ describe("runEvaluator", () => {
               scores: Array.from({ length: 3 }, (_, i) =>
                 makeTestScorer(`scorer_${i}`, i === 0),
               ),
+              classifications: [],
               errorScoreHandler: defaultErrorScoreHandler,
             },
             new NoopProgressReporter(),
@@ -266,6 +272,7 @@ describe("runEvaluator", () => {
               scores: Array.from({ length: 3 }, (_, i) =>
                 makeTestScorer(`scorer_${i}`),
               ),
+              classifications: [],
               errorScoreHandler: () => undefined,
             },
             new NoopProgressReporter(),
@@ -292,6 +299,7 @@ describe("runEvaluator", () => {
               scores: Array.from({ length: 3 }, (_, i) =>
                 makeTestScorer(`scorer_${i}`),
               ),
+              classifications: [],
               errorScoreHandler: () => ({ error_score: 1 }),
             },
             new NoopProgressReporter(),
@@ -345,6 +353,7 @@ describe("runEvaluator", () => {
               return input * 2;
             },
             scores: [],
+            classifications: [],
             timeout: 10,
             maxConcurrency: 1,
           },
@@ -394,6 +403,7 @@ describe("runEvaluator", () => {
               return input * 2;
             },
             scores: [],
+            classifications: [],
             signal: abortController.signal,
             maxConcurrency: 1,
           },
@@ -431,6 +441,7 @@ describe("runEvaluator", () => {
             return input * 2;
           },
           scores: [],
+          classifications: [],
         },
         new NoopProgressReporter(),
         [],
@@ -458,6 +469,7 @@ test("trialIndex is passed to task", async () => {
         return input * 2;
       },
       scores: [],
+      classifications: [],
       trialCount: 3,
     },
     new NoopProgressReporter(),
@@ -500,6 +512,7 @@ test("trialIndex with multiple inputs", async () => {
         return input * 2;
       },
       scores: [],
+      classifications: [],
       trialCount: 2,
     },
     new NoopProgressReporter(),
@@ -546,6 +559,7 @@ test("Eval with noSendLogs: true runs locally without creating experiment", asyn
         }),
         () => ({ name: "simple_scorer", score: 0.8 }),
       ],
+      classifications: [],
     },
     { noSendLogs: true, returnResults: true },
   );
@@ -592,6 +606,7 @@ test("Eval with returnResults: false produces empty results but valid summary",
         () => ({ name: "length_score", score: 0.75 }),
         () => ({ name: "quality_score", score: 0.9 }),
       ],
+      classifications: [],
     },
     { noSendLogs: true, returnResults: false },
   );
@@ -629,6 +644,7 @@ test("Eval with returnResults: true collects all results", async () => {
           score: args.output === args.expected ? 1 : 0,
         }),
       ],
+      classifications: [],
     },
     { noSendLogs: true, returnResults: true },
   );
@@ -672,6 +688,7 @@ test("tags can be appended and logged to root span", async () => {
         return input;
       },
       scores: [() => ({ name: "simple_scorer", score: 0.8 })],
+      classifications: [],
       summarizeScores: false,
     },
     new NoopProgressReporter(),
@@ -721,6 +738,7 @@ test.each([
         return input;
       },
       scores: [() => ({ name: "simple_scorer", score: 0.8 })],
+      classifications: [],
       summarizeScores: false,
     },
     new NoopProgressReporter(),
@@ -761,6 +779,7 @@ test("tags are persisted with a failing scorer", async () => {
           throw new Error("test error");
         },
       ],
+      classifications: [],
       summarizeScores: false,
     },
     new NoopProgressReporter(),
@@ -794,6 +813,7 @@ test("tags remain empty when not set", async () => {
         return input;
       },
       scores: [() => ({ name: "simple_scorer", score: 0.8 })],
+      classifications: [],
       summarizeScores: false,
     },
     new NoopProgressReporter(),
@@ -830,6 +850,7 @@ test("scorer spans have purpose='scorer' attribute", async () => {
           score: args.output === args.expected ? 1 : 0,
         }),
       ],
+      classifications: [],
     },
     new NoopProgressReporter(),
     [],
@@ -1471,6 +1492,7 @@ test("Eval with enableCache: false does not use span cache", async () => {
       data: [{ input: 1, expected: 2 }],
       task: (input) => input * 2,
       scores: [],
+      classifications: [],
       state,
     },
     { noSendLogs: true, enableCache: false },
@@ -1496,6 +1518,7 @@ test("Eval with enableCache: true (default) uses span cache", async () => {
       data: [{ input: 1, expected: 2 }],
       task: (input) => input * 2,
       scores: [],
+      classifications: [],
       state,
     },
     { noSendLogs: true }, // enableCache defaults to true
diff --git a/js/src/framework.ts b/js/src/framework.ts
index 78cee4bc8..ff1851783 100644
--- a/js/src/framework.ts
+++ b/js/src/framework.ts
@@ -1,6 +1,7 @@
 import {
   makeScorerPropagatedEvent,
   mergeDicts,
+  Classification,
   Score,
   SpanComponentsV3,
   SpanTypeAttribute,
@@ -186,6 +187,17 @@ export type EvalScorer<
   args: EvalScorerArgs<Input, Output, Expected, Metadata>,
 ) => OneOrMoreScores | Promise<OneOrMoreScores>;
 
+export type OneOrMoreClassifications = Classification | Classification[] | null;
+
+export type EvalClassifier<
+  Input,
+  Output,
+  Expected,
+  Metadata extends BaseMetadata = DefaultMetadataType,
+> = (
+  args: EvalScorerArgs<Input, Output, Expected, Metadata>,
+) => OneOrMoreClassifications | Promise<OneOrMoreClassifications>;
+
 export type EvalResult<
   Input,
   Output,
@@ -223,15 +235,16 @@ export interface Evaluator<
   task: EvalTask<Input, Output, Expected, Metadata, Parameters>;
 
   /**
-   * A set of functions that take an input, output, and expected value and return a score.
+   * A set of functions that take an input, output, and expected value and return a {@link Score}.
    */
   scores: EvalScorer<Input, Output, Expected, Metadata>[];
 
   /**
-   * Optional span types per scorer. When provided, same length as `scores`.
-   * Use `"classifier"` for classifier spans; otherwise `"score"`.
+   * A set of functions that take an input, output, and expected value and return a
+   * classification. Each function must return a {@link Classification} with a required
+   * `classification` field. Results are recorded under the `classifications` column.
    */
-  scorerSpanTypes?: ("score" | "classifier")[];
+  classifications: EvalClassifier<Input, Output, Expected, Metadata>[];
 
   /**
    * A set of parameters that will be passed to the evaluator.
@@ -870,6 +883,102 @@ export function scorerName(
   return scorer.name || `scorer_${scorer_idx}`;
 }
 
+export function classifierName(
+  classifier: EvalClassifier<any, any, any, any>,
+  classifier_idx: number,
+) {
+  return classifier.name || `classifier_${classifier_idx}`;
+}
+
+function buildSpanMetadata(
+  results: Array<{ name: string; metadata?: Record<string, unknown> }>,
+) {
+  return results.length === 1
+    ? results[0].metadata
+    : results.reduce(
+        (prev, s) => mergeDicts(prev, { [s.name]: s.metadata }),
+        {},
+      );
+}
+
+function buildSpanScores(
+  results: Array<{
+    name: string;
+    score: number | null;
+    metadata?: Record<string, unknown>;
+  }>,
+) {
+  const scoresRecord = results.reduce(
+    (prev, s) => mergeDicts(prev, { [s.name]: s.score }),
+    {},
+  );
+  return { resultMetadata: buildSpanMetadata(results), scoresRecord };
+}
+
+async function runInScorerSpan<T>(
+  rootSpan: Span,
+  spanName: string,
+  spanType: SpanTypeAttribute,
+  propagatedEvent: ReturnType<typeof makeScorerPropagatedEvent>,
+  eventInput: unknown,
+  fn: (span: Span) => Promise<T[] | null>,
+): Promise<
+  { kind: "score"; value: T[] | null } | { kind: "error"; value: unknown }
+> {
+  try {
+    const value = await rootSpan.traced(fn, {
+      name: spanName,
+      spanAttributes: { type: spanType, purpose: "scorer" },
+      propagatedEvent,
+      event: { input: eventInput },
+    });
+    return { kind: "score", value };
+  } catch (e) {
+    return { kind: "error", value: e };
+  }
+}
+
+function collectScoringResults<T extends { name: string }>(
+  runResults: Array<
+    { kind: "score"; value: T[] | null } | { kind: "error"; value: unknown }
+  >,
+  names: string[],
+  onResult: (result: T) => void,
+): { name: string; error: unknown }[] {
+  const failing: { name: string; error: unknown }[] = [];
+  runResults.forEach((r, i) => {
+    if (r.kind === "score") {
+      (r.value ?? []).forEach(onResult);
+    } else {
+      failing.push({ name: names[i], error: r.value });
+    }
+  });
+  return failing;
+}
+
+function logScoringFailures(
+  kind: string,
+  failures: { name: string; error: unknown }[],
+  metadata: Record<string, unknown>,
+  rootSpan: Span,
+  state: BraintrustState | undefined,
+): string[] {
+  if (!failures.length) return [];
+  const errorMap = Object.fromEntries(
+    failures.map(({ name, error }) => [
+      name,
+      error instanceof Error ? error.stack : `${error}`,
+    ]),
+  );
+  metadata[`${kind}_errors`] = errorMap;
+  rootSpan.log({ metadata: { [`${kind}_errors`]: errorMap } });
+  debugLogger.forState(state).warn(
+    `Found exceptions for the following ${kind}s: ${Object.keys(errorMap).join(", ")}`,
+    failures.map((f) => f.error),
+  );
+  return Object.keys(errorMap);
+}
+
 export async function runEvaluator(
   experiment: Experiment | null,
   // eslint-disable-next-line @typescript-eslint/no-explicit-any
@@ -1096,6 +1205,7 @@ async function runEvaluatorInternal(
           let tags: string[] = [...(datum.tags ?? [])];
           const scores: Record<string, number | null> = {};
           const scorerNames = evaluator.scores.map(scorerName);
+          const classifierNames = evaluator.classifications.map(classifierName);
           let unhandledScores: string[] | null = scorerNames;
           try {
             const meta = (o: Record<string, unknown>) =>
@@ -1160,143 +1270,162 @@ async function runEvaluatorInternal(
               output,
               trace,
             };
-            const scorerSpanTypes = evaluator.scorerSpanTypes ?? [];
-            const scoreResults = await Promise.all(
-              evaluator.scores.map(async (score, score_idx) => {
-                try {
-                  const runScorer = async (span: Span) => {
-                    const scoreResult = score(scoringArgs);
-                    const scoreValue =
-                      scoreResult instanceof Promise
-                        ? await scoreResult
-                        : scoreResult;
-
-                    if (scoreValue === null) {
-                      return null;
-                    }
-
-                    if (Array.isArray(scoreValue)) {
-                      for (const s of scoreValue) {
-                        if (!(typeof s === "object" && !isEmpty(s))) {
-                          throw new Error(
-                            `When returning an array of scores, each score must be a non-empty object. Got: ${JSON.stringify(
-                              s,
-                            )}`,
-                          );
+            const { trace: _trace, ...scoringArgsForLogging } = scoringArgs;
+            const propagatedEvent = makeScorerPropagatedEvent(
+              await rootSpan.export(),
+            );
+
+            const getOtherFields = (s: Score) => {
+              const { metadata: _metadata, name: _name, ...rest } = s;
+              return rest;
+            };
+
+            const [scoreResults, classificationResults] = await Promise.all([
+              Promise.all(
+                evaluator.scores.map((score, score_idx) =>
+                  runInScorerSpan(
+                    rootSpan,
+                    scorerNames[score_idx],
+                    SpanTypeAttribute.SCORE,
+                    propagatedEvent,
+                    scoringArgsForLogging,
+                    async (span) => {
+                      const scoreValue = await Promise.resolve(
+                        score(scoringArgs),
+                      );
+                      if (scoreValue === null) return null;
+                      if (Array.isArray(scoreValue)) {
+                        for (const s of scoreValue) {
+                          if (!(typeof s === "object" && !isEmpty(s))) {
+                            throw new Error(
+                              `When returning an array of scores, each score must be a non-empty object. Got: ${JSON.stringify(s)}`,
+                            );
+                          }
                         }
                       }
-                    }
-
-                    const results = Array.isArray(scoreValue)
-                      ? scoreValue
-                      : typeof scoreValue === "object" && !isEmpty(scoreValue)
-                        ? [scoreValue]
-                        : [
-                            {
-                              name: scorerNames[score_idx],
-                              score: scoreValue,
-                            },
-                          ];
-
-                    const getOtherFields = (s: Score) => {
-                      const {
-                        metadata: _metadata,
-                        name: _name,
-                        classification: _classification,
-                        id: _id,
-                        label: _label,
-                        ...rest
-                      } = s;
-                      return rest;
-                    };
-
-                    const resultMetadata =
-                      results.length === 1
-                        ? results[0].metadata
-                        : results.reduce(
-                            (prev, s) =>
-                              mergeDicts(prev, {
-                                [s.name]: s.metadata,
-                              }),
-                            {},
-                          );
-
-                    const resultOutput =
-                      results.length === 1
-                        ? getOtherFields(results[0])
-                        : results.reduce(
-                            (prev, s) =>
-                              mergeDicts(prev, {
-                                [s.name]: getOtherFields(s),
-                              }),
-                            {},
-                          );
-
-                    const scoresRecord = results.reduce(
-                      (prev, s) =>
-                        mergeDicts(prev, { [s.name]: s.score }),
-                      {},
-                    );
-
-                    span.log({
-                      output: resultOutput,
-                      metadata: resultMetadata,
-                      scores: scoresRecord,
-                    });
-                    return results;
-                  };
-
-                  const { trace: _trace, ...scoringArgsForLogging } =
-                    scoringArgs;
-                  const spanType =
-                    scorerSpanTypes[score_idx] === "classifier"
-                      ? SpanTypeAttribute.CLASSIFIER
-                      : SpanTypeAttribute.SCORE;
-                  const results = await rootSpan.traced(runScorer, {
-                    name: scorerNames[score_idx],
-                    spanAttributes: {
-                      type: spanType,
-                      purpose: "scorer",
+                      const results: Score[] = Array.isArray(scoreValue)
+                        ? scoreValue
+                        : typeof scoreValue === "object" && !isEmpty(scoreValue)
+                          ? [scoreValue]
+                          : [
+                              {
+                                name: scorerNames[score_idx],
+                                score: scoreValue,
+                              },
+                            ];
+                      const { resultMetadata, scoresRecord } =
+                        buildSpanScores(results);
+                      const resultOutput =
+                        results.length === 1
+                          ? getOtherFields(results[0])
+                          : results.reduce(
+                              (prev, s) =>
+                                mergeDicts(prev, {
+                                  [s.name]: getOtherFields(s),
+                                }),
+                              {},
+                            );
+                      span.log({
+                        output: resultOutput,
+                        metadata: resultMetadata,
+                        scores: scoresRecord,
+                      });
+                      return results;
                     },
-                    propagatedEvent: makeScorerPropagatedEvent(
-                      await rootSpan.export(),
-                    ),
-                    event: { input: scoringArgsForLogging },
-                  });
-                  return { kind: "score", value: results } as const;
-                } catch (e) {
-                  return { kind: "error", value: e } as const;
-                }
-              }),
+                  ),
+                ),
+              ),
+              Promise.all(
+                evaluator.classifications.map((classifier, idx) =>
+                  runInScorerSpan(
+                    rootSpan,
+                    classifierNames[idx],
+                    SpanTypeAttribute.CLASSIFIER,
+                    propagatedEvent,
+                    scoringArgsForLogging,
+                    async (span) => {
+                      const classifierValue = await Promise.resolve(
+                        classifier(scoringArgs),
+                      );
+                      if (classifierValue === null) return null;
+                      const rawResults = Array.isArray(classifierValue)
+                        ? classifierValue
+                        : [classifierValue];
+                      // Normalize: if the result is a raw classification value
+                      // (string or { id, label? }) rather than a full
+                      // Classification object, wrap it using the classifier name.
+                      const toClassification = (r: unknown): Classification => {
+                        if (
+                          r !== null &&
+                          typeof r === "object" &&
+                          "classification" in r &&
+                          "name" in r
+                        ) {
+                          return r as Classification;
+                        }
+                        return {
+                          name: classifierNames[idx],
+                          classification: r as
+                            | string
+                            | { id: string; label?: string },
+                        };
+                      };
+                      const results = rawResults.map(toClassification);
+                      const toIdLabel = (
+                        c: string | { id: string; label?: string } | undefined,
+                      ) =>
+                        c == null
+                          ? null
+                          : typeof c === "string"
+                            ? { id: c, label: c }
+                            : { id: c.id, label: c.label ?? c.id };
+                      const resultOutput =
+                        results.length === 1
+                          ? toIdLabel(results[0].classification)
+                          : results.reduce(
+                              (prev, r) =>
+                                mergeDicts(prev, {
+                                  [r.name]: toIdLabel(r.classification),
+                                }),
+                              {},
+                            );
+                      span.log({
+                        output: resultOutput,
+                        metadata: buildSpanMetadata(results),
+                      });
+                      return results;
+                    },
+                  ),
+                ),
+              ),
+            ]);
+
+            const failingScorers = collectScoringResults(
+              scoreResults,
+              scorerNames,
+              (result) => {
+                scores[result.name] = result.score;
+              },
             );
-            const failingScorersAndResults: { name: string; error: unknown }[] =
-              [];
+
             const classifications: Record<
               string,
               { id: string; label: string }[]
             > = {};
-            scoreResults.forEach((results, i) => {
-              const name = scorerNames[i];
-              if (results.kind === "score") {
-                (results.value || []).forEach((result) => {
-                  scores[result.name] = result.score;
-                  if (result.id != null) {
-                    classifications[result.name] = [
-                      { id: result.id, label: result.label ?? result.id },
-                    ];
-                  } else if (result.classification != null) {
-                    classifications[result.name] = [
-                      {
-                        id: result.classification,
-                        label: result.classification,
-                      },
-                    ];
-                  }
-                });
-              } else {
-                failingScorersAndResults.push({ name, error: results.value });
-              }
-            });
+            const failingClassifiers = collectScoringResults(
+              classificationResults,
+              classifierNames,
+              (result) => {
+                const c = result.classification;
+                if (typeof c === "string") {
+                  classifications[result.name] = [{ id: c, label: c }];
+                } else {
+                  classifications[result.name] = [
+                    { id: c.id, label: c.label ?? c.id },
+                  ];
+                }
+              },
+            );
 
             if (Object.keys(classifications).length > 0) {
               rootSpan.log({ classifications } as Parameters<
@@ -1304,28 +1433,23 @@ async function runEvaluatorInternal(
               >[0]);
             }
 
-            unhandledScores = null;
-            if (failingScorersAndResults.length) {
-              const scorerErrors = Object.fromEntries(
-                failingScorersAndResults.map(({ name, error }) => [
-                  name,
-                  error instanceof Error ? error.stack : `${error}`,
-                ]),
-              );
-              metadata["scorer_errors"] = scorerErrors;
-              rootSpan.log({
-                metadata: { scorer_errors: scorerErrors },
-              });
-              const names = Object.keys(scorerErrors).join(", ");
-              const errors = failingScorersAndResults.map((item) => item.error);
-              unhandledScores = Object.keys(scorerErrors);
-              debugLogger
-                .forState(evaluator.state)
-                .warn(
-                  `Found exceptions for the following scorers: ${names}`,
-                  errors,
-                );
-            }
+            const failedScorerNames = logScoringFailures(
+              "scorer",
+              failingScorers,
+              metadata,
+              rootSpan,
+              evaluator.state,
+            );
+            unhandledScores = failedScorerNames.length
+              ? failedScorerNames
+              : null;
+            logScoringFailures(
+              "classifier",
+              failingClassifiers,
+              metadata,
+              rootSpan,
+              evaluator.state,
+            );
           } catch (e) {
             logSpanError(rootSpan, e);
             error = e;
diff --git a/js/src/parameters.test.ts b/js/src/parameters.test.ts
index dbba8ea49..99d29950d 100644
--- a/js/src/parameters.test.ts
+++ b/js/src/parameters.test.ts
@@ -26,6 +26,7 @@ test("parameters are passed to task", async () => {
         return output;
       },
       scores: [],
+      classifications: [],
       parameters: {
         prefix: z.string().default("start:"),
         suffix: z.string().default(":end"),
@@ -59,6 +60,7 @@ test("prompt parameter is passed correctly", async () => {
         return input;
       },
       scores: [],
+      classifications: [],
       parameters: {
         main: {
           type: "prompt",
@@ -99,6 +101,7 @@ test("custom parameter values override defaults", async () => {
         return output;
       },
       scores: [],
+      classifications: [],
       parameters: {
         prefix: z.string().default("start:"),
         suffix: z.string().default(":end"),
@@ -131,6 +134,7 @@ test("array parameter is handled correctly", async () => {
         return input;
       },
       scores: [],
+      classifications: [],
       parameters: {
         items: z.array(z.string()).default(["item1", "item2"]),
       },
@@ -161,6 +165,7 @@ test("object parameter is handled correctly", async () => {
         return input;
       },
       scores: [],
+      classifications: [],
       parameters: {
         config: z
           .object({
@@ -196,6 +201,7 @@ test("model parameter defaults to configured value", async () => {
         return input;
       },
       scores: [],
+      classifications: [],
       parameters: {
         model: {
           type: "model",
@@ -224,6 +230,7 @@ test("model parameter is required when default is missing", async () => {
         data: [{ input: "test" }],
         task: async (input: string) => input,
         scores: [],
+        classifications: [],
         parameters: {
           model: {
             type: "model",
diff --git a/js/util/index.ts b/js/util/index.ts
index 25a76cc03..9746567bc 100644
--- a/js/util/index.ts
+++ b/js/util/index.ts
@@ -55,7 +55,7 @@ export {
   ensureNewDatasetRecord,
 } from "./object";
 
-export type { Score, Scorer, ScorerArgs } from "./score";
+export type { Classification, Score, Scorer, ScorerArgs } from "./score";
 
 export { constructJsonArray, deterministicReplacer } from "./json_util";
 
diff --git a/js/util/score.ts b/js/util/score.ts
index 373a1c6de..b8ce404d2 100644
--- a/js/util/score.ts
+++ b/js/util/score.ts
@@ -1,16 +1,21 @@
 /**
- * A classification result: either a plain string label, or an object with a
- * stable `id` and an optional human-readable `label` (defaults to `id`).
- * When set, the value is recorded in the `classifications` column keyed by
- * scorer name instead of (or in addition to) `score`.
+ * The result returned by a classifier function. Unlike `Score`, `classification`
+ * is required and the span will be recorded as a classifier span.
  */
-export type Classification = string | { id: string; label?: string };
+export interface Classification {
+  name: string;
+  /**
+   * The classification value: either a plain string label, or an object with a
+   * stable `id` and an optional human-readable `label` (defaults to `id`).
+   */
+  classification: string | { id: string; label?: string };
+  metadata?: Record<string, unknown>;
+}
 
 export interface Score {
   name: string;
   score: number | null;
   metadata?: Record<string, unknown>;
-  classification?: Classification;
   // DEPRECATION_NOTICE: this field is deprecated, as errors are propagated up to the caller.
   /**
    * @deprecated

From bbaf36254ce2ccce46a46e031712c62385acab19 Mon Sep 17 00:00:00 2001
From: Aswin Karumbunathan <aswin@braintrustdata.com>
Date: Mon, 16 Mar 2026 11:42:58 -0700
Subject: [PATCH 3/5] Address review feedback

---
 js/src/framework.test.ts | 202 ++++++++++++++++++++++++++++-----------
 js/src/framework.ts      | 131 +++++++++++++------------
 js/util/index.ts         |   8 +-
 js/util/object.ts        |   1 +
 js/util/score.ts         |  22 +++--
 5 files changed, 241 insertions(+), 123 deletions(-)

diff --git a/js/src/framework.test.ts b/js/src/framework.test.ts
index f431a728f..de57c2a3c 100644
--- a/js/src/framework.test.ts
+++ b/js/src/framework.test.ts
@@ -47,7 +47,6 @@ test("meta (write) is passed to task", async () => {
         return input * 2;
       },
       scores: [],
-      classifications: [],
     },
     new NoopProgressReporter(),
     [],
@@ -86,7 +85,6 @@ test("metadata (read/write) is passed to task", async () => {
         return input * 2;
       },
       scores: [],
-      classifications: [],
     },
     new NoopProgressReporter(),
     [],
@@ -127,7 +125,6 @@ test("expected (read/write) is passed to task", async () => {
         return input * 2;
       },
       scores: [],
-      classifications: [],
     },
     new NoopProgressReporter(),
     [],
@@ -178,12 +175,10 @@ describe("runEvaluator", () => {
           scores: Array.from({ length: 3 }, (_, i) =>
             makeTestScorer(`scorer_${i}`),
           ),
-          classifications: [],
         },
         new NoopProgressReporter(),
         [],
         undefined,
-        true,
       );
 
       expect(out.results.every((r) => Object.keys(r.scores).length === 0)).toBe(
@@ -206,13 +201,11 @@ describe("runEvaluator", () => {
               scores: Array.from({ length: 3 }, (_, i) =>
                 makeTestScorer(`scorer_${i}`),
               ),
-              classifications: [],
               errorScoreHandler: defaultErrorScoreHandler,
             },
             new NoopProgressReporter(),
             [],
             undefined,
-            true,
           );
 
           expect(
@@ -237,13 +230,11 @@ describe("runEvaluator", () => {
               scores: Array.from({ length: 3 }, (_, i) =>
                 makeTestScorer(`scorer_${i}`, i === 0),
               ),
-              classifications: [],
               errorScoreHandler: defaultErrorScoreHandler,
             },
             new NoopProgressReporter(),
             [],
             undefined,
-            true,
           );
 
           expect(
@@ -272,13 +263,11 @@ describe("runEvaluator", () => {
               scores: Array.from({ length: 3 }, (_, i) =>
                 makeTestScorer(`scorer_${i}`),
               ),
-              classifications: [],
               errorScoreHandler: () => undefined,
             },
             new NoopProgressReporter(),
             [],
             undefined,
-            true,
           );
 
           expect(
@@ -299,13 +288,11 @@ describe("runEvaluator", () => {
               scores: Array.from({ length: 3 }, (_, i) =>
                 makeTestScorer(`scorer_${i}`),
               ),
-              classifications: [],
               errorScoreHandler: () => ({ error_score: 1 }),
             },
             new NoopProgressReporter(),
             [],
             undefined,
-            true,
           );
 
           expect(
@@ -353,7 +340,6 @@ describe("runEvaluator", () => {
               return input * 2;
             },
             scores: [],
-            classifications: [],
             timeout: 10,
             maxConcurrency: 1,
           },
@@ -403,7 +389,6 @@ describe("runEvaluator", () => {
               return input * 2;
             },
             scores: [],
-            classifications: [],
             signal: abortController.signal,
             maxConcurrency: 1,
           },
@@ -441,7 +426,6 @@ describe("runEvaluator", () => {
             return input * 2;
           },
           scores: [],
-          classifications: [],
         },
         new NoopProgressReporter(),
         [],
@@ -469,7 +453,6 @@ test("trialIndex is passed to task", async () => {
         return input * 2;
       },
       scores: [],
-      classifications: [],
       trialCount: 3,
     },
     new NoopProgressReporter(),
@@ -489,7 +472,7 @@ test("trialIndex is passed to task", async () => {
   // All results should be correct
   results.forEach((result) => {
     expect(result.input).toBe(1);
-    expect(result.expected).toBe(2);
+    expect("expected" in result ? result.expected : undefined).toBe(2);
     expect(result.output).toBe(2);
     expect(result.error).toBeUndefined();
   });
@@ -512,7 +495,6 @@ test("trialIndex with multiple inputs", async () => {
         return input * 2;
       },
       scores: [],
-      classifications: [],
       trialCount: 2,
     },
     new NoopProgressReporter(),
@@ -559,7 +541,6 @@ test("Eval with noSendLogs: true runs locally without creating experiment", asyn
         }),
         () => ({ name: "simple_scorer", score: 0.8 }),
       ],
-      classifications: [],
     },
     { noSendLogs: true, returnResults: true },
   );
@@ -589,9 +570,8 @@ test("Eval with noSendLogs: true runs locally without creating experiment", asyn
 
 test("Eval with returnResults: false produces empty results but valid summary", async () => {
   const result = await Eval(
-    "test-no-results",
+    "test-no-results-project",
     {
-      projectName: "test-no-results-project",
       data: [
         { input: "hello", expected: "hello world" },
         { input: "test", expected: "test world" },
@@ -606,7 +586,6 @@ test("Eval with returnResults: false produces empty results but valid summary",
         () => ({ name: "length_score", score: 0.75 }),
         () => ({ name: "quality_score", score: 0.9 }),
       ],
-      classifications: [],
     },
     { noSendLogs: true, returnResults: false },
   );
@@ -630,9 +609,8 @@ test("Eval with returnResults: false produces empty results but valid summary",
 
 test("Eval with returnResults: true collects all results", async () => {
   const result = await Eval(
-    "test-with-results",
+    "test-with-results-project",
     {
-      projectName: "test-with-results-project",
       data: [
         { input: "hello", expected: "hello world" },
         { input: "test", expected: "test world" },
@@ -644,7 +622,6 @@ test("Eval with returnResults: true collects all results", async () => {
           score: args.output === args.expected ? 1 : 0,
         }),
       ],
-      classifications: [],
     },
     { noSendLogs: true, returnResults: true },
   );
@@ -684,11 +661,10 @@ test("tags can be appended and logged to root span", async () => {
       evalName: "js-tags-append",
       data: [{ input: "hello", expected: "hello world", tags: initialTags }],
       task: (input, hooks) => {
-        for (const t of appendedTags) hooks.tags.push(t);
+        for (const t of appendedTags) hooks.tags!.push(t);
         return input;
       },
       scores: [() => ({ name: "simple_scorer", score: 0.8 })],
-      classifications: [],
       summarizeScores: false,
     },
     new NoopProgressReporter(),
@@ -738,7 +714,6 @@ test.each([
         return input;
       },
       scores: [() => ({ name: "simple_scorer", score: 0.8 })],
-      classifications: [],
       summarizeScores: false,
     },
     new NoopProgressReporter(),
@@ -779,7 +754,6 @@ test("tags are persisted with a failing scorer", async () => {
           throw new Error("test error");
         },
       ],
-      classifications: [],
       summarizeScores: false,
     },
     new NoopProgressReporter(),
@@ -813,7 +787,6 @@ test("tags remain empty when not set", async () => {
         return input;
       },
       scores: [() => ({ name: "simple_scorer", score: 0.8 })],
-      classifications: [],
       summarizeScores: false,
     },
     new NoopProgressReporter(),
@@ -845,12 +818,11 @@ test("scorer spans have purpose='scorer' attribute", async () => {
       data: [{ input: "hello", expected: "hello" }],
       task: async (input: string) => input,
       scores: [
-        (args: { input: string; output: string; expected: string }) => ({
+        (args: { output: string; expected?: string }) => ({
           name: "simple_scorer",
           score: args.output === args.expected ? 1 : 0,
         }),
       ],
-      classifications: [],
     },
     new NoopProgressReporter(),
     [],
@@ -993,11 +965,12 @@ describe("framework2 metadata support", () => {
           options: { model: "gpt-4" },
         },
         [],
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
         {
           name: "test-prompt",
           slug: "test-prompt",
           metadata,
-        },
+        } as any,
       );
 
       const mockProjectMap = {
@@ -1022,10 +995,8 @@ describe("framework2 metadata support", () => {
           options: { model: "gpt-4" },
         },
         [],
-        {
-          name: "test-prompt",
-          slug: "test-prompt",
-        },
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        { name: "test-prompt", slug: "test-prompt" } as any,
       );
 
       const mockProjectMap = {
@@ -1048,11 +1019,12 @@ describe("framework2 metadata support", () => {
           options: { model: "gpt-4" },
         },
         [],
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
         {
           name: "test-prompt",
           slug: "test-prompt",
           environments: ["production"],
-        },
+        } as any,
       );
 
       const mockProjectMap = {
@@ -1075,11 +1047,12 @@ describe("framework2 metadata support", () => {
           options: { model: "gpt-4" },
         },
         [],
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
         {
           name: "test-prompt",
           slug: "test-prompt",
           environments: ["staging", "production"],
-        },
+        } as any,
       );
 
       const mockProjectMap = {
@@ -1105,10 +1078,8 @@ describe("framework2 metadata support", () => {
           options: { model: "gpt-4" },
         },
         [],
-        {
-          name: "test-prompt",
-          slug: "test-prompt",
-        },
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        { name: "test-prompt", slug: "test-prompt" } as any,
       );
 
       const mockProjectMap = {
@@ -1151,11 +1122,8 @@ describe("framework2 metadata support", () => {
           options: { model: "gpt-4" },
         },
         [],
-        {
-          name: "test-prompt",
-          slug: "test-prompt",
-          tags,
-        },
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        { name: "test-prompt", slug: "test-prompt", tags } as any,
       );
 
       const mockProjectMap = {
@@ -1180,10 +1148,8 @@ describe("framework2 metadata support", () => {
           options: { model: "gpt-4" },
         },
         [],
-        {
-          name: "test-prompt",
-          slug: "test-prompt",
-        },
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        { name: "test-prompt", slug: "test-prompt" } as any,
       );
 
       const mockProjectMap = {
@@ -1492,7 +1458,6 @@ test("Eval with enableCache: false does not use span cache", async () => {
       data: [{ input: 1, expected: 2 }],
       task: (input) => input * 2,
       scores: [],
-      classifications: [],
       state,
     },
     { noSendLogs: true, enableCache: false },
@@ -1518,7 +1483,6 @@ test("Eval with enableCache: true (default) uses span cache", async () => {
       data: [{ input: 1, expected: 2 }],
       task: (input) => input * 2,
       scores: [],
-      classifications: [],
       state,
     },
     { noSendLogs: true }, // enableCache defaults to true
@@ -1527,3 +1491,131 @@ test("Eval with enableCache: true (default) uses span cache", async () => {
   expect(startSpy).toHaveBeenCalled();
   expect(stopSpy).toHaveBeenCalled();
 });
+
+test("classifier-only evaluator populates classifications field", async () => {
+  const result = await Eval(
+    "test-classifier-only",
+    {
+      data: [{ input: "hello", expected: "greeting" }],
+      task: (input) => input,
+      scores: [],
+      classifications: [
+        () => ({
+          name: "category",
+          id: "greeting",
+          label: "Greeting",
+          confidence: 0.91,
+          metadata: { source: "unit-test" },
+        }),
+      ],
+    },
+    { noSendLogs: true, returnResults: true },
+  );
+
+  expect(result.results).toHaveLength(1);
+  const r = result.results[0];
+  expect(r.classifications?.category).toEqual([
+    {
+      id: "greeting",
+      label: "Greeting",
+      confidence: 0.91,
+      metadata: { source: "unit-test" },
+    },
+  ]);
+});
+
+test("scorer-only evaluator populates scores field", async () => {
+  const result = await Eval(
+    "test-scorer-only",
+    {
+      data: [{ input: "hello", expected: "hello" }],
+      task: (input) => input,
+      scores: [
+        (args) => ({
+          name: "exact_match",
+          score: args.output === args.expected ? 1 : 0,
+        }),
+      ],
+    },
+    { noSendLogs: true, returnResults: true },
+  );
+
+  expect(result.results).toHaveLength(1);
+  expect(result.results[0].scores.exact_match).toBe(1);
+  expect(result.results[0].classifications).toBeUndefined();
+});
+
+test("multiple classifiers returning the same name append items correctly", async () => {
+  const result = await Eval(
+    "test-classifier-append",
+    {
+      data: [{ input: "hello" }],
+      task: (input) => input,
+      scores: [],
+      classifications: [
+        () => [
+          { name: "category", id: "greeting", label: "Greeting" },
+          { name: "category", id: "informal", label: "Informal" },
+        ],
+      ],
+    },
+    { noSendLogs: true, returnResults: true },
+  );
+
+  expect(result.results).toHaveLength(1);
+  expect(result.results[0].classifications?.category).toHaveLength(2);
+  expect(result.results[0].classifications?.category[0]).toEqual({
+    id: "greeting",
+    label: "Greeting",
+  });
+  expect(result.results[0].classifications?.category[1]).toEqual({
+    id: "informal",
+    label: "Informal",
+  });
+});
+
+test("mixed evaluator populates both scores and classifications", async () => {
+  const result = await Eval(
+    "test-score-and-classify",
+    {
+      data: [{ input: "hello", expected: "hello" }],
+      task: (input) => input,
+      scores: [
+        (args) => ({
+          name: "exact_match",
+          score: args.output === args.expected ? 1 : 0,
+        }),
+      ],
+      classifications: [
+        () => ({ name: "category", id: "greeting", label: "Greeting" }),
+      ],
+    },
+    { noSendLogs: true, returnResults: true },
+  );
+
+  expect(result.results).toHaveLength(1);
+  expect(result.results[0].scores.exact_match).toBe(1);
+  expect(result.results[0].classifications?.category).toEqual([
+    { id: "greeting", label: "Greeting" },
+  ]);
+});
+
+test("malformed classifier output fails clearly", async () => {
+  const result = await Eval(
+    "test-invalid-classifier-output",
+    {
+      data: [{ input: "hello" }],
+      task: (input) => input,
+      scores: [],
+      classifications: [() => ({}) as never],
+    },
+    { noSendLogs: true, returnResults: true },
+  );
+
+  expect(result.results).toHaveLength(1);
+  expect((result.results[0] as any).metadata?.classifier_errors).toMatchObject({
+    classifier_0: expect.stringMatching(
+      /must return classifications with a non-empty string name/,
+    ),
+  });
+});
diff --git a/js/src/framework.ts b/js/src/framework.ts
index ff1851783..c57fc521c 100644
--- a/js/src/framework.ts
+++ b/js/src/framework.ts
@@ -2,6 +2,7 @@ import {
   makeScorerPropagatedEvent,
   mergeDicts,
   Classification,
+  ClassificationItem,
   Score,
   SpanComponentsV3,
   SpanTypeAttribute,
@@ -205,9 +206,10 @@ export type EvalResult<
   Metadata extends BaseMetadata = DefaultMetadataType,
 > = EvalCase<Input, Expected, Metadata> & {
   output: Output;
-  scores: Record<string, number | null>;
   error: unknown;
   origin?: ObjectReference;
+  scores: Record<string, number | null>;
+  classifications?: Record<string, ClassificationItem[]>;
 };
 
 type ErrorScoreHandler = (args: {
@@ -241,10 +243,9 @@ export interface Evaluator<
 
   /**
    * A set of functions that take an input, output, and expected value and return a
-   * classification. Each function must return a {@link Classification} with a required
-   * `classification` field. Results are recorded under the `classifications` column.
+   * {@link Classification}. Results are recorded under the `classifications` column.
    */
-  classifications: EvalClassifier<Input, Output, Expected, Metadata>[];
+  classifications?: EvalClassifier<Input, Output, Expected, Metadata>[];
 
   /**
    * A set of parameters that will be passed to the evaluator.
@@ -883,7 +884,7 @@ export function scorerName(
   return scorer.name || `scorer_${scorer_idx}`;
 }
 
-export function classifierName(
+function classifierName(
   classifier: EvalClassifier<any, any, any, any>,
   classifier_idx: number,
 ) {
@@ -956,6 +957,37 @@ function collectScoringResults<T extends { name: string }>(
   return failing;
 }
 
+function validateClassificationResult(
+  value: unknown,
+  scorerName: string,
+): Classification {
+  if (!(typeof value === "object" && value !== null && !isEmpty(value))) {
+    throw new Error(
+      `When returning structured classifier results, each classification must be a non-empty object. Got: ${JSON.stringify(value)}`,
+    );
+  }
+  if (!("name" in value) || typeof value.name !== "string" || !value.name) {
+    throw new Error(
+      `Classifier ${scorerName} must return classifications with a non-empty string name. Got: ${JSON.stringify(value)}`,
+    );
+  }
+  if (!("id" in value) || typeof value.id !== "string" || !value.id) {
+    throw new Error(
+      `Classifier ${scorerName} must return classifications with a non-empty string id. Got: ${JSON.stringify(value)}`,
+    );
+  }
+  return value as Classification;
+}
+
+function toClassificationItem(c: Classification): ClassificationItem {
+  return {
+    id: c.id,
+    label: c.label ?? c.id,
+    ...(c.confidence !== undefined ? { confidence: c.confidence } : {}),
+    ...(c.metadata !== undefined ? { metadata: c.metadata } : {}),
+  };
+}
+
 function logScoringFailures(
   kind: string,
   failures: { name: string; error: unknown }[],
@@ -1204,8 +1236,11 @@ async function runEvaluatorInternal(
           let error: unknown | undefined = undefined;
           let tags: string[] = [...(datum.tags ?? [])];
           const scores: Record<string, number | null> = {};
+          const classifications: Record<string, ClassificationItem[]> = {};
           const scorerNames = evaluator.scores.map(scorerName);
-          const classifierNames = evaluator.classifications.map(classifierName);
+          const classifierNames = (evaluator.classifications ?? []).map(
+            classifierName,
+          );
           let unhandledScores: string[] | null = scorerNames;
           try {
             const meta = (o: Record<string, unknown>) =>
@@ -1336,7 +1371,7 @@ async function runEvaluatorInternal(
                 ),
               ),
               Promise.all(
-                evaluator.classifications.map((classifier, idx) =>
+                (evaluator.classifications ?? []).map((classifier, idx) =>
                   runInScorerSpan(
                     rootSpan,
                     classifierNames[idx],
@@ -1348,52 +1383,31 @@ async function runEvaluatorInternal(
                         classifier(scoringArgs),
                       );
                       if (classifierValue === null) return null;
-                      const rawResults = Array.isArray(classifierValue)
-                        ? classifierValue
-                        : [classifierValue];
-                      // Normalize: if the result is a raw classification value
-                      // (string or { id, label? }) rather than a full
-                      // Classification object, wrap it using the classifier name.
-                      const toClassification = (r: unknown): Classification => {
-                        if (
-                          r !== null &&
-                          typeof r === "object" &&
-                          "classification" in r &&
-                          "name" in r
-                        ) {
-                          return r as Classification;
-                        }
-                        return {
-                          name: classifierNames[idx],
-                          classification: r as
-                            | string
-                            | { id: string; label?: string },
-                        };
-                      };
-                      const results = rawResults.map(toClassification);
-                      const toIdLabel = (
-                        c: string | { id: string; label?: string } | undefined,
-                      ) =>
-                        c == null
-                          ? null
-                          : typeof c === "string"
-                            ? { id: c, label: c }
-                            : { id: c.id, label: c.label ?? c.id };
+                      const rawResults = (
+                        Array.isArray(classifierValue)
+                          ? classifierValue
+                          : [classifierValue]
+                      ).map((result) =>
+                        validateClassificationResult(
+                          result,
+                          classifierNames[idx],
+                        ),
+                      );
                       const resultOutput =
-                        results.length === 1
-                          ? toIdLabel(results[0].classification)
-                          : results.reduce(
+                        rawResults.length === 1
+                          ? toClassificationItem(rawResults[0])
+                          : rawResults.reduce(
                               (prev, r) =>
                                 mergeDicts(prev, {
-                                  [r.name]: toIdLabel(r.classification),
+                                  [r.name]: toClassificationItem(r),
                                 }),
                               {},
                             );
                       span.log({
                         output: resultOutput,
-                        metadata: buildSpanMetadata(results),
+                        metadata: buildSpanMetadata(rawResults),
                       });
-                      return results;
+                      return rawResults;
                     },
                   ),
                 ),
@@ -1408,29 +1422,20 @@ async function runEvaluatorInternal(
               },
             );
 
-            const classifications: Record<
-              string,
-              { id: string; label: string }[]
-            > = {};
             const failingClassifiers = collectScoringResults(
               classificationResults,
               classifierNames,
               (result) => {
-                const c = result.classification;
-                if (typeof c === "string") {
-                  classifications[result.name] = [{ id: c, label: c }];
-                } else {
-                  classifications[result.name] = [
-                    { id: c.id, label: c.label ?? c.id },
-                  ];
+                const item = toClassificationItem(result);
+                if (!classifications[result.name]) {
+                  classifications[result.name] = [];
                 }
+                classifications[result.name].push(item);
               },
             );
 
             if (Object.keys(classifications).length > 0) {
-              rootSpan.log({ classifications } as Parameters<
-                typeof rootSpan.log
-              >[0]);
+              rootSpan.log({ classifications });
             }
 
             const failedScorerNames = logScoringFailures(
@@ -1473,15 +1478,21 @@ async function runEvaluatorInternal(
           }
 
           if (collectResults) {
-            collectedResults.push({
+            const baseResult = {
               input: datum.input,
               ...("expected" in datum ? { expected: datum.expected } : {}),
               output,
               tags: tags.length ? tags : undefined,
               metadata,
-              scores: mergedScores,
               error,
               origin: baseEvent.event?.origin,
+            };
+            collectedResults.push({
+              ...baseResult,
+              scores: mergedScores,
+              ...(Object.keys(classifications).length > 0
+                ? { classifications }
+                : {}),
             });
           }
         };
diff --git a/js/util/index.ts b/js/util/index.ts
index 9746567bc..52b082cc1 100644
--- a/js/util/index.ts
+++ b/js/util/index.ts
@@ -55,7 +55,13 @@ export {
   ensureNewDatasetRecord,
 } from "./object";
 
-export type { Classification, Score, Scorer, ScorerArgs } from "./score";
+export type {
+  Classification,
+  ClassificationItem,
+  Score,
+  Scorer,
+  ScorerArgs,
+} from "./score";
 
 export { constructJsonArray, deterministicReplacer } from "./json_util";
 
diff --git a/js/util/object.ts b/js/util/object.ts
index 735f52960..fea8735d6 100644
--- a/js/util/object.ts
+++ b/js/util/object.ts
@@ -21,6 +21,7 @@ export type OtherExperimentLogFields = {
   error: unknown;
   tags: string[];
   scores: Record<string, number | null>;
+  classifications?: Record<string, { id: string; label: string }[]>;
   metadata: Record<string, unknown>;
   metrics: Record<string, unknown>;
   datasetRecordId: string;
diff --git a/js/util/score.ts b/js/util/score.ts
index b8ce404d2..c02365dde 100644
--- a/js/util/score.ts
+++ b/js/util/score.ts
@@ -1,14 +1,22 @@
 /**
- * The result returned by a classifier function. Unlike `Score`, `classification`
- * is required and the span will be recorded as a classifier span.
+ * The result returned by a classifier function. Unlike `Score`, `id` is
+ * required and the span will be recorded as a classifier span.
  */
 export interface Classification {
   name: string;
-  /**
-   * The classification value: either a plain string label, or an object with a
-   * stable `id` and an optional human-readable `label` (defaults to `id`).
-   */
-  classification: string | { id: string; label?: string };
+  id: string;
+  label?: string;
+  confidence?: number | null;
+  metadata?: Record<string, unknown>;
+}
+
+/**
+ * The serialized form of a classification stored in the `classifications` log record.
+ */
+export interface ClassificationItem {
+  id: string;
+  label: string;
+  confidence?: number | null;
   metadata?: Record<string, unknown>;
 }
 

From 0eaf765a96dbe4b2af28d6eb3bbed1bdee10033b Mon Sep 17 00:00:00 2001
From: Aswin Karumbunathan <aswin@braintrustdata.com>
Date: Tue, 17 Mar 2026 09:56:32 -0700
Subject: [PATCH 4/5] Updates to match
 https://github.com/braintrustdata/braintrust-spec/pull/2/changes

---
 js/dev/server.ts          |  4 +--
 js/src/framework.test.ts  | 11 +++----
 js/src/framework.ts       | 65 ++++++++++++++++++++++++++++-----------
 js/src/parameters.test.ts | 14 ++++-----
 4 files changed, 60 insertions(+), 34 deletions(-)

diff --git a/js/dev/server.ts b/js/dev/server.ts
index aee357ad5..4e4daaf4d 100644
--- a/js/dev/server.ts
+++ b/js/dev/server.ts
@@ -117,7 +117,7 @@ export function runDevServer(
 
         evalDefs[name] = {
           parameters,
-          scores: evaluator.scores.map((score, idx) => ({
+          scores: (evaluator.scores ?? []).map((score, idx) => ({
             name: scorerName(score, idx),
           })),
         };
@@ -209,7 +209,7 @@ export function runDevServer(
           {
             ...evaluator,
             data: evalData.data,
-            scores: evaluator.scores.concat(
+            scores: (evaluator.scores ?? []).concat(
               scores?.map((score) =>
                 makeScorer(
                   state,
diff --git a/js/src/framework.test.ts b/js/src/framework.test.ts
index de57c2a3c..ddda9d267 100644
--- a/js/src/framework.test.ts
+++ b/js/src/framework.test.ts
@@ -1498,8 +1498,7 @@ test("classifier-only evaluator populates classifications field", async () => {
     {
       data: [{ input: "hello", expected: "greeting" }],
       task: (input) => input,
-      scores: [],
-      classifications: [
+      classifiers: [
         () => ({
           name: "category",
           id: "greeting",
@@ -1551,8 +1550,7 @@ test("multiple classifiers returning the same name append items correctly", asyn
     {
       data: [{ input: "hello" }],
       task: (input) => input,
-      scores: [],
-      classifications: [
+      classifiers: [
         () => [
           { name: "category", id: "greeting", label: "Greeting" },
           { name: "category", id: "informal", label: "Informal" },
@@ -1586,7 +1584,7 @@ test("mixed evaluator populates both scores and classifications", async () => {
           score: args.output === args.expected ? 1 : 0,
         }),
       ],
-      classifications: [
+      classifiers: [
         () => ({ name: "category", id: "greeting", label: "Greeting" }),
       ],
     },
@@ -1606,8 +1604,7 @@ test("malformed classifier output fails clearly", async () => {
     {
       data: [{ input: "hello" }],
       task: (input) => input,
-      scores: [],
-      classifications: [() => ({}) as never],
+      classifiers: [() => ({}) as never],
     },
     { noSendLogs: true, returnResults: true },
   );
diff --git a/js/src/framework.ts b/js/src/framework.ts
index c57fc521c..62e4e33ed 100644
--- a/js/src/framework.ts
+++ b/js/src/framework.ts
@@ -219,13 +219,13 @@ type ErrorScoreHandler = (args: {
   unhandledScores: string[];
 }) => Record<string, number> | undefined | void;
 
-export interface Evaluator<
+type EvaluatorBase<
   Input,
   Output,
   Expected,
   Metadata extends BaseMetadata = DefaultMetadataType,
   Parameters extends EvalParameters = EvalParameters,
-> {
+> = {
   /**
    * A function that returns a list of inputs, expected outputs, and metadata.
    */
@@ -236,17 +236,6 @@ export interface Evaluator<
    */
   task: EvalTask<Input, Output, Expected, Metadata, Parameters>;
 
-  /**
-   * A set of functions that take an input, output, and expected value and return a {@link Score}.
-   */
-  scores: EvalScorer<Input, Output, Expected, Metadata>[];
-
-  /**
-   * A set of functions that take an input, output, and expected value and return a
-   * {@link Classification}. Results are recorded under the `classifications` column.
-   */
-  classifications?: EvalClassifier<Input, Output, Expected, Metadata>[];
-
   /**
    * A set of parameters that will be passed to the evaluator.
    * Can be:
@@ -364,7 +353,42 @@ export interface Evaluator<
    * Flushes spans before calling scoring functions
    */
   flushBeforeScoring?: boolean;
-}
+};
+
+/**
+ * Defines an evaluator. At least one of `scores` or `classifiers` must be provided.
+ */
+export type Evaluator<
+  Input,
+  Output,
+  Expected,
+  Metadata extends BaseMetadata = DefaultMetadataType,
+  Parameters extends EvalParameters = EvalParameters,
+> = EvaluatorBase<Input, Output, Expected, Metadata, Parameters> &
+  (
+    | {
+        /**
+         * A set of functions that take an input, output, and expected value and return a {@link Score}.
+         */
+        scores: EvalScorer<Input, Output, Expected, Metadata>[];
+        /**
+         * A set of functions that take an input, output, and expected value and return a
+         * {@link Classification}. Results are recorded under the `classifications` column.
+         */
+        classifiers?: EvalClassifier<Input, Output, Expected, Metadata>[];
+      }
+    | {
+        /**
+         * A set of functions that take an input, output, and expected value and return a {@link Score}.
+         */
+        scores?: EvalScorer<Input, Output, Expected, Metadata>[];
+        /**
+         * A set of functions that take an input, output, and expected value and return a
+         * {@link Classification}. Results are recorded under the `classifications` column.
+         */
+        classifiers: EvalClassifier<Input, Output, Expected, Metadata>[];
+      }
+  );
 
 export class EvalResultWithSummary<
   Input,
@@ -1023,6 +1047,11 @@ export async function runEvaluator(
   enableCache = true,
   // eslint-disable-next-line @typescript-eslint/no-explicit-any
 ): Promise<EvalResultWithSummary<any, any, any, any>> {
+  if (!evaluator.scores && !evaluator.classifiers) {
+    throw new Error(
+      "Evaluator must include at least one of `scores` or `classifiers`",
+    );
+  }
   return await runEvaluatorInternal(
     experiment,
     evaluator,
@@ -1237,8 +1266,8 @@ async function runEvaluatorInternal(
           let tags: string[] = [...(datum.tags ?? [])];
           const scores: Record<string, number | null> = {};
           const classifications: Record<string, ClassificationItem[]> = {};
-          const scorerNames = evaluator.scores.map(scorerName);
-          const classifierNames = (evaluator.classifications ?? []).map(
+          const scorerNames = (evaluator.scores ?? []).map(scorerName);
+          const classifierNames = (evaluator.classifiers ?? []).map(
             classifierName,
           );
           let unhandledScores: string[] | null = scorerNames;
@@ -1317,7 +1346,7 @@ async function runEvaluatorInternal(
 
             const [scoreResults, classificationResults] = await Promise.all([
               Promise.all(
-                evaluator.scores.map((score, score_idx) =>
+                (evaluator.scores ?? []).map((score, score_idx) =>
                   runInScorerSpan(
                     rootSpan,
                     scorerNames[score_idx],
@@ -1371,7 +1400,7 @@ async function runEvaluatorInternal(
                 ),
               ),
               Promise.all(
-                (evaluator.classifications ?? []).map((classifier, idx) =>
+                (evaluator.classifiers ?? []).map((classifier, idx) =>
                   runInScorerSpan(
                     rootSpan,
                     classifierNames[idx],
diff --git a/js/src/parameters.test.ts b/js/src/parameters.test.ts
index 99d29950d..d5b7b7e4b 100644
--- a/js/src/parameters.test.ts
+++ b/js/src/parameters.test.ts
@@ -26,7 +26,7 @@ test("parameters are passed to task", async () => {
         return output;
       },
       scores: [],
-      classifications: [],
+      classifiers: [],
       parameters: {
         prefix: z.string().default("start:"),
         suffix: z.string().default(":end"),
@@ -60,7 +60,7 @@ test("prompt parameter is passed correctly", async () => {
         return input;
       },
       scores: [],
-      classifications: [],
+      classifiers: [],
       parameters: {
         main: {
           type: "prompt",
@@ -101,7 +101,7 @@ test("custom parameter values override defaults", async () => {
         return output;
       },
       scores: [],
-      classifications: [],
+      classifiers: [],
       parameters: {
         prefix: z.string().default("start:"),
         suffix: z.string().default(":end"),
@@ -134,7 +134,7 @@ test("array parameter is handled correctly", async () => {
         return input;
       },
       scores: [],
-      classifications: [],
+      classifiers: [],
       parameters: {
         items: z.array(z.string()).default(["item1", "item2"]),
       },
@@ -165,7 +165,7 @@ test("object parameter is handled correctly", async () => {
         return input;
       },
       scores: [],
-      classifications: [],
+      classifiers: [],
       parameters: {
         config: z
           .object({
@@ -201,7 +201,7 @@ test("model parameter defaults to configured value", async () => {
         return input;
       },
       scores: [],
-      classifications: [],
+      classifiers: [],
       parameters: {
         model: {
           type: "model",
@@ -230,7 +230,7 @@ test("model parameter is required when default is missing", async () => {
         data: [{ input: "test" }],
         task: async (input: string) => input,
         scores: [],
-        classifications: [],
+        classifiers: [],
         parameters: {
           model: {
             type: "model",

From 1421ef71cfd68ce2d9adcc03f58943cb92890483 Mon Sep 17 00:00:00 2001
From: Aswin Karumbunathan <aswin@braintrustdata.com>
Date: Tue, 17 Mar 2026 10:58:05 -0700
Subject: [PATCH 5/5] remove confidence, fix CI

we're not using confidence yet, so leave it out for now, we can always add it in later
---
 js/src/cli/functions/infer-source.ts |  2 +-
 js/src/cli/functions/upload.ts       | 38 +++++++++---------
 js/src/framework.test.ts             |  2 -
 js/src/framework.ts                  | 59 ++++++++++------------------
 js/util/score.ts                     |  2 -
 5 files changed, 41 insertions(+), 62 deletions(-)

diff --git a/js/src/cli/functions/infer-source.ts b/js/src/cli/functions/infer-source.ts
index 179b5c50d..0a759b422 100644
--- a/js/src/cli/functions/infer-source.ts
+++ b/js/src/cli/functions/infer-source.ts
@@ -85,7 +85,7 @@ export async function findCodeDefinition({
       fn =
         location.position.type === "task"
           ? evaluator.task
-          : evaluator.scores[location.position.index];
+          : (evaluator.scores ?? [])[location.position.index];
     }
   } else if (location.type === "function") {
     fn = outFileModule.functions[location.index].handler;
diff --git a/js/src/cli/functions/upload.ts b/js/src/cli/functions/upload.ts
index ce67f44d9..1745b5329 100644
--- a/js/src/cli/functions/upload.ts
+++ b/js/src/cli/functions/upload.ts
@@ -180,23 +180,25 @@ export async function uploadHandleBundles({
           function_type: "task",
           origin,
         },
-        ...evaluator.evaluator.scores.map((score, i): BundledFunctionSpec => {
-          const name = scorerName(score, i);
-          return {
-            ...baseInfo,
-            // There is a very small chance that someone names a function with the same convention, but
-            // let's assume it's low enough that it doesn't matter.
-            ...formatNameAndSlug(["eval", namePrefix, "scorer", name]),
-            description: `Score ${name} for eval ${namePrefix}`,
-            location: {
-              type: "experiment",
-              eval_name: evaluator.evaluator.evalName,
-              position: { type: "scorer", index: i },
-            },
-            function_type: "scorer",
-            origin,
-          };
-        }),
+        ...(evaluator.evaluator.scores ?? []).map(
+          (score, i): BundledFunctionSpec => {
+            const name = scorerName(score, i);
+            return {
+              ...baseInfo,
+              // There is a very small chance that someone names a function with the same convention, but
+              // let's assume it's low enough that it doesn't matter.
+              ...formatNameAndSlug(["eval", namePrefix, "scorer", name]),
+              description: `Score ${name} for eval ${namePrefix}`,
+              location: {
+                type: "experiment",
+                eval_name: evaluator.evaluator.evalName,
+                position: { type: "scorer", index: i },
+              },
+              function_type: "scorer",
+              origin,
+            };
+          },
+        ),
       ];
 
       bundleSpecs.push(...fileSpecs);
@@ -219,7 +221,7 @@ export async function uploadHandleBundles({
                   serializeRemoteEvalParametersContainer(resolvedParameters),
               }
             : {}),
-          scores: evaluator.evaluator.scores.map((score, i) => ({
+          scores: (evaluator.evaluator.scores ?? []).map((score, i) => ({
             name: scorerName(score, i),
           })),
         };
diff --git a/js/src/framework.test.ts b/js/src/framework.test.ts
index ddda9d267..45989fcdf 100644
--- a/js/src/framework.test.ts
+++ b/js/src/framework.test.ts
@@ -1503,7 +1503,6 @@ test("classifier-only evaluator populates classifications field", async () => {
           name: "category",
           id: "greeting",
           label: "Greeting",
-          confidence: 0.91,
           metadata: { source: "unit-test" },
         }),
       ],
@@ -1517,7 +1516,6 @@ test("classifier-only evaluator populates classifications field", async () => {
     {
       id: "greeting",
       label: "Greeting",
-      confidence: 0.91,
       metadata: { source: "unit-test" },
     },
   ]);
diff --git a/js/src/framework.ts b/js/src/framework.ts
index 62e4e33ed..c2bad700d 100644
--- a/js/src/framework.ts
+++ b/js/src/framework.ts
@@ -219,13 +219,17 @@ type ErrorScoreHandler = (args: {
   unhandledScores: string[];
 }) => Record<string, number> | undefined | void;
 
-type EvaluatorBase<
+/**
+ * Defines an evaluator. At least one of `scores` or `classifiers` must be provided;
+ * a runtime error is raised if neither is present.
+ */
+export interface Evaluator<
   Input,
   Output,
   Expected,
   Metadata extends BaseMetadata = DefaultMetadataType,
   Parameters extends EvalParameters = EvalParameters,
-> = {
+> {
   /**
    * A function that returns a list of inputs, expected outputs, and metadata.
    */
@@ -236,6 +240,19 @@ type EvaluatorBase<
    */
   task: EvalTask<Input, Output, Expected, Metadata, Parameters>;
 
+  /**
+   * A set of functions that take an input, output, and expected value and return a {@link Score}.
+   * At least one of `scores` or `classifiers` must be provided.
+   */
+  scores?: EvalScorer<Input, Output, Expected, Metadata>[];
+
+  /**
+   * A set of functions that take an input, output, and expected value and return a
+   * {@link Classification}. Results are recorded under the `classifications` column.
+   * At least one of `scores` or `classifiers` must be provided.
+   */
+  classifiers?: EvalClassifier<Input, Output, Expected, Metadata>[];
+
   /**
    * A set of parameters that will be passed to the evaluator.
    * Can be:
@@ -353,42 +370,7 @@ type EvaluatorBase<
    * Flushes spans before calling scoring functions
    */
   flushBeforeScoring?: boolean;
-};
-
-/**
- * Defines an evaluator. At least one of `scores` or `classifiers` must be provided.
- */
-export type Evaluator<
-  Input,
-  Output,
-  Expected,
-  Metadata extends BaseMetadata = DefaultMetadataType,
-  Parameters extends EvalParameters = EvalParameters,
-> = EvaluatorBase<Input, Output, Expected, Metadata, Parameters> &
-  (
-    | {
-        /**
-         * A set of functions that take an input, output, and expected value and return a {@link Score}.
-         */
-        scores: EvalScorer<Input, Output, Expected, Metadata>[];
-        /**
-         * A set of functions that take an input, output, and expected value and return a
-         * {@link Classification}. Results are recorded under the `classifications` column.
-         */
-        classifiers?: EvalClassifier<Input, Output, Expected, Metadata>[];
-      }
-    | {
-        /**
-         * A set of functions that take an input, output, and expected value and return a {@link Score}.
-         */
-        scores?: EvalScorer<Input, Output, Expected, Metadata>[];
-        /**
-         * A set of functions that take an input, output, and expected value and return a
-         * {@link Classification}. Results are recorded under the `classifications` column.
-         */
-        classifiers: EvalClassifier<Input, Output, Expected, Metadata>[];
-      }
-  );
+}
 
 export class EvalResultWithSummary<
   Input,
@@ -1007,7 +989,6 @@ function toClassificationItem(c: Classification): ClassificationItem {
   return {
     id: c.id,
     label: c.label ?? c.id,
-    ...(c.confidence !== undefined ? { confidence: c.confidence } : {}),
     ...(c.metadata !== undefined ? { metadata: c.metadata } : {}),
   };
 }
diff --git a/js/util/score.ts b/js/util/score.ts
index c02365dde..08daebeef 100644
--- a/js/util/score.ts
+++ b/js/util/score.ts
@@ -6,7 +6,6 @@ export interface Classification {
   name: string;
   id: string;
   label?: string;
-  confidence?: number | null;
   metadata?: Record<string, unknown>;
 }
 
@@ -16,7 +15,6 @@ export interface Classification {
 export interface ClassificationItem {
   id: string;
   label: string;
-  confidence?: number | null;
   metadata?: Record<string, unknown>;
 }