diff --git a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
new file mode 100644
index 000000000..aef7237ce
--- /dev/null
+++ b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
@@ -0,0 +1,435 @@
+/**
+ * EvalGen 2.0
+ *
+ * Ian Arawjo, Shreya Shankar, J.D. Zamfirescu, Helen Weixu Chen
+ *
+ * This file and its directory concerns the front-end to evaluation generator, EvalGen.
+ * EvalGen supports users in generating eval funcs (here binary assertions) and aligning them with their preferences.
+ *
+ * Specifically, the modal lets users:
+ *  - make and refine criteria to grade on (on the left)
+ *  - grade responses (on the right)
+ *  - while in the backend, an LLM is generating candidate assertions and selected the ones most aligned with user grades
+ * As the user grades responses, they add/refine existing criteria.
+ * This modal presents a shared interface where criteria can be iterated on *alongside* grading.
+ * This is because of **criteria drift,** a phenomenon identified observing users in EvalGen 1.0 (unreleased).
+ *
+ * An AI (LLM call) can also suggest criteria based on the implicit context (inputs, such as the prompt)
+ * and user feedback during grading (written feedback about failing outputs whose failure couldn't be classified under the immediate criteria set.)
+ */
+
+import React, { useCallback, useEffect, useMemo, useState } from "react";
+import {
+  EvalCriteria,
+  EvalFunctionSetReport,
+  EvalGenReport,
+} from "../backend/evalgen/typing";
+import { Dict, LLMResponse, RatingDict } from "../backend/typing";
+import useStore from "../store";
+import { escapeBraces } from "../backend/template";
+import StorageCache, { StringLookup } from "../backend/cache";
+import { generateLLMEvaluationCriteria } from "../backend/evalgen/utils";
+import { Button, Flex, Modal, Stepper } from "@mantine/core";
+import WelcomeStep from "./WelcomeStep";
+import FeedbackStep from "./FeedbackStep";
+import PickCriteriaStep from "./PickCriteriaStep";
+import ReportCardStep from "./ReportCardStep";
+import GradingResponsesStep from "./GradeResponsesStep";
+import {
+  batchResponsesByUID,
+  deepcopy,
+  sampleRandomElements,
+} from "../backend/utils";
+import { getRatingKeyForResponse } from "../ResponseRatingToolbar";
+import EvaluationFunctionExecutor from "../backend/evalgen/executor";
+import { getAIFeaturesModels } from "../backend/ai";
+
+// Main wizard component props
+interface EvalGenWizardProps {
+  opened: boolean;
+  onClose: () => void;
+  onComplete: (result: EvalFunctionSetReport) => void;
+  responses: LLMResponse[];
+}
+
+const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
+  opened,
+  onClose,
+  onComplete,
+  responses, // The LLM responses to operate over
+}) => {
+  // The active screen (stage) of EvalGen
+  const [active, setActive] = useState(0);
+
+  // From global state
+  const apiKeys = useStore((state) => state.apiKeys);
+  const genAIFeaturesProvider = useStore((state) => state.aiFeaturesProvider);
+  const genAIModelNames = useMemo(() => {
+    const models = getAIFeaturesModels(genAIFeaturesProvider);
+    return {
+      large: models.large,
+      small: models.small,
+    };
+  }, [genAIFeaturesProvider]);
+
+  // Regroup input responses by batch UID, whenever jsonResponses changes
+  const batchedResponses = useMemo(
+    () => (responses ? batchResponsesByUID(responses) : []),
+    [responses],
+  );
+
+  // For updating the global human ratings state
+  const setState = useStore((store) => store.setState);
+  const updateGlobalRating = useCallback(
+    (uid: string, label: string, payload: RatingDict) => {
+      const key = getRatingKeyForResponse(uid, label);
+      const safe_payload = deepcopy(payload);
+      setState(key, safe_payload);
+      StorageCache.store(key, safe_payload);
+    },
+    [setState],
+  );
+
+  // Criteria the user defines across the stages
+  const [criteria, setCriteria] = useState<EvalCriteria[]>([]);
+  const [onNextCallback, setOnNextCallback] = useState(() => () => {});
+
+  // Per-criteria grades (indexed by uid of response, then uid of criteria)
+  const [perCriteriaGrades, setPerCriteriaGrades] = useState<
+    Dict<Dict<boolean | undefined>>
+  >({});
+  const [annotation, setAnnotation] = useState<string | undefined>(undefined);
+  const setPerCriteriaGrade = (
+    responseUID: string,
+    criteriaUID: string,
+    newGrade: boolean | undefined,
+  ) => {
+    setPerCriteriaGrades((grades) => {
+      if (!grades[responseUID]) grades[responseUID] = {};
+      grades[responseUID][criteriaUID] = newGrade;
+      updateGlobalRating(responseUID, "perCriteriaGrades", grades[responseUID]);
+
+      // If the EvalGen executor is running, update the per-criteria grade for this sample:
+      executor?.setGradeForExample(responseUID, grades[responseUID]);
+
+      return { ...grades };
+    });
+  };
+  const numResponsesGraded = useMemo(() => {
+    let count = 0;
+    for (const uid in perCriteriaGrades) {
+      const gs = perCriteriaGrades[uid];
+      if (Object.values(gs).some((v) => v !== undefined && v !== null))
+        count += 1;
+    }
+    return count;
+  }, [perCriteriaGrades]);
+  const minNumToGrade = useMemo(() => {
+    return Math.min(10, Math.ceil(batchedResponses.length * 0.5));
+  }, [batchedResponses]);
+  const minNumToGradeToStartExecutor = useMemo(() => {
+    return Math.min(5, Math.ceil(batchedResponses.length * 0.25));
+  }, [batchedResponses]);
+
+  // The EvalGen object responsible for generating, implementing, and filtering candidate implementations
+  // :: Used on screen 4 (when `active` === 3).
+  const [executor, setExecutor] = useState<EvaluationFunctionExecutor | null>(
+    null,
+  );
+  const [evalGenReport, setEvalGenReport] =
+    useState<EvalFunctionSetReport | null>(null);
+
+  // Logs and state from the EvalGen backend
+  const [logs, setLogs] = useState<{ date: Date; message: string }[]>([]);
+  const [numCallsMade, setNumCallsMade] = useState({ strong: 0, weak: 0 });
+  const [execProgress, setExecProgress] = useState(0);
+
+  // The samples to pass the executor / grading responses features. This will be bounded
+  // by maxNumSamplesForExecutor, instead of the whole dataset.
+  const samplesForExecutor = useMemo(() => {
+    // The max number of samples (responses) to pass the executor. This controls how many requests will
+    // need to be sent off and how many evaluation function executions are performed.
+    // TODO: Give the user some control over this.
+    const maxNumSamplesForExecutor = 16;
+
+    // Sample from the full set of responses, if needed:
+    if (batchedResponses.length > maxNumSamplesForExecutor)
+      return sampleRandomElements(responses, maxNumSamplesForExecutor);
+    else return batchedResponses.slice();
+  }, [batchedResponses]);
+
+  // When the user is done per-criteria grading
+  const handleDonePerCriteriaGrading = useCallback(async () => {
+    // Await completion of all gen + execution of eval funcs
+    await executor?.waitForCompletion();
+
+    // Filtering eval funcs by grades and present results
+    const filteredFunctions =
+      (await executor?.filterEvaluationFunctions(0.25)) ?? null;
+    console.log("Filtered Functions: ", filteredFunctions);
+
+    // Return selected implementations to caller
+    // TODO
+    console.warn(filteredFunctions);
+
+    setActive(4); // Move to the report card step
+    setEvalGenReport(filteredFunctions);
+  }, [executor]);
+
+  // Update executor whenever resps, grades, or criteria change
+  useEffect(() => {
+    if (
+      criteria.length === 0 ||
+      numResponsesGraded < minNumToGradeToStartExecutor
+    )
+      return;
+    if (!executor) {
+      const addLog = (message: string) => {
+        setLogs((prevLogs) => [...prevLogs, { date: new Date(), message }]);
+      };
+
+      const ex = new EvaluationFunctionExecutor(
+        genAIModelNames,
+        apiKeys,
+        getLikelyPromptTemplateAsContext(samplesForExecutor) ?? "",
+        samplesForExecutor,
+        criteria,
+        (strong, weak) => {
+          // Callback to update GPT call counts
+          setNumCallsMade((n_calls) => {
+            n_calls.strong += strong;
+            n_calls.weak += weak;
+            return { ...n_calls };
+          });
+        },
+        addLog,
+        undefined, // don't pass any holistic grades at this stage
+        perCriteriaGrades,
+      );
+      setExecutor(ex);
+
+      // Start executor process
+      ex.start((progress) => {
+        setExecProgress(progress?.success ?? 0);
+      });
+    } else if (executor) {
+      // Update criteria in executor
+      executor.updateCriteria(criteria);
+    }
+  }, [
+    criteria,
+    samplesForExecutor,
+    numResponsesGraded,
+    minNumToGradeToStartExecutor,
+  ]);
+
+  const handleNext = useCallback(() => {
+    setActive((current) => Math.min(4, current + 1));
+  }, []);
+
+  const handlePrevious = useCallback(() => {
+    setActive((current) => Math.max(0, current - 1));
+  }, []);
+
+  const handleComplete = (evalFuncReport: EvalFunctionSetReport) => {
+    // Return final data to the caller
+    onComplete(evalFuncReport);
+    onClose();
+  };
+
+  const getLikelyPromptTemplateAsContext = (resps: LLMResponse[]) => {
+    // Attempt to infer the prompt template used to generate the responses:
+    const prompts = new Set<string>();
+    for (const resp_obj of resps) {
+      const pt = resp_obj?.metavars?.__pt;
+      if (pt !== undefined) {
+        prompts.add(StringLookup.get(pt) as string);
+      }
+    }
+
+    if (prompts.size === 0) return null;
+
+    // Pick a prompt template at random to serve as context....
+    return escapeBraces(prompts.values().next().value ?? "");
+  };
+
+  const exportGradesAndNotes = useStore((store) => store.exportGradesAndNotes);
+  async function genCriteriaFromContext(responses: LLMResponse[]) {
+    // Get the context from the input responses
+    const inputPromptTemplate =
+      getLikelyPromptTemplateAsContext(batchedResponses);
+
+    if (inputPromptTemplate === null) {
+      console.error("No context found. Cannot proceed.");
+      return;
+    }
+
+    // Get the user feedback on the responses, if any, from the global state
+    const feedback = exportGradesAndNotes(responses);
+
+    // Attempt to generate criteria using an LLM
+    return await generateLLMEvaluationCriteria(
+      inputPromptTemplate,
+      genAIModelNames.large,
+      apiKeys,
+      undefined,
+      undefined,
+      feedback,
+    );
+  }
+
+  return (
+    <Modal
+      opened={opened}
+      onClose={onClose}
+      // title="EvalGen Wizard"
+      size="95%"
+      padding="md"
+      // keepMounted
+      // closeOnClickOutside={true}
+      styles={{
+        inner: {
+          padding: "5%", // This creates space around the modal (10% total)
+        },
+        header: {
+          padding: "0px",
+          backgroundColor: "transparent",
+          // borderBottom: "1px solid black",
+        },
+        content: {
+          height: "100%", // Fill the available space
+          maxHeight: "90vh", // Limit to 90% of viewport height
+          display: "flex",
+          flexDirection: "column",
+        },
+        body: {
+          flex: 1, // This makes the body expand to fill available space
+          overflow: "auto", // Add scrolling if content is too tall
+        },
+      }}
+    >
+      {active === 0 && <WelcomeStep setOnNextCallback={setOnNextCallback} />}
+
+      {active === 1 && (
+        <FeedbackStep
+          onNext={handleNext}
+          onPrevious={handlePrevious}
+          responses={batchedResponses}
+          setOnNextCallback={setOnNextCallback}
+        />
+      )}
+
+      {active === 2 && (
+        <PickCriteriaStep
+          onNext={handleNext}
+          onPrevious={handlePrevious}
+          criteria={criteria}
+          setCriteria={setCriteria}
+          genCriteriaFromContext={() =>
+            genCriteriaFromContext(batchedResponses)
+          }
+          genAIModelNames={genAIModelNames}
+          setOnNextCallback={setOnNextCallback}
+        />
+      )}
+
+      {active === 3 && (
+        <GradingResponsesStep
+          onNext={handleNext}
+          onPrevious={handlePrevious}
+          genAIModelNames={genAIModelNames}
+          numCallsMade={numCallsMade}
+          executor={executor}
+          logs={logs}
+          responses={samplesForExecutor} // This is deliberately not the entire list of responses, for now.
+          criteria={criteria}
+          setCriteria={setCriteria}
+          grades={perCriteriaGrades}
+          setPerCriteriaGrade={setPerCriteriaGrade}
+          setOnNextCallback={setOnNextCallback}
+        />
+      )}
+
+      {active === 4 && (
+        <ReportCardStep
+          onPrevious={handlePrevious}
+          onFinish={handleComplete}
+          criteria={criteria}
+          setOnNextCallback={setOnNextCallback}
+          report={evalGenReport}
+        />
+      )}
+
+      {/* Sticky footer - button and steppers */}
+      <div
+        style={{
+          position: "fixed",
+          bottom: 106,
+          padding: "10px",
+          width: "95%",
+          pointerEvents: "none",
+        }}
+      >
+        <Flex justify="space-between">
+          <Button
+            variant="default"
+            onClick={handlePrevious}
+            disabled={active === 0}
+            style={{ pointerEvents: "all" }}
+          >
+            &lt; Back
+          </Button>
+
+          <Button
+            color={active === 3 ? "green" : "blue"}
+            onClick={active !== 3 ? handleNext : handleDonePerCriteriaGrading}
+            disabled={
+              active === 4 ||
+              (active === 3 && numResponsesGraded < minNumToGrade)
+            }
+            style={{ pointerEvents: "all" }}
+          >
+            {active === 3
+              ? numResponsesGraded >= minNumToGrade
+                ? "I think I'm done"
+                : `Grade at least ${minNumToGrade - numResponsesGraded} more`
+              : "Next >"}
+          </Button>
+        </Flex>
+      </div>
+      <div
+        style={{
+          position: "fixed",
+          bottom: 0,
+          background: "white",
+          padding: "10px",
+          borderTop: "1px solid #ddd",
+          width: "95%",
+        }}
+      >
+        <Stepper active={active} mb="xl">
+          <Stepper.Step label="Welcome" description="Get started">
+            {/* Step content is rendered below */}
+          </Stepper.Step>
+          <Stepper.Step label="Feedback" description="Rate some responses">
+            {/* Step content is rendered below */}
+          </Stepper.Step>
+          <Stepper.Step label="Criteria" description="Define eval criteria">
+            {/* Step content is rendered below */}
+          </Stepper.Step>
+          <Stepper.Step
+            label="Grading and Generation"
+            description="Grade by criteria, while we generate implementations"
+          >
+            {/* Step content is rendered below */}
+          </Stepper.Step>
+          <Stepper.Step label="Results" description="View alignment">
+            {/* Step content is rendered below */}
+          </Stepper.Step>
+        </Stepper>
+      </div>
+    </Modal>
+  );
+};
+
+export default EvalGenWizard;
diff --git a/chainforge/react-server/src/EvalGen/FeedbackStep.tsx b/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
new file mode 100644
index 000000000..719e50c86
--- /dev/null
+++ b/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
@@ -0,0 +1,169 @@
+import React, { useCallback, useEffect, useMemo, useState } from "react";
+import { Dict, LLMResponse, RatingDict } from "../backend/typing";
+import {
+  Button,
+  Center,
+  Flex,
+  Stack,
+  Text,
+  Textarea,
+  Title,
+  Tooltip,
+} from "@mantine/core";
+import GradingView from "./GradingView";
+import { IconThumbDown, IconThumbUp } from "@tabler/icons-react";
+import { getRatingKeyForResponse } from "../ResponseRatingToolbar";
+import useStore from "../store";
+import { deepcopy } from "../backend/utils";
+import StorageCache from "../backend/cache";
+
+interface FeedbackStepProps {
+  onNext: () => void;
+  onPrevious: () => void;
+  responses: LLMResponse[];
+  setOnNextCallback: React.Dispatch<React.SetStateAction<() => unknown>>;
+}
+
+const FeedbackStep: React.FC<FeedbackStepProps> = ({
+  onNext,
+  onPrevious,
+  responses,
+  setOnNextCallback,
+}) => {
+  const [shownResponse, setShownResponse] = useState<LLMResponse | undefined>(
+    undefined,
+  );
+  const [shownResponseIdx, setShownResponseIdx] = useState(0);
+
+  // Global state
+  const storeState = useStore<Dict<RatingDict>>((store) => store.state);
+  const setStoreState = useStore((store) => store.setState);
+
+  // The cache keys storing the ratings for this response object
+  const grade = useMemo(() => {
+    if (!shownResponse) return null;
+    const key = getRatingKeyForResponse(shownResponse?.uid, "grade");
+    const g = storeState[key];
+    if (g) return g[0];
+    else return null;
+  }, [shownResponse, storeState]);
+  const annotation = useMemo(() => {
+    if (!shownResponse) return "";
+    const key = getRatingKeyForResponse(shownResponse?.uid, "note");
+    const a = storeState[key];
+    if (a) return a[0]?.toString();
+    else return "";
+  }, [shownResponse, storeState]);
+
+  // Set the rating in the global store, which *should* update the above.
+  const setRating = useCallback(
+    (
+      uid: string | undefined,
+      label: string,
+      payload: boolean | string | null,
+    ) => {
+      if (!uid) return;
+      const key = getRatingKeyForResponse(uid, label);
+      setStoreState(key, { 0: payload }); // TODO: This will erase any feedback given on n>1 responses in the input.
+      StorageCache.store(key, { 0: payload });
+    },
+    [setStoreState],
+  );
+  const setGrade = (val: boolean | null) =>
+    setRating(shownResponse?.uid, "grade", val);
+  const setAnnotation = (val: string) =>
+    setRating(shownResponse?.uid, "note", val);
+
+  useEffect(() => {
+    if (!responses || responses.length === 0) return;
+    setShownResponse(responses[0]); // We only show the first response if n>1 resps per prompt, for simplicity's sake
+    setShownResponseIdx(0);
+  }, [responses]);
+
+  const nextResponse = useCallback(() => {
+    if (responses.length === 0) return;
+    if (shownResponseIdx < responses.length - 1) {
+      setShownResponseIdx(shownResponseIdx + 1);
+      setShownResponse(responses[shownResponseIdx + 1]);
+    }
+  }, [shownResponseIdx, responses]);
+
+  const prevResponse = useCallback(() => {
+    if (shownResponseIdx > 0) {
+      setShownResponseIdx(shownResponseIdx - 1);
+      setShownResponse(responses[shownResponseIdx - 1]);
+    }
+  }, [shownResponseIdx, responses]);
+
+  return (
+    <Stack spacing="sm" mb={200}>
+      <Title order={3}>Provide Feedback on Some Model Outputs</Title>
+
+      <GradingView
+        shownResponse={shownResponse}
+        shownResponseIdx={shownResponseIdx}
+        // shownResponseIdx={shownResponseUniqueIdx}
+        responseCount={responses.length}
+        gotoNextResponse={nextResponse}
+        gotoPrevResponse={prevResponse}
+      />
+
+      <Flex justify="center" gap="50px">
+        <Tooltip label="This response is bad!" withinPortal withArrow>
+          <Button
+            color={grade === true ? "gray" : "red"}
+            variant={grade !== false ? "outline" : "filled"}
+            onClick={() => {
+              setGrade(grade !== false ? false : null);
+            }}
+          >
+            <IconThumbDown />
+            &nbsp;Bad!
+          </Button>
+        </Tooltip>
+        <Tooltip label="This response is good!" withinPortal withArrow>
+          <Button
+            color={grade === false ? "gray" : "green"}
+            variant={grade !== true ? "outline" : "filled"}
+            onClick={() => {
+              setGrade(grade !== true ? true : null);
+            }}
+          >
+            <IconThumbUp />
+            &nbsp;Good!
+          </Button>
+        </Tooltip>
+      </Flex>
+      <Center mb={100}>
+        <Stack spacing="xs" w="80%">
+          <Text>What&apos;s the reason for your grade? Explain why:</Text>
+          <Flex align="center" justify="space-around" gap="lg">
+            <Textarea
+              value={annotation}
+              onChange={(e) => setAnnotation(e.currentTarget.value)}
+              disabled={grade === null}
+              autoFocus
+              w="100%"
+              onKeyDown={(e) => {
+                if (e.key === "Enter") {
+                  e.preventDefault();
+                  nextResponse();
+                }
+              }}
+            />
+            <Button
+              onClick={nextResponse}
+              color="dark"
+              disabled={grade === null || (grade === false && !annotation)}
+              h={54}
+            >
+              Submit and Next
+            </Button>
+          </Flex>
+        </Stack>
+      </Center>
+    </Stack>
+  );
+};
+
+export default FeedbackStep;
diff --git a/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx b/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
new file mode 100644
index 000000000..076b03755
--- /dev/null
+++ b/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
@@ -0,0 +1,684 @@
+import React, { useCallback, useEffect, useState } from "react";
+import { EvalCriteria } from "../backend/evalgen/typing";
+import { Dict, LLMResponse } from "../backend/typing";
+import {
+  ActionIcon,
+  Button,
+  Center,
+  Flex,
+  Grid,
+  Group,
+  rem,
+  ScrollArea,
+  Skeleton,
+  Stack,
+  Text,
+  Textarea,
+  TextInput,
+  Title,
+  Tooltip,
+} from "@mantine/core";
+import GradingView from "./GradingView";
+import { useDisclosure } from "@mantine/hooks";
+import { v4 as uuid } from "uuid";
+import {
+  IconRobot,
+  IconTerminal2,
+  IconThumbDown,
+  IconThumbUp,
+  IconTrash,
+} from "@tabler/icons-react";
+import {
+  generateLLMEvaluationCriteria,
+  getPromptForGenEvalCriteriaFromDesc,
+} from "../backend/evalgen/utils";
+import useStore from "../store";
+import EvaluationFunctionExecutor from "../backend/evalgen/executor";
+
+const ThumbUpDownButtons = ({
+  grade,
+  onChangeGrade,
+  getGradeCount,
+}: {
+  grade: boolean | undefined;
+  onChangeGrade: (newGrade: boolean | undefined) => void;
+  getGradeCount: (grade: boolean | undefined) => number;
+}) => {
+  const true_count = getGradeCount(true);
+  const false_count = getGradeCount(false);
+
+  return (
+    <>
+      {/* Thumbs up/down buttons */}
+      <Button
+        color={grade === true ? "green" : "gray"}
+        m={0}
+        p={0}
+        variant="subtle"
+        onClick={() => {
+          // Toggle grade: if on (true), turn 'off' (undefined, for neutral).
+          if (onChangeGrade) onChangeGrade(grade === true ? undefined : true);
+        }}
+      >
+        <div className="gradeContainer">
+          <IconThumbUp size="20pt" fill={grade === true ? "#aea" : "white"} />
+          {true_count > 0 && <div className="gradeUpCount">{true_count}</div>}
+        </div>
+      </Button>
+      <Button
+        color={grade === false ? "red" : "gray"}
+        m={0}
+        p={0}
+        variant="subtle"
+        onClick={() => {
+          // Toggle grade: if on (true), turn 'off' (undefined, for neutral).
+          if (onChangeGrade) onChangeGrade(grade === false ? undefined : false);
+        }}
+      >
+        <div className="gradeContainer">
+          <IconThumbDown
+            size="20pt"
+            fill={grade === false ? "pink" : "white"}
+          />
+          {false_count > 0 && (
+            <div className="gradeDownCount">{false_count}</div>
+          )}
+        </div>
+      </Button>
+    </>
+  );
+};
+
+interface CriteriaCardProps {
+  criterion: EvalCriteria;
+  onChange: (changedCriteria: EvalCriteria) => void;
+  onDelete: () => void;
+  initiallyOpen?: boolean;
+  grade: boolean | undefined;
+  onChangeGrade: (newGrade: boolean | undefined) => void;
+  getGradeCount: (grade: boolean | undefined) => number;
+  getStateValue: (stateId: number) => number;
+}
+
+const CriteriaCard: React.FC<CriteriaCardProps> = ({
+  criterion,
+  onChange,
+  onDelete,
+  initiallyOpen,
+  grade,
+  getGradeCount,
+  onChangeGrade,
+  getStateValue,
+}) => {
+  const [opened, { toggle }] = useDisclosure(initiallyOpen ?? false);
+  const [title, setTitle] = useState(criterion.shortname ?? "New Criteria");
+
+  return (
+    <Stack spacing={0} ml={8}>
+      <Flex align="center">
+        <Group spacing="0px">
+          {/* Thumbs up/down buttons */}
+          <ThumbUpDownButtons
+            grade={grade}
+            onChangeGrade={onChangeGrade}
+            getGradeCount={getGradeCount}
+          />
+
+          {/* Title of the criteria */}
+          <TextInput
+            value={title}
+            onChange={(e) => setTitle(e.target.value)}
+            onBlur={(e) => {
+              criterion.shortname = e.target.value;
+              if (onChange) onChange(criterion);
+            }}
+            placeholder="Criteria name"
+            variant="unstyled"
+            size="md"
+            ml="xs"
+            className="nodrag nowheel"
+            styles={{
+              input: {
+                padding: "0px",
+                height: "14pt",
+                minHeight: "0pt",
+                fontWeight: 500,
+              },
+            }}
+          />
+        </Group>
+
+        <Group spacing="4px" ml="auto">
+          {/* Whether this criteria should be implemented with code (function) or an LLM evaluator */}
+          <Tooltip
+            label={
+              criterion.eval_method === "code"
+                ? "Change to an LLM evaluator"
+                : "Change to a code evaluator"
+            }
+            withinPortal
+            withArrow
+          >
+            <Text
+              color="#999"
+              size="sm"
+              mr="6px"
+              onClick={() => {
+                criterion.eval_method =
+                  criterion.eval_method === "code" ? "expert" : "code";
+                if (onChange) onChange(criterion);
+              }}
+            >
+              {criterion.eval_method === "code" ? (
+                <Flex style={{ userSelect: "none" }}>
+                  <IconTerminal2 size="14pt" />
+                  &nbsp;Python
+                </Flex>
+              ) : (
+                <Flex style={{ userSelect: "none" }}>
+                  <IconRobot size="14pt" />
+                  &nbsp;LLM
+                </Flex>
+              )}
+            </Text>
+          </Tooltip>
+
+          {/* <Contributor getStateValue={getStateValue} /> */}
+
+          {/* Delete button (and any other criterion-specific changes in the future) */}
+          <ActionIcon variant="subtle" color="red" onClick={onDelete}>
+            <IconTrash style={{ width: rem(16), height: rem(16) }} />
+          </ActionIcon>
+        </Group>
+      </Flex>
+
+      <Textarea
+        value={criterion.criteria}
+        placeholder="Describe here. You must describe what the criteria means before EvalGen can implement it."
+        size="xs"
+        ml={38}
+        onChange={(e) => {
+          criterion.criteria = e.target.value;
+          if (onChange) onChange(criterion);
+        }}
+        onClickCapture={(e) => e.stopPropagation()}
+        styles={{
+          input: {
+            border: "none",
+            borderWidth: "0px",
+            margin: "0px",
+            color: "#444",
+            background: "transparent",
+            lineHeight: 1.1,
+            paddingTop: "4px !important",
+            paddingBottom: "4px !important",
+          },
+        }}
+        autosize
+        minRows={2}
+        maxRows={5}
+        fz="sm"
+        mb="xs"
+        c="dimmed"
+      />
+    </Stack>
+  );
+};
+
+interface GradingResponsesStepProps {
+  onNext: () => void;
+  onPrevious: () => void;
+  executor: EvaluationFunctionExecutor | null;
+  logs: { date: Date; message: string }[];
+  genAIModelNames: { large: string; small: string };
+  numCallsMade: { strong: number; weak: number };
+  responses: LLMResponse[];
+  criteria: EvalCriteria[];
+  setCriteria: React.Dispatch<React.SetStateAction<EvalCriteria[]>>;
+  grades: Dict<Dict<boolean | undefined>>; // per-criteria grades
+  setPerCriteriaGrade: (
+    responseUID: string,
+    criteriaUID: string,
+    newGrade: boolean | undefined,
+  ) => void;
+  setOnNextCallback: React.Dispatch<React.SetStateAction<() => unknown>>;
+}
+
+const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
+  onNext,
+  onPrevious,
+  executor,
+  logs,
+  genAIModelNames,
+  numCallsMade,
+  responses,
+  criteria,
+  setCriteria,
+  grades,
+  setPerCriteriaGrade,
+  setOnNextCallback,
+}) => {
+  const apiKeys = useStore((state) => state.apiKeys);
+  const [shownResponse, setShownResponse] = useState<LLMResponse | undefined>(
+    undefined,
+  );
+  const [pastShownResponses, setPastShownResponses] = useState<LLMResponse[]>(
+    [],
+  );
+  const [shownResponseIdx, setShownResponseIdx] = useState(0);
+
+  const [newCriteriaDesc, setNewCriteriaDesc] = useState("");
+
+  const getStateValue = (stateId: number) => {
+    return Math.floor(Math.random() * 30 + 6);
+  };
+  const getGradeCount = (criteriaUID: string, grade: boolean | undefined) => {
+    let count = 0;
+    for (const respUid in grades) {
+      count += grade === grades[respUid][criteriaUID] ? 1 : 0;
+    }
+    return count;
+  };
+
+  useEffect(() => {
+    if (!responses || responses.length === 0) return;
+    setShownResponse(responses[0]);
+    setShownResponseIdx(0);
+  }, [responses]);
+
+  const nextResponse = useCallback(() => {
+    if (responses.length === 0) return;
+    if (shownResponseIdx < responses.length - 1) {
+      setShownResponseIdx(shownResponseIdx + 1);
+      setShownResponse(responses[shownResponseIdx + 1]);
+    }
+  }, [shownResponseIdx, responses]);
+
+  const prevResponse = useCallback(() => {
+    if (shownResponseIdx > 0) {
+      setShownResponseIdx(shownResponseIdx - 1);
+      setShownResponse(responses[shownResponseIdx - 1]);
+    }
+  }, [shownResponseIdx, responses]);
+
+  // Add a criterion
+  const handleAddCriteria = (newCrit: EvalCriteria) => {
+    setCriteria((cs) => {
+      if (!newCrit.uid) newCrit.uid = uuid();
+      return [...cs, newCrit];
+    });
+  };
+
+  // Modify an existing criterion
+  const handleChangeCriteria = (newCrit: EvalCriteria, uid: string) => {
+    setCriteria((cs) => {
+      const idx = cs.findIndex((c) => c.uid === uid);
+      if (idx === -1) {
+        console.error("Could not find criteria with uid", uid);
+        return cs;
+      }
+      cs[idx] = newCrit;
+      return [...cs];
+    });
+  };
+
+  // Delete a criterion
+  const handleDeleteCriteria = (uid: string) => {
+    setCriteria((cs) => {
+      return cs.filter((c) => c.uid !== uid);
+    });
+  };
+
+  // Synthesize a new criteria according to the feedback given for the shown response
+  const [isLoadingCriteria, setIsLoadingCriteria] = useState(0);
+  const synthNewCriteriaWithLLM = (
+    response: string,
+    feedback: string,
+    grade: "good" | "bad" | "unknown",
+  ) => {
+    // Add a loading Skeleton
+    setIsLoadingCriteria((num) => num + 1);
+    // Make async LLM call to expand criteria only if the feedback contains some idea of a constraint on the output and isn't covered by existing criteria
+    const prettyCriteria = criteria
+      .map((crit) => {
+        return `${crit.shortname}: ${crit.criteria}`;
+      })
+      .join("\n");
+
+    generateLLMEvaluationCriteria(
+      "",
+      genAIModelNames.large,
+      apiKeys,
+      `I've given some feedback on some text output. Use this feedback to decide on a single new evaluation criteria with a yes/no answer, only if the feedback isn't encompassed by existing criteria. I want you to take the criteria and output a JSON object in the format below. 
+  
+  TEXT OUTPUT: 
+  \`\`\`
+  ${response}
+  \`\`\`
+  
+  EXISTING CRITERIA:
+  \`\`\`
+  ${prettyCriteria}
+  \`\`\`
+  
+  GRADE (whether text was good or bad):
+  \`\`\`
+  ${grade}
+  \`\`\`
+  
+  FEEDBACK: 
+  \`\`\`
+  ${feedback}
+  \`\`\`
+  
+  If you determine the feedback corresponds to a new criteria, your response should contain a short title for the criteria ("shortname"), a description of the criteria in 2 sentences ("criteria"), and whether it should be evaluated with "code", or by an "expert" if the criteria is difficult to evaluate ("eval_method"). Your answer should be JSON within a \`\`\`json \`\`\` marker, with the following three fields: "criteria", "shortname", and "eval_method" (code or expert). The "criteria" should expand upon the user's input, the "shortname" should be a very brief title for the criteria, and this list should contain as many evaluation criteria as you can think of. Each evaluation criteria should test a unit concept that should evaluate to "true" in the ideal case. Only output JSON, nothing else. Output an empty list if there is no new evaluation criteria`, // prompt
+      "gpt-4o", // llm
+    )
+      .then((evalCrits) => {
+        // Take only the first if evalCrits has a nonempty list
+        if (evalCrits[0]) {
+          setCriteria((crit) =>
+            crit.concat([
+              {
+                ...evalCrits[0],
+                uid: uuid(),
+              },
+            ]),
+          );
+        }
+        // Remove a loading Skeleton
+        setIsLoadingCriteria((num) => num - 1);
+        // setNumGPT4Calls((num) => num + 1);
+      })
+      .catch((err) => {
+        console.error(err);
+        setIsLoadingCriteria((num) => num - 1);
+      });
+  };
+
+  const addCriteria = (desc: string) => {
+    // Add a loading Skeleton
+    setIsLoadingCriteria((num) => num + 1);
+    // Make async LLM call to expand criteria
+    generateLLMEvaluationCriteria(
+      "",
+      genAIModelNames.large,
+      apiKeys,
+      getPromptForGenEvalCriteriaFromDesc(desc), // prompt
+      null, // system_msg
+    )
+      .then((evalCrits) => {
+        // Take only the first suggested by the model, if any
+        setCriteria((crit) =>
+          crit.concat([
+            {
+              ...evalCrits[0],
+              uid: uuid(),
+            },
+          ]),
+        );
+        // Remove a loading Skeleton
+        setIsLoadingCriteria((num) => num - 1);
+      })
+      .catch((err) => {
+        console.error(err);
+        setIsLoadingCriteria((num) => num - 1);
+      });
+  };
+
+  return (
+    <Grid h="100%">
+      <Grid.Col span={8}>
+        <Stack justify="space-between">
+          {/* View showing the response the user is currently grading */}
+          <GradingView
+            shownResponse={shownResponse}
+            shownResponseIdx={shownResponseIdx}
+            responseCount={responses.length}
+            gotoNextResponse={nextResponse}
+            gotoPrevResponse={prevResponse}
+          />
+
+          <Flex direction="column">
+            <Flex justify="space-between" align="center">
+              <Text size="lg" weight={500} mb="sm">
+                LLM Activity
+              </Text>
+              {/* GPT Call Tally */}
+              <Text size="sm" color="dark" style={{ fontStyle: "italic" }}>
+                Executed {numCallsMade.strong} {genAIModelNames.large} calls and{" "}
+                {numCallsMade.weak} {genAIModelNames.small} calls.
+              </Text>
+            </Flex>
+            <div
+              style={{
+                backgroundColor: "#f0f0f0",
+                color: "#333",
+                fontFamily: "monospace",
+                fontSize: "8pt",
+                padding: "12px",
+                lineHeight: "1.2",
+                width: "calc(100% - 10px)",
+                height: "200px",
+                overflowY: "auto",
+                borderRadius: "8px",
+                border: "1px solid #ddd",
+                marginRight: "10px", // Space on the right
+              }}
+              ref={(el) => {
+                if (el) {
+                  el.scrollTop = el.scrollHeight;
+                }
+              }}
+            >
+              {logs.map((log, index) => (
+                <div key={index}>
+                  <span style={{ color: "#4A90E2" }}>
+                    {log.date.toLocaleString()} -{" "}
+                  </span>
+                  <span>{log.message}</span>
+                </div>
+              ))}
+            </div>
+          </Flex>
+
+          {/* Progress bar */}
+          {/* <Flex justify="left" align="center" gap="md">
+                    <Stack w="100%" spacing={4}>
+                      <Text color="#aaa" size="sm">
+                        {bottomBar.progressLabel}
+                      </Text>
+                      <Progress w="100%" value={bottomBar.progressPerc} mb="0px" />
+                    </Stack>
+                  </Flex> */}
+        </Stack>
+      </Grid.Col>
+      <Grid.Col
+        span={4}
+        bg="#eee"
+        pt="16px"
+        h="100%"
+        style={{ boxShadow: "-10px 0px 20px #aaa" }}
+      >
+        <Center>
+          <Title order={3} ml={8} mt="sm" mb="md">
+            Per-criteria grading
+          </Title>
+        </Center>
+
+        <ScrollArea
+          h="75%"
+          offsetScrollbars
+          style={{ border: "1px solid #ccc" }}
+        >
+          <div
+            style={{
+              display: "flex",
+              flexDirection: "column",
+              marginBottom: "40px",
+            }}
+          >
+            <div style={{ flex: 2, overflowY: "auto" }}>
+              {criteria.map((e) => (
+                <CriteriaCard
+                  criterion={e}
+                  key={e.uid}
+                  onChange={(newCrit) => handleChangeCriteria(newCrit, e.uid)}
+                  onDelete={() => handleDeleteCriteria(e.uid)}
+                  grade={
+                    shownResponse && grades[shownResponse.uid]
+                      ? grades[shownResponse.uid][e.uid]
+                      : undefined
+                  }
+                  getGradeCount={(grade) => {
+                    return shownResponse
+                      ? getGradeCount(
+                          // shownResponse.uid,
+                          e.uid,
+                          grade,
+                        )
+                      : 0;
+                  }}
+                  onChangeGrade={(newGrade) => {
+                    if (shownResponse)
+                      setPerCriteriaGrade(shownResponse.uid, e.uid, newGrade);
+                  }}
+                  initiallyOpen={true}
+                  getStateValue={(stateId) => getStateValue(stateId)}
+                />
+              ))}
+              {isLoadingCriteria > 0 ? (
+                Array.from(
+                  { length: isLoadingCriteria },
+                  (v: unknown, idx: number) => (
+                    <Skeleton key={idx} h={80} mb={4} />
+                  ),
+                )
+              ) : (
+                <></>
+              )}
+            </div>
+
+            <div className="criteriaButtons">
+              {/* <Popover withArrow>
+              <Popover.Target>
+              <Button
+                leftIcon={<IconPencil size={14} />}
+                variant="subtle"
+                color="gray"
+                // gradient={{ from: "blue", to: "green", deg: 90 }}
+                // onClick={() => {
+                //   handleAddCriteria({
+                //     shortname: "New Criteria",
+                //     criteria: "",
+                //     eval_method: "code",
+                //     priority: 0,
+                //     uid: uuid(),
+                //   });
+                // }}
+              >
+                Add a new criteria
+              </Button>
+              </Popover.Target>
+              <Popover.Dropdown>
+                <Flex justify="space-around" align="center" gap="md">
+                  <Textarea label="Describe the critera:">Hello</Textarea>
+                  <Button>Submit</Button>
+                </Flex>
+                
+              </Popover.Dropdown>
+            </Popover> */}
+
+              {/* <Button
+                leftIcon={<IconSparkles size={14} />}
+                variant="subtle"
+                color="gray"
+                // gradient={{ from: "blue", to: "green", deg: 90 }}
+                onClick={() => {
+                  generateCriteria(responses);
+                }}
+              >
+                Suggest Criteria
+              </Button> */}
+            </div>
+
+            {/* <Stack spacing="0px" pl="xs" pr="lg" style={{ flex: 1 }}>
+            <Divider mt="lg" />
+            <Title mb="0px" order={4}>
+              Suggest New Criteria
+            </Title>
+            <Textarea
+              value={annotation}
+              onChange={(e) => setAnnotation(e.target.value)}
+              description="How good is this response? Explain anything not captured under your existing criteria. Your feedback will be used to generate new criteria."
+              mb="sm"
+            /> */}
+            {/* <Radio.Group
+              name="favoriteFramework"
+              label="Rate the response holistically:"
+              value={holisticGrade}
+              onChange={(v) => setHolisticGrade(v as "good" | "bad")}
+              withAsterisk
+              mb="md"
+            >
+              <Group mt="xs">
+                <Radio value="good" label="Good" />
+                <Radio value="bad" label="Bad" />
+                <span>
+                  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+                </span>
+                <Button
+                  color="green"
+                  variant="filled"
+                  disabled={
+                    !holisticGrade ||
+                    annotation === undefined ||
+                    annotation.length === 0
+                  }
+                  onClick={() => {
+                    synthNewCriteriaWithLLM(
+                      shownResponse?.responses[0].toString() ?? "",
+                      annotation ?? "",
+                      holisticGrade ?? "unknown",
+                    );
+
+                    nextResponse();
+                  }}
+                >
+                  + Submit Feedback
+                </Button>
+              </Group>
+            </Radio.Group> */}
+            {/* </Stack> */}
+          </div>
+
+          <Textarea
+            value={newCriteriaDesc}
+            onChange={(e) => setNewCriteriaDesc(e.currentTarget.value)}
+            label="Add new criteria:"
+            placeholder="Describe the criteria to add."
+            ml="md"
+            mr="md"
+          ></Textarea>
+          <Group position="right" mr="md" mt="sm">
+            <Button
+              color="green"
+              variant="filled"
+              disabled={
+                newCriteriaDesc?.trim().length === 0 || isLoadingCriteria > 0
+              }
+              onClick={() => {
+                addCriteria(newCriteriaDesc);
+                setNewCriteriaDesc("");
+              }}
+            >
+              + Add criteria
+            </Button>
+          </Group>
+        </ScrollArea>
+      </Grid.Col>
+    </Grid>
+  );
+};
+
+export default GradingResponsesStep;
diff --git a/chainforge/react-server/src/EvalGen/GradingView.tsx b/chainforge/react-server/src/EvalGen/GradingView.tsx
new file mode 100644
index 000000000..9f7aca3b0
--- /dev/null
+++ b/chainforge/react-server/src/EvalGen/GradingView.tsx
@@ -0,0 +1,183 @@
+import React, { ReactNode, useMemo } from "react";
+import { LLMResponse } from "../backend/typing";
+import {
+  cleanMetavarsFilterFunc,
+  llmResponseDataToString,
+  transformDict,
+} from "../backend/utils";
+import { Box, Button, Center, Flex, Stack, Text, Tooltip } from "@mantine/core";
+import {
+  IconChevronLeft,
+  IconChevronRight,
+  IconSparkles,
+} from "@tabler/icons-react";
+import { StringLookup } from "../backend/cache";
+import { cleanEscapedBraces } from "../backend/template";
+
+const HeaderText = ({ children }: { children: ReactNode }) => {
+  return (
+    <Text size="xl" fw={500} pl="sm" mb="lg">
+      {children}
+    </Text>
+  );
+};
+
+export interface GradingViewProps {
+  shownResponse: LLMResponse | undefined;
+  shownResponseIdx: number;
+  responseCount: number;
+  gotoPrevResponse: () => void;
+  gotoNextResponse: () => void;
+}
+
+const GradingView: React.FC<GradingViewProps> = ({
+  shownResponse,
+  shownResponseIdx,
+  responseCount,
+  gotoPrevResponse,
+  gotoNextResponse,
+}) => {
+  // Calculate inner values only when shownResponse changes
+  const responseText = useMemo(
+    () =>
+      shownResponse && shownResponse.responses?.length > 0
+        ? cleanEscapedBraces(
+            llmResponseDataToString(shownResponse.responses[0]),
+          )
+        : "",
+    [shownResponse],
+  );
+
+  const prompt = useMemo(
+    () => StringLookup.get(shownResponse?.prompt) ?? "",
+    [shownResponse],
+  );
+  const varsDivs = useMemo(() => {
+    const combined_vars_metavars = shownResponse
+      ? {
+          ...StringLookup.concretizeDict(shownResponse.vars),
+          ...transformDict(
+            StringLookup.concretizeDict(shownResponse.metavars),
+            cleanMetavarsFilterFunc,
+          ),
+        }
+      : {};
+
+    return Object.entries(combined_vars_metavars).map(([varname, val]) => (
+      <div key={varname} className="grade-resp-var-container">
+        <span className="response-var-name">{varname}&nbsp;=&nbsp;</span>
+        <span className="response-var-value linebreaks">{val}</span>
+      </div>
+    ));
+  }, [shownResponse]);
+
+  return (
+    <Stack justify="space-between">
+      <Box>
+        {/* Top header */}
+        <Flex justify="center">
+          <HeaderText>
+            {/* What do you think of this response? */}
+            What do you think of response #{shownResponseIdx + 1} of{" "}
+            {responseCount}?
+          </HeaderText>
+        </Flex>
+        {/* Middle response box with chevron buttons < and > for going back and forward a response */}
+        <Flex justify="center" align="center" mb="sm">
+          {/* Go back to previous response */}
+          <Tooltip label="To previous response" withArrow>
+            <Button
+              variant="filled"
+              color="dark"
+              onClick={gotoPrevResponse}
+              h={84}
+              p="10px 4px"
+              mr={4}
+            >
+              <IconChevronLeft />
+            </Button>
+          </Tooltip>
+
+          {/* The response one is currently grading */}
+          <div
+            className="response-box"
+            style={{
+              backgroundColor: "#eee",
+              width: "90%",
+              maxHeight: "340px",
+              overflowY: "scroll",
+              borderColor: "black",
+              borderStyle: "solid",
+            }}
+          >
+            <div className="response-item-llm-name-wrapper">
+              <div
+                className="small-response"
+                style={{ fontSize: "11pt", padding: "12pt" }}
+              >
+                {responseText}
+              </div>
+            </div>
+          </div>
+
+          {/* Go forward to the next response */}
+          <Tooltip label="To next response" withArrow>
+            <Button
+              variant="filled"
+              color="dark"
+              onClick={gotoNextResponse}
+              h={84}
+              p="10px 4px"
+              ml={4}
+            >
+              <IconChevronRight />
+            </Button>
+          </Tooltip>
+        </Flex>
+        {/* Views for the vars (inputs) that generated this response, and the concrete prompt */}
+        <Flex justify="center" mb="xl" gap="lg">
+          <div
+            style={{
+              backgroundColor: "#fff",
+              padding: "12px",
+              width: "45%",
+              borderRadius: "12px",
+              borderWidth: "1px",
+              borderStyle: "solid",
+            }}
+          >
+            Vars
+            <hr />
+            <div style={{ maxHeight: "160px", overflowY: "scroll" }}>
+              {varsDivs}
+            </div>
+          </div>
+          <div
+            style={{
+              backgroundColor: "#fff",
+              padding: "12px",
+              width: "45%",
+              borderRadius: "2px",
+            }}
+          >
+            Prompt
+            <hr />
+            <div
+              className="monofont linebreaks"
+              style={{
+                maxHeight: "160px",
+                overflowY: "scroll",
+                fontSize: "10pt",
+                lineHeight: "1.2",
+              }}
+            >
+              {prompt}
+            </div>
+          </div>
+        </Flex>
+      </Box>
+    </Stack>
+  );
+};
+
+export default GradingView;
diff --git a/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx b/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
new file mode 100644
index 000000000..cd4e908fe
--- /dev/null
+++ b/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
@@ -0,0 +1,580 @@
+import React, { useMemo, useState } from "react";
+import { EvalCriteria, EvalFunctionReport } from "../backend/evalgen/typing";
+import {
+  Accordion,
+  Button,
+  Card,
+  Checkbox,
+  Code,
+  Divider,
+  Flex,
+  Group,
+  Popover,
+  RingProgress,
+  ScrollArea,
+  SimpleGrid,
+  Skeleton,
+  Stack,
+  Switch,
+  Text,
+  Textarea,
+  TextInput,
+  Title,
+  Tooltip,
+  useMantineTheme,
+} from "@mantine/core";
+import { useDisclosure } from "@mantine/hooks";
+import {
+  IconCode,
+  IconRepeat,
+  IconRobot,
+  IconSparkles,
+  IconTrash,
+} from "@tabler/icons-react";
+import useStore from "../store";
+import { accuracyToColor, cmatrixTextAnnotations } from "../backend/utils";
+import {
+  generateLLMEvaluationCriteria,
+  getPromptForGenEvalCriteriaFromDesc,
+} from "../backend/evalgen/utils";
+import { v4 as uuid } from "uuid";
+import Plot from "react-plotly.js";
+
+interface PickCriteriaStepProps {
+  onNext: () => void;
+  onPrevious: () => void;
+  criteria: EvalCriteria[];
+  setCriteria: React.Dispatch<React.SetStateAction<EvalCriteria[]>>;
+  genCriteriaFromContext: () => Promise<EvalCriteria[] | undefined>;
+  setOnNextCallback: React.Dispatch<React.SetStateAction<() => unknown>>;
+  genAIModelNames: { large: string; small: string };
+}
+
+export interface CriteriaCardProps {
+  title: string;
+  description: string;
+  evalMethod: string;
+  onTitleChange?: (newTitle: string) => void;
+  onDescriptionChange?: (newDesc: string) => void;
+  onEvalMethodChange?: (newEvalMethod: string) => void;
+  onRemove?: () => void;
+  reportMode?: boolean;
+  evalFuncReport?: EvalFunctionReport;
+  onCheck?: (newChecked: boolean) => void;
+  otherFuncs?: EvalFunctionReport[];
+}
+
+export const CriteriaCard: React.FC<CriteriaCardProps> = function CriteriaCard({
+  title,
+  description,
+  evalMethod,
+  onTitleChange,
+  onDescriptionChange,
+  onEvalMethodChange,
+  onRemove,
+  reportMode,
+  evalFuncReport,
+  onCheck,
+  otherFuncs,
+}) {
+  const [checked, setChecked] = useState(true);
+  const [codeChecked, setCodeChecked] = useState(evalMethod === "code");
+  const theme = useMantineTheme();
+
+  // Report card specific
+  const [openedCMatrix, { close: closeCMatrix, open: openCMatrix }] =
+    useDisclosure(false);
+  const [viewedCode, { close: closeViewedCode, open: openViewedCode }] =
+    useDisclosure(false);
+  const cMatrixPlot = useMemo(() => {
+    if (!evalFuncReport) return undefined;
+    const x = ["Pred.<br>fail", "Pred.<br>pass"];
+    const y = ["Human<br>pass", "Human<br>fail"];
+    const z = [
+      [evalFuncReport.false_fail, evalFuncReport.true_pass],
+      [evalFuncReport.true_fail, evalFuncReport.false_pass],
+    ];
+    return (
+      <Plot
+        data={[
+          {
+            z,
+            x,
+            y,
+            xgap: 2,
+            ygap: 2,
+            type: "heatmap",
+            colorscale: "Blues",
+            showscale: false,
+            showlegend: false,
+          },
+        ]}
+        layout={{
+          width: 160,
+          height: 160,
+          margin: { t: 10, b: 40, l: 50, r: 0 },
+          annotations: cmatrixTextAnnotations(x, y, z),
+        }}
+      />
+    );
+  }, [evalFuncReport]);
+
+  const reportAccuracyRing = useMemo(() => {
+    if (!evalFuncReport) return undefined;
+    return {
+      percent: Math.floor((evalFuncReport.f1 ?? 0) * 100),
+      color: accuracyToColor(evalFuncReport.f1 ?? 0),
+    };
+  }, [evalFuncReport]);
+
+  const setCheckedAndRealign = (newChecked: boolean) => {
+    setChecked(newChecked);
+
+    // oncheck is a callback to the parent to update the selected eval functions
+    // oncheck is an awaitable function
+    if (onCheck && evalFuncReport) onCheck(newChecked);
+  };
+
+  const unselectedImplementations = useMemo(
+    () =>
+      otherFuncs !== undefined && otherFuncs.length > 0
+        ? otherFuncs.map((item, idx) => (
+            <div key={idx}>
+              <Code style={{ whiteSpace: "pre-wrap" }}>
+                {item.evalFunction.code}
+              </Code>
+              <Divider />
+            </div>
+          ))
+        : null,
+    [otherFuncs],
+  );
+
+  return (
+    <Card
+      shadow="sm"
+      padding="sm"
+      pl="md"
+      pb="xl"
+      radius="md"
+      withBorder
+      style={{ backgroundColor: checked ? "#f2f7fc" : "#fff" }}
+    >
+      <div
+        // onClick={() => setChecked(!checked)}
+        onKeyUp={(e) => e.preventDefault()}
+        className="checkcard"
+      >
+        <Tooltip label={checked ? "Don't use this" : "Use this"} withArrow>
+          <Checkbox
+            checked={checked}
+            onChange={() => setCheckedAndRealign(!checked)}
+            tabIndex={-1}
+            size="xs"
+            mr="sm"
+            mt="xs"
+            styles={{ input: { cursor: "pointer" } }}
+            aria-hidden
+          />
+        </Tooltip>
+
+        <div style={{ width: "100%" }}>
+          <TextInput
+            value={title}
+            onChange={(e) =>
+              onTitleChange ? onTitleChange(e.currentTarget.value) : null
+            }
+            mb={7}
+            lh={1}
+            styles={{
+              input: {
+                border: "none",
+                borderWidth: "0px",
+                padding: "0px",
+                background: "transparent",
+                fontWeight: 500,
+                fontSize: "12pt",
+                margin: "0px",
+                height: "auto",
+                minHeight: "auto",
+              },
+            }}
+          />
+
+          <Textarea
+            value={description}
+            onChange={(e) =>
+              onDescriptionChange
+                ? onDescriptionChange(e.currentTarget.value)
+                : null
+            }
+            onClickCapture={(e) => e.stopPropagation()}
+            styles={{
+              input: {
+                border: "none",
+                borderWidth: "0px",
+                paddingTop: "0px !important",
+                paddingLeft: "0px",
+                margin: "0px",
+                color: "#444",
+                background: "transparent",
+                lineHeight: 1.1,
+              },
+            }}
+            autosize
+            minRows={2}
+            maxRows={5}
+            fz="sm"
+            mb="xs"
+            c="dimmed"
+          />
+
+          {reportMode && (
+            <Popover
+              opened={viewedCode}
+              // offset={{ crossAxis: -20 }}
+              withinPortal
+              position="bottom"
+              shadow="lg"
+              withArrow
+              width={400}
+            >
+              <Popover.Target>
+                <Text
+                  size="sm"
+                  color="gray"
+                  onMouseEnter={openViewedCode}
+                  onMouseLeave={closeViewedCode}
+                >
+                  {codeChecked ? "Python" : "LLM"}
+                </Text>
+              </Popover.Target>
+              <Popover.Dropdown>
+                <Code style={{ whiteSpace: "pre-wrap" }}>
+                  {evalFuncReport?.evalFunction.code}
+                </Code>
+              </Popover.Dropdown>
+            </Popover>
+          )}
+        </div>
+
+        {!reportMode ? (
+          <Button
+            size="xs"
+            variant="subtle"
+            compact
+            color="gray"
+            onClick={onRemove}
+            pos="absolute"
+            right="8px"
+            top="8px"
+            style={{ padding: "0px" }}
+          >
+            <IconTrash size={"95%"} />
+          </Button>
+        ) : (
+          <></>
+        )}
+
+        {reportMode && reportAccuracyRing ? (
+          <Stack spacing={0}>
+            <Popover
+              position="right"
+              opened={openedCMatrix}
+              offset={{ crossAxis: -20 }}
+              withinPortal
+              shadow="lg"
+              withArrow
+            >
+              <Popover.Target>
+                <RingProgress
+                  size={100}
+                  sections={[
+                    {
+                      value: reportAccuracyRing.percent,
+                      color: reportAccuracyRing.color,
+                    },
+                  ]}
+                  label={
+                    <Text
+                      color={reportAccuracyRing.color}
+                      weight={700}
+                      align="center"
+                      size="lg"
+                    >
+                      {`${reportAccuracyRing.percent}%`}
+                    </Text>
+                  }
+                  onMouseEnter={openCMatrix}
+                  onMouseLeave={closeCMatrix}
+                />
+              </Popover.Target>
+              <Popover.Dropdown>{cMatrixPlot}</Popover.Dropdown>
+            </Popover>
+            <Text align="center" size="xs" color="gray" maw="90%" lh={1.1}>
+              Alignment with your grades
+            </Text>
+          </Stack>
+        ) : (
+          <></>
+        )}
+
+        {!reportMode ? (
+          <Switch
+            size="lg"
+            color="gray"
+            onLabel="Code"
+            offLabel="LLM"
+            pos="absolute"
+            right="8px"
+            bottom="10px"
+            checked={codeChecked}
+            onChange={(e) => {
+              setCodeChecked(e.currentTarget.checked);
+              if (onEvalMethodChange)
+                onEvalMethodChange(e.currentTarget.checked ? "code" : "expert");
+            }}
+            thumbIcon={
+              codeChecked ? (
+                <IconCode
+                  size="0.8rem"
+                  color={theme.colors.teal[theme.fn.primaryShade()]}
+                  stroke={3}
+                />
+              ) : (
+                <IconRobot
+                  size="0.8rem"
+                  color={theme.colors.blue[theme.fn.primaryShade()]}
+                  stroke={3}
+                />
+              )
+            }
+          />
+        ) : (
+          <></>
+        )}
+      </div>
+
+      <div>
+        {reportMode && (
+          <Accordion>
+            <Accordion.Item
+              key={"Show Bad Implementations"}
+              value={"Show Bad Implementations"}
+            >
+              <Accordion.Control>
+                <Text size="sm"> Show Bad Implementations </Text>
+              </Accordion.Control>
+              <Accordion.Panel>{unselectedImplementations}</Accordion.Panel>
+            </Accordion.Item>
+          </Accordion>
+        )}
+      </div>
+    </Card>
+  );
+};
+
+const PickCriteriaStep: React.FC<PickCriteriaStepProps> = ({
+  onNext,
+  onPrevious,
+  criteria,
+  setCriteria,
+  genCriteriaFromContext,
+  genAIModelNames,
+}) => {
+  // State for criteria cards
+  const [addCriteriaValue, setAddCriteriaValue] = useState("");
+  const [isLoadingCriteria, setIsLoadingCriteria] = useState(0);
+
+  // Global state
+  const apiKeys = useStore((state) => state.apiKeys);
+
+  // An estimate of many requests the implementation executor will require (upper bound).
+  const estimatedLLMRequestsToImplement = useMemo(() => {
+    return 0; // TODO
+    // const num_llm_evals = criteria.reduce(
+    //   (acc, crit) => acc + (crit.eval_method === "expert" ? 1 : 0),
+    //   0,
+    // );
+    // // The executor sends off one query per criteria to generate 3-5 candidates each.
+    // // Each candidate LLM eval prompt will be run over all candidates.
+    // return criteria.length + num_llm_evals * 5 * samples.length;
+  }, [criteria]);
+
+  const addCriteria = () => {
+    // Add a loading Skeleton
+    setIsLoadingCriteria((num) => num + 1);
+    // Make async LLM call to expand criteria
+    generateLLMEvaluationCriteria(
+      "",
+      genAIModelNames.large,
+      apiKeys,
+      getPromptForGenEvalCriteriaFromDesc(addCriteriaValue), // prompt
+      null, // system_msg
+    )
+      .then((evalCrits) => {
+        // Take only the first
+        setCriteria((crit) =>
+          crit.concat([
+            {
+              ...evalCrits[0],
+              uid: uuid(),
+            },
+          ]),
+        );
+        // Remove a loading Skeleton
+        setIsLoadingCriteria((num) => num - 1);
+      })
+      .catch((err) => {
+        console.error(err);
+        setIsLoadingCriteria((num) => num - 1);
+      });
+  };
+
+  const updateCriteria = (
+    newValue: string,
+    critIdx: number,
+    propName: "shortname" | "criteria" | "eval_method",
+  ) => {
+    setCriteria((crit) => {
+      if (propName in crit[critIdx])
+        // @ts-expect-error This is hard to type because it's a wrapper over an accessor.
+        crit[critIdx][propName] = newValue;
+      return [...crit];
+    });
+  };
+
+  const handleSubmit = () => {
+    // setCriteriaData(criteria);
+    onNext();
+  };
+
+  return (
+    <Stack spacing="lg" p="xl">
+      <Title order={3}>Define Evaluation Criteria</Title>
+
+      <div>
+        <Text size="sm" pl="sm" mb="lg">
+          Select criteria that you would like to evaluate responses on. Based on
+          your chosen criteria, LLM will generate implementations of assertions.
+          Afterwards, an optional human scoring pass can better align these
+          implementations with your expectations.
+        </Text>
+
+        <Text size="sm" pl="sm" mb="lg" style={{ fontStyle: "italic" }}>
+          Note: Due to rate limits and/or cost, think carefully before selecting
+          more than 5 criteria to be evaluated by LLMs.
+        </Text>
+
+        <Flex align="center" gap="lg">
+          <TextInput
+            label="Describe a new criterion to add, then press Enter:"
+            value={addCriteriaValue}
+            onChange={(evt) => setAddCriteriaValue(evt.currentTarget.value)}
+            placeholder="e.g., the response is valid JSON"
+            mb="lg"
+            pl="sm"
+            pr="sm"
+            w="100%"
+            onKeyDown={(evt) => {
+              if (evt.key === "Enter") {
+                evt.preventDefault();
+                addCriteria();
+                setAddCriteriaValue("");
+              }
+            }}
+          />
+          <Button
+            variant="filled"
+            disabled={addCriteriaValue?.trim().length === 0}
+            onClick={() => {
+              addCriteria();
+              setAddCriteriaValue("");
+            }}
+          >
+            Generate
+          </Button>
+          <Button
+            variant="outline"
+            onClick={() => {
+              if (isLoadingCriteria > 0) return;
+              setIsLoadingCriteria(3);
+              genCriteriaFromContext()
+                .then((crit) =>
+                  setCriteria(crit ? criteria.concat(crit) : criteria),
+                )
+                .catch((err) => {
+                  console.error(err);
+                  setIsLoadingCriteria(0);
+                })
+                .finally(() => setIsLoadingCriteria(0));
+            }}
+          >
+            <IconRepeat />
+            <IconSparkles />
+            &nbsp;Suggest criteria
+          </Button>
+        </Flex>
+
+        <ScrollArea mih={300} h={500} mah={500}>
+          <SimpleGrid cols={3} spacing="sm" verticalSpacing="sm" mb="lg">
+            {criteria.map((c, idx) => (
+              <CriteriaCard
+                title={c.shortname}
+                description={c.criteria}
+                evalMethod={c.eval_method}
+                key={`cc-${c.uid ?? idx.toString() + c.shortname}`}
+                onTitleChange={(title) =>
+                  updateCriteria(title, idx, "shortname")
+                }
+                onDescriptionChange={(desc) =>
+                  updateCriteria(desc, idx, "criteria")
+                }
+                onEvalMethodChange={(method) =>
+                  updateCriteria(method, idx, "eval_method")
+                }
+                onRemove={() =>
+                  setCriteria(criteria.filter((v, j) => j !== idx))
+                }
+              />
+            ))}
+            {isLoadingCriteria > 0 ? (
+              Array.from({ length: isLoadingCriteria }, (x, i) => (
+                <Skeleton key={`skele-card-${i}`}>
+                  <CriteriaCard
+                    title={"Loading"}
+                    description={"Loading"}
+                    evalMethod={"expert"}
+                  />
+                </Skeleton>
+              ))
+            ) : (
+              <></>
+            )}
+          </SimpleGrid>
+        </ScrollArea>
+      </div>
+
+      {/* <Group position="apart" mt="xl">
+        <Button variant="default" onClick={onPrevious}>
+          Back
+        </Button>
+        <Tooltip
+          label={`Will send off up to ${estimatedLLMRequestsToImplement} requests`}
+          withArrow
+        >
+          <Button
+            variant="gradient"
+            gradient={{ from: "teal", to: "lime", deg: 105 }}
+            disabled={!criteria || criteria.length === 0}
+            onClick={handleSubmit}
+          >
+            Ready to Grade!
+          </Button>
+        </Tooltip>
+      </Group> */}
+    </Stack>
+  );
+};
+
+export default PickCriteriaStep;
diff --git a/chainforge/react-server/src/EvalGen/ReportCardStep.tsx b/chainforge/react-server/src/EvalGen/ReportCardStep.tsx
new file mode 100644
index 000000000..8e421bcf0
--- /dev/null
+++ b/chainforge/react-server/src/EvalGen/ReportCardStep.tsx
@@ -0,0 +1,120 @@
+import React, { useMemo } from "react";
+import {
+  Button,
+  Card,
+  Flex,
+  Group,
+  ScrollArea,
+  SimpleGrid,
+  Stack,
+  Text,
+} from "@mantine/core";
+import { EvalCriteria, EvalFunctionSetReport } from "../backend/evalgen/typing";
+import { CriteriaCard } from "./PickCriteriaStep";
+
+interface ReportCardStepProps {
+  criteria: EvalCriteria[];
+  report: EvalFunctionSetReport | null;
+  onFinish: (reports: EvalFunctionSetReport) => void;
+  onPrevious: () => void;
+  setOnNextCallback: React.Dispatch<React.SetStateAction<() => unknown>>;
+}
+
+const ReportCardStep: React.FC<ReportCardStepProps> = ({
+  report,
+  onFinish,
+  onPrevious,
+}) => {
+  const cards = useMemo(() => {
+    if (!report) return null;
+    const cards = [];
+
+    // Iterate through selected eval functions and create cards
+    for (const selectedFunc of report.selectedEvalFunctions) {
+      const c = selectedFunc.evalCriteria;
+      // Find corresponding report in allEvalFunctionReports map from criteria to list
+      const evalFuncReports = report.allEvalFunctionReports.get(c);
+      const evalFuncReport = evalFuncReports?.find(
+        (rep) => rep.evalFunction === selectedFunc,
+      );
+      // Get the functions that were not selected for this criteria
+      const otherFuncs = evalFuncReports?.filter(
+        (rep) => rep.evalFunction !== selectedFunc,
+      );
+
+      cards.push(
+        <CriteriaCard
+          reportMode
+          title={c.shortname}
+          description={c.criteria}
+          evalMethod={c.eval_method}
+          key={c.uid}
+          evalFuncReport={evalFuncReport}
+          otherFuncs={otherFuncs}
+        />,
+      );
+    }
+    return cards;
+  }, [report]);
+
+  console.log(report);
+
+  return (
+    <Stack spacing="lg">
+      <Text align="center" size="lg" pl="sm" mb="lg">
+        Chosen Functions and Alignment
+      </Text>
+
+      {/* Show coverage and false failure rate numbers */}
+      <Flex justify="center" gap="md" mb="lg">
+        <Group position="center" spacing="xl" style={{ textAlign: "center" }}>
+          <Card
+            shadow="sm"
+            padding="md"
+            radius="md"
+            style={{ backgroundColor: "#f0f0f0" }}
+          >
+            <Text weight={500} size="md">
+              Coverage of Bad Responses
+            </Text>
+            <Text color="blue" weight={700} size="md">
+              {report?.failureCoverage.toFixed(2)}%
+            </Text>
+          </Card>
+          <Card
+            shadow="sm"
+            padding="md"
+            radius="md"
+            style={{ backgroundColor: "#f0f0f0" }}
+          >
+            <Text weight={500} size="md">
+              False Failure Rate
+            </Text>
+            <Text color="red" weight={700} size="md">
+              {report?.falseFailureRate.toFixed(2)}%
+            </Text>
+          </Card>
+        </Group>
+      </Flex>
+
+      <ScrollArea mih={300} h={400} mah={400}>
+        <SimpleGrid cols={3} spacing="sm" verticalSpacing="sm" mb="lg">
+          {cards}
+        </SimpleGrid>
+      </ScrollArea>
+
+      <Flex justify="center" gap={12} mt="xs">
+        <Button
+          onClick={() => {
+            if (!report) return;
+            onFinish(report);
+          }}
+        >
+          Finish with selected evaluators
+        </Button>
+      </Flex>
+    </Stack>
+  );
+};
+
+export default ReportCardStep;
diff --git a/chainforge/react-server/src/EvalGen/WelcomeStep.tsx b/chainforge/react-server/src/EvalGen/WelcomeStep.tsx
new file mode 100644
index 000000000..fe50ae4ef
--- /dev/null
+++ b/chainforge/react-server/src/EvalGen/WelcomeStep.tsx
@@ -0,0 +1,85 @@
+import React from "react";
+import { Anchor, Button, List, Stack, Text, Title } from "@mantine/core";
+
+interface WelcomeStepProps {
+  setOnNextCallback: React.Dispatch<React.SetStateAction<() => unknown>>;
+}
+
+const WelcomeStep: React.FC<WelcomeStepProps> = ({ setOnNextCallback }) => (
+  <Stack spacing="md" m="lg" p="lg" mb={120}>
+    <Title order={2}>Welcome to the EvalGen Wizard</Title>
+    <Text>
+      This wizard will guide you through creating automated evaluators for LLM
+      responses that are aligned with your preferences. You&apos;ll look at
+      data, define what you care about, apply those criteria to grade data, and
+      refine your criteria as you see more outputs. EvalGen then generates
+      automated evaluators that implement each criteria, chooses implementations
+      most aligned with your grades, and reports how aligned they are.
+    </Text>
+    <Text>
+      EvalGen is backed up by our{" "}
+      <Anchor
+        href="https://dl.acm.org/doi/abs/10.1145/3654777.3676450"
+        target="_blank"
+      >
+        empirical research at UIST 2024
+      </Anchor>
+      , and is inspired by inductive processes in UX research (heuristic
+      evaluation and grounded theory).
+    </Text>
+    <Text>Currently, Evalgen is in a public beta. It:</Text>
+    <List>
+      <List.Item>
+        Only generates <b>assertions (pass/fail tests)</b>. Numeric and
+        categorical evaluators are not included.
+      </List.Item>
+      <List.Item>
+        Asks for grades on a <b>per-criteria</b> basis on the main grading
+        screen. This is the chief difference from our paper.
+      </List.Item>
+      <List.Item>
+        Requires access to the GenAI features of ChainForge, which (currently)
+        requires an OpenAI API key. (If you&apos;d like to use other models,
+        more general access to GenAI features is coming soon.)
+      </List.Item>
+      <List.Item>
+        Should be run on the outputs of <b>already-run</b> Prompt Nodes
+        (you&apos;ve already collected some LLM responses).
+      </List.Item>
+      <List.Item>EvalGen will send off many requests during usage.</List.Item>
+    </List>
+    <Text>
+      🔔 <b>By using Evalgen, you take full responsibility for credit usage.</b>{" "}
+      Currently, EvalGen does NOT:
+    </Text>
+    <List>
+      <List.Item>
+        Work on imported spreadsheets of data (although if you are interested in
+        this, raise a Pull Request).
+      </List.Item>
+      <List.Item>
+        Generate code that uses third-party libraries. For safety, LLM-generated
+        Python code is run sandboxed in the browser with pyodide. Pyodide does
+        not have access to many libraries out-of-the-box. (If your eval criteria
+        implementation must use a third-party library, we suggest you use
+        ChainForge&apos;s genAI features on an individual code eval node,
+        outside this wizard.)
+      </List.Item>
+    </List>
+    {/* <Text>We have captured the following about your context:</Text>
+    <ul>
+      <li>…</li>
+      <li>[x] Use this info when helping me think of evaluation criteria</li>
+    </ul> */}
+    <Text>
+      After EvalGen finishes, the chosen evaluators appear in the MultiEval
+      node.
+    </Text>
+    <Text>
+      EvalGen is in beta. To improve it, provide feedback on our Github Issues
+      or Discussion pages, or raise a Pull Request with the changes.
+    </Text>
+  </Stack>
+);
+
+export default WelcomeStep;
diff --git a/chainforge/react-server/src/ItemsNode.tsx b/chainforge/react-server/src/ItemsNode.tsx
index 7d007e0a3..d1aad2c0c 100644
--- a/chainforge/react-server/src/ItemsNode.tsx
+++ b/chainforge/react-server/src/ItemsNode.tsx
@@ -12,7 +12,12 @@ import NodeLabel from "./NodeLabelComponent";
 import { IconForms, IconTransform } from "@tabler/icons-react";
 import { Handle, Node, Position } from "reactflow";
 import BaseNode from "./BaseNode";
-import { DebounceRef, genDebounceFunc, processCSV } from "./backend/utils";
+import {
+  DebounceRef,
+  genDebounceFunc,
+  processCSV,
+  stripWrappingQuotes,
+} from "./backend/utils";
 import { AIGenReplaceItemsPopover } from "./AiPopover";
 import { cleanEscapedBraces, escapeBraces } from "./backend/template";
 import { TextFieldsNodeProps } from "./TextFieldsNode";
@@ -22,16 +27,6 @@ const wrapInQuotesIfContainsComma = (str: string) =>
   str.includes(",") ? `"${str}"` : str;
 export const makeSafeForCSLFormat = (str: string) =>
   wrapInQuotesIfContainsComma(replaceDoubleQuotesWithSingle(str));
-const stripWrappingQuotes = (str: string) => {
-  if (
-    typeof str === "string" &&
-    str.length >= 2 &&
-    str.charAt(0) === '"' &&
-    str.charAt(str.length - 1) === '"'
-  )
-    return str.substring(1, str.length - 1);
-  else return str;
-};
 export const prepareItemsNodeData = (text: string) => ({
   text,
   fields: processCSV(text).map(stripWrappingQuotes).map(escapeBraces),
diff --git a/chainforge/react-server/src/LLMEvalNode.tsx b/chainforge/react-server/src/LLMEvalNode.tsx
index ed9c257b1..ed50fa92f 100644
--- a/chainforge/react-server/src/LLMEvalNode.tsx
+++ b/chainforge/react-server/src/LLMEvalNode.tsx
@@ -263,6 +263,7 @@ export const LLMEvaluatorComponent = forwardRef<
           apiKeys ?? {},
           progress_listener,
           cancelId,
+          undefined,
           useReasoning,
         );
       })
diff --git a/chainforge/react-server/src/LLMResponseInspector.tsx b/chainforge/react-server/src/LLMResponseInspector.tsx
index 34de9cc77..2ca763f84 100644
--- a/chainforge/react-server/src/LLMResponseInspector.tsx
+++ b/chainforge/react-server/src/LLMResponseInspector.tsx
@@ -57,6 +57,7 @@ import {
   blobToBase64,
 } from "./backend/utils";
 import {
+  EvalResultDisplay,
   MediaBox,
   ResponseBox,
   ResponseGroup,
@@ -66,6 +67,7 @@ import {
 import { getLabelForResponse } from "./ResponseRatingToolbar";
 import {
   Dict,
+  EvaluationScore,
   LLMResponse,
   LLMResponseData,
   TemplateVarInfo,
@@ -799,24 +801,46 @@ const LLMResponseInspector: React.FC<LLMResponseInspectorProps> = ({
               const val = resp_objs[0].metavars[v];
               return val !== undefined ? val : "(unspecified)";
             });
-            let eval_cols_vals: [string | JSX.Element, string][][] = [];
+            let eval_cols_vals: [
+              string | JSX.Element,
+              string,
+              string,
+              EvaluationScore | undefined,
+            ][][] = [];
             if (eval_res_cols && eval_res_cols.length > 0) {
               // We can assume that there's only one response object, since to
               // if eval_res_cols is set, there must be only one LLM.
               eval_cols_vals = eval_res_cols.map((metric_name, metric_idx) => {
                 const items = resp_objs[0].eval_res?.items;
-                if (!items) return [["(no result)", "(no result)"]];
+                const uid = resp_objs[0].uid;
+                if (!items)
+                  return [["(no result)", "(no result)", uid, undefined]];
                 return items.map((item) => {
-                  if (item === undefined) return ["(undefined)", "(undefined)"];
+                  if (item === undefined)
+                    return ["(undefined)", "(undefined)", uid, item];
                   if (
                     typeof item !== "object" &&
                     metric_idx === 0 &&
                     metric_name === "Score"
                   )
-                    return getEvalResultStr(item, true);
+                    return [...getEvalResultStr(item, true), uid, item] as [
+                      string | JSX.Element,
+                      string,
+                      string,
+                      EvaluationScore,
+                    ];
                   else if (typeof item === "object" && metric_name in item)
-                    return getEvalResultStr(item[metric_name], true);
-                  else return ["(unspecified)", "(unspecified)"];
+                    return [
+                      ...getEvalResultStr(item[metric_name], true),
+                      uid,
+                      item[metric_name],
+                    ] as [
+                      string | JSX.Element,
+                      string,
+                      string,
+                      EvaluationScore,
+                    ];
+                  else return ["(unspecified)", "(unspecified)", uid, item];
                 }); // treat n>1 resps per prompt as multi-line results in the column
               });
             }
@@ -866,7 +890,15 @@ const LLMResponseInspector: React.FC<LLMResponseInspectorProps> = ({
               | undefined
               | LLMResponse[]
               | LLMResponseData[]
-              | { type: "eval"; data: (string | JSX.Element)[][] }
+              | {
+                  type: "eval";
+                  data: (
+                    | string
+                    | JSX.Element
+                    | EvaluationScore
+                    | undefined
+                  )[][];
+                }
             > = {};
             let vals_arr_start_idx = 0;
             var_cols_vals.forEach((v, i) => {
@@ -977,11 +1009,22 @@ const LLMResponseInspector: React.FC<LLMResponseInspectorProps> = ({
             } else if ("type" in val && val.type === "eval") {
               return (
                 <Stack spacing={0}>
-                  {(val.data as [string | JSX.Element, string][]).map(
-                    (e, i) => (
-                      <div key={i}>{e[0]}</div>
-                    ),
-                  )}
+                  {(
+                    val.data as [
+                      string | JSX.Element,
+                      string,
+                      string,
+                      EvaluationScore | undefined,
+                    ][]
+                  ).map((e, i) => (
+                    <EvalResultDisplay
+                      uid={e[2]}
+                      evalRes={e[3]}
+                      evalResIdx={i}
+                      evalResultDivOrStr={e[0]}
+                      key={i}
+                    />
+                  ))}
                 </Stack>
               );
             } else
@@ -1086,7 +1129,10 @@ const LLMResponseInspector: React.FC<LLMResponseInspectorProps> = ({
               <div
                 key={"l" + leaf_id}
                 className={className}
-                style={{ backgroundColor: rgroup_color(eatenvars.length) }}
+                style={{
+                  backgroundColor: rgroup_color(eatenvars.length),
+                  position: "relative",
+                }}
               >
                 <ResponseGroup
                   header={header}
diff --git a/chainforge/react-server/src/MultiEvalNode.tsx b/chainforge/react-server/src/MultiEvalNode.tsx
index f88009ed2..c19343ac2 100644
--- a/chainforge/react-server/src/MultiEvalNode.tsx
+++ b/chainforge/react-server/src/MultiEvalNode.tsx
@@ -20,6 +20,7 @@ import {
   Button,
   Alert,
   Tooltip,
+  Flex,
 } from "@mantine/core";
 import { useDisclosure } from "@mantine/hooks";
 import {
@@ -31,6 +32,7 @@ import {
   IconPlus,
   IconRobot,
   IconSearch,
+  IconSparkles,
   IconTerminal,
   IconTrash,
 } from "@tabler/icons-react";
@@ -57,6 +59,8 @@ import { GatheringResponsesRingProgress } from "./LLMItemButtonGroup";
 import { Dict, LLMResponse, QueryProgress } from "./backend/typing";
 import { AlertModalContext } from "./AlertModal";
 import { Status } from "./StatusIndicatorComponent";
+import { EvalFunctionSetReport, EvalGenReport } from "./backend/evalgen/typing";
+import EvalGenWizard from "./EvalGen/EvalGenWizard";
 import StorageCache from "./backend/cache";
 const IS_RUNNING_LOCALLY = APP_IS_RUNNING_LOCALLY();
 
@@ -345,6 +349,8 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
   const [lastRunSuccess, setLastRunSuccess] = useState(true);
   const [showDrawer, setShowDrawer] = useState(false);
 
+  const [pulledInputs, setPulledInputs] = useState<LLMResponse[]>([]);
+
   // Debounce helpers
   const debounceTimeoutRef = useRef(null);
   const debounce = genDebounceFunc(debounceTimeoutRef);
@@ -359,9 +365,19 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
 
   // Add an evaluator to the end of the list
   const addEvaluator = useCallback(
-    (name: string, type: EvaluatorContainerDesc["type"], state: Dict) => {
+    (
+      name: string,
+      type: EvaluatorContainerDesc["type"],
+      state: Dict,
+      initiallyOpen = true,
+      uid?: string,
+    ) => {
       setEvaluators(
-        evaluators.concat({ name, uid: uuid(), type, state, justAdded: true }),
+        // evaluators.concat({ name, uid: uuid(), type, state, justAdded: true }),
+        (e) => [
+          ...e,
+          { name, uid: uid ?? uuid(), type, state, justAdded: initiallyOpen },
+        ],
       );
     },
     [evaluators],
@@ -411,89 +427,80 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
     );
   };
 
-  // const evaluatorComponents = useMemo(() => {
-  //   // evaluatorComponentRefs.current = [];
-
-  //   return evaluators.map((e, idx) => {
-  //     let component: React.ReactNode;
-  //     if (e.type === "python" || e.type === "javascript") {
-  //       component = (
-  //         <CodeEvaluatorComponent
-  //           ref={(el) =>
-  //             (evaluatorComponentRefs.current[idx] = {
-  //               type: "code",
-  //               name: e.name,
-  //               ref: el,
-  //             })
-  //           }
-  //           code={e.state?.code}
-  //           progLang={e.type}
-  //           type="evaluator"
-  //           id={id}
-  //           onCodeEdit={(code) =>
-  //             updateEvalState(idx, (e) => (e.state.code = code))
-  //           }
-  //           showUserInstruction={false}
-  //         />
-  //       );
-  //     } else if (e.type === "llm") {
-  //       component = (
-  //         <LLMEvaluatorComponent
-  //           ref={(el) =>
-  //             (evaluatorComponentRefs.current[idx] = {
-  //               type: "llm",
-  //               name: e.name,
-  //               ref: el,
-  //             })
-  //           }
-  //           prompt={e.state?.prompt}
-  //           grader={e.state?.grader}
-  //           format={e.state?.format}
-  //           id={id}
-  //           showUserInstruction={false}
-  //           onPromptEdit={(prompt) =>
-  //             updateEvalState(idx, (e) => (e.state.prompt = prompt))
-  //           }
-  //           onLLMGraderChange={(grader) =>
-  //             updateEvalState(idx, (e) => (e.state.grader = grader))
-  //           }
-  //           onFormatChange={(format) =>
-  //             updateEvalState(idx, (e) => (e.state.format = format))
-  //           }
-  //         />
-  //       );
-  //     } else {
-  //       console.error(
-  //         `Unknown evaluator type ${e.type} inside multi-evaluator node. Cannot display evaluator UI.`,
-  //       );
-  //       component = <Alert>Error: Unknown evaluator type {e.type}</Alert>;
-  //     }
-  //     return (
-  //       <EvaluatorContainer
-  //         name={e.name}
-  //         key={`${e.name}-${idx}`}
-  //         type={EVAL_TYPE_PRETTY_NAME[e.type]}
-  //         progress={e.progress}
-  //         onDelete={() => {
-  //           delete evaluatorComponentRefs.current[idx];
-  //           setEvaluators(evaluators.filter((_, i) => i !== idx));
-  //         }}
-  //         onChangeTitle={(newTitle) =>
-  //           setEvaluators(
-  //             evaluators.map((e, i) => {
-  //               if (i === idx) e.name = newTitle;
-  //               console.log(e);
-  //               return e;
-  //             }),
-  //           )
-  //         }
-  //         padding={e.type === "llm" ? "8px" : undefined}
-  //       >
-  //         {component}
-  //       </EvaluatorContainer>
-  //     );
-  //   });
-  // }, [evaluators, id]);
+  // const evalGenModalRef = useRef<EvalGenModalRef>(null);
+  // const openEvalGen = () => {
+  //   const resps = handlePullInputs();
+  //   evalGenModalRef.current?.trigger(resps, onFinalReportsReady);
+  // };
+
+  const onFinalReportsReady = useCallback(
+    (report: EvalFunctionSetReport) => {
+      // Turn the criteria in the final report into evaluators
+
+      for (const selectedFunc of report.selectedEvalFunctions) {
+        const crit = selectedFunc.evalCriteria;
+
+        // Find corresponding report in allEvalFunctionReports map from criteria to list
+        const evalFuncReports = report.allEvalFunctionReports.get(crit);
+        const evalFuncReport = evalFuncReports?.find(
+          (rep) => rep.evalFunction === selectedFunc,
+        );
+
+        if (!evalFuncReport) {
+          console.error(
+            "EvalGen: That's strange. No report found for selected function. Skipping...",
+            selectedFunc,
+          );
+          continue;
+        }
+
+        // Extract the code from the selected function
+        const code = evalFuncReport?.evalFunction.code;
+        // Get the functions that were not selected for this criteria
+        // const otherFuncs = evalFuncReports?.filter(
+        //   (rep) => rep.evalFunction !== selectedFunc,
+        // );
+
+        if (crit.eval_method === "code") {
+          // Python
+          addEvaluator(
+            crit.shortname,
+            "python",
+            {
+              code: code.trim(),
+              sandbox: true,
+            },
+            false,
+            crit.uid,
+          );
+        } else if (crit.eval_method === "expert") {
+          // LLM
+          addEvaluator(
+            crit.shortname,
+            "llm",
+            {
+              prompt: code,
+              format: "bin",
+            },
+            false,
+            crit.uid,
+          );
+        } else {
+          // JavaScript
+          addEvaluator(
+            crit.shortname,
+            "javascript",
+            {
+              code: code.trim(),
+            },
+            false,
+            crit.uid,
+          );
+        }
+      }
+    },
+    [addEvaluator],
+  );
 
   const handleError = useCallback(
     (err: Error | string) => {
@@ -699,15 +706,13 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
           }
         });
       });
+
       const finalResponses = Object.values(merged_res_objs_by_uid);
-      console.log("Output length:", finalResponses.length);
-      console.log("MultiEval Output:", finalResponses[0]?.eval_res?.items[0]);
       // We now have a dict of the form { uid: LLMResponse }
       // We need return only the values of this dict:
       setLastResponses(finalResponses);
       setLastRunSuccess(true);
       setDataPropsForNode(id, { output: finalResponses });
-      console.log("Setting output");
       StorageCache.store(`${id}.json`, finalResponses);
       pingOutputNodes(id);
       setStatus(Status.READY);
@@ -736,6 +741,21 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
     }
   }, [data]);
 
+  // EvalGen Wizard
+  const [evalGenOpened, setEvalGenOpened] = useState(false);
+  const openEvalGen = useCallback(() => {
+    setPulledInputs(handlePullInputs());
+    setEvalGenOpened(true);
+  }, []);
+  const handleEvalGenComplete = useCallback(
+    (evaluationData: EvalFunctionSetReport) => {
+      console.log("Evaluation wizard completed with data:", evaluationData);
+      onFinalReportsReady(evaluationData);
+      setEvalGenOpened(false);
+    },
+    [onFinalReportsReady],
+  );
+
   return (
     <BaseNode classNames="evaluator-node multi-eval-node" nodeId={id}>
       <NodeLabel
@@ -751,7 +771,15 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
         ref={inspectModal}
         jsonResponses={lastResponses}
       />
-      {/* <PickCriteriaModal ref={pickCriteriaModalRef} /> */}
+
+      <EvalGenWizard
+        opened={evalGenOpened}
+        onClose={() => setEvalGenOpened(false)}
+        onComplete={handleEvalGenComplete}
+        responses={pulledInputs}
+      />
+      {/* <EvalGenModal ref={evalGenModalRef} /> */}
+
       <iframe style={{ display: "none" }} id={`${id}-iframe`}></iframe>
 
       {/* {evaluatorComponents} */}
@@ -928,19 +956,15 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
             >
               LLM
             </Menu.Item>
-            {/* {AI_SUPPORT_ENABLED ? <Menu.Divider /> : <></>} */}
-            {/* {AI_SUPPORT_ENABLED ? (
-              <Menu.Item
-                icon={<IconSparkles size="14px" />}
-                onClick={onClickPickCriteria}
-              >
-                Let an AI decide!
-              </Menu.Item>
-            ) : (
-              <></>
-            )} */}
             <Menu.Divider />
-            {EVALUATOR_PRESETS.map((category, idx) => (
+            <Menu.Item
+              icon={<IconSparkles size="11pt" />}
+              onClick={openEvalGen}
+            >
+              Generate with EvalGen
+            </Menu.Item>
+            {/* <Menu.Divider /> */}
+            {/* {EVALUATOR_PRESETS.map((category, idx) => (
               <React.Fragment key={category.label}>
                 {idx > 0 && <Menu.Divider />}
                 <Menu.Label>{category.label}</Menu.Label>
@@ -967,12 +991,12 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
                   </Menu.Item>
                 ))}
               </React.Fragment>
-            ))}
+            ))} */}
           </Menu.Dropdown>
         </Menu>
       </div>
 
-      {/* EvalGen {evaluators && evaluators.length === 0 ? (
+      {evaluators && evaluators.length === 0 ? (
         <Flex justify="center" gap={12} mt="md">
           <Tooltip
             label="Let an AI help you generate criteria and implement evaluation functions."
@@ -980,16 +1004,20 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
             position="bottom"
             withArrow
           >
-            <Button onClick={onClickPickCriteria} variant="outline" size="xs">
+            <Button
+              onClick={openEvalGen}
+              variant="filled"
+              color="violet"
+              size="xs"
+            >
               <IconSparkles size="11pt" />
-              &nbsp;Generate criteria
+              &nbsp;Generate evals with EvalGen
             </Button>
-          </Tooltip> */}
-      {/* <Button disabled variant='gradient' gradient={{ from: 'teal', to: 'lime', deg: 105 }}><IconSparkles />&nbsp;Validate</Button> */}
-      {/* </Flex>
+          </Tooltip>
+        </Flex>
       ) : (
         <></>
-      )} */}
+      )}
 
       {lastRunSuccess && lastResponses && lastResponses.length > 0 ? (
         <InspectFooter
diff --git a/chainforge/react-server/src/PromptNode.tsx b/chainforge/react-server/src/PromptNode.tsx
index c621613c7..c5781c4bf 100644
--- a/chainforge/react-server/src/PromptNode.tsx
+++ b/chainforge/react-server/src/PromptNode.tsx
@@ -1189,7 +1189,10 @@ Soft failing by replacing undefined with empty strings.`,
                   o.metavars = resp_obj.metavars ?? {};
 
                   // Add a metavar for the prompt *template* in this PromptNode
-                  // o.metavars.__pt = prompt_template;
+                  o.metavars.__pt =
+                    typeof prompt_template === "string"
+                      ? prompt_template
+                      : prompt_template[0];
 
                   // Carry over any chat history
                   if (resp_obj.chat_history)
diff --git a/chainforge/react-server/src/ResponseBoxes.tsx b/chainforge/react-server/src/ResponseBoxes.tsx
index 3e765a609..6549993c5 100644
--- a/chainforge/react-server/src/ResponseBoxes.tsx
+++ b/chainforge/react-server/src/ResponseBoxes.tsx
@@ -1,8 +1,16 @@
-import React, { Suspense, useMemo, lazy, useEffect } from "react";
-import { Collapse, Flex, Stack } from "@mantine/core";
+import React, {
+  Suspense,
+  useMemo,
+  lazy,
+  useState,
+  useCallback,
+  useEffect,
+} from "react";
+import { ActionIcon, Collapse, Flex, Stack, Tooltip } from "@mantine/core";
 import { useDisclosure } from "@mantine/hooks";
 import {
   blobOrFileToDataURL,
+  deepcopy,
   llmResponseDataToString,
   truncStr,
 } from "./backend/utils";
@@ -12,7 +20,10 @@ import {
   LLMResponse,
   LLMResponseData,
 } from "./backend/typing";
-import { MediaLookup } from "./backend/cache";
+import StorageCache, { MediaLookup } from "./backend/cache";
+import { IconCheck, IconChecks, IconX } from "@tabler/icons-react";
+import { getRatingKeyForResponse } from "./ResponseRatingToolbar";
+import useStore from "./store";
 
 // Lazy load the response toolbars
 const ResponseRatingToolbar = lazy(() => import("./ResponseRatingToolbar"));
@@ -62,7 +73,7 @@ export const getEvalResultStr = (
       return [
         <Stack key={1} spacing={0}>
           {strs.map((s, i) => (
-            <span key={i}>s</span>
+            <div key={i}>{s[0]}</div>
           ))}
         </Stack>,
         joined_strs,
@@ -86,6 +97,134 @@ export const getEvalResultStr = (
   }
 };
 
+interface EvalResultAssessment {
+  correct: boolean | null;
+  // The original eval score that the user gave feedback on.
+  // If the underlying score changes, i.e. on subsequent runs after changing the evaluator,
+  // we need to be able to invalidate the user's assessment (or flip it automatically, in the case of boolean values).
+  orig_score?: EvaluationScore;
+  feedback?: string | null;
+}
+
+export const EvalResultDisplay = ({
+  uid, // the response uid
+  evalResIdx, // the index of the eval result in the array
+  evalRes, // the score of the eval result
+  evalResultDivOrStr,
+}: {
+  uid: string;
+  evalResIdx: number;
+  evalRes?: EvaluationScore;
+  evalResultDivOrStr: JSX.Element | string;
+}) => {
+  // The cache key storing the ratings for this user score
+  const evalResultAssessmentKey = useMemo(
+    () => getRatingKeyForResponse(uid, "metaeval") + `.${evalResIdx}`,
+    [uid, evalResIdx],
+  );
+
+  // The current rating states, reading from the global store.
+  // :: This ensures refreshes will occur only on this component, only when the rating
+  // :: for this component changes.
+  // const state = useStore((store) => store.state);
+  const setState = useStore((store) => store.setState);
+  const userRating = useStore<EvalResultAssessment | undefined>(
+    (store) => store.state[evalResultAssessmentKey],
+  );
+  const setRating = useCallback(
+    (correct: boolean | null, feedback?: string | null) => {
+      const safe_payload = deepcopy({
+        correct,
+        orig_score: evalRes,
+        feedback,
+      } as EvalResultAssessment);
+      setState(evalResultAssessmentKey, safe_payload);
+      StorageCache.store(evalResultAssessmentKey, safe_payload);
+    },
+    [evalResultAssessmentKey, setState, evalRes],
+  );
+
+  // The internal user assessment of this eval result
+  const rating = useMemo(() => userRating?.correct, [userRating]);
+
+  // Upon load, detect if the eval result has changed, if the user had previously assessed it.
+  // If so, either a) invalidate the user's rating or b) if it's a boolean, flip it.
+  useEffect(() => {
+    // If the original eval score wasn't saved, or the user has no rating, continue
+    if (userRating?.orig_score == null || userRating.correct == null) return;
+    const orig_eval_score = userRating.orig_score;
+    if (orig_eval_score !== evalRes) {
+      // The eval score has changed since the user last rated it!
+      if (
+        typeof evalRes === "boolean" &&
+        typeof orig_eval_score === "boolean"
+      ) {
+        // If the eval type was boolean, we can safely flip the user's rating:
+        setRating(!userRating.correct, userRating?.feedback);
+      } else {
+        // We don't know what to do if the score fundamentally changes type or is categorical.
+        // Simply invalidate the user's assessment:
+        setRating(null, null);
+      }
+    }
+  }, [userRating, evalRes]);
+
+  return (
+    <div className="eval-score">
+      {evalResultDivOrStr}
+      {rating == null && (
+        <Flex className="eval-vote-icons">
+          <ActionIcon variant="transparent" onClick={() => setRating(true)}>
+            <IconCheck className="eval-vote-icon" size={20} />
+          </ActionIcon>
+          <ActionIcon variant="transparent" onClick={() => setRating(false)}>
+            <IconX className="eval-vote-icon" size={20} />
+          </ActionIcon>
+        </Flex>
+      )}
+      {rating != null && (
+        <Flex className="eval-vote-chosen">
+          {rating === true && (
+            <Tooltip
+              label="Human-verified eval score"
+              withArrow
+              arrowSize={8}
+              withinPortal
+            >
+              <ActionIcon variant="transparent" onClick={() => setRating(null)}>
+                <IconChecks
+                  color="#666"
+                  stroke={2}
+                  className="eval-vote-icon"
+                  size={20}
+                />
+              </ActionIcon>
+            </Tooltip>
+          )}
+          {rating === false && (
+            <Tooltip
+              label="Human marked this eval score as incorrect"
+              multiline
+              withArrow
+              arrowSize={8}
+              withinPortal
+            >
+              <ActionIcon variant="transparent" onClick={() => setRating(null)}>
+                <IconX
+                  color="red"
+                  stroke={4}
+                  className="eval-vote-icon"
+                  size={20}
+                />
+              </ActionIcon>
+            </Tooltip>
+          )}
+        </Flex>
+      )}
+    </div>
+  );
+};
+
 const countResponsesBy = (
   responses: LLMResponseData[],
   keyFunc: (item: LLMResponseData) => string,
@@ -226,10 +365,13 @@ export const genResponseTextsDisplay = (
 
   // Collapse responses with the same texts.
   // We need to keep track of the original evaluation result per response str:
-  const resp_str_to_eval_res: Dict<EvaluationScore> = {};
+  const resp_str_to_eval_res: Dict<[EvaluationScore, number]> = {};
   if (eval_res_items)
     responses.forEach((r, idx) => {
-      resp_str_to_eval_res[llmResponseDataToString(r)] = eval_res_items[idx];
+      resp_str_to_eval_res[llmResponseDataToString(r)] = [
+        eval_res_items[idx],
+        idx,
+      ];
     });
 
   const same_resp_text_counts = countResponsesBy(responses, (r) =>
@@ -291,7 +433,14 @@ export const genResponseTextsDisplay = (
         )}
         {eval_res_items ? (
           <p className="small-response-metrics">
-            {getEvalResultStr(resp_str_to_eval_res[r], true)[0]}
+            <EvalResultDisplay
+              uid={res_obj.uid}
+              evalRes={resp_str_to_eval_res[r][0]}
+              evalResIdx={resp_str_to_eval_res[r][1]}
+              evalResultDivOrStr={
+                getEvalResultStr(resp_str_to_eval_res[r][0], true)[0]
+              }
+            />
           </p>
         ) : (
           <></>
diff --git a/chainforge/react-server/src/ResponseRatingToolbar.tsx b/chainforge/react-server/src/ResponseRatingToolbar.tsx
index a5998dba6..21e2636e8 100644
--- a/chainforge/react-server/src/ResponseRatingToolbar.tsx
+++ b/chainforge/react-server/src/ResponseRatingToolbar.tsx
@@ -23,11 +23,11 @@ import {
 import StorageCache from "./backend/cache";
 import useStore from "./store";
 import { deepcopy } from "./backend/utils";
+import { RatingDict } from "./backend/typing";
 
-type RatingDict = Record<number, boolean | string | undefined>;
-
-const getRatingKeyForResponse = (uid: string, label_name: string) =>
+export const getRatingKeyForResponse = (uid: string, label_name: string) =>
   `r.${uid}.${label_name}`;
+
 const collapse_ratings = (rating_dict: RatingDict, idxs: number[]) => {
   if (rating_dict === undefined) return undefined;
   for (let j = 0; j < idxs.length; j++) {
@@ -37,9 +37,14 @@ const collapse_ratings = (rating_dict: RatingDict, idxs: number[]) => {
   return undefined;
 };
 
+export const extractUIDFromRatingKey = (key: string) => {
+  return key.substring(2, key.lastIndexOf("."));
+};
+
 export const getLabelForResponse = (uid: string, label_name: string) => {
   return StorageCache.get(getRatingKeyForResponse(uid, label_name));
 };
+
 export const setLabelForResponse = (
   uid: string,
   label_name: string,
@@ -139,7 +144,7 @@ const ResponseRatingToolbar: React.FC<ResponseRatingToolbarProps> = ({
 
   // Override the text in the internal textarea whenever upstream annotation changes.
   useEffect(() => {
-    setNoteText(note !== undefined ? note.toString() : "");
+    setNoteText(note != null ? note.toString() : "");
   }, [note]);
 
   // The label for the pop-up comment box.
diff --git a/chainforge/react-server/src/VisNode.tsx b/chainforge/react-server/src/VisNode.tsx
index 4990dc102..6542f0098 100644
--- a/chainforge/react-server/src/VisNode.tsx
+++ b/chainforge/react-server/src/VisNode.tsx
@@ -127,6 +127,23 @@ const castEvalScoreToNum = (score: EvaluationScore): number => {
   else return 0; // unknown, soft fail
 };
 
+const findEvalResKeys = (resps: LLMResponse[]): Set<string> => {
+  const eval_res_keys = new Set<string>();
+  resps.forEach((resp_obj) => {
+    if (resp_obj.eval_res && resp_obj.eval_res.items) {
+      resp_obj.eval_res.items.forEach((item) => {
+        if (typeof item === "object") {
+          Object.keys(item).forEach((k) => eval_res_keys.add(k));
+        } else {
+          // If the item is not an object, we can assume it's a single value
+          eval_res_keys.add("score");
+        }
+      });
+    }
+  });
+  return eval_res_keys;
+};
+
 /**
  *  UTIL FUNCTIONS FOR VIS PLOTS
  */
@@ -250,6 +267,8 @@ interface VisNodeData {
   selected_vars: string[] | string;
   llm_groups?: { value: string; label: string }[];
   selected_llm_group?: string;
+  eval_res_vars?: string[];
+  selected_eval_res_var?: string;
   input: string;
   refresh: boolean;
   title: string;
@@ -324,6 +343,24 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
         : "LLM (default)",
     );
 
+    // The x-axis, which are the names of eval results (if a dictionary)
+    const [evalResVars, setEvalResVars] = useState<string[]>(
+      data?.eval_res_vars ?? ["score"],
+    );
+    const [selectedEvalResVar, setSelectedEvalResVar] = useState(
+      data?.selected_eval_res_var ?? "score",
+    );
+    const handleChangeSelectedEvalResVar = useCallback(
+      (new_val: React.ChangeEvent<HTMLSelectElement>) => {
+        setSelectedEvalResVar(new_val.target.value);
+        if (id)
+          setDataPropsForNode(id, {
+            selected_eval_res_var: new_val.target.value,
+          });
+      },
+      [id, setDataPropsForNode],
+    );
+
     // Typically, a user will only need the default LLM 'group' --all LLMs in responses.
     // However, when prompts are chained together, the original LLM info is stored in metavars as a key.
     // LLM groups allow you to plot against the original LLMs, even though a 'scorer' LLM might come after.
@@ -333,13 +370,14 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
     const [selectedLLMGroup, setSelectedLLMGroup] = useState(
       data?.selected_llm_group ?? "LLM",
     );
-    const handleChangeLLMGroup = (
-      new_val: React.ChangeEvent<HTMLSelectElement>,
-    ) => {
-      setSelectedLLMGroup(new_val.target.value);
-      if (id)
-        setDataPropsForNode(id, { selected_llm_group: new_val.target.value });
-    };
+    const handleChangeLLMGroup = useCallback(
+      (new_val: React.ChangeEvent<HTMLSelectElement>) => {
+        setSelectedLLMGroup(new_val.target.value);
+        if (id)
+          setDataPropsForNode(id, { selected_llm_group: new_val.target.value });
+      },
+      [id, setDataPropsForNode],
+    );
 
     // When the user clicks an item in the drop-down,
     // we want to autoclose the multiselect drop-down:
@@ -375,6 +413,15 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
       varnames = Array.from(varnames);
       metavars = Array.from(metavars);
 
+      // Find all keys in eval results
+      const eval_res_keys = findEvalResKeys(resps);
+      if (eval_res_keys.size === 0) {
+        eval_res_keys.add("score"); // default to 'score' if no keys found
+      } else if (selectedEvalResVar === "score") {
+        // We need to set the default eval res var to the first one in the list
+        setSelectedEvalResVar(eval_res_keys.values().next().value as string);
+      }
+
       // Get all vars for the y-axis dropdown, merging metavars and vars into one list,
       // and excluding any special 'LLM group' metavars:
       const msvars = [{ value: "LLM (default)", label: "LLM (default)" }]
@@ -401,18 +448,22 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
       if (
         !multiSelectVars ||
         !multiSelectValue ||
+        !evalResVars ||
         !areSetsEqual(
           new Set(msvars.map((o) => o.value)),
           new Set(multiSelectVars.map((o) => o.value)),
-        )
+        ) ||
+        !areSetsEqual(new Set(evalResVars), eval_res_keys)
       ) {
         setMultiSelectValue("LLM (default)");
         setMultiSelectVars(msvars);
+        setEvalResVars(Array.from(eval_res_keys));
         if (id)
           setDataPropsForNode(id, {
             vars: msvars,
             selected_vars: [],
             llm_groups: available_llm_groups,
+            eval_res_vars: Array.from(eval_res_keys),
           });
       }
     };
@@ -520,6 +571,21 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
             ? responses[0].eval_res.dtype
             : "Numeric";
 
+        let sel_typeof_eval_res = typeof_eval_res;
+        if (typeof_eval_res.includes("KeyValue")) {
+          const first_item = responses[0].eval_res?.items?.[0];
+          if (typeof first_item === "object") {
+            const val = first_item[selectedEvalResVar];
+            if (typeof val === "boolean") {
+              sel_typeof_eval_res = "Boolean";
+            } else if (typeof val === "number") {
+              sel_typeof_eval_res = "Numeric";
+            } else if (typeof val === "string") {
+              sel_typeof_eval_res = "Categorical";
+            }
+          }
+        }
+
         // If categorical type, check if all binary:
         if (typeof_eval_res === "Categorical") {
           const is_all_bools = responses.reduce(
@@ -535,6 +601,7 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
           );
           if (is_all_bools) {
             typeof_eval_res = "Boolean";
+            sel_typeof_eval_res = "Boolean";
             setDisableGraphTypeOption(true);
           }
         } else {
@@ -552,14 +619,14 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
             max_num_results_per_prompt = res_obj.eval_res.items.length;
         });
 
-        let plot_legend: React.ReactNode | null = null;
+        const plot_legend: React.ReactNode | null = null;
         let metric_axes_labels: string[] = [];
         let num_metrics = 1;
         if (
           typeof_eval_res.includes("KeyValue") &&
-          responses[0].eval_res !== undefined
+          responses.some((r) => r.eval_res !== undefined)
         ) {
-          metric_axes_labels = Object.keys(responses[0].eval_res.items[0]);
+          metric_axes_labels = Array.from(findEvalResKeys(responses));
           num_metrics = metric_axes_labels.length;
         }
 
@@ -590,9 +657,7 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
           if (typeof_eval_res.includes("KeyValue"))
             return eval_res_obj.items.map(
               (item) =>
-                (item as Dict<boolean | number | string>)[
-                  metric_axes_labels[0]
-                ],
+                (item as Dict<boolean | number | string>)[selectedEvalResVar],
             );
           return eval_res_obj.items;
         };
@@ -680,7 +745,7 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
 
           if (metric_axes_labels.length > 0)
             layout.xaxis = {
-              title: { font: { size: 12 }, text: metric_axes_labels[0] },
+              title: { font: { size: 12 }, text: selectedEvalResVar },
               ...layout.xaxis,
             };
           else
@@ -696,7 +761,7 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
         ) => {
           let names = new Set<string>();
           const plotting_categorical_vars =
-            group_type === "var" && typeof_eval_res === "Categorical";
+            group_type === "var" && sel_typeof_eval_res === "Categorical";
 
           // When we're plotting vars, we want the stacked bar colors to be the *categories*,
           // and the x_items to be the names of vars, so that the left axis is a vertical list of varnames.
@@ -744,8 +809,8 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
                   getColorForLLMAndSetIfNotFound(get_llm(responses[0]));
 
             if (
-              typeof_eval_res === "Boolean" ||
-              typeof_eval_res === "Categorical"
+              sel_typeof_eval_res === "Boolean" ||
+              sel_typeof_eval_res === "Categorical"
             ) {
               // Plot a histogram for categorical or boolean data.
               spec.push({
@@ -792,8 +857,12 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
                   d.histfunc = "sum";
                   d.y = new Array(x_items.length).fill(shortnames[name]);
                   d.textposition = "none"; // hide the text which appears within each bar
+                  const xaxis_title =
+                    metric_axes_labels.length > 0
+                      ? "Sum of '" + selectedEvalResVar + "'"
+                      : "Sum of scores";
                   layout.xaxis = {
-                    title: { font: { size: 12 }, text: "Sum of scores" },
+                    title: { font: { size: 12 }, text: xaxis_title },
                     ...layout.xaxis,
                   };
 
@@ -829,7 +898,7 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
 
           if (metric_axes_labels.length > 0)
             layout.xaxis = {
-              title: { font: { size: 12 }, text: metric_axes_labels[0] },
+              title: { font: { size: 12 }, text: selectedEvalResVar },
               ...layout.xaxis,
             };
         };
@@ -863,7 +932,7 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
               });
             }
 
-            if (typeof_eval_res === "Boolean") {
+            if (sel_typeof_eval_res === "Boolean") {
               // Plot a histogram for boolean (true/false) categorical data.
               spec.push({
                 type: "histogram",
@@ -899,9 +968,12 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
               if (graphType.key === "bar") {
                 d.type = "bar";
                 d.textposition = "none"; // hide the text which appears within each bar
-                xaxis_title = "Sum of scores";
+                xaxis_title =
+                  metric_axes_labels.length > 0
+                    ? "Sum of '" + selectedEvalResVar + "'"
+                    : "Sum of scores";
 
-                if (typeof_eval_res === "Numeric") {
+                if (sel_typeof_eval_res === "Numeric") {
                   // To make error bars work, we need to sum the numbers, instead of relying
                   // upon the stacked bar chart:
                   let sum_x_items: number[] = [];
@@ -945,10 +1017,6 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
           });
           layout.boxmode = "group";
           layout.bargap = 0.5;
-          // layout.yaxis = {
-          //   tickfont: { size: 10 },
-          //   ...layout.yaxis,
-          // };
 
           // Set the left margin to fit the yticks labels
           layout.margin.l = calcLeftPaddingForYLabels(
@@ -957,177 +1025,205 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
 
           if (metric_axes_labels.length > 0)
             layout.xaxis = {
-              title: { font: { size: 12 }, text: metric_axes_labels[0] },
+              title: { font: { size: 12 }, text: selectedEvalResVar },
               ...layout.xaxis,
             };
         };
 
-        if (num_metrics > 1) {
-          // For 2 or more metrics, display a parallel coordinates plot.
-          // :: For instance, if evaluator produces { height: 32, weight: 120 } plot responses with 2 metrics, 'height' and 'weight'
-          if (varnames.length === 1) {
-            const unique_vals = getUniqueKeysInResponses(
-              responses,
-              (resp_obj) => get_var(resp_obj, varnames[0]),
-            );
-            // const response_txts = responses.map(res_obj => res_obj.responses).flat();
-
-            const group_colors = varcolors;
-            const unselected_line_color = "#ddd";
-            const spec_colors = responses
-              .map((resp_obj) => {
-                const idx = unique_vals.indexOf(get_var(resp_obj, varnames[0]));
-                return resp_obj.eval_res
-                  ? Array(resp_obj.eval_res.items.length).fill(idx)
-                  : [];
-              })
-              .flat();
-
-            const colorscale: [number, string][] = [];
-            for (let i = 0; i < unique_vals.length; i++) {
-              if (
-                !selectedLegendItems ||
-                selectedLegendItems.indexOf(unique_vals[i]) > -1
-              )
-                colorscale.push([
-                  i / (unique_vals.length - 1),
-                  group_colors[i % group_colors.length],
-                ]);
-              else
-                colorscale.push([
-                  i / (unique_vals.length - 1),
-                  unselected_line_color,
-                ]);
+        // PARALLEL COORDINATES PLOT -- Disabled for now.
+        // May be re-enabled in the future.
+        // if (num_metrics > 1) {
+        //   // For 2 or more metrics, display a parallel coordinates plot.
+        //   // :: For instance, if evaluator produces { height: 32, weight: 120 } plot responses with 2 metrics, 'height' and 'weight'
+        //   if (varnames.length === 1) {
+        //     const unique_vals = getUniqueKeysInResponses(
+        //       responses,
+        //       (resp_obj) => get_var(resp_obj, varnames[0]),
+        //     );
+        //     // const response_txts = responses.map(res_obj => res_obj.responses).flat();
+
+        //     const group_colors = varcolors;
+        //     const unselected_line_color = "#ddd";
+        //     const spec_colors = responses
+        //       .map((resp_obj) => {
+        //         const idx = unique_vals.indexOf(get_var(resp_obj, varnames[0]));
+        //         return resp_obj.eval_res
+        //           ? Array(resp_obj.eval_res.items.length).fill(idx)
+        //           : [];
+        //       })
+        //       .flat();
+
+        //     const colorscale: [number, string][] = [];
+        //     for (let i = 0; i < unique_vals.length; i++) {
+        //       if (
+        //         !selectedLegendItems ||
+        //         selectedLegendItems.indexOf(unique_vals[i]) > -1
+        //       )
+        //         colorscale.push([
+        //           i / (unique_vals.length - 1),
+        //           group_colors[i % group_colors.length],
+        //         ]);
+        //       else
+        //         colorscale.push([
+        //           i / (unique_vals.length - 1),
+        //           unselected_line_color,
+        //         ]);
+        //     }
+
+        //     const dimensions: Dict = [];
+        //     metric_axes_labels.forEach((metric) => {
+        //       const evals = extractEvalResultsForMetric(metric, responses);
+        //       dimensions.push({
+        //         range: evals.every((e) => typeof e === "number")
+        //           ? [
+        //               Math.min(...(evals as number[])),
+        //               Math.max(...(evals as number[])),
+        //             ]
+        //           : undefined,
+        //         label: metric,
+        //         values: evals,
+        //       });
+        //     });
+
+        //     spec.push({
+        //       type: "parcoords",
+        //       pad: [10, 10, 10, 10],
+        //       line: {
+        //         color: spec_colors,
+        //         colorscale,
+        //       },
+        //       dimensions,
+        //     });
+        //     layout.margin = { l: 40, r: 40, b: 40, t: 50, pad: 0 };
+        //     layout.paper_bgcolor = "white";
+        //     layout.font = { color: "black" };
+        //     layout.selectedpoints = [];
+
+        //     // There's no built-in legend for parallel coords, unfortunately, so we need to construct our own:
+        //     const legend_labels: Dict<string> = {};
+        //     unique_vals.forEach((v, idx) => {
+        //       if (!selectedLegendItems || selectedLegendItems.indexOf(v) > -1)
+        //         legend_labels[v] = group_colors[idx % group_colors.length];
+        //       else legend_labels[v] = unselected_line_color;
+        //     });
+        //     const onClickLegendItem = (label: string) => {
+        //       if (
+        //         selectedLegendItems &&
+        //         selectedLegendItems.length === 1 &&
+        //         selectedLegendItems[0] === label
+        //       )
+        //         setSelectedLegendItems(null); // Clicking twice on a legend item deselects it and displays all
+        //       else setSelectedLegendItems([label]);
+        //     };
+        //     plot_legend = (
+        //       <PlotLegend
+        //         labels={legend_labels}
+        //         onClickLabel={onClickLegendItem}
+        //       />
+        //     );
+
+        //     // Tried to support Plotly hover events here, but looks like
+        //     // currently there are unsupported for parcoords: https://github.com/plotly/plotly.js/issues/3012
+        //     // onHover = (e) => {
+        //     //     console.log(e.curveNumber);
+        //     //     // const curveIdx = e.curveNumber;
+        //     //     // if (curveIdx < response_txts.length) {
+        //     //     //     if (!selectedLegendItems || selectedLegendItems.indexOf(unique_vals[spec_colors[curveIdx]]) > -1)
+        //     //     //         console.log(response_txts[curveIdx]);
+        //     //     // }
+        //     // };
+        //   } else {
+        //     setSelectedLegendItems(null);
+        //     const error_text =
+        //       "Plotting evaluations with more than one metric and more than one prompt parameter is currently unsupported.";
+        //     setPlaceholderText(
+        //       <p
+        //         style={{
+        //           maxWidth: "220px",
+        //           backgroundColor: "#f0aaaa",
+        //           padding: "10px",
+        //           fontSize: "10pt",
+        //         }}
+        //       >
+        //         {error_text}
+        //       </p>,
+        //     );
+        //     console.error(error_text);
+        //   }
+        // } else {
+
+        // A single metric --use plots like grouped box-and-whiskers, 3d scatterplot
+        if (varnames.length === 0) {
+          // No variables means they used a single prompt (no template) to generate responses
+          // (Users are likely evaluating differences in responses between LLMs)
+          if (sel_typeof_eval_res === "Boolean") plot_accuracy(get_llm, "llm");
+          else plot_simple_boxplot(get_llm, "llm");
+        } else if (varnames.length === 1) {
+          // 1 var; numeric eval
+          if (llm_names.length === 1) {
+            if (sel_typeof_eval_res === "Boolean")
+              // Accuracy plot per value of the selected variable:
+              plot_accuracy((r) => get_var_and_trim(r, varnames[0]), "var");
+            else {
+              // Simple box plot, as there is only a single LLM in the response
+              plot_simple_boxplot(
+                (r) => get_var_and_trim(r, varnames[0]),
+                "var",
+              );
             }
+          } else {
+            // There are multiple LLMs in the response; do a grouped box plot by LLM.
+            // Note that 'name' is now the LLM, and 'x' stores the value of the var:
+            plot_grouped_boxplot((r) => get_var_and_trim(r, varnames[0]));
+          }
+        } else if (varnames.length === 2) {
+          // Input is 2 vars; numeric eval
+          // Display a 3D scatterplot with 2 dimensions:
 
-            const dimensions: Dict = [];
-            metric_axes_labels.forEach((metric) => {
-              const evals = extractEvalResultsForMetric(metric, responses);
-              dimensions.push({
-                range: evals.every((e) => typeof e === "number")
-                  ? [
-                      Math.min(...(evals as number[])),
-                      Math.max(...(evals as number[])),
-                    ]
-                  : undefined,
-                label: metric,
-                values: evals,
-              });
-            });
-
-            spec.push({
-              type: "parcoords",
-              pad: [10, 10, 10, 10],
-              line: {
-                color: spec_colors,
-                colorscale,
+          const names_0 = new Set(
+            responses.map((r) => get_var_and_trim(r, varnames[0])),
+          );
+          const shortnames_0 = genUniqueShortnames(names_0);
+          const names_1 = new Set(
+            responses.map((r) => get_var_and_trim(r, varnames[1])),
+          );
+          const shortnames_1 = genUniqueShortnames(names_1);
+
+          if (llm_names.length === 1) {
+            spec = {
+              type: "scatter3d",
+              x: responses
+                .map((r) => get_var(r, varnames[0], true))
+                .map((s) => shortnames_0[s]),
+              y: responses
+                .map((r) => get_var(r, varnames[1], true))
+                .map((s) => shortnames_1[s]),
+              z: responses.map(
+                (r) =>
+                  get_items(r.eval_res).reduce(
+                    (acc: number, val) =>
+                      acc + (typeof val === "number" ? val : 0),
+                    0,
+                  ) / (r.eval_res?.items.length ?? 1),
+              ), // calculates mean
+              mode: "markers",
+              marker: {
+                color: getColorForLLMAndSetIfNotFound(llm_names[0]),
               },
-              dimensions,
-            });
-            layout.margin = { l: 40, r: 40, b: 40, t: 50, pad: 0 };
-            layout.paper_bgcolor = "white";
-            layout.font = { color: "black" };
-            layout.selectedpoints = [];
-
-            // There's no built-in legend for parallel coords, unfortunately, so we need to construct our own:
-            const legend_labels: Dict<string> = {};
-            unique_vals.forEach((v, idx) => {
-              if (!selectedLegendItems || selectedLegendItems.indexOf(v) > -1)
-                legend_labels[v] = group_colors[idx % group_colors.length];
-              else legend_labels[v] = unselected_line_color;
-            });
-            const onClickLegendItem = (label: string) => {
-              if (
-                selectedLegendItems &&
-                selectedLegendItems.length === 1 &&
-                selectedLegendItems[0] === label
-              )
-                setSelectedLegendItems(null); // Clicking twice on a legend item deselects it and displays all
-              else setSelectedLegendItems([label]);
             };
-            plot_legend = (
-              <PlotLegend
-                labels={legend_labels}
-                onClickLabel={onClickLegendItem}
-              />
-            );
-
-            // Tried to support Plotly hover events here, but looks like
-            // currently there are unsupported for parcoords: https://github.com/plotly/plotly.js/issues/3012
-            // onHover = (e) => {
-            //     console.log(e.curveNumber);
-            //     // const curveIdx = e.curveNumber;
-            //     // if (curveIdx < response_txts.length) {
-            //     //     if (!selectedLegendItems || selectedLegendItems.indexOf(unique_vals[spec_colors[curveIdx]]) > -1)
-            //     //         console.log(response_txts[curveIdx]);
-            //     // }
-            // };
           } else {
-            setSelectedLegendItems(null);
-            const error_text =
-              "Plotting evaluations with more than one metric and more than one prompt parameter is currently unsupported.";
-            setPlaceholderText(
-              <p
-                style={{
-                  maxWidth: "220px",
-                  backgroundColor: "#f0aaaa",
-                  padding: "10px",
-                  fontSize: "10pt",
-                }}
-              >
-                {error_text}
-              </p>,
-            );
-            console.error(error_text);
-          }
-        } else {
-          // A single metric --use plots like grouped box-and-whiskers, 3d scatterplot
-          if (varnames.length === 0) {
-            // No variables means they used a single prompt (no template) to generate responses
-            // (Users are likely evaluating differences in responses between LLMs)
-            if (typeof_eval_res === "Boolean") plot_accuracy(get_llm, "llm");
-            else plot_simple_boxplot(get_llm, "llm");
-          } else if (varnames.length === 1) {
-            // 1 var; numeric eval
-            if (llm_names.length === 1) {
-              if (typeof_eval_res === "Boolean")
-                // Accuracy plot per value of the selected variable:
-                plot_accuracy((r) => get_var_and_trim(r, varnames[0]), "var");
-              else {
-                // Simple box plot, as there is only a single LLM in the response
-                plot_simple_boxplot(
-                  (r) => get_var_and_trim(r, varnames[0]),
-                  "var",
-                );
-              }
-            } else {
-              // There are multiple LLMs in the response; do a grouped box plot by LLM.
-              // Note that 'name' is now the LLM, and 'x' stores the value of the var:
-              plot_grouped_boxplot((r) => get_var_and_trim(r, varnames[0]));
-            }
-          } else if (varnames.length === 2) {
-            // Input is 2 vars; numeric eval
-            // Display a 3D scatterplot with 2 dimensions:
-
-            const names_0 = new Set(
-              responses.map((r) => get_var_and_trim(r, varnames[0])),
-            );
-            const shortnames_0 = genUniqueShortnames(names_0);
-            const names_1 = new Set(
-              responses.map((r) => get_var_and_trim(r, varnames[1])),
-            );
-            const shortnames_1 = genUniqueShortnames(names_1);
-
-            if (llm_names.length === 1) {
-              spec = {
+            spec = [];
+            llm_names.forEach((llm) => {
+              const resps = responses.filter((r) => get_llm(r) === llm);
+              spec.push({
                 type: "scatter3d",
-                x: responses
+                x: resps
                   .map((r) => get_var(r, varnames[0], true))
                   .map((s) => shortnames_0[s]),
-                y: responses
+                y: resps
                   .map((r) => get_var(r, varnames[1], true))
                   .map((s) => shortnames_1[s]),
-                z: responses.map(
+                z: resps.map(
                   (r) =>
                     get_items(r.eval_res).reduce(
                       (acc: number, val) =>
@@ -1137,37 +1233,11 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
                 ), // calculates mean
                 mode: "markers",
                 marker: {
-                  color: getColorForLLMAndSetIfNotFound(llm_names[0]),
+                  color: getColorForLLMAndSetIfNotFound(llm),
                 },
-              };
-            } else {
-              spec = [];
-              llm_names.forEach((llm) => {
-                const resps = responses.filter((r) => get_llm(r) === llm);
-                spec.push({
-                  type: "scatter3d",
-                  x: resps
-                    .map((r) => get_var(r, varnames[0], true))
-                    .map((s) => shortnames_0[s]),
-                  y: resps
-                    .map((r) => get_var(r, varnames[1], true))
-                    .map((s) => shortnames_1[s]),
-                  z: resps.map(
-                    (r) =>
-                      get_items(r.eval_res).reduce(
-                        (acc: number, val) =>
-                          acc + (typeof val === "number" ? val : 0),
-                        0,
-                      ) / (r.eval_res?.items.length ?? 1),
-                  ), // calculates mean
-                  mode: "markers",
-                  marker: {
-                    color: getColorForLLMAndSetIfNotFound(llm),
-                  },
-                  name: llm,
-                });
+                name: llm,
               });
-            }
+            });
           }
         }
 
@@ -1184,6 +1254,8 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
     }, [
       multiSelectVars,
       multiSelectValue,
+      evalResVars,
+      selectedEvalResVar,
       selectedLLMGroup,
       responses,
       selectedLegendItems,
@@ -1254,9 +1326,10 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
             <span style={smallTextStyle}>x-axis:</span>
             <NativeSelect
               className="nodrag nowheel"
-              data={["score"]}
+              data={evalResVars}
               size="xs"
-              value={"score"}
+              value={selectedEvalResVar}
+              onChange={handleChangeSelectedEvalResVar}
               miw="80px"
             />
           </div>
diff --git a/chainforge/react-server/src/backend/ai.ts b/chainforge/react-server/src/backend/ai.ts
index 68e00ff87..9b1acd492 100644
--- a/chainforge/react-server/src/backend/ai.ts
+++ b/chainforge/react-server/src/backend/ai.ts
@@ -25,8 +25,8 @@ export type Row = string;
 const AIFeaturesLLMs = [
   {
     provider: "OpenAI",
-    small: { value: "gpt-4o", label: "OpenAI GPT4o" },
-    large: { value: "gpt-4", label: "OpenAI GPT4" },
+    small: { value: "gpt-4o-mini", label: "OpenAI GPT4o-mini" },
+    large: { value: "gpt-4o", label: "OpenAI GPT4o" },
   },
   {
     provider: "Bedrock",
diff --git a/chainforge/react-server/src/backend/backend.ts b/chainforge/react-server/src/backend/backend.ts
index 1358fcf70..3633e37b3 100644
--- a/chainforge/react-server/src/backend/backend.ts
+++ b/chainforge/react-server/src/backend/backend.ts
@@ -18,6 +18,7 @@ import {
   LLMResponseData,
   PromptVarType,
   StringOrHash,
+  ChatHistory,
   JSONCompatible,
 } from "./typing";
 import { LLM, LLMProvider, getEnumName, getProvider } from "./models";
@@ -33,6 +34,7 @@ import {
   llmResponseDataToString,
   extendArray,
   extendArrayDict,
+  stripWrappingQuotes,
   extractMediaVars,
 } from "./utils";
 import StorageCache, { MediaLookup, StringLookup } from "./cache";
@@ -1309,42 +1311,46 @@ export async function executepy(
  *
  * @param id a unique ID to refer to this information. Used when cache'ing evaluation results.
  * @param llm the LLM to query (as an LLM specification dict)
- * @param root_prompt the prompt template to use as the scoring function. Should include exactly one template var, {input}, where input responses will be put.
+ * @param root_prompt the prompt template to use as the scoring function. Should include exactly one template var, {__input}, where input responses will be put.
  * @param response_ids the cache'd response to run on, which must be a unique ID or list of unique IDs of cache'd data
  * @param api_keys optional. any api keys to set before running the LLM
  */
 export async function evalWithLLM(
   id: string,
-  llm: LLMSpec,
+  llm: string | LLMSpec,
   root_prompt: string,
-  response_ids: string | string[],
+  response_ids: string | string[] | LLMResponse[],
   api_keys?: Dict,
   progress_listener?: (progress: { [key: symbol]: any }) => void,
   cancel_id?: string | number,
+  system_msg?: string,
   useReasoning?: boolean,
 ): Promise<{ responses?: LLMResponse[]; errors: string[] }> {
   // Check format of response_ids
   if (!Array.isArray(response_ids)) response_ids = [response_ids];
-  response_ids = response_ids as Array<string>;
+  if (response_ids.length === 0) return { responses: [], errors: [] };
+
+  const load_resps_from_cache = typeof response_ids[0] === "string";
+  const system_message: ChatHistoryInfo[] | undefined = system_msg
+    ? [
+        {
+          messages: [{ role: "system", content: system_msg }],
+          fill_history: {},
+        },
+      ]
+    : undefined;
 
   if (api_keys !== undefined) set_api_keys(api_keys);
 
   // Load all responses with the given ID:
   let all_evald_responses: LLMResponse[] = [];
   let all_errors: string[] = [];
-  for (const cache_id of response_ids) {
-    const fname = `${cache_id}.json`;
-    if (!StorageCache.has(fname))
-      throw new Error(`Did not find cache file for id ${cache_id}`);
-
-    // Load the raw responses from the cache + clone them all:
-    const resp_objs = (load_cache_responses(fname) as LLMResponse[]).map((r) =>
-      JSON.parse(JSON.stringify(r)),
-    ) as LLMResponse[];
-
-    if (resp_objs.length === 0) continue;
 
-    console.log(resp_objs);
+  const _runOverResponses = async (
+    resp_objs: LLMResponse[],
+    cache_id?: string,
+  ) => {
+    console.log("Running LLM evaluator over response objects:", resp_objs);
 
     // We need to keep track of the index of each response in the response object.
     // We can generate var dicts with metadata to store the indices:
@@ -1368,16 +1374,16 @@ export async function evalWithLLM(
 
     // Now run all inputs through the LLM grader!:
     const { responses, errors } = await queryLLM(
-      `eval-${id}-${cache_id}`,
+      `eval-${id}-${cache_id ?? "provided"}`,
       [llm],
       1,
       root_prompt,
       { __input: inputs },
-      undefined,
+      system_message, // if there's a sys_message, we pass it in chat history format
       undefined,
       undefined,
       progress_listener,
-      false,
+      !cache_id, // if there's no cache_id, we don't want to cache the responses
       cancel_id,
     );
 
@@ -1452,7 +1458,34 @@ export async function evalWithLLM(
       }
     });
 
-    all_evald_responses = all_evald_responses.concat(resp_objs);
+    return resp_objs;
+  };
+
+  // Run over cache'd response data
+  if (load_resps_from_cache) {
+    for (const cache_id of response_ids) {
+      const fname = `${cache_id}.json`;
+      if (!StorageCache.has(fname))
+        throw new Error(`Did not find cache file for id ${cache_id}`);
+
+      // Load the raw responses from the cache + clone them all:
+      const resp_objs = (load_cache_responses(fname) as LLMResponse[]).map(
+        (r) => JSON.parse(JSON.stringify(r)),
+      ) as LLMResponse[];
+      if (resp_objs.length === 0) continue;
+
+      const evald_resp_objs = await _runOverResponses(
+        resp_objs,
+        cache_id as string,
+      );
+
+      all_evald_responses = all_evald_responses.concat(evald_resp_objs);
+    }
+  } else {
+    // Run over provided response objects
+    const resp_objs = response_ids as LLMResponse[];
+    const evald_resp_objs = await _runOverResponses(resp_objs); // no cache
+    all_evald_responses = all_evald_responses.concat(evald_resp_objs);
   }
 
   // Do additional processing to check if all evaluations are
@@ -1462,7 +1495,9 @@ export async function evalWithLLM(
     if (!resp_obj.eval_res) continue;
     for (const score of resp_obj.eval_res.items) {
       if (score !== undefined)
-        all_eval_res.add(score.toString().trim().toLowerCase());
+        all_eval_res.add(
+          stripWrappingQuotes(score.toString().trim().toLowerCase()),
+        );
     }
   }
 
@@ -1502,7 +1537,8 @@ export async function evalWithLLM(
   }
 
   // Store the evaluated responses in a new cache json:
-  StorageCache.store(`${id}.json`, all_evald_responses);
+  if (load_resps_from_cache)
+    StorageCache.store(`${id}.json`, all_evald_responses);
 
   return { responses: all_evald_responses, errors: all_errors };
 }
diff --git a/chainforge/react-server/src/backend/evalgen/README.md b/chainforge/react-server/src/backend/evalgen/README.md
new file mode 100644
index 000000000..0ff121d97
--- /dev/null
+++ b/chainforge/react-server/src/backend/evalgen/README.md
@@ -0,0 +1,27 @@
+# `evalgen`: Grading and Evaluation Function Selection Module
+
+This module takes a developer's prompt and set of examples (variables, prompts, responses), suggests evaluation criteria (with confirmation from developer + other criteria), generates and evaluates multiple functions per criteria on each of the examples, and returns the best function per criteria (most aligned with the developer's grades).
+
+## Execution
+
+There is an interactive script to play with the functionality in `test.ts`. You can run it by running `ts-node test.ts` in the `grading` directory of the project. The terminal is a bit laggy sometimes.
+
+## Architecture
+
+The module is divided into the following components: `executor`, `utils`, `oai_utils`.
+
+### Utils
+
+This module contains types and prompts for criteria generation, function generation, and function execution.
+
+### OAI Utils
+
+This module contains utilities for interacting with the OpenAI API.
+
+### Executor
+
+This module contains the main logic for the module. It takes a developer's prompt and set of examples, as well as a list of evaluation criteria (which can be generated by the utils module). It has a background process to generate and evaluate functions for each criteria, updating each example's grading priority as function results stream in. There is a method to query the next example to grade, and another method to set the grade for an example. The module also has a method to query the best function per criteria (most aligned with the developer's grades).
+
+# Credits
+
+The `evalgen` module was created by Shreya Shankar. It was adapted to the ChainForge codebase by Ian Arawjo.
diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts
new file mode 100644
index 000000000..24c7802b1
--- /dev/null
+++ b/chainforge/react-server/src/backend/evalgen/executor.ts
@@ -0,0 +1,1075 @@
+import {
+  calculateCohensKappa,
+  calculateF1Score,
+  calculateMCC,
+  execPyFunc,
+  executeLLMEval,
+  generateFunctionsForCriteria,
+} from "./utils";
+import {
+  EvalCriteria,
+  EvalFunction,
+  EvalFunctionResult,
+  EvalFunctionReport,
+  EvalFunctionSetReport,
+  EvalCriteriaUID,
+} from "./typing";
+import {
+  LLMResponse,
+  ResponseUID,
+  QueryProgress,
+  Dict,
+  LLMSpec,
+} from "../typing";
+import { EventEmitter } from "events";
+
+/**
+ * The EvaluationFunctionExecutor class is designed to asynchronously
+ * evaluate a set of examples against specified evaluation criteria using
+ * generated evaluation functions and to prioritize grading based on the
+ * results.
+ *
+ * Usage:
+ *
+ * 1. Initialization:
+ *    Create an instance of the EvaluationFunctionExecutor by providing the
+ *    evaluation criteria, a prompt template for the developer's LLM chain,
+ *    and a set of examples to be evaluated.
+ *
+ *    const executor = new EvaluationFunctionExecutor(
+ *      promptTemplate, examples, evalCriteria);
+ *
+ *    // Optionally, you can call setEvalCriteria to set the evaluation criteria
+ *    // after the executor has been initialized.
+ *    executor.setEvalCriteria(evalCriteria);
+ *
+ * 2. Start Background Computation:
+ *    Call the `start` method to begin generating and executing evaluation
+ *    functions in the background. This method returns immediately,
+ *    allowing your application to perform other tasks concurrently.
+ *
+ *    executor.start();
+ *
+ * 3. Continue with Other Computations and Interactive Grading:
+ *    You can proceed with other tasks (i.e., grading) immediately after
+ *    starting the background computation. Use `getNextExampleToScore`
+ *    to determine which example to grade next and `setGradeForExample`
+ *    to assign grades to specific examples. This interactive grading will
+ *    help in filtering out incorrect evaluation functions.
+ *
+ *    // Example of interactive grading loop
+ *    let nextExampleId = executor.getNextExampleToScore();
+ *    while (nextExampleId !== null) {
+ *      const grade = ...; // Determine the grade for the example, e.g.,
+ *                          // through user input
+ *      executor.setGradeForExample(nextExampleId, grade);
+ *      nextExampleId = executor.getNextExampleToScore();
+ *    }
+ *
+ * 5. (Optional) Querying Results:
+ *    At any time, you can query the current grading priorities of examples
+ *    or check the grading status by using methods like `getScore`,
+ *    `getAllScores`, or `getNextExampleToScore`.
+ */
+export default class EvaluationFunctionExecutor {
+  private scores: Map<ResponseUID, number>;
+  // Cache function results for each example
+  private resultsCache: Map<EvalFunction, Map<ResponseUID, EvalFunctionResult>>;
+  private llms: { small: string | LLMSpec; large: string | LLMSpec };
+  private apiKeys: Dict;
+  private grades: Map<ResponseUID, boolean>; // Grades for all examples
+  private perCriteriaGrades: Dict<Dict<boolean | undefined>>; // Grades per criteria
+  private annotations: Dict<string>; // Annotations for each response
+  private lastPickedHighScore: boolean; // To alternate between highest and lowest scores when sampling examples to grade
+  private examples: LLMResponse[]; // The set of examples being evaluated and graded
+  private evalCriteria: EvalCriteria[]; // The criteria used to generate evaluation functions
+  private evalFunctions: EvalFunction[]; // The set of evaluation functions generated for the developer's LLM chain
+  private promptTemplate: string; // The prompt template for the developer's LLM chain
+  private backgroundTaskPromise: Promise<void> | null = null; // To keep track of the background task for generating and executing evaluation functions
+  private criteriaQueue: EvalCriteria[] = []; // Queue for new criteria to be processed
+  private processing = false; // To keep track of whether we are currently processing a criteria
+  private updateNumLLMCalls: (
+    numStrongModelCalls: number,
+    numWeakModelCalls: number,
+  ) => void;
+
+  private logFunction: (logMessage: string) => void;
+
+  /**
+   * Initializes a new instance of the EvaluationFunctionExecutor class.
+   *
+   * @param evalCriteria The criteria used to generate evaluation functions. Provided/confirmed by the developer.
+   * @param promptTemplate The prompt template for the developer's LLM chain. This is useful for the LLM to generate correct evaluation functions.
+   * @param examples A set of variable-prompt-response triples that we want the developer to grade (and use for filtering incorrect evaluation functions).
+   * @param existingGrades Optional. A dict in format {uid: grade}, containing existing grades.
+   */
+  constructor(
+    genAIModels: { small: string | LLMSpec; large: string | LLMSpec },
+    apiKeys: Dict,
+    promptTemplate: string,
+    examples: LLMResponse[],
+    evalCriteria: EvalCriteria[] = [],
+    updateNumLLMCalls: (
+      numStrongModelCalls: number,
+      numWeakModelCalls: number,
+    ) => void,
+    addLog: (log: string) => void,
+    existingGrades?: Record<ResponseUID, boolean>,
+    existingPerCriteriaGrades?: Dict<Dict<boolean | undefined>>,
+    annotations?: Dict<string>,
+  ) {
+    console.log(evalCriteria);
+
+    this.resultsCache = new Map<
+      EvalFunction,
+      Map<ResponseUID, EvalFunctionResult>
+    >();
+    this.lastPickedHighScore = false; // Start off picking the highest score
+    this.examples = examples;
+    this.evalCriteria = evalCriteria;
+    this.promptTemplate = promptTemplate;
+    this.llms = genAIModels;
+    this.apiKeys = apiKeys;
+
+    // Set scores and grades to default values of 0
+    this.scores = new Map<ResponseUID, number>();
+
+    // Set scores to 0 for each example id
+    for (const example of examples) {
+      this.scores.set(example.uid, 0);
+    }
+
+    this.grades = new Map<ResponseUID, boolean>();
+    this.perCriteriaGrades = {};
+    this.evalFunctions = [];
+    this.annotations = {};
+
+    // Pass in any existing grades
+    if (existingGrades) {
+      Object.entries(existingGrades).forEach(([uid, grade]) => {
+        this.grades.set(uid, grade);
+      });
+    }
+
+    // Pass in any existing per-criteria grades
+    if (existingPerCriteriaGrades) {
+      this.perCriteriaGrades = existingPerCriteriaGrades;
+    }
+
+    if (annotations) {
+      this.annotations = annotations;
+    }
+
+    this.criteriaQueue = [];
+    this.processing = false;
+
+    this.updateNumLLMCalls = updateNumLLMCalls;
+    this.logFunction = addLog;
+  }
+
+  /**
+   * Starts the background computation for generating and executing evaluation functions.
+   * This method initiates the tasks but does not wait for them to complete.
+   * This method should be called after the constructor.
+   */
+  public start(onProgress?: (progress: QueryProgress) => void): void {
+    // Throw error if there is no eval criteria
+    if (this.evalCriteria.length === 0) {
+      throw new Error(
+        "No evaluation criteria provided. Please provide at least one evaluation criterion.",
+      );
+    }
+
+    // Throw error if bg task is already running
+    if (this.backgroundTaskPromise) {
+      throw new Error(
+        "Background task for generating and executing evaluation functions is already running.",
+      );
+    }
+
+    // Initiate the background task without awaiting its completion
+    this.backgroundTaskPromise =
+      this.generateAndExecuteEvaluationFunctions(onProgress);
+  }
+
+  /**
+   * Allows the client to explicitly wait for the background tasks to complete if needed.
+   */
+  public async waitForCompletion(): Promise<void> {
+    if (this.backgroundTaskPromise) {
+      const promise = this.backgroundTaskPromise;
+      this.backgroundTaskPromise = null;
+      await promise;
+    }
+  }
+
+  /**
+   * Whether the executor is currently running (.start() has been called and is not yet completed).
+   */
+  public isRunning(): boolean {
+    return this.backgroundTaskPromise !== null;
+  }
+
+  private async generateAndExecuteFunctionsForCriteria(
+    criteria: EvalCriteria,
+    onProgress?: (progress: QueryProgress) => void,
+  ): Promise<void> {
+    const emitter = new EventEmitter();
+    const functionExecutionPromises: Promise<any>[] = [];
+
+    emitter.on("functionGenerated", (evalFunction) => {
+      this.logFunction(
+        `Generated a new ${evalFunction.evalCriteria.eval_method === "code" ? "code-based" : "LLM-based"} validator for criteria: ${evalFunction.evalCriteria.shortname}${evalFunction.evalCriteria.eval_method === "expert" ? `, with prompt: ${evalFunction.name}` : ""}. Executing it on ${this.examples.length} examples.`,
+      );
+
+      const executionPromise = (async () => {
+        this.evalFunctions.push(evalFunction);
+        const executionPromises = this.examples.map(async (example) => {
+          // Get random positive and negative examples for this criteria using the perCriteriaGrades
+          const criteriaId = criteria.uid;
+          const randomPositiveExample = this.examples.find(
+            (example) =>
+              this.perCriteriaGrades[criteriaId]?.[example.uid] === true,
+          );
+          const randomNegativeExample = this.examples.find(
+            (example) =>
+              this.perCriteriaGrades[criteriaId]?.[example.uid] === false,
+          );
+
+          const funcToExecute =
+            evalFunction.evalCriteria.eval_method === "code"
+              ? execPyFunc
+              : executeLLMEval;
+
+          // Run the function on the example and if there's an error, increment skipped
+          const result = await funcToExecute(
+            evalFunction,
+            this.llms.small,
+            example,
+            randomPositiveExample,
+            randomNegativeExample,
+          );
+
+          // Update weak model call count by 1 if the eval method is expert
+          if (evalFunction.evalCriteria.eval_method === "expert") {
+            this.updateNumLLMCalls(0, 1);
+          }
+
+          if (onProgress) {
+            onProgress({
+              success:
+                (100 * functionExecutionPromises.length) /
+                this.criteriaQueue.length,
+              error: 0,
+            });
+          }
+
+          if (!this.resultsCache.has(evalFunction)) {
+            this.resultsCache.set(evalFunction, new Map());
+          }
+          this.resultsCache.get(evalFunction)?.set(example.uid, result);
+
+          if (result === EvalFunctionResult.FAIL) {
+            this.updateScore(example.uid, evalFunction);
+          }
+        });
+
+        await Promise.all(executionPromises);
+      })();
+
+      functionExecutionPromises.push(executionPromise);
+    });
+
+    const badExample = this.examples.find(
+      (example) =>
+        this.perCriteriaGrades[example.uid]?.[criteria.uid] === false,
+    );
+
+    try {
+      await generateFunctionsForCriteria(
+        criteria,
+        this.llms.large,
+        this.promptTemplate,
+        this.examples[Math.floor(Math.random() * this.examples.length)],
+        emitter,
+        badExample,
+        this.apiKeys,
+      );
+
+      console.log(`Generated functions for criteria: ${criteria.shortname}`);
+      console.log(
+        `Number of functions generated: ${functionExecutionPromises.length}`,
+      );
+      this.logFunction(
+        `Generated ${functionExecutionPromises.length} functions for criteria: ${criteria.shortname}`,
+      );
+    } catch (error) {
+      console.error(
+        `Error generating functions for criteria ${criteria.shortname}: ${error}`,
+      );
+      this.logFunction(
+        `Error generating functions for criteria ${criteria.shortname}: ${error}`,
+      );
+    }
+
+    // Update LLM call count by 1
+    this.updateNumLLMCalls(1, 0);
+
+    await Promise.all(functionExecutionPromises);
+  }
+
+  /**
+   * Generates and executes evaluation functions for a set of examples based on provided criteria.
+   * This method is responsible for initializing the evaluation process and managing the asynchronous execution of functions.
+   */
+  public async generateAndExecuteEvaluationFunctions(
+    onProgress?: (progress: QueryProgress) => void,
+  ): Promise<void> {
+    // Enter a continuous monitoring loop for new criteria
+    while (this.backgroundTaskPromise !== null) {
+      // Check if there are any criteria in the queue to process
+      if (this.criteriaQueue.length > 0 && !this.processing) {
+        // Pop a criteria off the queue and process it
+        // TODO: use worker pool to parallelize this
+        await this.processNextCriteria();
+      }
+
+      // Sleep for a short time before checking again (prevents CPU hogging)
+      await new Promise((resolve) => setTimeout(resolve, 500));
+    }
+  }
+
+  /**
+   * Updates the set of evaluation criteria and triggers the generation and execution of evaluation functions for any new criteria.
+   * This method allows the client to add new evaluation criteria after the executor has been initialized.
+   * The new criteria will be processed in parallel with the existing criteria.
+   * The method returns immediately, allowing the client to continue with other tasks.
+   *
+   * @param criteria The new state of the evaluation criteria list.
+   */
+  public updateCriteria(criteriaList: EvalCriteria[]): void {
+    // See if there are criteria to remove
+    this.evalCriteria = this.evalCriteria.filter(
+      (c) => !criteriaList.includes(c),
+    );
+
+    // See if there are new criteria to add
+    for (const criteria of criteriaList) {
+      if (this.evalCriteria.includes(criteria)) {
+        // criteria already included
+        continue;
+      }
+
+      console.log(`Adding new criteria: ${criteria.shortname}`);
+      this.criteriaQueue.push(criteria);
+      this.evalCriteria.push(criteria);
+
+      // Start the generation and execution of functions for the new criteria
+      if (!this.processing) {
+        this.processNextCriteria();
+      }
+    }
+  }
+
+  private async processNextCriteria() {
+    this.processing = true;
+    while (this.criteriaQueue.length > 0) {
+      const criteria = this.criteriaQueue.shift();
+      if (criteria) {
+        // Log the processing of new criteria
+        this.logFunction(`Processing new criteria: ${criteria.shortname}`);
+        await this.generateAndExecuteFunctionsForCriteria(criteria);
+      }
+    }
+    this.processing = false;
+  }
+
+  /**
+   * Updates the grading prioritiy score for a given example based on the outcome of a synthesized evaluation function.
+   * This method calculates the failure rate of a function and adjusts the example's score accordingly. Functions with higher failure rates will result in lower scores for the example.
+   *
+   * @param exampleId The unique ID of the example being scored.
+   * @param evalFunction The eval function used for evaluation.
+   */
+  private updateScore(
+    exampleId: ResponseUID,
+    evalFunction: EvalFunction,
+  ): void {
+    // const outcome = this.outcomes.get(evalFunction);
+
+    // Get all the results for this function
+    const results = this.resultsCache.get(evalFunction);
+
+    if (results === undefined) {
+      return;
+    }
+
+    // Compute pass rate
+    const passed = Array.from(results.values()).filter(
+      (result) => result === EvalFunctionResult.PASS,
+    ).length;
+
+    // Compute failure rate
+    const failed = Array.from(results.values()).filter(
+      (result) => result === EvalFunctionResult.FAIL,
+    ).length;
+
+    const passRate = passed / (passed + failed);
+
+    const currentScore = this.scores.get(exampleId) || 0;
+    this.scores.set(exampleId, currentScore + passRate);
+  }
+
+  /**
+   * Retrieves the current response priority score for a given example.
+   * This method allows clients to query the score of an example at any point during the evaluation process, for transparency and debugging purposes.
+   *
+   * @param exampleId The unique ID of the example whose score is being requested.
+   * @returns The current response priority score of the example, if available.
+   */
+  public getScore(exampleId: ResponseUID): number | undefined {
+    return this.scores.get(exampleId);
+  }
+
+  /**
+   * Retrieves scores for all examples.
+   * This method provides a snapshot of the current scores for all examples being evaluated.
+   *
+   * @returns A map of example IDs to their current scores.
+   */
+  public getAllScores(): Map<ResponseUID, number> {
+    return new Map(this.scores);
+  }
+
+  /**
+   * Retrieves the grades set by the developer for all examples.
+   *
+   * @returns A map of example IDs to their grades.
+   */
+  public getGrades(): Map<ResponseUID, boolean> {
+    return new Map(this.grades);
+  }
+
+  public estimateNumGPTCalls(perCriteriaGrades: Dict<boolean>): {
+    numGPT4Calls: number;
+    numGPT35Calls: number;
+  } {
+    let numGPT4Calls = 0;
+    let numLLMCriteria = 0;
+    for (const criteriaId in perCriteriaGrades) {
+      const currGrade = perCriteriaGrades[criteriaId];
+      const numGradedAsCurrGrade = this.examples.filter(
+        (example) =>
+          this.perCriteriaGrades[example.uid] &&
+          this.perCriteriaGrades[example.uid][criteriaId] === currGrade,
+      ).length;
+      if (Math.random() <= 1 / (numGradedAsCurrGrade + 1)) {
+        numGPT4Calls += 1;
+        const criteria = this.evalCriteria.find(
+          (criteria) => criteria.uid === criteriaId,
+        );
+        if (criteria && criteria.eval_method === "expert") {
+          numLLMCriteria += 1;
+        }
+      }
+    }
+
+    return {
+      numGPT4Calls,
+      numGPT35Calls: numLLMCriteria * 3 * this.examples.length,
+    };
+  }
+
+  /**
+   * Sets a grade for an example based on external input from the developer.
+   * This will be used for filtering out incorrect evaluation functions.
+   * If the developer does not provide a holistic grade, the executor will infer it from the perCriteriaGrades.
+   * With some probability, generate new implementations for the criteria in perCriteriaGrades.
+   *
+   * @param exampleId The unique ID of the example being graded.
+   * @param holisticGrade The developer-provided grade assigned to the example, "good" or "bad" or unknown.
+   */
+  public setGradeForExample(
+    exampleId: ResponseUID,
+    perCriteriaGrades?: Dict<boolean | undefined>,
+    holisticGrade?: string,
+    annotation?: string,
+  ): void {
+    if (holisticGrade !== null) {
+      const boolHolistic = holisticGrade === "good";
+      this.grades.set(exampleId, boolHolistic);
+    }
+
+    if (perCriteriaGrades) {
+      this.perCriteriaGrades[exampleId] = perCriteriaGrades;
+
+      // If holisticGrade was null, set it based on the perCriteriaGrades---if all criteria in the perCriteriaGrades are true, set the holisticGrade to true, else false
+      if (holisticGrade === null) {
+        const allTrue = Object.values(perCriteriaGrades).every(
+          (value) => value === true,
+        );
+        this.grades.set(exampleId, allTrue);
+      }
+    }
+
+    if (annotation) {
+      this.annotations[exampleId] = annotation;
+    }
+
+    let numCriteriaWithNewImplementations = 0;
+
+    // Trigger generateNewImplementationsForCriteria for each criteria in perCriteriaGrades
+    for (const criteriaId in perCriteriaGrades) {
+      const currGrade = perCriteriaGrades[criteriaId];
+      // With probability 1 / # graded examples for this criteria with currGrade, generate new implementations
+      const numGradedAsCurrGrade = this.examples.filter(
+        (example) =>
+          this.perCriteriaGrades[example.uid] &&
+          this.perCriteriaGrades[example.uid][criteriaId] === currGrade,
+      ).length;
+
+      if (Math.random() <= 1 / (numGradedAsCurrGrade + 1)) {
+        console.log(
+          `Generating new implementations for criteria: ${criteriaId}`,
+        );
+        const evalCriteria = this.evalCriteria.find(
+          (criteria) => criteria.uid === criteriaId,
+        );
+        if (evalCriteria) {
+          this.criteriaQueue.push(evalCriteria);
+          if (!this.processing) {
+            this.processNextCriteria();
+          }
+          numCriteriaWithNewImplementations++;
+        } else {
+          console.error(`Evaluation criteria with ID ${criteriaId} not found.`);
+        }
+      }
+    }
+
+    console.log(
+      `Generated new implementations for ${numCriteriaWithNewImplementations} criteria.`,
+    );
+  }
+
+  /**
+   * Set evaluation criteria for the executor.
+   * This method allows the client to set the evaluation criteria after the executor has been initialized.
+   */
+  public setEvalCriteria(evalCriteria: EvalCriteria[]): void {
+    this.evalCriteria = evalCriteria;
+  }
+
+  /**
+   * Set examples for the executor.
+   * This method allows the client to change the examples after the executor has been initialized.
+   */
+  public setExamples(examples: LLMResponse[]): void {
+    this.examples = examples;
+
+    // Set scores to 0 for each example id
+    for (const example of examples) {
+      this.scores.set(example.uid, 0);
+    }
+  }
+
+  /**
+   * Gets a map of ungraded example ids and their scores, sorted by score.
+   * @return A map of ungraded example ids and their scores, sorted by score.
+   */
+  public getUngradedScores(): Map<ResponseUID, number> {
+    // Step 1: Convert the scores Map to an array and filter out graded examples
+    const ungradedEntries = Array.from(this.scores.entries())
+      .filter(([id]) => !this.grades.has(id))
+      .map(([id, score]) => ({ id, score, rand: Math.random() })) // Add a random value for tie-breaking
+
+      // Step 2: Sort the ungraded entries first by score, then randomly for tie-breaking
+      .sort((a, b) => {
+        if (a.score === b.score) {
+          return a.rand - b.rand; // Tie-breaking by random value
+        }
+        return b.score - a.score; // Sort by score descending
+      })
+
+      // Step 3: Convert the sorted objects back into the format expected by the Map constructor
+      .map(({ id, score }) => [id, score] as [ResponseUID, number]);
+
+    // Step 4: Convert the array of key-value pairs back into a Map and return
+    return new Map(ungradedEntries);
+  }
+
+  private getExampleForId(id: string) {
+    const item = this.examples.filter((e) => e.uid === id);
+    if (item.length === 1) return item[0];
+    else if (item.length > 1) {
+      console.error(
+        "More than one example found with the same id. Ids must be unique. Returning the first, to not halt...",
+      );
+      return item[0];
+    } else return null;
+  }
+
+  /**
+   * Determines the next example to be graded, alternating between examples with the highest and lowest ungraded scores.
+   * This method aims to balance attention across examples of varying difficulty or quality. Ideally, in grading, we get a sample of good and bad
+   * responses.
+   *
+   * @param policy The policy to use for selecting the next example to grade. Currently, the only supported policies are "random" and "priority".
+   *
+   * @returns The unique ID of the next example to be graded, or null if all examples have been graded.
+   */
+  public getNextExampleToGrade(
+    policy: "random" | "priority" = "priority",
+  ): LLMResponse | null {
+    const ungraded = Array.from(this.getUngradedScores().keys());
+
+    if (ungraded.length === 0) {
+      return null; // No ungraded examples left
+    }
+
+    // If the policy is random, return a random ungraded example
+    if (policy === "random") {
+      return this.getExampleForId(
+        ungraded[Math.floor(Math.random() * ungraded.length)],
+      );
+    }
+
+    // Otherwise whether to pick the highest or lowest ungraded score
+    const pickIndex = this.lastPickedHighScore ? ungraded.length - 1 : 0;
+    this.lastPickedHighScore = !this.lastPickedHighScore; // Alternate for next time
+
+    return this.getExampleForId(ungraded[pickIndex]);
+  }
+
+  /**
+   * Given an eval function and the results of that function against the examples (LLM responses),
+   * computes the alignment statistics between the eval function and the user grades.
+   * @param evalFunc
+   * @returns A Report, assuming the the function has been executed over some examples and the user has provided grades for those examples. If there's not enough data, returns undefined.
+   */
+  public computeAlignmentStats(
+    evalFunc: EvalFunction,
+  ): EvalFunctionReport | undefined {
+    // Get the eval function results from the cache
+    const results = this.resultsCache.get(evalFunc);
+    if (results === undefined) {
+      console.warn(
+        "No cache results found for this eval function. First ensure that the function has been executed over some examples.",
+      );
+      return undefined;
+    }
+
+    console.log(this.perCriteriaGrades, evalFunc.evalCriteria.uid);
+
+    // Get the criteria ID for this eval function
+    const criteriaId = evalFunc.evalCriteria.uid;
+
+    // Create a report for this function
+    const report: EvalFunctionReport = {
+      evalFunction: evalFunc,
+      true_pass: 0,
+      true_fail: 0,
+      false_pass: 0,
+      false_fail: 0,
+      skipped: 0,
+    };
+
+    // Check if we have any examples graded for this criteria
+    let hasGradedExamples = false;
+    for (const exampleId in this.perCriteriaGrades) {
+      if (this.perCriteriaGrades[exampleId]?.[criteriaId] !== undefined) {
+        hasGradedExamples = true;
+        break;
+      }
+    }
+
+    if (!hasGradedExamples) {
+      console.warn(
+        "No user grades found for this eval criteria. You must first grade some examples against this criteria (thumbs up/down) before we can compute alignment.",
+      );
+      return undefined;
+    }
+
+    // Calculate alignment for this function based on the graded examples
+    Object.entries(this.perCriteriaGrades).forEach(
+      ([exampleId, criteriaGrades]) => {
+        const grade = criteriaGrades[criteriaId];
+        if (grade === undefined) return; // Skip if user provides no grade for this criteria
+        if (grade === undefined) return; // Skip if user provides no grade for this example
+        const result = results.get(exampleId);
+        const userGrade = grade
+          ? EvalFunctionResult.PASS
+          : EvalFunctionResult.FAIL;
+
+        if (result !== undefined) {
+          // Handle true positives and true negatives
+          if (result === userGrade) {
+            if (result === EvalFunctionResult.PASS) {
+              report.true_pass++;
+            } else if (result === EvalFunctionResult.FAIL) {
+              report.true_fail++;
+            }
+          } else {
+            if (result === EvalFunctionResult.PASS) {
+              report.false_pass++;
+            } else if (result === EvalFunctionResult.FAIL) {
+              report.false_fail++;
+            } else {
+              report.skipped++;
+            }
+          }
+        }
+      },
+    );
+
+    // Calculate alignment in different ways
+    // NOTE: If a denominator during the calculate is 0, this will set the score to undefined.
+    report.f1 = calculateF1Score(
+      report.true_pass,
+      report.false_pass,
+      report.false_fail,
+    );
+    report.mcc = calculateMCC(
+      report.true_pass,
+      report.true_fail,
+      report.false_pass,
+      report.false_fail,
+    );
+    report.cohens_kappa = calculateCohensKappa(
+      report.true_pass,
+      report.true_fail,
+      report.false_pass,
+      report.false_fail,
+    );
+
+    // Calculate failure coverage
+    const failureCoverage =
+      report.true_fail + report.false_pass > 0
+        ? report.true_fail / (report.true_fail + report.false_pass)
+        : 0.0; // 0.0 if there are no failures to detect
+
+    // Calculate false failure rate
+    const falseFailureRate =
+      report.true_pass + report.false_fail > 0
+        ? report.false_fail / (report.true_pass + report.false_fail)
+        : 0.0; // Default to 0.0 if there are no examples that could trigger false failures
+
+    report.failureCoverage = failureCoverage;
+    report.falseFailureRate = falseFailureRate;
+
+    return report;
+  }
+
+  /**
+   * Filters out evaluation functions that are incorrect based on the grades provided by the developer.
+   *
+   * @param falseFailureRateThreshold The threshold for the failure rate of each selected evaluation functions. The returned function set will only contain functions with a false failure rate below this threshold.
+   *
+   * @returns A filtered set of evaluation functions that each have a false failure rate below the specified threshold and cover as much evaluation criteria as possible.
+   */
+  public async filterEvaluationFunctions(
+    falseFailureRateThreshold: number,
+  ): Promise<EvalFunctionSetReport> {
+    const gradedExamples = this.examples.filter((example) =>
+      this.grades.has(example.uid),
+    );
+    const gradedResultMap: Map<
+      ResponseUID,
+      Map<EvalFunction, EvalFunctionResult>
+    > = new Map();
+
+    // Iterate over graded examples and evaluation functions to fill the matrix
+    for (const example of gradedExamples) {
+      const row = new Map<EvalFunction, EvalFunctionResult>();
+      for (const evalFunction of this.evalFunctions) {
+        // Check if the result is in the cache
+        if (this.resultsCache.has(evalFunction)) {
+          const result = this.resultsCache.get(evalFunction)?.get(example.uid);
+          if (result !== undefined) {
+            row.set(evalFunction, result);
+            continue;
+          }
+        }
+
+        // If not, execute the function and store the result in the cache
+        const funcToExecute =
+          evalFunction.evalCriteria.eval_method === "code"
+            ? execPyFunc
+            : executeLLMEval;
+        const result = await funcToExecute(
+          evalFunction,
+          this.llms.small,
+          example,
+        );
+
+        // Put result in cache
+        if (!this.resultsCache.has(evalFunction)) {
+          this.resultsCache.set(evalFunction, new Map());
+        }
+        this.resultsCache.get(evalFunction)?.set(example.uid, result);
+
+        row.set(evalFunction, result);
+      }
+      gradedResultMap.set(example.uid, row);
+    }
+
+    const bestEvalFunctions: EvalFunction[] = [];
+    const evalFunctionReport: Map<EvalCriteria, EvalFunctionReport[]> =
+      new Map();
+
+    // Iterate through each criteria
+    // For each criteria, select the function with the highest alignment rate
+    for (const criteria of this.evalCriteria) {
+      const scoredFunctions = [];
+
+      for (const evalFunction of this.evalFunctions) {
+        // Skip functions that don't match the criteria
+        if (evalFunction.evalCriteria !== criteria) {
+          continue;
+        }
+
+        // Create a report for this function
+        const report: EvalFunctionReport | undefined =
+          this.computeAlignmentStats(evalFunction);
+
+        if (!report) {
+          console.warn(
+            "Could not compute alignment stats for an eval function. Skipping.",
+          );
+          continue;
+        }
+
+        // Save the report for this function
+        if (!evalFunctionReport.has(criteria)) {
+          evalFunctionReport.set(criteria, []);
+        }
+        evalFunctionReport.get(criteria)?.push(report);
+        console.log(report);
+
+        scoredFunctions.push({
+          evalFunction,
+          report,
+        });
+      }
+
+      // Sort the functions by "alignment"
+      // Here, we are using MCC as the alignment metric, where higher is better.
+      scoredFunctions.sort((a, b) => {
+        const a_mcc = a.report?.mcc ?? -1; // If undefined, set to -1, which is lowest possible.
+        const b_mcc = b.report?.mcc ?? -1;
+        if (a_mcc === b_mcc) {
+          // If MCC is the same or not present, sort by false failure rate
+          return (
+            (a.report?.falseFailureRate ?? 0) -
+            (b.report?.falseFailureRate ?? 0)
+          );
+        }
+        return b_mcc - a_mcc; // Sort by MCC descending
+      });
+
+      // // See if we can filter out functions with ffr > threshold
+      // const funcsBelowThreshold = scoredFunctions.filter(
+      //   (func) => func.report?.falseFailureRate !== undefined && func.report?.falseFailureRate <= falseFailureRateThreshold,
+      // );
+
+      // // Save the best function for this criteria
+      // // Maximize failure coverage and minimize false failure rate
+      // funcsBelowThreshold.sort((a, b) => {
+      //   if (a.report?.failureCoverage === b.report?.failureCoverage) {
+      //     return a.report?.falseFailureRate - b.report?.falseFailureRate;
+      //   }
+      //   return b.failureCoverage - a.failureCoverage;
+      // });
+
+      if (scoredFunctions.length > 0) {
+        // The top result is the 'best' / most aligned function
+        bestEvalFunctions.push(scoredFunctions[0].evalFunction);
+      }
+    }
+
+    const [coverage, falseFailureRate] = this.getSelectedFunctionAlignment(
+      bestEvalFunctions,
+      gradedResultMap,
+      gradedExamples,
+    );
+
+    // Create report of coverage, missed failures, selected functions, and all eval function reports
+    const report = {
+      failureCoverage: coverage,
+      falseFailureRate,
+      selectedEvalFunctions: bestEvalFunctions,
+      allEvalFunctionReports: evalFunctionReport,
+    };
+
+    return report;
+  }
+
+  private getSelectedFunctionAlignment(
+    selectedEvalFunctions: EvalFunction[],
+    gradedResultMap: Map<ResponseUID, Map<EvalFunction, EvalFunctionResult>>,
+    gradedExamples: LLMResponse[],
+  ) {
+    // Of the selected functions, calculate the coverage of failures and false failure rate
+    let truePass = 0;
+    const coveredFailures = new Set<ResponseUID>();
+    const falseFailures = new Set<ResponseUID>();
+
+    for (const example of gradedExamples) {
+      let systemPass = true;
+
+      for (const evalFunction of selectedEvalFunctions) {
+        const result = gradedResultMap.get(example.uid)?.get(evalFunction);
+        if (
+          result === EvalFunctionResult.FAIL &&
+          !this.grades.get(example.uid)
+        ) {
+          coveredFailures.add(example.uid);
+          systemPass = false;
+        }
+
+        if (
+          result === EvalFunctionResult.FAIL &&
+          this.grades.get(example.uid)
+        ) {
+          systemPass = false;
+          falseFailures.add(example.uid);
+        }
+      }
+
+      if (systemPass) {
+        if (this.grades.get(example.uid)) {
+          truePass++;
+        }
+      }
+    }
+
+    // Print out failure coverage
+    const numFailures = gradedExamples.filter(
+      (example) => !this.grades.get(example.uid),
+    ).length;
+    const coverage = (coveredFailures.size / numFailures) * 100;
+    const falseFailureRate =
+      (falseFailures.size / (truePass + falseFailures.size)) * 100;
+    console.log(`Failure coverage: ${coverage}`);
+    console.log(`False failure rate: ${falseFailureRate}`);
+
+    // Print out missed failures
+    // const missedFailures = gradedExamples.filter(
+    //   (example) =>
+    //     !this.grades.get(example.uid) && !coveredFailures.has(example.uid),
+    // );
+    // if (missedFailures.length > 0) {
+    //   console.log(`Missed failures: ${missedFailures}`);
+    // }
+
+    return [coverage, falseFailureRate];
+  }
+
+  public async recomputeAlignment(
+    selectedEvalCriteria: EvalCriteria[],
+    oldReport: EvalFunctionSetReport,
+  ): Promise<EvalFunctionSetReport> {
+    // Recompute alignment based on the selected functions
+    const gradedExamples = this.examples.filter((example) =>
+      this.grades.has(example.uid),
+    );
+    const gradedResultMap: Map<
+      ResponseUID,
+      Map<EvalFunction, EvalFunctionResult>
+    > = new Map();
+
+    // Iterate over graded examples and evaluation functions to fill the matrix
+    for (const example of gradedExamples) {
+      const row = new Map<EvalFunction, EvalFunctionResult>();
+      for (const evalFunction of this.evalFunctions) {
+        // Check if the result is in the cache
+        if (this.resultsCache.has(evalFunction)) {
+          const result = this.resultsCache.get(evalFunction)?.get(example.uid);
+          if (result !== undefined) {
+            row.set(evalFunction, result);
+            continue;
+          }
+        }
+
+        // If not, execute the function and store the result in the cache
+        const funcToExecute =
+          evalFunction.evalCriteria.eval_method === "code"
+            ? execPyFunc
+            : executeLLMEval;
+        const result = await funcToExecute(
+          evalFunction,
+          this.llms.small,
+          example,
+        );
+
+        // Put result in cache
+        if (!this.resultsCache.has(evalFunction)) {
+          this.resultsCache.set(evalFunction, new Map());
+        }
+        this.resultsCache.get(evalFunction)?.set(example.uid, result);
+
+        row.set(evalFunction, result);
+      }
+      gradedResultMap.set(example.uid, row);
+    }
+
+    // Filter out functions that don't match the selected criteria
+    const selectedEvalFunctions = oldReport.selectedEvalFunctions.filter(
+      (evalFunction) =>
+        selectedEvalCriteria.includes(evalFunction.evalCriteria),
+    );
+
+    const [coverage, falseFailureRate] = this.getSelectedFunctionAlignment(
+      selectedEvalFunctions,
+      gradedResultMap,
+      gradedExamples,
+    );
+
+    // Create report of coverage, missed failures, selected functions, and all eval function reports
+    const report = {
+      failureCoverage: coverage,
+      falseFailureRate,
+      selectedEvalFunctions: oldReport.selectedEvalFunctions,
+      allEvalFunctionReports: oldReport.allEvalFunctionReports,
+    };
+
+    return report;
+  }
+
+  /**
+   * Retrieves the current outcomes of the evaluation functions.
+   * This method provides a snapshot of the current outcomes of the evaluation functions.
+   *
+   * @returns A map of evaluation functions to their current outcomes.
+   */
+  public getOutcomes(): Map<
+    EvalFunction,
+    { passed: number; failed: number; skipped: number }
+  > {
+    // Compute based on the results cache
+    const outcomes = new Map<
+      EvalFunction,
+      { passed: number; failed: number; skipped: number }
+    >();
+
+    for (const [evalFunction, results] of this.resultsCache) {
+      let passed = 0;
+      let failed = 0;
+      let skipped = 0;
+
+      for (const result of results.values()) {
+        if (result === EvalFunctionResult.PASS) {
+          passed++;
+        } else if (result === EvalFunctionResult.FAIL) {
+          failed++;
+        } else {
+          skipped++;
+        }
+      }
+
+      outcomes.set(evalFunction, { passed, failed, skipped });
+    }
+
+    return outcomes;
+  }
+}
diff --git a/chainforge/react-server/src/backend/evalgen/oai_utils.ts b/chainforge/react-server/src/backend/evalgen/oai_utils.ts
new file mode 100644
index 000000000..958c1d70e
--- /dev/null
+++ b/chainforge/react-server/src/backend/evalgen/oai_utils.ts
@@ -0,0 +1,84 @@
+// import { env as process_env } from "process";
+import { EventEmitter } from "events";
+// import { AzureKeyCredential, OpenAIClient } from "@azure/openai";
+import { hashtagTemplateVars, llmResponseDataToString } from "../utils";
+import { simpleQueryLLM } from "../backend";
+import { Dict, LLMSpec } from "../typing";
+import { extractMdBlocks } from "./utils";
+type ContentType = "python_fn" | "llm_eval";
+
+export class EvalGenAssertionEmitter extends EventEmitter {
+  private apiKeys: Dict | undefined;
+
+  constructor(apiKeys?: Dict) {
+    super();
+    this.apiKeys = apiKeys;
+  }
+
+  async generate(
+    prompt: string,
+    llm: string | LLMSpec,
+    contentType: ContentType,
+  ): Promise<void> {
+    const emit_prompt = (p: string) => this.emit("function", p);
+
+    const result = await simpleQueryLLM(
+      prompt, // prompt
+      typeof llm === "string" ? llm : [llm], // llm
+      // spec, // llm
+      "You are an expert Python programmer and helping me write assertions for my LLM pipeline. An LLM pipeline accepts an example and prompt template, fills the template's placeholders with the example, and generates a response.", // system_msg
+      this.apiKeys, // API keys (if any)
+    );
+
+    if (result.errors && Object.keys(result.errors).length > 0)
+      throw new Error(Object.values(result.errors as Dict)[0].toString());
+
+    // Get output (text from LLM response)
+    const output = llmResponseDataToString(result.responses[0].responses[0]);
+    console.log("Streamer: LLM said: ", output); // for debuggging
+
+    // Attempt to extract output depending on content type
+    if (contentType === "llm_eval") {
+      // Expected output is a ``json block that is just a list of three strings representing the prompts i.e. ["str1", "str2", "str3"]
+      // Attempt to extract JSON blocks (strings) from output
+      const json_blocks = extractMdBlocks(output, "json");
+      if (json_blocks === undefined || json_blocks.length === 0)
+        throw new Error(
+          "EvalGen: Could not parse LLM response into evaluation prompt: No JSON detected in output.",
+        );
+
+      // If we passed, this should be a list of strings:
+      const prompts = json_blocks.flatMap((b) => JSON.parse(b));
+      // Verify format:
+      if (prompts.every((p) => typeof p === "string")) {
+        // If these are all strings, we are good to go--
+        // We must be careful to first hashtag all template variables in the prompt
+        // before emitting them, so that they are not interpreted as template variables.
+        const hashtagged_prompts = prompts.map((p) => hashtagTemplateVars(p));
+        // Emit all the LLM eval prompt candidates in one burst
+        hashtagged_prompts.forEach(emit_prompt);
+      } else {
+        console.error(
+          "Unexpected output type after JSON parsing: At least generated LLM eval prompt is not a string.",
+          prompts,
+        );
+        throw new Error("Unexpected output type after JSON parsing");
+      }
+    } else if (contentType === "python_fn") {
+      // Expected output has ~3 Python codeblocks within ```python markers
+      // Attempt to extract code blocks from output
+      const code_blocks = extractMdBlocks(output, "python");
+      if (code_blocks === undefined || code_blocks.length === 0)
+        throw new Error(
+          "EvalGen: Could not parse LLM response into Python function: No code detected in output.",
+        );
+
+      // If we passed, this should be a list of Python code functions. We assume it is OK, and treat them separately:
+      code_blocks.forEach(emit_prompt);
+    } else {
+      throw new Error("Unknown content type: " + contentType);
+    }
+
+    this.emit("end"); // Signal that streaming is complete
+  }
+}
diff --git a/chainforge/react-server/src/backend/evalgen/test.ts b/chainforge/react-server/src/backend/evalgen/test.ts
new file mode 100644
index 000000000..7766479cc
--- /dev/null
+++ b/chainforge/react-server/src/backend/evalgen/test.ts
@@ -0,0 +1,142 @@
+// import fs from "fs";
+// import csvParser from "csv-parser";
+// import readline from "readline";
+
+// import { Example, EvalCriteria, generateLLMEvaluationCriteria } from "./utils";
+// import EvaluationFunctionExecutor from "./executor";
+
+// const readCSV = async (filePath: string): Promise<Example[]> => {
+//   const examples: Example[] = [];
+//   let counter = 0; // Counter to generate unique IDs
+
+//   return new Promise((resolve, reject) => {
+//     fs.createReadStream(filePath)
+//       .pipe(csvParser(["prompt", "example", "response", "model"]))
+//       .on("data", (data) => {
+//         try {
+//           examples.push({
+//             id: `example_${++counter}`, // Generating a unique ID
+//             variables: data.example,
+//             prompt: data.prompt,
+//             response: data.response,
+//           });
+//         } catch (error) {
+//           // console.error("Error parsing variables from CSV:", error);
+//           // Don't throw here, just skip the example
+//         }
+//       })
+//       .on("end", () => resolve(examples))
+//       .on("error", reject);
+//   });
+// };
+
+// const rl = readline.createInterface({
+//   input: process.stdin,
+//   output: process.stdout,
+// });
+
+// const askQuestion = (query: string): Promise<string> =>
+//   new Promise((resolve) => rl.question(query, resolve));
+
+// const main = async () => {
+//   // Placeholder values - replace with actual data
+//   const promptTemplate = `You are an AI Assistant that’s an expert at reviewing pull requests. Review the below pull request that you receive.
+
+//   Input format
+//   - The input format follows Github diff format with addition and subtraction of code.
+//   - The + sign means that code has been added.
+//   - The - sign means that code has been removed.
+
+//   Instructions
+//   - Take into account that you don’t have access to the full code but only the code diff.
+//   - Only answer on what can be improved and provide the improvement in code.
+//   - Answer in short form.
+//   - Include code snippets if necessary.
+//   - Adhere to the languages code conventions.
+//   - Make it personal and always show gratitude to the author using "@" when tagging.`;
+
+//   let examples: Example[] = await readCSV("./codereviews.csv");
+
+//   // Get a sample of 10 examples
+//   examples = examples.slice(0, 10);
+
+//   // Print number of examples
+//   console.log(`Loaded ${examples.length} examples.`);
+
+//   // Start a timer
+//   let start = Date.now();
+//   let timeElapsed = 0;
+
+//   // Step 1: Suggest eval criteria and solicit approval
+//   const evalCriteria = await generateLLMEvaluationCriteria(promptTemplate);
+//   // Pause the timer
+//   timeElapsed += Date.now() - start;
+
+//   const approval = await askQuestion(
+//     "Do you approve the suggested criteria? (y/n) ",
+//   );
+
+//   if (approval.toLowerCase() !== "y") {
+//     console.log(
+//       "Please adjust the criteria directly in the source code for now.",
+//     );
+//     return;
+//   }
+
+//   const executor = new EvaluationFunctionExecutor(
+//     promptTemplate,
+//     examples,
+//   );
+
+//   // Set the evaluation criteria
+//   executor.setEvaluationCriteria(evalCriteria);
+
+//   // Resume the timer
+//   start = Date.now();
+
+//   // Step 2: Start background task
+//   executor.start();
+
+//   //   await executor.waitForCompletion();
+
+//   //   Step 3: Present examples to grade
+//   while (true) {
+//     // Get ungraded scores
+//     const ungradedScores = executor.getUngradedScores();
+//     console.log("Ungraded Scores: ", ungradedScores);
+
+//     const nextExampleId = executor.getNextExampleToGrade();
+//     if (!nextExampleId) {
+//       console.log("All examples graded or no examples available.");
+//       break;
+//     }
+
+//     const example = examples.find((e) => e.id === nextExampleId);
+//     if (!example) continue;
+
+//     console.log(
+//       `Example ID: ${example.id}, Prompt: ${example.prompt}, Response: ${example.response}`,
+//     );
+//     const grade = await askQuestion(
+//       "Is this response acceptable? (y/n/finish) ",
+//     );
+
+//     if (grade === "finish") {
+//       break;
+//     }
+
+//     executor.setGradeForExample(example.id, grade.toLowerCase() === "y");
+//   }
+
+//   // Print grades
+//   console.log("Grades: ", executor.getGrades());
+
+//   // Step 4: Filtering and results
+//   //   await executor.waitForCompletion();
+//   const filteredFunctions = await executor.filterEvaluationFunctions(0.2);
+//   console.log("Filtered Functions: ", filteredFunctions);
+
+//   rl.close();
+// };
+
+// main().catch(console.error);
diff --git a/chainforge/react-server/src/backend/evalgen/typing.ts b/chainforge/react-server/src/backend/evalgen/typing.ts
new file mode 100644
index 000000000..ef15d1551
--- /dev/null
+++ b/chainforge/react-server/src/backend/evalgen/typing.ts
@@ -0,0 +1,82 @@
+import { ChatHistoryInfo, Dict } from "../typing";
+
+export type EvalCriteriaUID = string;
+
+export interface EvalCriteria {
+  shortname: string;
+  criteria: string;
+  eval_method: "code" | "expert";
+  uid: EvalCriteriaUID;
+  priority: number;
+  source?: string;
+}
+
+export interface EvalGenReport {
+  criteria: EvalCriteria[];
+  failureCoverage: number;
+  falseFailureRate: number;
+}
+
+export function validEvalCriteriaFormat(json_obj: Dict) {
+  return (
+    "criteria" in json_obj &&
+    "shortname" in json_obj &&
+    ["code", "expert"].includes(json_obj.eval_method)
+  );
+}
+
+export enum EvalFunctionResult {
+  PASS = "pass",
+  FAIL = "fail",
+  SKIP = "skip",
+}
+
+export interface EvalFunction {
+  evalCriteria: EvalCriteria;
+  code: string;
+  name: string;
+  uid: string;
+}
+
+export interface EvalFunctionReport {
+  evalFunction: EvalFunction;
+  true_pass: number;
+  true_fail: number;
+  false_pass: number;
+  false_fail: number;
+  skipped: number;
+  mcc?: number; // Matthews correlation coefficient, which is a measure of the quality of binary classifications
+  f1?: number; // F1 score, which is the harmonic mean of precision and recall
+  cohens_kappa?: number; // Cohen's kappa, which is a measure of inter-rater agreement
+  failureCoverage?: number; // The percentage of failures that were covered by the eval function
+  falseFailureRate?: number; // The percentage of false failures
+}
+
+export interface EvalFunctionSetReport {
+  failureCoverage: number;
+  falseFailureRate: number;
+  selectedEvalFunctions: EvalFunction[];
+  allEvalFunctionReports: Map<EvalCriteria, EvalFunctionReport[]>; // Map from criteria to function reports
+}
+
+export class EvalExecutionError extends Error {
+  constructor(message: string) {
+    super(message); // Call the parent constructor with the message
+    this.name = "EvalExecutionError"; // Set the error name to the class name
+    Object.setPrototypeOf(this, EvalExecutionError.prototype);
+  }
+}
+
+export const AssertionWriterSystemMsg =
+  "You are an expert Python programmer and helping me write assertions for my LLM pipeline. An LLM pipeline accepts an example and prompt template, fills the template's placeholders with the example, and generates a response.";
+export const AssertionWriterSystemMsgChatHistory: ChatHistoryInfo[] = [
+  {
+    messages: [
+      {
+        role: "system",
+        content: AssertionWriterSystemMsg,
+      },
+    ],
+    fill_history: {},
+  },
+];
diff --git a/chainforge/react-server/src/backend/evalgen/utils.ts b/chainforge/react-server/src/backend/evalgen/utils.ts
new file mode 100644
index 000000000..76c4194e7
--- /dev/null
+++ b/chainforge/react-server/src/backend/evalgen/utils.ts
@@ -0,0 +1,550 @@
+// Interfaces and utility functions
+// TODO: Use ChainForge's openai utils (I tried but got errors)
+// import { AzureOpenAIStreamer } from "./oai_utils";
+import { EventEmitter } from "events";
+import {
+  AssertionWriterSystemMsg,
+  EvalCriteria,
+  EvalFunction,
+  EvalFunctionResult,
+  validEvalCriteriaFormat,
+} from "./typing";
+import { Dict, LLMResponse, LLMSpec } from "../typing";
+import {
+  evalWithLLM,
+  executejs,
+  executepy,
+  queryLLM,
+  simpleQueryLLM,
+} from "../backend";
+import {
+  getVarsAndMetavars,
+  hashtagTemplateVars,
+  llmResponseDataToString,
+  retryAsyncFunc,
+} from "../utils";
+import { v4 as uuid } from "uuid";
+import { EvalGenAssertionEmitter } from "./oai_utils";
+import {
+  buildContextPromptForVarsMetavars,
+  buildGenEvalCodePrompt,
+} from "../../AiPopover";
+import { escapeBraces } from "../template";
+
+/**
+ * Extracts substrings within "```" and "```" ticks. Excludes the ticks from return.
+ * @param mdText
+ * @returns
+ */
+export function extractMdBlocks(
+  mdText: string,
+  blockName: string,
+): string[] | undefined {
+  const regex = new RegExp(`\`\`\`${blockName}(.*?)\`\`\``, "gs");
+  const matches = [];
+  let match: RegExpExecArray | null;
+
+  while ((match = regex.exec(mdText)) !== null) matches.push(match[1]);
+
+  if (matches.length > 0) return matches;
+
+  console.error("No md blocks found for name:", blockName);
+  return undefined;
+}
+
+/**
+ * Given the user's prompt, generates a list of criteria in JSON format.
+ *
+ * FUTURE: One might consider giving more contextual information, e.g. input vars to the prompt or prompt history.
+ *
+ * @param prompt The user's prompt (must be 'concrete'/escaped braces)
+ * @returns A list of parsed `EvalCriteria`
+ */
+export async function generateLLMEvaluationCriteria(
+  prompt: string,
+  llm: string | LLMSpec,
+  apiKeys?: Dict,
+  promptTemplate?: string, // overrides prompt template used
+  systemMsg?: string | null, // overrides default system message, if present. Use null to specify empty.
+  userFeedback?: { grade: boolean; note?: string; response: string }[], // user feedback to include in the prompt
+): Promise<EvalCriteria[]> {
+  // Compose user feedback
+  let userFeedbackPrompt = "";
+  if (userFeedback) {
+    userFeedbackPrompt = `\n\n-----------------\nHere is some feedback on the LLM's responses to this prompt:`;
+    for (const feedback of userFeedback) {
+      userFeedbackPrompt += `\n\nFor the response: "${feedback.response}", the user gave the following feedback:`;
+      if (feedback.grade !== undefined) {
+        userFeedbackPrompt += `\nGrade: ${feedback.grade === true ? "Good" : "Bad"}`;
+      }
+      if (feedback.note !== undefined) {
+        userFeedbackPrompt += `\nExplanation for grade: "${feedback.note}"`;
+      }
+    }
+    userFeedbackPrompt += "\n-----------------\n";
+  }
+
+  // Construct the detailed prompt for the LLM
+  const detailedPrompt =
+    promptTemplate ??
+    `Here is my LLM prompt template:
+  
+  \`${prompt}\`
+
+    ${userFeedbackPrompt}
+    
+    Based on the instructions in the prompt that need to be followed, I want to write a list of assertions for my LLM pipeline to run on all pipeline responses. Give me a list of 3 distinct criteria to check for in LLM responses. Each item in the list should contain a string description of a criteria to check for, and whether it should be evaluated with code or by an expert if the criteria is difficult to evaluate. Your answer should be a JSON list of objects within \`\`\`json \`\`\` markers, where each object has the following three fields: "criteria", "shortname", and "eval_method" (code or expert). At most 3 criteria should have eval_method as expert. The "criteria" should be short, and the "shortname" should be a very brief title for the criteria. Each evaluation criteria should test a concept that should evaluate to "true" in the ideal case.`;
+
+  // Query the LLM (below, we will try this up to 3 times)
+  async function _query() {
+    const result = await simpleQueryLLM(
+      detailedPrompt, // prompt
+      typeof llm === "string" ? llm : [llm], // llm
+      // spec, // llm
+      systemMsg !== undefined
+        ? systemMsg === null
+          ? undefined
+          : systemMsg
+        : AssertionWriterSystemMsg, // system_msg
+      apiKeys, // API keys (if any)
+    );
+
+    if (result.errors && Object.keys(result.errors).length > 0)
+      throw new Error(Object.values(result.errors as Dict)[0].toString());
+
+    // Get output (text from LLM response)
+    const output = llmResponseDataToString(result.responses[0].responses[0]);
+    // console.log("LLM said: ", output); // for debuggging
+
+    // Attempt to extract JSON blocks (strings) from input
+    const json_blocks = extractMdBlocks(output, "json");
+    if (json_blocks === undefined || json_blocks.length === 0)
+      throw new Error(
+        "EvalGen: Could not parse LLM response into evaluation critera: No JSON detected in output.",
+      );
+
+    // Attempt to parse all JSON blocks into objects
+    const data: EvalCriteria[] = json_blocks.map((s) => JSON.parse(s)).flat(1);
+
+    // console.log("Parsed", data);
+
+    // Double-check the formatting
+    if (data.every(validEvalCriteriaFormat)) {
+      // Initialize any required properties
+      data.forEach((d) => {
+        d.uid = uuid();
+        d.priority = 0;
+      });
+      return data;
+    }
+    // Incorrect formatting
+    else
+      throw new Error(
+        "EvalGen: At least one JSON block was not in expected EvalCriteria format.",
+      );
+  }
+
+  // Retry up to 3 times; otherwise, we will throw the last encountered error.
+  return retryAsyncFunc(_query, 3);
+}
+
+export function getPromptForGenEvalCriteriaFromDesc(desc: string) {
+  return `I've described a criteria I want to use to evaluate text. I want you to take the criteria and output a JSON object in the format below. 
+
+CRITERIA: 
+\`\`\`
+${desc}
+\`\`\`
+
+Your response should contain a short title for the criteria ("shortname"), a description of the criteria in 2 sentences ("criteria"), and whether it should be evaluated with "code", or by an "expert" if the criteria is difficult to evaluate ("eval_method"). Your answer should be JSON within a \`\`\`json \`\`\` marker, with the following three fields: "criteria", "shortname", and "eval_method" (code or expert). The "criteria" should expand upon the user's input, the "shortname" should be a very brief title for the criteria, and this list should contain as many evaluation criteria as you can think of. Each evaluation criteria should test a unit concept that should evaluate to "true" in the ideal case. Only output JSON, nothing else.`;
+}
+
+export async function executeLLMEval(
+  evalFunction: EvalFunction,
+  llm: string | LLMSpec,
+  example: LLMResponse,
+  positiveExample?: LLMResponse,
+  negativeExample?: LLMResponse,
+): Promise<EvalFunctionResult> {
+  // The LLM eval prompt might include template vars. We need to add
+  // a hashtag to indicate to ChainForge that it should use the
+  // fill_history in the provided `example` LLMResponse.
+  const candidateCriteriaPrompt = hashtagTemplateVars(evalFunction.code);
+
+  // Construct call to an LLM to evaluate the example
+  const evalPrompt =
+    "Evaluate the text below according to this criteria: " +
+    candidateCriteriaPrompt +
+    ' Only return "yes" or "no", nothing else.\n\n```\n' +
+    "{__input}" +
+    "\n```";
+
+  // Query an LLM as an evaluator
+  let systemMessage;
+  if (
+    positiveExample &&
+    positiveExample.responses.length > 0 &&
+    negativeExample &&
+    negativeExample.responses.length > 0
+  ) {
+    systemMessage =
+      "You are an expert evaluator. Please consider the following GOOD example:\n" +
+      llmResponseDataToString(positiveExample.responses[0]) +
+      "\n\nand BAD example:\n" +
+      llmResponseDataToString(negativeExample.responses[0]) +
+      "\n\nwhen making your evaluation.";
+  }
+
+  // We use ChainForge's infrastructure for running LLM evaluators
+  // to score responses based on the criteria.
+  const { responses, errors } = await evalWithLLM(
+    Date.now().toString(), // id to refer to this query
+    llm, // llm
+    evalPrompt,
+    [example], // we pass in a single example
+    undefined,
+    undefined,
+    undefined,
+    systemMessage,
+  );
+
+  if (
+    !responses ||
+    responses.length === 0 ||
+    !responses[0].eval_res ||
+    responses[0].eval_res.items.length === 0
+  ) {
+    console.error(
+      "Error executing LLM eval candidate:",
+      errors,
+      evalFunction.code,
+    );
+    return EvalFunctionResult.SKIP;
+  }
+
+  // Get the output
+  const output = responses[0].eval_res?.items[0];
+  // This should be a boolean... but we need to parse it
+  const is_pass =
+    output === true || (typeof output === "string" && output.includes("yes"));
+  const is_fail =
+    output === false || (typeof output === "string" && output.includes("no"));
+
+  // Parse the response to determine the boolean value to return
+  if (is_pass) {
+    return EvalFunctionResult.PASS;
+  } else if (is_fail) {
+    return EvalFunctionResult.FAIL;
+  } else {
+    // throw new EvalExecutionError(
+    //   `Error executing function ${evalFunction.name}: could not parse ${response.choices[0].message.content}`,
+    // );
+    console.warn(
+      "executeLLMEval: Warning: Could not find 'yes' or 'no' in response.",
+      evalPrompt,
+      output,
+    );
+    return EvalFunctionResult.SKIP;
+  }
+}
+
+/**
+ * Executes a JavaScript function, described by evalFunction, against the "example" LLM response object.
+ * @returns `EvalFunctionResult`
+ */
+export async function execJSFunc(
+  evalFunction: EvalFunction,
+  example: LLMResponse,
+  iframe_id: string,
+) {
+  try {
+    const result = await executejs(
+      iframe_id,
+      evalFunction.code,
+      [example],
+      "response",
+      "evaluator",
+    );
+
+    // Check for errors
+    if (result.error !== undefined) throw new Error(result.error);
+
+    // Extract the evaluation result
+    const eval_res = result.responses
+      ? (result.responses[0] as LLMResponse).eval_res?.items[0]
+      : undefined;
+
+    // Check that the evaluation result is a boolean value
+    // NOTE: EvalGen only supports assertion functions at this time.
+    if (typeof eval_res !== "boolean")
+      throw new Error(
+        "Non-boolean return value encountered when executing JS eval code. Value: " +
+          eval_res,
+      );
+
+    return eval_res ? EvalFunctionResult.PASS : EvalFunctionResult.FAIL;
+  } catch (err) {
+    console.error(err);
+    return EvalFunctionResult.SKIP;
+  }
+}
+
+/**
+ * Executes a Python function, described by evalFunction, against the "example" LLM response object.
+ * NOTE: This runs in a sandbox using pyodide.
+ * @returns `EvalFunctionResult`
+ */
+export async function execPyFunc(
+  evalFunction: EvalFunction,
+  llm: string | LLMSpec, // not used, but provided for consistency with the other exec func signature
+  example: LLMResponse,
+  positiveExample?: LLMResponse,
+  negativeExample?: LLMResponse,
+): Promise<EvalFunctionResult> {
+  try {
+    // We need to replace the function name with "evaluate", which is what is expected by backend:
+    let code = evalFunction.code.replace(
+      `def ${evalFunction.name}`,
+      "def evaluate",
+    );
+
+    // Add `import re` to the code if it's not already there
+    if (!code.includes("import re")) code = "import re\n" + code;
+
+    // console.log(`Executing function: ${code}`);
+
+    // Execute the function via pyodide
+    const result = await executepy(
+      uuid(),
+      code,
+      [example],
+      "response",
+      "evaluator",
+      undefined,
+      "pyodide", // execute in sandbox with a pyodide WebWorker
+    );
+
+    // Check for errors
+    if (result.error !== undefined) throw new Error(result.error);
+
+    // console.log("Result:", result);
+
+    // Extract the evaluation result
+    const eval_res = result.responses
+      ? (result.responses[0] as LLMResponse).eval_res?.items[0]
+      : undefined;
+
+    // Check that the evaluation result is a boolean value
+    // NOTE: EvalGen only supports assertion functions at this time.
+    if (typeof eval_res !== "boolean")
+      throw new Error(
+        "Non-boolean return value encountered when executing Python eval code. Value: " +
+          eval_res,
+      );
+
+    return eval_res ? EvalFunctionResult.PASS : EvalFunctionResult.FAIL;
+  } catch (err) {
+    console.error(err);
+    return EvalFunctionResult.SKIP;
+  }
+}
+
+export async function generateFunctionsForCriteria(
+  criteria: EvalCriteria,
+  llm: string | LLMSpec,
+  promptTemplate: string,
+  example: LLMResponse,
+  emitter: EventEmitter,
+  badExample?: LLMResponse,
+  apiKeys?: Dict,
+): Promise<void> {
+  const functionGenPrompt = buildFunctionGenPrompt(
+    criteria,
+    promptTemplate,
+    example,
+    badExample,
+  );
+  console.log("Function generation prompt:", functionGenPrompt);
+
+  try {
+    const streamer = new EvalGenAssertionEmitter(apiKeys);
+
+    streamer.on("function", (functionDefinition: string) => {
+      processAndEmitFunction(criteria, functionDefinition, emitter);
+    });
+
+    const modelType =
+      criteria.eval_method === "expert" ? "llm_eval" : "python_fn";
+    await streamer.generate(functionGenPrompt, llm, modelType);
+  } catch (error) {
+    console.error("Error generating function for criteria:", error);
+    throw new Error(
+      `Failed to generate function for criteria: ${criteria.criteria}`,
+    );
+  }
+}
+
+function buildFunctionGenPrompt(
+  criteria: EvalCriteria,
+  promptTemplate: string,
+  example: LLMResponse,
+  badExample?: LLMResponse,
+): string {
+  let badExampleSection = "";
+  if (badExample) {
+    badExampleSection = `
+    Here is an example response that DOES NOT meet the criteria:
+    \`\`\`
+    ${llmResponseDataToString(badExample.responses[0])}
+    \`\`\`
+    `;
+  }
+
+  if (criteria.eval_method === "expert") {
+    const varsAndMetavars = getVarsAndMetavars([example]);
+    // Turn the vars and metavars into a string
+    const _composeVarsContext = (vars: string[]) => {
+      if (vars.length === 0) return "";
+      vars.map((v) => ` - "${v}": ${example.vars[v]}`).join("\n");
+    };
+    const varsAndMetavarsContext =
+      _composeVarsContext(varsAndMetavars.vars) +
+      "\n" +
+      _composeVarsContext(varsAndMetavars.metavars);
+    const varsAndMetavarsContextPrompt =
+      varsAndMetavarsContext.length > 2
+        ? `\n\nIn your prompts, it may be useful to refer to metadata associated with the LLM output, such as when you are comparing to a ground truth. For instance, consider a situation where the user has a prompt template with a variable {writing_style} —'poem', 'text message', or 'formal letter' —and they want to validate that the LLM's output was really in that style. You would produce a prompt template like:
+
+"Respond with 'yes' if the text below is in the style of a {writing_style}, 'no' if not. Only reply with the classification, nothing else."
+
+The template indicates that the same {writing_style} variable used upstream in the LLM pipeline, should be used in your evaluation prompt.
+
+If you want to refer to the value of an input variable, you **must** use template braces like {variable}.
+
+Here are the variables you have access to (keys), and example values for one output: 
+${varsAndMetavarsContext}`
+        : "";
+
+    return escapeBraces(`Given the following prompt template for an LLM pipeline:\n\n ${promptTemplate}\n\nYour task is to devise a prompt for an expert to evaluate the pipeline's responses based on the following criteria: "${criteria.criteria}"
+    ${badExampleSection}
+    You will devise 3 prompts for the evaluation criterion to see which has the best accuracy. Each prompt you generate should be a short question that an expert can answer with a "yes" or "no" to evaluate entire criteria (don't miss anything in the criteria). Try different variations/wordings in the prompts. ${varsAndMetavarsContextPrompt}
+    
+    Return your prompts in a JSON list of strings within \`\`\`json \`\`\` markers. Each string should be a question for the expert to answer, and each question should be contained on its own line.
+    ---
+    `);
+  } else {
+    const prompt = `Given the following prompt template for an LLM pipeline:\n\n ${promptTemplate}\n\n, your task is to devise multiple Python assertions to evaluate LLM responses based on the criteria "${criteria.shortname}". 
+    ${badExampleSection}
+    Create 3 implementations of the criterion.
+    ${buildGenEvalCodePrompt("python", buildContextPromptForVarsMetavars(getVarsAndMetavars([example])), criteria.criteria, true)}
+    Be creative in your implementations. Our goal is to explore diverse approaches to evaluate LLM responses effectively. Try to avoid using third-party libraries for code-based evaluation methods. Include the full implementation of each function in separate "\`\`\`python" blocks. Each function should return only True or False.`;
+
+    return escapeBraces(prompt); // Escape braces in the prompt
+  }
+}
+
+function processAndEmitFunction(
+  criteria: EvalCriteria,
+  functionDefinition: string,
+  emitter: EventEmitter,
+): void {
+  const evalFunction: EvalFunction = {
+    evalCriteria: criteria,
+    code: functionDefinition,
+    name: functionDefinition,
+    uid: uuid(),
+  };
+
+  if (criteria.eval_method === "code") {
+    const functionNameMatch = functionDefinition.match(
+      /def\s+([a-zA-Z_]\w*)\s*\(/,
+    );
+    if (functionNameMatch) {
+      evalFunction.name = functionNameMatch[1];
+    } else {
+      console.error(
+        "Could not extract the function name from the provided code.",
+      );
+      return; // Skip emitting if no function name could be extracted
+    }
+  }
+
+  emitter.emit("functionGenerated", evalFunction);
+}
+
+/**
+ * Calculates the F1 score based on true positives, false positives, and false negatives.
+ * The F1 score is the harmonic mean of precision and recall.
+ * Precision = TP / (TP + FP)
+ * Recall = TP / (TP + FN)
+ * F1 = 2 * (Precision * Recall) / (Precision + Recall)
+ * @param true_positive The number of true positive predictions
+ * @param false_positive The number of false positive predictions
+ * @param false_negative The number of false negative predictions
+ * @returns The F1 score, or undefined if precision and recall are both zero
+ */
+export function calculateF1Score(
+  true_positive: number,
+  false_positive: number,
+  false_negative: number,
+): number | undefined {
+  const precision = true_positive / (true_positive + false_positive);
+  const recall = true_positive / (true_positive + false_negative);
+  if (precision + recall === 0) return undefined; // Avoid division by zero
+  return (2 * precision * recall) / (precision + recall);
+}
+
+/**
+ * Calculates Matthews correlation coefficient (MCC) based on the confusion matrix values.
+ * ```
+ *  MCC = (TP * TN - FP * FN) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
+ * ```
+ * @param true_positive The number of true positive predictions
+ * @param true_negative The number of true negative predictions
+ * @param false_positive The number of false positive predictions
+ * @param false_negative The number of false negative predictions
+ * @returns The Matthews correlation coefficient, or undefined if the denominator is zero
+ */
+export function calculateMCC(
+  true_positive: number,
+  true_negative: number,
+  false_positive: number,
+  false_negative: number,
+): number | undefined {
+  const numerator =
+    true_positive * true_negative - false_positive * false_negative;
+  const denominator = Math.sqrt(
+    (true_positive + false_positive) *
+      (true_positive + false_negative) *
+      (true_negative + false_positive) *
+      (true_negative + false_negative),
+  );
+  if (denominator === 0) return undefined; // Avoid division by zero
+  return numerator / denominator;
+}
+
+/**
+ * Calculates Cohen's Kappa coefficient based on the confusion matrix values.
+ * ```
+ *  Kappa = (Po - Pe) / (1 - Pe)
+ * ```
+ * where Po is the observed agreement and Pe is the expected agreement.
+ * @param TP The number of true positive predictions
+ * @param TN The number of true negative predictions
+ * @param FP The number of false positive predictions
+ * @param FN The number of false negative predictions
+ * @returns The Cohen's Kappa coefficient, or undefined if the denominator is zero
+ */
+export function calculateCohensKappa(
+  TP: number,
+  TN: number,
+  FP: number,
+  FN: number,
+): number | undefined {
+  const numerator = 2 * (TP * TN - FP * FN);
+  const denominator = (TP + FP) * (FP + TN) + (TP + FN) * (FN + TN);
+  if (denominator === 0) {
+    return undefined; // Avoid division by zero
+  }
+  return numerator / denominator;
+}
diff --git a/chainforge/react-server/src/backend/typing.ts b/chainforge/react-server/src/backend/typing.ts
index a3af8d4ce..2559782db 100644
--- a/chainforge/react-server/src/backend/typing.ts
+++ b/chainforge/react-server/src/backend/typing.ts
@@ -330,6 +330,8 @@ export type TabularDataColType = {
 
 export type PythonInterpreter = "flask" | "pyodide";
 
+export type RatingDict = Record<number, boolean | string | null | undefined>;
+
 export interface FileWithContent extends FileWithPath {
   content?: string;
 }
diff --git a/chainforge/react-server/src/backend/utils.ts b/chainforge/react-server/src/backend/utils.ts
index 59f909eeb..887f8b22c 100644
--- a/chainforge/react-server/src/backend/utils.ts
+++ b/chainforge/react-server/src/backend/utils.ts
@@ -54,6 +54,7 @@ import {
 } from "@mirai73/bedrock-fm";
 import StorageCache, { StringLookup, MediaLookup } from "./cache";
 import Compressor from "compressorjs";
+import { Annotations } from "plotly.js";
 // import { Models } from "@mirai73/bedrock-fm/lib/bedrock";
 
 const ANTHROPIC_HUMAN_PROMPT = "\n\nHuman:";
@@ -2004,14 +2005,14 @@ function _extract_anthropic_chat_responses(
 function _extract_anthropic_text_responses(
   response: Array<Dict>,
 ): Array<string> {
-  return response.map((r: Dict) => r.completion.trim());
+  return response.map((r: Dict) => r.completion?.trim());
 }
 
 /**
  * Extracts the text part of a HuggingFace text completion.
  */
 function _extract_huggingface_responses(response: Array<Dict>): Array<string> {
-  return response.map((r: Dict) => r.generated_text.trim());
+  return response.map((r: Dict) => r.generated_text?.trim());
 }
 
 /**
@@ -2027,7 +2028,7 @@ function _extract_alephalpha_responses(response: Dict): Array<string> {
 function _extract_ollama_responses(
   response: Array<Dict>,
 ): Array<LLMResponseData> {
-  return response.map((r: any) => r.generated_text.trim());
+  return response.map((r: any) => r.generated_text?.trim());
 }
 
 /**
@@ -2843,3 +2844,87 @@ export const __http_url_to_base64 = (url: string) => {
     xhr.send();
   });
 };
+
+export const stripWrappingQuotes = (str: string) => {
+  if (
+    typeof str === "string" &&
+    str.length >= 2 &&
+    str.charAt(0) === '"' &&
+    str.charAt(str.length - 1) === '"'
+  )
+    return str.substring(1, str.length - 1);
+  else return str;
+};
+
+export const accuracyToColor = (acc: number) => {
+  if (acc > 0.9) return "green";
+  else if (acc > 0.7) return "yellow";
+  else if (acc > 0.5) return "orange";
+  else return "red";
+};
+
+export const cmatrixTextAnnotations = (
+  x: string[],
+  y: string[],
+  z: number[][],
+) => {
+  const annotations = [];
+  const midVal = Math.max(...z.flat());
+  for (let i = 0; i < y.length; i++) {
+    for (let j = 0; j < x.length; j++) {
+      annotations.push({
+        xref: "x1",
+        yref: "y1",
+        x: x[j],
+        y: y[i],
+        text: z[i][j].toString(),
+        font: {
+          // family: "monospace",
+          // size: 12,
+          color: z[i][j] < midVal ? "white" : "black",
+        },
+        showarrow: false,
+      });
+    }
+  }
+  return annotations as Partial<Annotations>[];
+};
+
+/**
+ * Adds a hashtag prefix to template variables in a string.
+ * Converts unescaped templates of the form {template} to {#template}.
+ * Ignores escaped braces like \{ and \}.
+ *
+ * @param input - The input string containing templates
+ * @returns The string with templates converted to hashtagged form
+ */
+export function hashtagTemplateVars(input: string): string {
+  let result = "";
+  let i = 0;
+
+  while (i < input.length) {
+    // Check for escaped braces
+    if (
+      input[i] === "\\" &&
+      i + 1 < input.length &&
+      (input[i + 1] === "{" || input[i + 1] === "}")
+    ) {
+      // Add the escape character and the brace
+      result += input[i] + input[i + 1];
+      i += 2;
+    }
+    // Check for opening brace of a template (that isn't already hashtagged)
+    else if (input[i] === "{" && i + 1 < input.length && input[i + 1] !== "#") {
+      // Add the opening brace and the hashtag
+      result += "{#";
+      i++;
+    }
+    // Regular character
+    else {
+      result += input[i];
+      i++;
+    }
+  }
+
+  return result;
+}
diff --git a/chainforge/react-server/src/store.tsx b/chainforge/react-server/src/store.tsx
index f16f6b281..bb381416e 100644
--- a/chainforge/react-server/src/store.tsx
+++ b/chainforge/react-server/src/store.tsx
@@ -16,6 +16,7 @@ import {
   deepcopy,
   transformDict,
   APP_IS_RUNNING_LOCALLY,
+  llmResponseDataToString,
 } from "./backend/utils";
 import { DuplicateVariableNameError } from "./backend/errors";
 import {
@@ -27,12 +28,12 @@ import {
   TabularDataColType,
   TabularDataRowType,
   JSONCompatible,
+  LLMResponse,
 } from "./backend/typing";
 import { TogetherChatSettings } from "./ModelSettingSchemas";
 import { NativeLLM } from "./backend/models";
 import { StringLookup } from "./backend/cache";
 import { saveGlobalConfig } from "./backend/backend";
-import { remove } from "jszip";
 const IS_RUNNING_LOCALLY = APP_IS_RUNNING_LOCALLY();
 
 // Initial project settings
@@ -462,6 +463,9 @@ export interface StoreHandles {
   state: Dict;
   setState: (key: string, val: any) => void;
   importState: (state: Dict) => void;
+  exportGradesAndNotes: (
+    responses: LLMResponse[],
+  ) => { grade: boolean; note?: string; response: string }[];
 
   // The color to represent a specific LLM, to be globally consistent
   llmColors: Dict<string>;
@@ -664,6 +668,25 @@ const useStore = create<StoreHandles>((set, get) => ({
       state,
     }));
   },
+  exportGradesAndNotes: (responses: LLMResponse[]) => {
+    const state = get().state;
+    const res: { grade: boolean; note?: string; response: string }[] = [];
+    responses.forEach((r) => {
+      const uid = r.uid;
+      if (r.uid === undefined || r.responses?.length === 0) return;
+      const gradeKey = `r.${uid}.grade`;
+      const noteKey = `r.${uid}.note`;
+      const grade = state[gradeKey];
+      const note = state[noteKey];
+      if (grade === undefined) return;
+      res.push({
+        grade: grade?.[0],
+        note: note?.[0],
+        response: llmResponseDataToString(r.responses[0]),
+      }); // TODO: support multiple responses when n>1
+    });
+    return res;
+  },
 
   // Keep track of LLM colors, to ensure color consistency across various plots and displays
   llmColors: initialLLMColors,
diff --git a/chainforge/react-server/src/styles.css b/chainforge/react-server/src/styles.css
index a1c8edc03..b4a478f9e 100644
--- a/chainforge/react-server/src/styles.css
+++ b/chainforge/react-server/src/styles.css
@@ -331,6 +331,41 @@ html[data-mantine-color-scheme="dark"] .multi-eval-node {
   cursor: zoom-in;
 }
 
+.eval-score {
+  /* background-color: #eee; */
+  /* padding: 8px; */
+}
+
+.eval-vote-icon {
+  color: #999;
+}
+
+.eval-vote-icon:hover {
+  color: #333;
+}
+
+.eval-vote-icons {
+  display: none;
+  position: relative;
+  /* opacity: 0.3; */
+  margin-left: 6px;
+  margin-top: -6px;
+}
+
+.eval-vote-chosen {
+  display: none; /* Disable eval the eval voting for initial EvalGen release */
+  /* display: inline-flex; */
+  position: relative;
+  margin-left: 6px;
+  margin-top: -6px;
+}
+
+.eval-score:hover .eval-vote-icons {
+  display: none; /* Disable eval the eval voting for initial EvalGen release */
+  /* display: inline-flex; */
+  /* opacity: 1.0; */
+}
+
 html[data-mantine-color-scheme="dark"] .eval-inspect-response-footer {
   background-color: #333;
   color: #ccc;
@@ -1460,6 +1495,37 @@ html[data-mantine-color-scheme="dark"] .chat-bubble textarea {
   margin: 0;
 }
 
+.gradeContainer {
+  position: relative;
+  overflow: visible;
+  /* width: 20px; */
+}
+
+.gradeUpCount {
+  position: absolute;
+  right: 0px;
+  top: -3px;
+  font-size: x-small;
+}
+
+.gradeDownCount {
+  position: absolute;
+  right: 0px;
+  bottom: 0px;
+  font-size: x-small;
+}
+
+.criteriaButtons {
+  text-align: center;
+  display: flex;
+  justify-content: space-between;
+  padding-left: 50px;
+  padding-right: 50px;
+  /* gap: 100px;
+  padding: 10px;
+  column-gap: normal;
+  -moz-column-gap: 100px; */
+}
 html[data-mantine-color-scheme="dark"] .react-flow__controls-button {
   background-color: #777 !important;
   color: #ddd;
diff --git a/setup.py b/setup.py
index 43ce61067..32dfbec16 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@ def readme():
 
 setup(
     name="chainforge",
-    version="0.3.6.0",
+    version="0.3.6.1",
     packages=find_packages(),
     author="Ian Arawjo",
     description="A Visual Programming Environment for Prompt Engineering",