From 3d9c82352dd31595aa90d892bc03d9dbf2c3e0c9 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Wed, 12 Mar 2025 22:35:02 -0400
Subject: [PATCH 01/35] Add prompt variants feature to Prompt Node

---
 .../react-server/src/LLMListComponent.tsx     |  20 +-
 chainforge/react-server/src/PromptNode.tsx    | 236 +++++++++++++++---
 .../react-server/src/backend/backend.ts       |  40 +--
 chainforge/react-server/src/backend/utils.ts  |  49 ++++
 4 files changed, 273 insertions(+), 72 deletions(-)
diff --git a/chainforge/react-server/src/LLMListComponent.tsx b/chainforge/react-server/src/LLMListComponent.tsx
index 900870fd1..4aac22daa 100644
--- a/chainforge/react-server/src/LLMListComponent.tsx
+++ b/chainforge/react-server/src/LLMListComponent.tsx
@@ -31,31 +31,13 @@ import useStore, { initLLMProviders, initLLMProviderMenu } from "./store";
 import { Dict, JSONCompatible, LLMGroup, LLMSpec } from "./backend/typing";
 import { useContextMenu } from "mantine-contextmenu";
 import { ContextMenuItemOptions } from "mantine-contextmenu/dist/types";
+import { ensureUniqueName } from "./backend/utils";
 
 // The LLM(s) to include by default on a PromptNode whenever one is created.
 // Defaults to ChatGPT (GPT3.5) when running locally, and HF-hosted falcon-7b for online version since it's free.
 const DEFAULT_INIT_LLMS = [initLLMProviders[0]];
 
 // Helper funcs
-// Ensure that a name is 'unique'; if not, return an amended version with a count tacked on (e.g. "GPT-4 (2)")
-const ensureUniqueName = (_name: string, _prev_names: string[]) => {
-  // Strip whitespace around names
-  const prev_names = _prev_names.map((n) => n.trim());
-  const name = _name.trim();
-
-  // Check if name is unique
-  if (!prev_names.includes(name)) return name;
-
-  // Name isn't unique; find a unique one:
-  let i = 2;
-  let new_name = `${name} (${i})`;
-  while (prev_names.includes(new_name)) {
-    i += 1;
-    new_name = `${name} (${i})`;
-  }
-  return new_name;
-};
-
 /** Get position CSS style below and left-aligned to the input element */
 const getPositionCSSStyle = (
   elem: HTMLButtonElement,
diff --git a/chainforge/react-server/src/PromptNode.tsx b/chainforge/react-server/src/PromptNode.tsx
index 93af5a8dd..194f1e8d2 100644
--- a/chainforge/react-server/src/PromptNode.tsx
+++ b/chainforge/react-server/src/PromptNode.tsx
@@ -18,9 +18,20 @@ import {
   Modal,
   Box,
   Tooltip,
+  Group,
+  Flex,
+  Button,
+  ActionIcon,
 } from "@mantine/core";
 import { useDisclosure } from "@mantine/hooks";
-import { IconEraser, IconList } from "@tabler/icons-react";
+import {
+  IconArrowLeft,
+  IconArrowRight,
+  IconEraser,
+  IconList,
+  IconPlus,
+  IconTrash,
+} from "@tabler/icons-react";
 import useStore from "./store";
 import BaseNode from "./BaseNode";
 import NodeLabel from "./NodeLabelComponent";
@@ -41,6 +52,7 @@ import {
   extractSettingsVars,
   truncStr,
   genDebounceFunc,
+  ensureUniqueName,
 } from "./backend/utils";
 import LLMResponseInspectorDrawer from "./LLMResponseInspectorDrawer";
 import CancelTracker from "./backend/canceler";
@@ -64,6 +76,7 @@ import {
   queryLLM,
 } from "./backend/backend";
 import { StringLookup } from "./backend/cache";
+import { union } from "./backend/setUtils";
 
 const getUniqueLLMMetavarKey = (responses: LLMResponse[]) => {
   const metakeys = new Set(
@@ -221,6 +234,7 @@ export interface PromptNodeProps {
     contChat: boolean;
     refresh: boolean;
     refreshLLMList: boolean;
+    idxPromptVariantShown?: number;
   };
   id: string;
   type: string;
@@ -257,10 +271,15 @@ const PromptNode: React.FC<PromptNodeProps> = ({
     null,
   );
   const [templateVars, setTemplateVars] = useState<string[]>(data.vars ?? []);
-  const [promptText, setPromptText] = useState<string>(data.prompt ?? "");
-  const [promptTextOnLastRun, setPromptTextOnLastRun] = useState<string | null>(
-    null,
+  const [promptText, setPromptText] = useState<string | string[]>(
+    data.prompt ?? "",
+  );
+  const [idxPromptVariantShown, setIdxPromptVariantShown] = useState<number>(
+    data.idxPromptVariantShown ?? 0,
   );
+  const [promptTextOnLastRun, setPromptTextOnLastRun] = useState<
+    string | string[] | null
+  >(null);
   const [status, setStatus] = useState(Status.NONE);
   const [numGenerations, setNumGenerations] = useState<number>(data.n ?? 1);
   const [numGenerationsLastRun, setNumGenerationsLastRun] = useState<number>(
@@ -391,10 +410,17 @@ const PromptNode: React.FC<PromptNodeProps> = ({
   }, [templateVars, id, pullInputData, updateShowContToggle]);
 
   const refreshTemplateHooks = useCallback(
-    (text: string) => {
-      // Update template var fields + handles
-      const found_template_vars = new Set(extractBracketedSubstrings(text)); // gets all strs within braces {} that aren't escaped; e.g., ignores \{this\} but captures {this}
+    (text: string | string[]) => {
+      const texts = typeof text === "string" ? [text] : text;
+
+      // Get all template vars in the prompt(s)
+      let found_template_vars = new Set<string>();
+      for (const t of texts) {
+        const substrs = extractBracketedSubstrings(t); // gets all strs within braces {} that aren't escaped; e.g., ignores \{this\} but captures {this}
+        found_template_vars = union(found_template_vars, new Set(substrs));
+      }
 
+      // Update template var fields + handles
       if (!setsAreEqual(found_template_vars, new Set(templateVars))) {
         if (node_type !== "chat") {
           try {
@@ -413,27 +439,29 @@ const PromptNode: React.FC<PromptNodeProps> = ({
 
   const handleInputChange = useCallback(
     (event: React.ChangeEvent<HTMLTextAreaElement>) => {
-      const value = event.target.value;
+      const value = event.target.value as string;
       const updateStatus =
         promptTextOnLastRun !== null &&
         status !== Status.WARNING &&
         value !== promptTextOnLastRun;
 
-      // Store prompt text
-      data.prompt = value;
-
       // Debounce the global state change to happen only after 500ms, as it forces a costly rerender:
-      debounce((_value, _updateStatus) => {
-        setPromptText(_value);
-        setDataPropsForNode(id, { prompt: _value });
-        refreshTemplateHooks(_value);
+      debounce((_value: string, _updateStatus, _idxPromptVariantShown) => {
+        setPromptText((prompts) => {
+          if (typeof prompts === "string") prompts = _value;
+          else prompts[_idxPromptVariantShown] = _value;
+          setDataPropsForNode(id, { prompt: prompts });
+          refreshTemplateHooks(prompts);
+          return prompts;
+        });
         if (_updateStatus) setStatus(Status.WARNING);
-      }, 300)(value, updateStatus);
+      }, 300)(value, updateStatus, idxPromptVariantShown);
 
       // Debounce refreshing the template hooks so we don't annoy the user
       // debounce((_value) => refreshTemplateHooks(_value), 500)(value);
     },
     [
+      idxPromptVariantShown,
       promptTextOnLastRun,
       status,
       refreshTemplateHooks,
@@ -552,7 +580,7 @@ const PromptNode: React.FC<PromptNodeProps> = ({
   // Ask the backend how many responses it needs to collect, given the input data:
   const fetchResponseCounts = useCallback(
     (
-      prompt: string,
+      prompt: string | string[],
       vars: Dict,
       llms: (StringOrHash | LLMSpec)[],
       chat_histories?:
@@ -592,14 +620,24 @@ const PromptNode: React.FC<PromptNodeProps> = ({
       const pulled_vars = pullInputData(templateVars, id);
       updateShowContToggle(pulled_vars);
 
-      generatePrompts(promptText, pulled_vars).then((prompts) => {
-        setPromptPreviews(
-          prompts.map(
-            (p: PromptTemplate) =>
-              new PromptInfo(p.toString(), extractSettingsVars(p.fill_history)),
-          ),
-        );
-      });
+      const prompts =
+        typeof promptText === "string" ? [promptText] : promptText;
+
+      Promise.all(prompts.map((p) => generatePrompts(p, pulled_vars))).then(
+        (results) => {
+          // Handle all the results here
+          const all_concrete_prompts = results.flatMap((ps) =>
+            ps.map(
+              (p: PromptTemplate) =>
+                new PromptInfo(
+                  p.toString(),
+                  extractSettingsVars(p.fill_history),
+                ),
+            ),
+          );
+          setPromptPreviews(all_concrete_prompts);
+        },
+      );
 
       pullInputChats();
     } catch (err) {
@@ -827,9 +865,18 @@ Soft failing by replacing undefined with empty strings.`,
 
     // Pull the data to fill in template input variables, if any
     let pulled_data: Dict<(string | TemplateVarInfo)[]> = {};
+    let var_for_prompt_templates: string | undefined;
     try {
       // Try to pull inputs
       pulled_data = pullInputData(templateVars, id);
+
+      // Add a special new variable for the root prompt template(s)
+      var_for_prompt_templates = ensureUniqueName(
+        "prompt",
+        Object.keys(pulled_data),
+      );
+      if (typeof promptText !== "string")
+        pulled_data[var_for_prompt_templates] = promptText; // this will be filled in when calling queryLLMs
     } catch (err) {
       if (showAlert) showAlert((err as Error)?.message ?? err);
       console.error(err);
@@ -873,7 +920,9 @@ Soft failing by replacing undefined with empty strings.`,
     // Fetch info about the number of queries we'll need to make
     const fetch_resp_count = () =>
       fetchResponseCounts(
-        prompt_template,
+        typeof prompt_template === "string"
+          ? prompt_template
+          : `{${var_for_prompt_templates}}`, // Use special root prompt if there's multiple prompt variants
         pulled_data,
         _llmItemsCurrState,
         pulled_chats as ChatHistoryInfo[],
@@ -951,9 +1000,11 @@ Soft failing by replacing undefined with empty strings.`,
     const query_llms = () => {
       return queryLLM(
         id,
-        _llmItemsCurrState, // deep clone it first
+        _llmItemsCurrState,
         numGenerations,
-        prompt_template,
+        typeof prompt_template === "string"
+          ? prompt_template
+          : `{${var_for_prompt_templates}}`, // Use special root prompt if there's multiple prompt variants
         pulled_data,
         chat_hist_by_llm,
         apiKeys || {},
@@ -1015,7 +1066,7 @@ Soft failing by replacing undefined with empty strings.`,
                   o.metavars = resp_obj.metavars ?? {};
 
                   // Add a metavar for the prompt *template* in this PromptNode
-                  o.metavars.__pt = prompt_template;
+                  // o.metavars.__pt = prompt_template;
 
                   // Carry over any chat history
                   if (resp_obj.chat_history)
@@ -1156,6 +1207,48 @@ Soft failing by replacing undefined with empty strings.`,
     [numGenerationsLastRun, status],
   );
 
+  const handleAddPromptVariant = useCallback(() => {
+    // Pushes a new prompt variant, updating the prompts list and duplicating the current shown prompt
+    const prompts = typeof promptText === "string" ? [promptText] : promptText;
+    const curIdx = Math.max(
+      0,
+      Math.min(prompts.length - 1, idxPromptVariantShown),
+    ); // clamp
+    const curShownPrompt = prompts[curIdx];
+    setPromptText(prompts.concat([curShownPrompt]));
+    setIdxPromptVariantShown(prompts.length);
+  }, [promptText, idxPromptVariantShown]);
+
+  const gotoPromptVariant = useCallback(
+    (shift: number) => {
+      const prompts =
+        typeof promptText === "string" ? [promptText] : promptText;
+      const newIdx = Math.max(
+        0,
+        Math.min(prompts.length - 1, idxPromptVariantShown + shift),
+      ); // clamp
+      setIdxPromptVariantShown(newIdx);
+    },
+    [promptText, idxPromptVariantShown],
+  );
+
+  const handleRemovePromptVariant = useCallback(() => {
+    setPromptText((prompts) => {
+      if (typeof prompts === "string") return prompts; // cannot remove the last one
+      prompts.splice(idxPromptVariantShown, 1); // remove the indexed variant
+      setIdxPromptVariantShown(Math.max(0, idxPromptVariantShown - 1)); // goto the previous variant, if possible
+      return prompts;
+    });
+  }, [idxPromptVariantShown]);
+
+  // Whenever idx of prompt variant changes, we need to refresh the Textarea:
+  useEffect(() => {
+    if (textAreaRef.current && Array.isArray(promptText)) {
+      // @ts-expect-error Mantine has a 'value' property on Textareas, but TypeScript doesn't know this
+      textAreaRef.current.value = promptText[idxPromptVariantShown];
+    }
+  }, [idxPromptVariantShown]);
+
   const hideStatusIndicator = () => {
     if (status !== Status.NONE) setStatus(Status.NONE);
   };
@@ -1254,7 +1347,12 @@ Soft failing by replacing undefined with empty strings.`,
                 key={0}
                 className="prompt-field-fixed nodrag nowheel"
                 minRows={4}
-                defaultValue={data.prompt}
+                defaultValue={
+                  typeof data.prompt === "string"
+                    ? data.prompt
+                    : data.prompt &&
+                      data.prompt[data.idxPromptVariantShown ?? 0]
+                }
                 onChange={handleInputChange}
                 miw={230}
                 styles={{
@@ -1277,11 +1375,69 @@ Soft failing by replacing undefined with empty strings.`,
           className="prompt-field-fixed nodrag nowheel"
           minRows={4}
           maxRows={12}
-          defaultValue={data.prompt}
+          defaultValue={
+            typeof data.prompt === "string"
+              ? data.prompt
+              : data.prompt && data.prompt[data.idxPromptVariantShown ?? 0]
+          }
           onChange={handleInputChange}
+          // value={typeof promptText === "string" ? promptText : promptText[idxPromptVariantShown]}
         />
       )}
 
+      <Flex justify="right" pos="absolute" right={10}>
+        {typeof promptText === "string" || promptText.length === 1 ? (
+          <Button
+            size="xs"
+            variant="subtle"
+            color="gray"
+            mt={3}
+            mr={3}
+            p={0}
+            fw="normal"
+            h="1.0rem"
+            onClick={handleAddPromptVariant}
+          >
+            + add variant
+          </Button>
+        ) : (
+          <>
+            <ActionIcon
+              size="xs"
+              c="black"
+              onClick={() => gotoPromptVariant(-1)}
+            >
+              <IconArrowLeft size={19} />
+            </ActionIcon>
+
+            <Text size="xs">
+              Variant {idxPromptVariantShown + 1} of{" "}
+              {typeof promptText === "string" ? 1 : promptText.length}
+            </Text>
+
+            <ActionIcon
+              size="xs"
+              c="black"
+              onClick={() => gotoPromptVariant(1)}
+            >
+              <IconArrowRight size={19} />
+            </ActionIcon>
+
+            <Tooltip label="Add prompt variant" position="right" withArrow>
+              <ActionIcon size="xs" c="black" onClick={handleAddPromptVariant}>
+                <IconPlus size={19} />
+              </ActionIcon>
+            </Tooltip>
+
+            <Tooltip label="Remove this variant" position="right" withArrow>
+              <ActionIcon size="xs" c="red" onClick={handleRemovePromptVariant}>
+                <IconTrash size={19} />
+              </ActionIcon>
+            </Tooltip>
+          </>
+        )}
+      </Flex>
+
       <Handle
         type="source"
         position={Position.Right}
@@ -1289,13 +1445,17 @@ Soft failing by replacing undefined with empty strings.`,
         className="grouped-handle"
         style={{ top: "50%" }}
       />
-      <TemplateHooks
-        vars={templateVars}
-        nodeId={id}
-        startY={hooksY}
-        position={Position.Left}
-        ignoreHandles={["__past_chats"]}
-      />
+
+      <Box mih={14}>
+        <TemplateHooks
+          vars={templateVars}
+          nodeId={id}
+          startY={hooksY}
+          position={Position.Left}
+          ignoreHandles={["__past_chats"]}
+        />
+      </Box>
+
       <hr />
       <div>
         <div style={{ marginBottom: "10px", padding: "4px" }}>
diff --git a/chainforge/react-server/src/backend/backend.ts b/chainforge/react-server/src/backend/backend.ts
index 55e3775e6..20d25bac0 100644
--- a/chainforge/react-server/src/backend/backend.ts
+++ b/chainforge/react-server/src/backend/backend.ts
@@ -29,6 +29,8 @@ import {
   repairCachedResponses,
   deepcopy,
   llmResponseDataToString,
+  extendArray,
+  extendArrayDict,
 } from "./utils";
 import StorageCache, { StringLookup } from "./cache";
 import { PromptPipeline } from "./query";
@@ -520,7 +522,7 @@ export async function generatePrompts(
 /**
  * Calculates how many queries we need to make, given the passed prompt and vars.
  *
- * @param prompt the prompt template, with any {{}} vars
+ * @param prompt the prompt template, with any {} vars; or alternatively, an array of such templates
  * @param vars a dict of the template variables to fill the prompt template with, by name.
  *             For each var value, can be single values or a list; in the latter, all permutations are passed. (Pass empty dict if no vars.)
  * @param llms the list of LLMs you will query
@@ -531,7 +533,7 @@ export async function generatePrompts(
  *          If there was an error, returns a dict with a single key, 'error'.
  */
 export async function countQueries(
-  prompt: string,
+  prompt: string | string[],
   vars: PromptVarsDict,
   llms: Array<StringOrHash | LLMSpec>,
   n: number,
@@ -545,19 +547,27 @@ export async function countQueries(
   vars = deepcopy(vars);
   llms = deepcopy(llms);
 
-  let all_prompt_permutations: PromptTemplate[] | Dict<PromptTemplate[]>;
-
-  const gen_prompts = new PromptPermutationGenerator(prompt);
-  if (cont_only_w_prior_llms && Array.isArray(llms)) {
-    all_prompt_permutations = {};
-    llms.forEach((llm_spec) => {
-      const llm_key = extract_llm_key(llm_spec);
-      (all_prompt_permutations as Dict<PromptTemplate[]>)[llm_key] = Array.from(
-        gen_prompts.generate(filterVarsByLLM(vars, llm_key)),
+  const prompt_templates = typeof prompt === "string" ? [prompt] : prompt;
+  const all_prompt_permutations: PromptTemplate[] | Dict<PromptTemplate[]> =
+    cont_only_w_prior_llms && Array.isArray(llms) ? {} : [];
+
+  for (const pt of prompt_templates) {
+    const gen_prompts = new PromptPermutationGenerator(pt);
+    if (cont_only_w_prior_llms && Array.isArray(llms)) {
+      llms.forEach((llm_spec) => {
+        const llm_key = extract_llm_key(llm_spec);
+        extendArrayDict(
+          all_prompt_permutations as Dict<PromptTemplate[]>,
+          llm_key,
+          Array.from(gen_prompts.generate(filterVarsByLLM(vars, llm_key))),
+        );
+      });
+    } else {
+      extendArray(
+        all_prompt_permutations as PromptTemplate[],
+        Array.from(gen_prompts.generate(vars)),
       );
-    });
-  } else {
-    all_prompt_permutations = Array.from(gen_prompts.generate(vars));
+    }
   }
 
   let cache_file_lookup: Dict = {};
@@ -739,7 +749,7 @@ export async function ensureUniqueFlowFilename(
  * @param id a unique ID to refer to this information. Used when cache'ing responses. 
  * @param llm a string, list of strings, or list of LLM spec dicts specifying the LLM(s) to query.
  * @param n the amount of generations for each prompt. All LLMs will be queried the same number of times 'n' per each prompt.
- * @param prompt the prompt template, with any {{}} vars
+ * @param prompt the prompt template, with any {} vars
  * @param vars a dict of the template variables to fill the prompt template with, by name. 
                For each var, can be single values or a list; in the latter, all permutations are passed. (Pass empty dict if no vars.)
  * @param chat_histories Either an array of `ChatHistory` (to use across all LLMs), or a dict indexed by LLM nicknames of `ChatHistory` arrays to use per LLM. 
diff --git a/chainforge/react-server/src/backend/utils.ts b/chainforge/react-server/src/backend/utils.ts
index 55f9ff31b..92c0d8d59 100644
--- a/chainforge/react-server/src/backend/utils.ts
+++ b/chainforge/react-server/src/backend/utils.ts
@@ -2398,3 +2398,52 @@ export const compressBase64Image = (b64: string): Promise<string> => {
     )
     .then((compressedBlob) => blobToBase64(compressedBlob as Blob));
 };
+
+/**
+ * Extends array `a` with the values of `b`.
+ * @param a The array to extend (in-place).
+ * @param b The array to add to the end of `a`.
+ * @returns `a`, extended.
+ */
+export const extendArray = <T>(a: Array<T>, b: Array<T>): Array<T> => {
+  for (const i in b) {
+    a.push(b[i]);
+  }
+  return a;
+};
+
+/**
+ * Extends the array `key` in a dict with `values`, creating a new array if the key is missing.
+ * @param dict The dictionary to extend (in-place).
+ * @param key The key of the dictionary.
+ * @param values The new array to append to the end of the dict value for `key`.
+ */
+export const extendArrayDict = <K extends string | number | symbol, V>(
+  dict: Record<K, V[]>,
+  key: K,
+  values: V[],
+): void => {
+  if (!dict[key]) {
+    dict[key] = [];
+  }
+  extendArray(dict[key], values);
+};
+
+/** Ensure that a name is 'unique'; if not, return an amended version with a count tacked on (e.g. "GPT-4 (2)") */
+export const ensureUniqueName = (_name: string, _prev_names: string[]) => {
+  // Strip whitespace around names
+  const prev_names = _prev_names.map((n) => n.trim());
+  const name = _name.trim();
+
+  // Check if name is unique
+  if (!prev_names.includes(name)) return name;
+
+  // Name isn't unique; find a unique one:
+  let i = 2;
+  let new_name = `${name} (${i})`;
+  while (prev_names.includes(new_name)) {
+    i += 1;
+    new_name = `${name} (${i})`;
+  }
+  return new_name;
+};

From d6723459e80d32f969d84d9c3f355b4eee4a98d7 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Fri, 14 Mar 2025 12:34:12 -0400
Subject: [PATCH 02/35] fix countQueries backwards compatibility; add
 alertmodal for deleting prompt variant; add prompt variant to PromptPreview
 screens

---
 chainforge/flask_app.py                       |  48 ++-
 chainforge/react-server/src/App.tsx           |   1 +
 .../react-server/src/AreYouSureModal.tsx      |   7 +-
 chainforge/react-server/src/FlowSidebar.tsx   |  72 ++++-
 .../src/LLMResponseInspectorModal.tsx         |   5 +-
 chainforge/react-server/src/PromptNode.tsx    | 306 ++++++++++++------
 chainforge/react-server/src/backend/models.ts |   2 +
 chainforge/react-server/src/backend/utils.ts  |  10 +-
 8 files changed, 324 insertions(+), 127 deletions(-)

diff --git a/chainforge/flask_app.py b/chainforge/flask_app.py
index e2876bf4f..8ec97925e 100644
--- a/chainforge/flask_app.py
+++ b/chainforge/flask_app.py
@@ -1,4 +1,4 @@
-import json, os, sys, asyncio, time
+import json, os, sys, asyncio, time, shutil
 from dataclasses import dataclass
 from enum import Enum
 from typing import List
@@ -772,7 +772,7 @@ def delete_flow(filename):
 
 @app.route('/api/flows/<filename>', methods=['PUT'])
 def save_or_rename_flow(filename):
-    """Save or rename a flow"""
+    """Save, rename, or duplicate a flow"""
     data = request.json
 
     if not filename.endswith('.cforge'):
@@ -805,6 +805,36 @@ def save_or_rename_flow(filename):
             return jsonify({"message": f"Flow renamed from {filename} to {new_name}"})
         except Exception as error:
             return jsonify({"error": str(error)}), 404
+    
+    elif data.get('duplicate'):
+        # Duplicate flow
+        try:
+            # Check for name clashes (if a flow already exists with the new name)
+            copy_name = _get_unique_flow_name(filename, "Copy of ") 
+            # Copy the file to the new (safe) path, and copy metadata too:
+            shutil.copy2(os.path.join(FLOWS_DIR, filename), os.path.join(FLOWS_DIR, f"{copy_name}.cforge"))
+            # Return the new filename
+            return jsonify({"copyName": copy_name})
+        except Exception as error:
+            return jsonify({"error": str(error)}), 404
+
+def _get_unique_flow_name(filename: str, prefix: str = None) -> str: 
+    base, ext = os.path.splitext(filename)
+    if ext is None or len(ext) == 0: 
+        ext = ".cforge"
+    unique_filename = base + ext
+    if prefix is not None:
+        unique_filename = prefix + unique_filename
+    i = 1
+
+    # Find the first non-clashing filename of the form <filename>(i).cforge where i=1,2,3 etc
+    while os.path.isfile(os.path.join(FLOWS_DIR, unique_filename)):
+        unique_filename = f"{base}({i}){ext}"
+        if prefix is not None:
+            unique_filename = prefix + unique_filename
+        i += 1
+    
+    return unique_filename.replace(".cforge", "")
 
 @app.route('/api/getUniqueFlowFilename', methods=['PUT'])
 def get_unique_flow_name():
@@ -813,18 +843,8 @@ def get_unique_flow_name():
     filename = data.get("name")
     
     try:
-        base, ext = os.path.splitext(filename)
-        if ext is None or len(ext) == 0: 
-            ext = ".cforge"
-        unique_filename = base + ext
-        i = 1
-
-        # Find the first non-clashing filename of the form <filename>(i).cforge where i=1,2,3 etc
-        while os.path.isfile(os.path.join(FLOWS_DIR, unique_filename)):
-            unique_filename = f"{base}({i}){ext}"
-            i += 1
-        
-        return jsonify(unique_filename.replace(".cforge", ""))
+        new_name = _get_unique_flow_name(filename)
+        return jsonify(new_name)
     except Exception as e:
         return jsonify({"error": str(e)}), 404
 
diff --git a/chainforge/react-server/src/App.tsx b/chainforge/react-server/src/App.tsx
index ba203e372..309f2f9dd 100644
--- a/chainforge/react-server/src/App.tsx
+++ b/chainforge/react-server/src/App.tsx
@@ -1334,6 +1334,7 @@ const App = () => {
               ml="sm"
               size="1.625rem"
               onClick={() => saveFlow()}
+              bg="#eee"
               loading={isSaving}
               disabled={isLoading || isSaving}
             >
diff --git a/chainforge/react-server/src/AreYouSureModal.tsx b/chainforge/react-server/src/AreYouSureModal.tsx
index b0ac59cd9..957817526 100644
--- a/chainforge/react-server/src/AreYouSureModal.tsx
+++ b/chainforge/react-server/src/AreYouSureModal.tsx
@@ -5,6 +5,7 @@ import { useDisclosure } from "@mantine/hooks";
 export interface AreYouSureModalProps {
   title: string;
   message: string;
+  color?: string;
   onConfirm?: () => void;
 }
 
@@ -14,7 +15,7 @@ export interface AreYouSureModalRef {
 
 /** Modal that lets user rename a single value, using a TextInput field. */
 const AreYouSureModal = forwardRef<AreYouSureModalRef, AreYouSureModalProps>(
-  function AreYouSureModal({ title, message, onConfirm }, ref) {
+  function AreYouSureModal({ title, message, color, onConfirm }, ref) {
     const [opened, { open, close }] = useDisclosure(false);
     const description = message || "Are you sure?";
 
@@ -37,7 +38,7 @@ const AreYouSureModal = forwardRef<AreYouSureModalRef, AreYouSureModalProps>(
         onClose={close}
         title={title}
         styles={{
-          header: { backgroundColor: "orange", color: "white" },
+          header: { backgroundColor: color ?? "orange", color: "white" },
           root: { position: "relative", left: "-5%" },
         }}
       >
@@ -54,7 +55,7 @@ const AreYouSureModal = forwardRef<AreYouSureModalRef, AreYouSureModalProps>(
         >
           <Button
             variant="light"
-            color="orange"
+            color={color ?? "orange"}
             type="submit"
             w="40%"
             onClick={close}
diff --git a/chainforge/react-server/src/FlowSidebar.tsx b/chainforge/react-server/src/FlowSidebar.tsx
index c7566bcee..c95170851 100644
--- a/chainforge/react-server/src/FlowSidebar.tsx
+++ b/chainforge/react-server/src/FlowSidebar.tsx
@@ -5,6 +5,7 @@ import {
   IconMenu2,
   IconX,
   IconCheck,
+  IconCopy,
 } from "@tabler/icons-react";
 import axios from "axios";
 import { AlertModalContext } from "./AlertModal";
@@ -20,8 +21,10 @@ import {
   Flex,
   Divider,
   ScrollArea,
+  Tooltip,
 } from "@mantine/core";
 import { FLASK_BASE_URL } from "./backend/utils";
+import { ensureUniqueFlowFilename } from "./backend/backend";
 
 interface FlowFile {
   name: string;
@@ -112,6 +115,26 @@ const FlowSidebar: React.FC<FlowSidebarProps> = ({
     setNewEditName(flowFile);
   };
 
+  // 'Duplicate' the flow
+  const handleDuplicateFlow = async (
+    flowFile: string,
+    event: React.MouseEvent<HTMLButtonElement, MouseEvent>,
+  ) => {
+    event.stopPropagation(); // Prevent triggering the parent click
+    await axios
+      .put(`${FLASK_BASE_URL}api/flows/${flowFile}`, {
+        duplicate: true,
+      })
+      .then((resp) => {
+        onLoadFlow(undefined, resp.data.copyName as string); // Tell the parent that the filename has changed. This won't replace the flow.
+        fetchSavedFlowList(); // Refresh the list
+      })
+      .catch((err) => {
+        console.error(err);
+        if (showAlert) showAlert(err);
+      });
+  };
+
   // Cancel editing
   const handleCancelEdit = (
     event: React.MouseEvent<HTMLButtonElement, MouseEvent>,
@@ -191,7 +214,7 @@ const FlowSidebar: React.FC<FlowSidebarProps> = ({
         onClose={() => setIsOpen(false)}
         title="Saved Flows"
         position="left"
-        size="250px" // Adjust sidebar width
+        size="350px" // Adjust sidebar width
         padding="md"
         withCloseButton={true}
         scrollAreaComponent={ScrollArea.Autosize}
@@ -261,18 +284,45 @@ const FlowSidebar: React.FC<FlowSidebarProps> = ({
                         {flow.name}
                       </Text>
                       <Flex gap="0px">
-                        <ActionIcon
-                          color="blue"
-                          onClick={(e) => handleEditClick(flow.name, e)}
+                        <Tooltip
+                          label="Edit name"
+                          withArrow
+                          arrowPosition="center"
+                          withinPortal
+                        >
+                          <ActionIcon
+                            color="blue"
+                            onClick={(e) => handleEditClick(flow.name, e)}
+                          >
+                            <IconEdit size={18} />
+                          </ActionIcon>
+                        </Tooltip>
+                        <Tooltip
+                          label="Duplicate this flow"
+                          withArrow
+                          arrowPosition="center"
+                          withinPortal
                         >
-                          <IconEdit size={18} />
-                        </ActionIcon>
-                        <ActionIcon
-                          color="red"
-                          onClick={(e) => handleDeleteFlow(flow.name, e)}
+                          <ActionIcon
+                            color="blue"
+                            onClick={(e) => handleDuplicateFlow(flow.name, e)}
+                          >
+                            <IconCopy size={18} />
+                          </ActionIcon>
+                        </Tooltip>
+                        <Tooltip
+                          label="Delete this flow"
+                          withArrow
+                          arrowPosition="center"
+                          withinPortal
                         >
-                          <IconTrash size={18} />
-                        </ActionIcon>
+                          <ActionIcon
+                            color="red"
+                            onClick={(e) => handleDeleteFlow(flow.name, e)}
+                          >
+                            <IconTrash size={18} />
+                          </ActionIcon>
+                        </Tooltip>
                       </Flex>
                     </Flex>
                     <Text size="xs" color="gray">
diff --git a/chainforge/react-server/src/LLMResponseInspectorModal.tsx b/chainforge/react-server/src/LLMResponseInspectorModal.tsx
index bfc3b3d3e..54d6f0003 100644
--- a/chainforge/react-server/src/LLMResponseInspectorModal.tsx
+++ b/chainforge/react-server/src/LLMResponseInspectorModal.tsx
@@ -71,7 +71,10 @@ const LLMResponseInspectorModal = forwardRef<
           </button>
         </div>
       }
-      styles={{ title: { justifyContent: "space-between", width: "100%" } }}
+      styles={{
+        title: { justifyContent: "space-between", width: "100%" },
+        header: { paddingBottom: "0px" },
+      }}
     >
       <div
         className="inspect-modal-response-container"
diff --git a/chainforge/react-server/src/PromptNode.tsx b/chainforge/react-server/src/PromptNode.tsx
index 194f1e8d2..bbfa49873 100644
--- a/chainforge/react-server/src/PromptNode.tsx
+++ b/chainforge/react-server/src/PromptNode.tsx
@@ -22,6 +22,8 @@ import {
   Flex,
   Button,
   ActionIcon,
+  Divider,
+  Title,
 } from "@mantine/core";
 import { useDisclosure } from "@mantine/hooks";
 import {
@@ -77,6 +79,7 @@ import {
 } from "./backend/backend";
 import { StringLookup } from "./backend/cache";
 import { union } from "./backend/setUtils";
+import AreYouSureModal, { AreYouSureModalRef } from "./AreYouSureModal";
 
 const getUniqueLLMMetavarKey = (responses: LLMResponse[]) => {
   const metakeys = new Set(
@@ -98,19 +101,33 @@ const bucketChatHistoryInfosByLLM = (chat_hist_infos: ChatHistoryInfo[]) => {
 
 export class PromptInfo {
   prompt: string;
-  settings: Dict;
+  settings?: Dict;
+  label?: string;
 
-  constructor(prompt: string, settings: Dict) {
+  constructor(prompt: string, settings?: Dict, label?: string) {
     this.prompt = prompt;
     this.settings = settings;
+    this.label = label;
   }
 }
 
-const displayPromptInfos = (promptInfos: PromptInfo[], wideFormat: boolean) =>
+const displayPromptInfos = (
+  promptInfos: PromptInfo[],
+  wideFormat: boolean,
+  bgColor?: string,
+) =>
   promptInfos.map((info, idx) => (
     <div key={idx}>
-      <div className="prompt-preview">{info.prompt}</div>
-      {info.settings ? (
+      <div className="prompt-preview" style={{ backgroundColor: bgColor }}>
+        {info.label && (
+          <Text c="black" size="xs" fw="bold" mb={0}>
+            {info.label}
+            <hr />
+          </Text>
+        )}
+        {info.prompt}
+      </div>
+      {info.settings &&
         Object.entries(info.settings).map(([key, val]) => {
           return (
             <div key={key} className="settings-var-inline response-var-inline">
@@ -120,10 +137,7 @@ const displayPromptInfos = (promptInfos: PromptInfo[], wideFormat: boolean) =>
               </span>
             </div>
           );
-        })
-      ) : (
-        <></>
-      )}
+        })}
     </div>
   ));
 
@@ -131,12 +145,14 @@ export interface PromptListPopoverProps {
   promptInfos: PromptInfo[];
   onHover: () => void;
   onClick: () => void;
+  promptTemplates?: string[] | string;
 }
 
 export const PromptListPopover: React.FC<PromptListPopoverProps> = ({
   promptInfos,
   onHover,
   onClick,
+  promptTemplates,
 }) => {
   const [opened, { close, open }] = useDisclosure(false);
 
@@ -185,6 +201,29 @@ export const PromptListPopover: React.FC<PromptListPopoverProps> = ({
             Preview of generated prompts ({promptInfos.length} total)
           </Text>
         </Center>
+        {Array.isArray(promptTemplates) && promptTemplates.length > 1 && (
+          <Box>
+            <Divider
+              my="xs"
+              label="Prompt variants"
+              fw="bold"
+              labelPosition="center"
+            />
+            {displayPromptInfos(
+              promptTemplates.map(
+                (t, i) => new PromptInfo(t, undefined, `Variant ${i + 1}`),
+              ),
+              false,
+              "#ddf1f8",
+            )}
+            <Divider
+              my="xs"
+              label="Concrete prompts"
+              fw="bold"
+              labelPosition="center"
+            />
+          </Box>
+        )}
         {displayPromptInfos(promptInfos, false)}
       </Popover.Dropdown>
     </Popover>
@@ -195,12 +234,14 @@ export interface PromptListModalProps {
   promptPreviews: PromptInfo[];
   infoModalOpened: boolean;
   closeInfoModal: () => void;
+  promptTemplates?: string[] | string;
 }
 
 export const PromptListModal: React.FC<PromptListModalProps> = ({
   promptPreviews,
   infoModalOpened,
   closeInfoModal,
+  promptTemplates,
 }) => {
   return (
     <Modal
@@ -218,6 +259,29 @@ export const PromptListModal: React.FC<PromptListModalProps> = ({
       }}
     >
       <Box m="lg" mt="xl">
+        {Array.isArray(promptTemplates) && promptTemplates.length > 1 && (
+          <Box>
+            <Divider
+              my="xs"
+              label="Prompt variants"
+              fw="bold"
+              labelPosition="center"
+            />
+            {displayPromptInfos(
+              promptTemplates.map(
+                (t, i) => new PromptInfo(t, undefined, `Variant ${i + 1}`),
+              ),
+              true,
+              "#ddf1f8",
+            )}
+            <Divider
+              my="xs"
+              label="Concrete prompts (filled in)"
+              fw="bold"
+              labelPosition="center"
+            />
+          </Box>
+        )}
         {displayPromptInfos(promptPreviews, true)}
       </Box>
     </Modal>
@@ -1207,6 +1271,39 @@ Soft failing by replacing undefined with empty strings.`,
     [numGenerationsLastRun, status],
   );
 
+  const hideStatusIndicator = () => {
+    if (status !== Status.NONE) setStatus(Status.NONE);
+  };
+
+  // Dynamically update the textareas and position of the template hooks
+  const textAreaRef = useRef<HTMLTextAreaElement | HTMLDivElement | null>(null);
+  const [hooksY, setHooksY] = useState(138);
+  const setRef = useCallback(
+    (elem: HTMLDivElement | HTMLTextAreaElement | null) => {
+      if (!elem) return;
+      // To listen for resize events of the textarea, we need to use a ResizeObserver.
+      // We initialize the ResizeObserver only once, when the 'ref' is first set, and only on the div wrapping textfields.
+      // NOTE: This won't work on older browsers, but there's no alternative solution.
+      if (!textAreaRef.current && elem && window.ResizeObserver) {
+        let past_hooks_y = 138;
+        const incr = 68 + (node_type === "chat" ? -6 : 0);
+        const observer = new window.ResizeObserver(() => {
+          if (!textAreaRef || !textAreaRef.current) return;
+          const new_hooks_y = textAreaRef.current.clientHeight + incr;
+          if (past_hooks_y !== new_hooks_y) {
+            setHooksY(new_hooks_y);
+            past_hooks_y = new_hooks_y;
+          }
+        });
+
+        observer.observe(elem);
+        textAreaRef.current = elem;
+      }
+    },
+    [textAreaRef],
+  );
+
+  const deleteVariantConfirmModal = useRef<AreYouSureModalRef>(null);
   const handleAddPromptVariant = useCallback(() => {
     // Pushes a new prompt variant, updating the prompts list and duplicating the current shown prompt
     const prompts = typeof promptText === "string" ? [promptText] : promptText;
@@ -1234,12 +1331,20 @@ Soft failing by replacing undefined with empty strings.`,
 
   const handleRemovePromptVariant = useCallback(() => {
     setPromptText((prompts) => {
-      if (typeof prompts === "string") return prompts; // cannot remove the last one
+      if (typeof prompts === "string" || prompts.length === 1) return prompts; // cannot remove the last one
       prompts.splice(idxPromptVariantShown, 1); // remove the indexed variant
-      setIdxPromptVariantShown(Math.max(0, idxPromptVariantShown - 1)); // goto the previous variant, if possible
-      return prompts;
+      const newIdx = Math.max(0, idxPromptVariantShown - 1);
+      setIdxPromptVariantShown(newIdx); // goto the previous variant, if possible
+
+      if (textAreaRef.current) {
+        // We have to force an update here since idxPromptVariantShown might've not changed
+        // @ts-expect-error Mantine has a 'value' property on Textareas, but TypeScript doesn't know this
+        textAreaRef.current.value = prompts[newIdx];
+      }
+
+      return [...prompts];
     });
-  }, [idxPromptVariantShown]);
+  }, [idxPromptVariantShown, textAreaRef]);
 
   // Whenever idx of prompt variant changes, we need to refresh the Textarea:
   useEffect(() => {
@@ -1249,37 +1354,92 @@ Soft failing by replacing undefined with empty strings.`,
     }
   }, [idxPromptVariantShown]);
 
-  const hideStatusIndicator = () => {
-    if (status !== Status.NONE) setStatus(Status.NONE);
-  };
+  const promptVariantControls = useMemo(() => {
+    return (
+      <Flex justify="right" pos="absolute" right={10}>
+        {typeof promptText === "string" || promptText.length === 1 ? (
+          <Tooltip
+            label="Add prompt variant. This duplicates the current prompt, allowing you to tweak it to test variations. (You can also accomplish the same thing by template chaining.)"
+            multiline
+            position="right"
+            withArrow
+            arrowSize={8}
+            w={220}
+            withinPortal
+          >
+            <Button
+              size="xs"
+              variant="subtle"
+              color="gray"
+              mt={3}
+              mr={3}
+              p={0}
+              fw="normal"
+              h="1.0rem"
+              onClick={handleAddPromptVariant}
+            >
+              + add variant
+            </Button>
+          </Tooltip>
+        ) : (
+          <>
+            <ActionIcon
+              size="xs"
+              c="black"
+              onClick={() => gotoPromptVariant(-1)}
+            >
+              <IconArrowLeft size={19} />
+            </ActionIcon>
 
-  // Dynamically update the textareas and position of the template hooks
-  const textAreaRef = useRef<HTMLTextAreaElement | HTMLDivElement | null>(null);
-  const [hooksY, setHooksY] = useState(138);
-  const setRef = useCallback(
-    (elem: HTMLDivElement | HTMLTextAreaElement | null) => {
-      if (!elem) return;
-      // To listen for resize events of the textarea, we need to use a ResizeObserver.
-      // We initialize the ResizeObserver only once, when the 'ref' is first set, and only on the div wrapping textfields.
-      // NOTE: This won't work on older browsers, but there's no alternative solution.
-      if (!textAreaRef.current && elem && window.ResizeObserver) {
-        let past_hooks_y = 138;
-        const incr = 68 + (node_type === "chat" ? -6 : 0);
-        const observer = new window.ResizeObserver(() => {
-          if (!textAreaRef || !textAreaRef.current) return;
-          const new_hooks_y = textAreaRef.current.clientHeight + incr;
-          if (past_hooks_y !== new_hooks_y) {
-            setHooksY(new_hooks_y);
-            past_hooks_y = new_hooks_y;
-          }
-        });
+            <Text size="xs">
+              Variant {idxPromptVariantShown + 1} of{" "}
+              {typeof promptText === "string" ? 1 : promptText.length}
+            </Text>
 
-        observer.observe(elem);
-        textAreaRef.current = elem;
-      }
-    },
-    [textAreaRef],
-  );
+            <ActionIcon
+              size="xs"
+              c="black"
+              mr={2}
+              onClick={() => gotoPromptVariant(1)}
+            >
+              <IconArrowRight size={19} />
+            </ActionIcon>
+
+            <Tooltip
+              label="Add prompt variant"
+              position="right"
+              withArrow
+              withinPortal
+            >
+              <ActionIcon
+                size="xs"
+                c="black"
+                mr={2}
+                onClick={handleAddPromptVariant}
+              >
+                <IconPlus size={19} />
+              </ActionIcon>
+            </Tooltip>
+
+            <Tooltip
+              label="Remove this variant"
+              position="right"
+              withArrow
+              withinPortal
+            >
+              <ActionIcon
+                size="xs"
+                c="black"
+                onClick={() => deleteVariantConfirmModal?.current?.trigger()}
+              >
+                <IconTrash size={19} />
+              </ActionIcon>
+            </Tooltip>
+          </>
+        )}
+      </Flex>
+    );
+  }, [idxPromptVariantShown, promptText, deleteVariantConfirmModal]);
 
   // Add custom context menu options on right-click.
   // 1. Convert TextFields to Items Node, for convenience.
@@ -1322,6 +1482,7 @@ Soft failing by replacing undefined with empty strings.`,
           <PromptListPopover
             key="prompt-previews"
             promptInfos={promptPreviews}
+            promptTemplates={promptText}
             onHover={handlePreviewHover}
             onClick={openInfoModal}
           />,
@@ -1333,9 +1494,17 @@ Soft failing by replacing undefined with empty strings.`,
       />
       <PromptListModal
         promptPreviews={promptPreviews}
+        promptTemplates={promptText}
         infoModalOpened={infoModalOpened}
         closeInfoModal={closeInfoModal}
       />
+      <AreYouSureModal
+        ref={deleteVariantConfirmModal}
+        title="Delete prompt variant"
+        message="Are you sure you want to delete this prompt variant? This action is irreversible."
+        color="red"
+        onConfirm={handleRemovePromptVariant}
+      />
 
       {node_type === "chat" ? (
         <div ref={setRef}>
@@ -1385,58 +1554,7 @@ Soft failing by replacing undefined with empty strings.`,
         />
       )}
 
-      <Flex justify="right" pos="absolute" right={10}>
-        {typeof promptText === "string" || promptText.length === 1 ? (
-          <Button
-            size="xs"
-            variant="subtle"
-            color="gray"
-            mt={3}
-            mr={3}
-            p={0}
-            fw="normal"
-            h="1.0rem"
-            onClick={handleAddPromptVariant}
-          >
-            + add variant
-          </Button>
-        ) : (
-          <>
-            <ActionIcon
-              size="xs"
-              c="black"
-              onClick={() => gotoPromptVariant(-1)}
-            >
-              <IconArrowLeft size={19} />
-            </ActionIcon>
-
-            <Text size="xs">
-              Variant {idxPromptVariantShown + 1} of{" "}
-              {typeof promptText === "string" ? 1 : promptText.length}
-            </Text>
-
-            <ActionIcon
-              size="xs"
-              c="black"
-              onClick={() => gotoPromptVariant(1)}
-            >
-              <IconArrowRight size={19} />
-            </ActionIcon>
-
-            <Tooltip label="Add prompt variant" position="right" withArrow>
-              <ActionIcon size="xs" c="black" onClick={handleAddPromptVariant}>
-                <IconPlus size={19} />
-              </ActionIcon>
-            </Tooltip>
-
-            <Tooltip label="Remove this variant" position="right" withArrow>
-              <ActionIcon size="xs" c="red" onClick={handleRemovePromptVariant}>
-                <IconTrash size={19} />
-              </ActionIcon>
-            </Tooltip>
-          </>
-        )}
-      </Flex>
+      {promptVariantControls}
 
       <Handle
         type="source"
diff --git a/chainforge/react-server/src/backend/models.ts b/chainforge/react-server/src/backend/models.ts
index 09110f6ec..7f13863b1 100644
--- a/chainforge/react-server/src/backend/models.ts
+++ b/chainforge/react-server/src/backend/models.ts
@@ -313,6 +313,8 @@ export const RATE_LIMIT_BY_MODEL: { [key in LLM]?: number } = {
 };
 
 export const RATE_LIMIT_BY_PROVIDER: { [key in LLMProvider]?: number } = {
+  [LLMProvider.OpenAI]: 1000, // Tier 3 pricing limit is 5000 per minute, across most models, we use 1000 to be safe.
+  [LLMProvider.Azure_OpenAI]: 1000, // Tier 3 pricing limit is 5000 per minute, across most models, we use 1000 to be safe.
   [LLMProvider.Anthropic]: 25, // Tier 1 pricing limit is 50 per minute, across all models; we halve this, to be safe.
   [LLMProvider.Together]: 30, // Paid tier limit is 60 per minute, across all models; we halve this, to be safe.
   [LLMProvider.Google]: 1000, // RPM for Google Gemini models 1.5 is quite generous; at base it is 1000 RPM. If you are using the free version it's 15 RPM, but we can expect most CF users to be using paid (and anyway you can just re-run prompt node until satisfied).
diff --git a/chainforge/react-server/src/backend/utils.ts b/chainforge/react-server/src/backend/utils.ts
index 92c0d8d59..ce55d6008 100644
--- a/chainforge/react-server/src/backend/utils.ts
+++ b/chainforge/react-server/src/backend/utils.ts
@@ -1967,10 +1967,12 @@ export const extractSettingsVars = (vars?: PromptVarsDict) => {
     vars !== undefined &&
     Object.keys(vars).some((k) => k.charAt(0) === "=")
   ) {
-    return transformDict(
-      deepcopy(vars),
-      (k) => k.charAt(0) === "=",
-      (k) => k.substring(1),
+    return StringLookup.concretizeDict(
+      transformDict(
+        deepcopy(vars),
+        (k) => k.charAt(0) === "=",
+        (k) => k.substring(1),
+      ),
     );
   } else return {};
 };

From bdf0b6a0239abb3bff0389b54611f4465a96f3ce Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Fri, 14 Mar 2025 19:48:00 -0400
Subject: [PATCH 03/35] Autoresize textarea when switching prompt variants.
 Ensure auto-templating is only used for variants when length exceeds 1.

---
 chainforge/flask_app.py                       |  18 ++
 chainforge/react-server/src/App.tsx           | 189 ++++++++++++++----
 chainforge/react-server/src/FlowSidebar.tsx   |   1 -
 chainforge/react-server/src/ItemsNode.tsx     |   2 +-
 chainforge/react-server/src/PromptNode.tsx    |  35 +++-
 .../react-server/src/backend/backend.ts       |   2 +
 6 files changed, 193 insertions(+), 54 deletions(-)

diff --git a/chainforge/flask_app.py b/chainforge/flask_app.py
index 8ec97925e..c954fbc70 100644
--- a/chainforge/flask_app.py
+++ b/chainforge/flask_app.py
@@ -759,6 +759,17 @@ def get_flow(filename):
     except FileNotFoundError:
         return jsonify({"error": "Flow not found"}), 404
 
+@app.route('/api/flowExists/<filename>', methods=['GET'])
+def get_flow_exists(filename):
+    """Return the content of a specific flow"""
+    if not filename.endswith('.cforge'):
+        filename += '.cforge'
+    try:
+        is_file = os.path.isfile(os.path.join(FLOWS_DIR, filename))
+        return jsonify({"exists": is_file})
+    except FileNotFoundError:
+        return jsonify({"error": "Flow not found"}), 404
+
 @app.route('/api/flows/<filename>', methods=['DELETE'])
 def delete_flow(filename):
     """Delete a flow"""
@@ -781,11 +792,18 @@ def save_or_rename_flow(filename):
     if data.get('flow'):
         # Save flow (overwriting any existing flow file with the same name)
         flow_data = data.get('flow')
+        also_autosave = data.get('alsoAutosave')
         
         try:
             filepath = os.path.join(FLOWS_DIR, filename)
             with open(filepath, 'w') as f:
                 json.dump(flow_data, f)
+
+            # If we should also autosave, then attempt to override the autosave cache file:
+            if also_autosave:
+                autosave_filepath = os.path.join(FLOWS_DIR, '__autosave.cforge')
+                shutil.copy2(filepath, autosave_filepath)  # copy the file to __autosave
+
             return jsonify({"message": f"Flow '{filename}' saved!"})
         except FileNotFoundError:
             return jsonify({"error": f"Could not save flow '{filename}' to local filesystem. See terminal for more details."}), 404
diff --git a/chainforge/react-server/src/App.tsx b/chainforge/react-server/src/App.tsx
index 309f2f9dd..c41f451c4 100644
--- a/chainforge/react-server/src/App.tsx
+++ b/chainforge/react-server/src/App.tsx
@@ -63,6 +63,7 @@ import {
   getDefaultModelSettings,
 } from "./ModelSettingSchemas";
 import { v4 as uuid } from "uuid";
+import axios from "axios";
 import LZString from "lz-string";
 import { EXAMPLEFLOW_1 } from "./example_flows";
 
@@ -78,7 +79,11 @@ import "lazysizes/plugins/attrchange/ls.attrchange";
 import { shallow } from "zustand/shallow";
 import useStore, { StoreHandles } from "./store";
 import StorageCache, { StringLookup } from "./backend/cache";
-import { APP_IS_RUNNING_LOCALLY, browserTabIsActive } from "./backend/utils";
+import {
+  APP_IS_RUNNING_LOCALLY,
+  browserTabIsActive,
+  FLASK_BASE_URL,
+} from "./backend/utils";
 import { Dict, JSONCompatible, LLMSpec } from "./backend/typing";
 import {
   ensureUniqueFlowFilename,
@@ -113,6 +118,14 @@ const IS_ACCEPTED_BROWSER =
 // we have access to the Flask backend for, e.g., Python code evaluation.
 const IS_RUNNING_LOCALLY = APP_IS_RUNNING_LOCALLY();
 
+const SAVE_FLOW_FILENAME_TO_BROWSER_CACHE = (name: string) => {
+  console.log("Saving flow filename", name);
+  // Save the current filename of the user's working flow
+  StorageCache.saveToLocalStorage("chainforge-cur-file", {
+    flowFileName: name,
+  });
+};
+
 const selector = (state: StoreHandles) => ({
   nodes: state.nodes,
   edges: state.edges,
@@ -266,6 +279,11 @@ const App = () => {
   const safeSetFlowFileName = useCallback(async (newName: string) => {
     const uniqueName = await ensureUniqueFlowFilename(newName);
     setFlowFileName(uniqueName);
+    SAVE_FLOW_FILENAME_TO_BROWSER_CACHE(uniqueName);
+  }, []);
+  const setFlowFileNameAndCache = useCallback((newName: string) => {
+    setFlowFileName(newName);
+    SAVE_FLOW_FILENAME_TO_BROWSER_CACHE(newName);
   }, []);
 
   // For 'share' button
@@ -387,6 +405,7 @@ const App = () => {
       flowData?: unknown,
       saveToLocalFilesystem?: string,
       hideErrorAlert?: boolean,
+      onError?: () => void,
     ) => {
       if (!rfInstance && !flowData) return;
 
@@ -406,11 +425,16 @@ const App = () => {
           // Save!
           const flowFile = `${saveToLocalFilesystem ?? flowFileName}.cforge`;
           if (saveToLocalFilesystem !== undefined)
-            return saveFlowToLocalFilesystem(flow_and_cache, flowFile);
+            return saveFlowToLocalFilesystem(
+              flow_and_cache,
+              flowFile,
+              saveToLocalFilesystem !== "__autosave",
+            );
           // @ts-expect-error The exported RF instance is JSON compatible but TypeScript won't read it as such.
           else downloadJSON(flow_and_cache, flowFile);
         })
         .catch((err) => {
+          if (onError) onError();
           if (hideErrorAlert) console.error(err);
           else handleError(err);
         });
@@ -432,14 +456,18 @@ const App = () => {
       setShowSaveSuccess(false);
 
       startSaveTransition(() => {
-        // NOTE: This currently only saves the front-end state. Cache files
-        // are not pulled or overwritten upon loading from localStorage.
+        // Get current flow state
         const flow = rf.toObject();
-        StorageCache.saveToLocalStorage("chainforge-flow", flow);
 
-        // Attempt to save the current state of the back-end state,
-        // the StorageCache. (This does LZ compression to save space.)
-        StorageCache.saveToLocalStorage("chainforge-state");
+        const saveToLocalStorage = () => {
+          // This line only saves the front-end state. Cache files
+          // are not pulled or overwritten upon loading from localStorage.
+          StorageCache.saveToLocalStorage("chainforge-flow", flow);
+
+          // Attempt to save the current back-end state,
+          // in the StorageCache. (This does LZ compression to save space.)
+          StorageCache.saveToLocalStorage("chainforge-state");
+        };
 
         const onFlowSaved = () => {
           console.log("Flow saved!");
@@ -452,10 +480,18 @@ const App = () => {
         // If running locally, aattempt to save a copy of the flow to the lcoal filesystem,
         // so it shows up in the list of saved flows.
         if (IS_RUNNING_LOCALLY)
-          exportFlow(flow, fileName ?? flowFileName, hideErrorAlert)?.then(
-            onFlowSaved,
-          );
-        else onFlowSaved();
+          // SAVE TO LOCAL FILESYSTEM (only), and if that fails, try to save to localStorage
+          exportFlow(
+            flow,
+            fileName ?? flowFileName,
+            hideErrorAlert,
+            saveToLocalStorage,
+          )?.then(onFlowSaved);
+        else {
+          // SAVE TO BROWSER LOCALSTORAGE
+          saveToLocalStorage();
+          onFlowSaved();
+        }
       });
     },
     [rfInstance, exportFlow, flowFileName],
@@ -475,8 +511,13 @@ const App = () => {
 
   // Initialize auto-saving
   const initAutosaving = useCallback(
-    (rf_inst: ReactFlowInstance) => {
-      if (autosavingInterval !== undefined) return; // autosaving interval already set
+    (rf_inst: ReactFlowInstance, reinit?: boolean) => {
+      if (autosavingInterval !== undefined) {
+        // Autosaving interval already set
+        if (reinit)
+          clearInterval(autosavingInterval); // reinitialize interval, clearing the current one
+        else return; // do nothing
+      }
       console.log("Init autosaving");
 
       // Autosave the flow to localStorage every minute:
@@ -539,7 +580,9 @@ const App = () => {
     StorageCache.clear();
 
     // New flow filename
-    setFlowFileName(`flow-${Date.now()}`);
+    const new_filename = `flow-${Date.now()}`;
+    setFlowFileNameAndCache(new_filename);
+
     if (rfInstance) rfInstance.setViewport({ x: 200, y: 80, zoom: 1 });
   }, [setNodes, setEdges, resetLLMColors, rfInstance]);
 
@@ -575,7 +618,7 @@ const App = () => {
       }, 10);
 
       // Start auto-saving, if it's not already enabled
-      if (rf_inst) initAutosaving(rf_inst);
+      if (rf_inst) initAutosaving(rf_inst, true);
     },
     [resetLLMColors, setNodes, setEdges, initAutosaving],
   );
@@ -584,23 +627,28 @@ const App = () => {
     importState(StorageCache.getAllMatching((key) => key.startsWith("r.")));
   }, [importState]);
 
-  const autosavedFlowExists = useCallback(() => {
-    return window.localStorage.getItem("chainforge-flow") !== null;
-  }, []);
-  const loadFlowFromAutosave = useCallback(
-    async (rf_inst: ReactFlowInstance) => {
-      const saved_flow = StorageCache.loadFromLocalStorage(
-        "chainforge-flow",
-        false,
-      ) as Dict;
-      if (saved_flow) {
-        StorageCache.loadFromLocalStorage("chainforge-state", true);
-        importGlobalStateFromCache();
-        loadFlow(saved_flow, rf_inst);
+  // Find the autosaved flow, if it exists, returning
+  // whether it exists and the location ("browser" or "filesystem") that it exists at.
+  const autosavedFlowExists = useCallback(async () => {
+    if (IS_RUNNING_LOCALLY) {
+      // If running locally, we try to fetch a flow autosaved on the user's local machine first:
+      try {
+        const response = await axios.get(
+          `${FLASK_BASE_URL}api/flowExists/__autosave`,
+        );
+        const autosave_file_exists = response.data.exists as boolean;
+        if (autosave_file_exists)
+          return { exists: autosave_file_exists, location: "filesystem" };
+      } catch (error) {
+        // Soft fail, continuing onwards to checking localStorage instead
       }
-    },
-    [importGlobalStateFromCache, loadFlow],
-  );
+    }
+
+    return {
+      exists: window.localStorage.getItem("chainforge-flow") !== null,
+      location: "browser",
+    };
+  }, []);
 
   // Import data to the cache stored on the local filesystem (in backend)
   const handleImportCache = useCallback(
@@ -715,6 +763,38 @@ const App = () => {
     fetchOpenAIEval(evalname).then(importFlowFromJSON).catch(handleError);
   };
 
+  const loadFlowFromAutosave = useCallback(
+    async (rf_inst: ReactFlowInstance, fromFilesystem?: boolean) => {
+      if (fromFilesystem) {
+        // From local filesystem
+        // Fetch the flow
+        const response = await axios.get(
+          `${FLASK_BASE_URL}api/flows/__autosave`,
+        );
+
+        // Attempt to load flow into the UI
+        try {
+          importFlowFromJSON(response.data, rf_inst);
+          console.log("Loaded flow from autosave on local machine.");
+        } catch (error) {
+          handleError(error as Error);
+        }
+      } else {
+        // From browser localStorage
+        const saved_flow = StorageCache.loadFromLocalStorage(
+          "chainforge-flow",
+          false,
+        ) as Dict;
+        if (saved_flow) {
+          StorageCache.loadFromLocalStorage("chainforge-state", true);
+          importGlobalStateFromCache();
+          loadFlow(saved_flow, rf_inst);
+        }
+      }
+    },
+    [importGlobalStateFromCache, loadFlow, importFlowFromJSON, handleError],
+  );
+
   // Load flow from examples modal
   const onSelectExampleFlow = (name: string, example_category?: string) => {
     // Trigger the 'loading' modal
@@ -723,7 +803,7 @@ const App = () => {
     // Detect a special category of the example flow, and use the right loader for it:
     if (example_category === "openai-eval") {
       importFlowFromOpenAIEval(name);
-      setFlowFileName(`flow-${Date.now()}`);
+      setFlowFileNameAndCache(`flow-${Date.now()}`);
       return;
     }
 
@@ -732,7 +812,7 @@ const App = () => {
       .then(function (flowJSON) {
         // We have the data, import it:
         importFlowFromJSON(flowJSON);
-        setFlowFileName(`flow-${Date.now()}`);
+        setFlowFileNameAndCache(`flow-${Date.now()}`);
       })
       .catch(handleError);
   };
@@ -871,6 +951,20 @@ const App = () => {
               err.message,
             );
           });
+
+        // We also need to fetch the current flowFileName
+        // Attempt to get the last working filename on component mount
+        const last_working_flow_filename = StorageCache.loadFromLocalStorage(
+          "chainforge-cur-file",
+        );
+        if (
+          last_working_flow_filename &&
+          typeof last_working_flow_filename === "object" &&
+          "flowFileName" in last_working_flow_filename
+        ) {
+          // Use last working flow name
+          setFlowFileName(last_working_flow_filename.flowFileName as string);
+        }
       } else {
         // Check if there's a shared flow UID in the URL as a GET param
         // If so, we need to look it up in the database and attempt to load it:
@@ -910,14 +1004,19 @@ const App = () => {
       }
 
       // Attempt to load an autosaved flow, if one exists:
-      if (autosavedFlowExists()) loadFlowFromAutosave(rf_inst);
-      else {
-        // Load an interesting default starting flow for new users
-        importFlowFromJSON(EXAMPLEFLOW_1, rf_inst);
-
-        // Open a welcome pop-up
-        // openWelcomeModal();
-      }
+      autosavedFlowExists().then(({ exists, location }) => {
+        if (!exists) {
+          // Load an interesting default starting flow for new users
+          importFlowFromJSON(EXAMPLEFLOW_1, rf_inst);
+
+          // Open a welcome pop-up
+          // openWelcomeModal();
+        } else if (location === "browser") {
+          loadFlowFromAutosave(rf_inst, false);
+        } else if (location === "filesystem") {
+          loadFlowFromAutosave(rf_inst, true);
+        }
+      });
 
       // Turn off loading wheel
       setIsLoading(false);
@@ -1218,7 +1317,9 @@ const App = () => {
       <FlowSidebar
         currentFlow={flowFileName}
         onLoadFlow={(flowData, name) => {
-          if (name !== undefined) setFlowFileName(name);
+          if (name !== undefined) {
+            setFlowFileNameAndCache(name);
+          }
           if (flowData !== undefined) {
             try {
               importFlowFromJSON(flowData);
@@ -1231,7 +1332,7 @@ const App = () => {
         }}
       />
     );
-  }, [flowFileName, importFlowFromJSON, showAlert]);
+  }, [flowFileName, importFlowFromJSON, showAlert, setFlowFileNameAndCache]);
 
   if (!IS_ACCEPTED_BROWSER) {
     return (
diff --git a/chainforge/react-server/src/FlowSidebar.tsx b/chainforge/react-server/src/FlowSidebar.tsx
index c95170851..e9612b358 100644
--- a/chainforge/react-server/src/FlowSidebar.tsx
+++ b/chainforge/react-server/src/FlowSidebar.tsx
@@ -24,7 +24,6 @@ import {
   Tooltip,
 } from "@mantine/core";
 import { FLASK_BASE_URL } from "./backend/utils";
-import { ensureUniqueFlowFilename } from "./backend/backend";
 
 interface FlowFile {
   name: string;
diff --git a/chainforge/react-server/src/ItemsNode.tsx b/chainforge/react-server/src/ItemsNode.tsx
index 16e6b3db4..0f0198ed9 100644
--- a/chainforge/react-server/src/ItemsNode.tsx
+++ b/chainforge/react-server/src/ItemsNode.tsx
@@ -55,7 +55,7 @@ const ItemsNode: React.FC<ItemsNodeProps> = ({ data, id }) => {
   const flags = useStore((state) => state.flags);
 
   const [contentDiv, setContentDiv] = useState<React.ReactNode | null>(null);
-  const [isEditing, setIsEditing] = useState(true);
+  const [isEditing, setIsEditing] = useState(false);
   const [csvInput, setCsvInput] = useState<React.ReactNode | null>(null);
   const [countText, setCountText] = useState<React.ReactNode | null>(null);
 
diff --git a/chainforge/react-server/src/PromptNode.tsx b/chainforge/react-server/src/PromptNode.tsx
index bbfa49873..a4f533106 100644
--- a/chainforge/react-server/src/PromptNode.tsx
+++ b/chainforge/react-server/src/PromptNode.tsx
@@ -98,6 +98,14 @@ const bucketChatHistoryInfosByLLM = (chat_hist_infos: ChatHistoryInfo[]) => {
   });
   return chats_by_llm;
 };
+const getRootPromptFor = (
+  promptTexts: string | string[],
+  varNameForRootTemplate: string,
+) => {
+  if (typeof promptTexts === "string") return promptTexts;
+  else if (promptTexts.length === 1) return promptTexts[0];
+  else return `{${varNameForRootTemplate}}`;
+};
 
 export class PromptInfo {
   prompt: string;
@@ -929,7 +937,7 @@ Soft failing by replacing undefined with empty strings.`,
 
     // Pull the data to fill in template input variables, if any
     let pulled_data: Dict<(string | TemplateVarInfo)[]> = {};
-    let var_for_prompt_templates: string | undefined;
+    let var_for_prompt_templates: string;
     try {
       // Try to pull inputs
       pulled_data = pullInputData(templateVars, id);
@@ -939,7 +947,7 @@ Soft failing by replacing undefined with empty strings.`,
         "prompt",
         Object.keys(pulled_data),
       );
-      if (typeof promptText !== "string")
+      if (typeof promptText !== "string" && promptText.length > 1)
         pulled_data[var_for_prompt_templates] = promptText; // this will be filled in when calling queryLLMs
     } catch (err) {
       if (showAlert) showAlert((err as Error)?.message ?? err);
@@ -1066,9 +1074,7 @@ Soft failing by replacing undefined with empty strings.`,
         id,
         _llmItemsCurrState,
         numGenerations,
-        typeof prompt_template === "string"
-          ? prompt_template
-          : `{${var_for_prompt_templates}}`, // Use special root prompt if there's multiple prompt variants
+        getRootPromptFor(prompt_template, var_for_prompt_templates), // Use special root prompt if there's multiple prompt variants
         pulled_data,
         chat_hist_by_llm,
         apiKeys || {},
@@ -1277,6 +1283,16 @@ Soft failing by replacing undefined with empty strings.`,
 
   // Dynamically update the textareas and position of the template hooks
   const textAreaRef = useRef<HTMLTextAreaElement | HTMLDivElement | null>(null);
+  const resizeTextarea = () => {
+    const textarea = textAreaRef.current;
+
+    if (textarea) {
+      textarea.style.height = "auto"; // Reset height to shrink if needed
+      const newHeight = Math.min(textarea.scrollHeight, 600);
+      textarea.style.height = `${newHeight}px`;
+    }
+  };
+
   const [hooksY, setHooksY] = useState(138);
   const setRef = useCallback(
     (elem: HTMLDivElement | HTMLTextAreaElement | null) => {
@@ -1325,6 +1341,7 @@ Soft failing by replacing undefined with empty strings.`,
         Math.min(prompts.length - 1, idxPromptVariantShown + shift),
       ); // clamp
       setIdxPromptVariantShown(newIdx);
+      resizeTextarea();
     },
     [promptText, idxPromptVariantShown],
   );
@@ -1340,6 +1357,7 @@ Soft failing by replacing undefined with empty strings.`,
         // We have to force an update here since idxPromptVariantShown might've not changed
         // @ts-expect-error Mantine has a 'value' property on Textareas, but TypeScript doesn't know this
         textAreaRef.current.value = prompts[newIdx];
+        resizeTextarea();
       }
 
       return [...prompts];
@@ -1351,6 +1369,7 @@ Soft failing by replacing undefined with empty strings.`,
     if (textAreaRef.current && Array.isArray(promptText)) {
       // @ts-expect-error Mantine has a 'value' property on Textareas, but TypeScript doesn't know this
       textAreaRef.current.value = promptText[idxPromptVariantShown];
+      resizeTextarea();
     }
   }, [idxPromptVariantShown]);
 
@@ -1399,7 +1418,7 @@ Soft failing by replacing undefined with empty strings.`,
             <ActionIcon
               size="xs"
               c="black"
-              mr={2}
+              mr={3}
               onClick={() => gotoPromptVariant(1)}
             >
               <IconArrowRight size={19} />
@@ -1540,9 +1559,9 @@ Soft failing by replacing undefined with empty strings.`,
       ) : (
         <Textarea
           ref={setRef}
-          autosize
+          // autosize
           className="prompt-field-fixed nodrag nowheel"
-          minRows={4}
+          minRows={5}
           maxRows={12}
           defaultValue={
             typeof data.prompt === "string"
diff --git a/chainforge/react-server/src/backend/backend.ts b/chainforge/react-server/src/backend/backend.ts
index 20d25bac0..988c837fa 100644
--- a/chainforge/react-server/src/backend/backend.ts
+++ b/chainforge/react-server/src/backend/backend.ts
@@ -711,10 +711,12 @@ export async function fetchEnvironAPIKeys(): Promise<Dict<string>> {
 export async function saveFlowToLocalFilesystem(
   flowJSON: Dict,
   filename: string,
+  alsoAutosave: boolean,
 ): Promise<void> {
   try {
     await axios.put(`${FLASK_BASE_URL}api/flows/${filename}`, {
       flow: flowJSON,
+      alsoAutosave: alsoAutosave,
     });
   } catch (error) {
     throw new Error(

From cb9f3bf9365937c5b67be836dc40c303a65c3ab4 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Fri, 14 Mar 2025 19:57:41 -0400
Subject: [PATCH 04/35] cleanup unused imports

---
 chainforge/react-server/src/App.tsx              | 1 -
 chainforge/react-server/src/LLMListComponent.tsx | 5 +----
 chainforge/react-server/src/MultiEvalNode.tsx    | 8 ++++----
 chainforge/react-server/src/PromptNode.tsx       | 2 --
 4 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/chainforge/react-server/src/App.tsx b/chainforge/react-server/src/App.tsx
index c41f451c4..9ca353cd4 100644
--- a/chainforge/react-server/src/App.tsx
+++ b/chainforge/react-server/src/App.tsx
@@ -6,7 +6,6 @@ import React, {
   useContext,
   useMemo,
   useTransition,
-  KeyboardEventHandler,
   KeyboardEvent,
 } from "react";
 import ReactFlow, { Controls, Background, ReactFlowInstance } from "reactflow";
diff --git a/chainforge/react-server/src/LLMListComponent.tsx b/chainforge/react-server/src/LLMListComponent.tsx
index 4aac22daa..f111443a4 100644
--- a/chainforge/react-server/src/LLMListComponent.tsx
+++ b/chainforge/react-server/src/LLMListComponent.tsx
@@ -23,10 +23,7 @@ import { StrictModeDroppable } from "./StrictModeDroppable";
 import ModelSettingsModal, {
   ModelSettingsModalRef,
 } from "./ModelSettingsModal";
-import {
-  getDefaultModelFormData,
-  getDefaultModelSettings,
-} from "./ModelSettingSchemas";
+import { getDefaultModelSettings } from "./ModelSettingSchemas";
 import useStore, { initLLMProviders, initLLMProviderMenu } from "./store";
 import { Dict, JSONCompatible, LLMGroup, LLMSpec } from "./backend/typing";
 import { useContextMenu } from "mantine-contextmenu";
diff --git a/chainforge/react-server/src/MultiEvalNode.tsx b/chainforge/react-server/src/MultiEvalNode.tsx
index 10db55e4f..9f15cea7d 100644
--- a/chainforge/react-server/src/MultiEvalNode.tsx
+++ b/chainforge/react-server/src/MultiEvalNode.tsx
@@ -221,10 +221,10 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
   const bringNodeToFront = useStore((state) => state.bringNodeToFront);
   const inputEdgesForNode = useStore((state) => state.inputEdgesForNode);
 
-  const flags = useStore((state) => state.flags);
-  const AI_SUPPORT_ENABLED = useMemo(() => {
-    return flags.aiSupport;
-  }, [flags]);
+  // const flags = useStore((state) => state.flags);
+  // const AI_SUPPORT_ENABLED = useMemo(() => {
+  //   return flags.aiSupport;
+  // }, [flags]);
 
   const [status, setStatus] = useState<Status>(Status.NONE);
   // For displaying error messages to user
diff --git a/chainforge/react-server/src/PromptNode.tsx b/chainforge/react-server/src/PromptNode.tsx
index a4f533106..ce811a0c0 100644
--- a/chainforge/react-server/src/PromptNode.tsx
+++ b/chainforge/react-server/src/PromptNode.tsx
@@ -18,12 +18,10 @@ import {
   Modal,
   Box,
   Tooltip,
-  Group,
   Flex,
   Button,
   ActionIcon,
   Divider,
-  Title,
 } from "@mantine/core";
 import { useDisclosure } from "@mantine/hooks";
 import {

From 66d067b1e9f8e01f1ed677f72bfce43f71546893 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Fri, 14 Mar 2025 22:50:40 -0400
Subject: [PATCH 05/35] Add 'human verification' icons beside eval scores in
 Response Inspector

---
 .../react-server/src/LLMResponseInspector.tsx |  3 +-
 chainforge/react-server/src/ResponseBoxes.tsx | 74 ++++++++++++++++++-
 .../react-server/src/text-fields-node.css     | 34 +++++++++
 3 files changed, 107 insertions(+), 4 deletions(-)

diff --git a/chainforge/react-server/src/LLMResponseInspector.tsx b/chainforge/react-server/src/LLMResponseInspector.tsx
index 944736f5b..10994b83a 100644
--- a/chainforge/react-server/src/LLMResponseInspector.tsx
+++ b/chainforge/react-server/src/LLMResponseInspector.tsx
@@ -52,6 +52,7 @@ import {
   genDebounceFunc,
 } from "./backend/utils";
 import {
+  EvalResultDisplay,
   ResponseBox,
   ResponseGroup,
   genResponseTextsDisplay,
@@ -839,7 +840,7 @@ const LLMResponseInspector: React.FC<LLMResponseInspectorProps> = ({
                 <Stack spacing={0}>
                   {(val.data as [string | JSX.Element, string][]).map(
                     (e, i) => (
-                      <div key={i}>{e[0]}</div>
+                      <EvalResultDisplay evalResultDivOrStr={e[0]} key={i} />
                     ),
                   )}
                 </Stack>
diff --git a/chainforge/react-server/src/ResponseBoxes.tsx b/chainforge/react-server/src/ResponseBoxes.tsx
index 2c23eae53..9f2093545 100644
--- a/chainforge/react-server/src/ResponseBoxes.tsx
+++ b/chainforge/react-server/src/ResponseBoxes.tsx
@@ -1,5 +1,5 @@
-import React, { Suspense, useMemo, lazy } from "react";
-import { Collapse, Flex, Stack } from "@mantine/core";
+import React, { Suspense, useMemo, lazy, useState } from "react";
+import { ActionIcon, Collapse, Flex, Stack, Tooltip } from "@mantine/core";
 import { useDisclosure } from "@mantine/hooks";
 import { llmResponseDataToString, truncStr } from "./backend/utils";
 import {
@@ -10,6 +10,7 @@ import {
   StringOrHash,
 } from "./backend/typing";
 import { StringLookup } from "./backend/cache";
+import { IconCheck, IconChecks, IconX } from "@tabler/icons-react";
 
 // Lazy load the response toolbars
 const ResponseRatingToolbar = lazy(() => import("./ResponseRatingToolbar"));
@@ -83,6 +84,69 @@ export const getEvalResultStr = (
   }
 };
 
+export const EvalResultDisplay = ({
+  evalResultDivOrStr,
+}: {
+  evalResultDivOrStr: JSX.Element | string;
+}) => {
+  const [grade, setGrade] = useState<boolean | null>(null);
+
+  return (
+    <div className="eval-score">
+      {evalResultDivOrStr}
+      {grade === null && (
+        <Flex className="eval-vote-icons">
+          <ActionIcon variant="transparent" onClick={() => setGrade(true)}>
+            <IconCheck className="eval-vote-icon" size={20} />
+          </ActionIcon>
+          <ActionIcon variant="transparent" onClick={() => setGrade(false)}>
+            <IconX className="eval-vote-icon" size={20} />
+          </ActionIcon>
+        </Flex>
+      )}
+      {grade !== null && (
+        <Flex className="eval-vote-chosen">
+          {grade === true && (
+            <Tooltip
+              label="Human-verified eval score"
+              withArrow
+              arrowSize={8}
+              withinPortal
+            >
+              <ActionIcon variant="transparent" onClick={() => setGrade(null)}>
+                <IconChecks
+                  color="#666"
+                  stroke={2}
+                  className="eval-vote-icon"
+                  size={20}
+                />
+              </ActionIcon>
+            </Tooltip>
+          )}
+          {grade === false && (
+            <Tooltip
+              label="Human marked this eval score as incorrect"
+              multiline
+              withArrow
+              arrowSize={8}
+              withinPortal
+            >
+              <ActionIcon variant="transparent" onClick={() => setGrade(null)}>
+                <IconX
+                  color="red"
+                  stroke={4}
+                  className="eval-vote-icon"
+                  size={20}
+                />
+              </ActionIcon>
+            </Tooltip>
+          )}
+        </Flex>
+      )}
+    </div>
+  );
+};
+
 const countResponsesBy = (
   responses: LLMResponseData[],
   keyFunc: (item: LLMResponseData) => string,
@@ -295,7 +359,11 @@ export const genResponseTextsDisplay = (
         )}
         {eval_res_items ? (
           <p className="small-response-metrics">
-            {getEvalResultStr(resp_str_to_eval_res[r], true)[0]}
+            <EvalResultDisplay
+              evalResultDivOrStr={
+                getEvalResultStr(resp_str_to_eval_res[r], true)[0]
+              }
+            />
           </p>
         ) : (
           <></>
diff --git a/chainforge/react-server/src/text-fields-node.css b/chainforge/react-server/src/text-fields-node.css
index 8274f31ab..719cf0a96 100644
--- a/chainforge/react-server/src/text-fields-node.css
+++ b/chainforge/react-server/src/text-fields-node.css
@@ -240,6 +240,40 @@ button.remove-edge-btn:active {
 .eval-inspect-response-footer button {
   cursor: zoom-in;
 }
+
+.eval-score {
+  /* background-color: #eee; */
+  /* padding: 8px; */
+}
+
+.eval-vote-icon {
+  color: #999;
+}
+
+.eval-vote-icon:hover {
+  color: #333;
+}
+
+.eval-vote-icons {
+  display: none;
+  position: absolute;
+  /* opacity: 0.3; */
+  margin-left: 6px;
+  margin-top: -6px;
+}
+
+.eval-vote-chosen {
+  display: inline-flex;
+  position: absolute;
+  margin-left: 6px;
+  margin-top: -6px;
+}
+
+.eval-score:hover .eval-vote-icons {
+  display: inline-flex;
+  /* opacity: 1.0; */
+}
+
 /* .blink-inspect-response-footer {
     animation: blink-footer 0.7s ease-in 1;
   }

From 7f77b73fc1f6087f8252b97d8f56cdee6af9855c Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Sat, 15 Mar 2025 12:43:44 -0400
Subject: [PATCH 06/35] Eval score assessments save. Boolean assessments flip
 when the eval flips.

---
 .../react-server/src/LLMResponseInspector.tsx |  71 ++++++++---
 chainforge/react-server/src/ResponseBoxes.tsx | 110 +++++++++++++++---
 .../src/ResponseRatingToolbar.tsx             |   5 +-
 3 files changed, 156 insertions(+), 30 deletions(-)

diff --git a/chainforge/react-server/src/LLMResponseInspector.tsx b/chainforge/react-server/src/LLMResponseInspector.tsx
index 10994b83a..ec1cd2f3e 100644
--- a/chainforge/react-server/src/LLMResponseInspector.tsx
+++ b/chainforge/react-server/src/LLMResponseInspector.tsx
@@ -61,6 +61,7 @@ import {
 import { getLabelForResponse } from "./ResponseRatingToolbar";
 import {
   Dict,
+  EvaluationScore,
   LLMResponse,
   LLMResponseData,
   isImageResponseData,
@@ -696,24 +697,46 @@ const LLMResponseInspector: React.FC<LLMResponseInspectorProps> = ({
               const val = resp_objs[0].metavars[v];
               return val !== undefined ? val : "(unspecified)";
             });
-            let eval_cols_vals: [string | JSX.Element, string][][] = [];
+            let eval_cols_vals: [
+              string | JSX.Element,
+              string,
+              string,
+              EvaluationScore | undefined,
+            ][][] = [];
             if (eval_res_cols && eval_res_cols.length > 0) {
               // We can assume that there's only one response object, since to
               // if eval_res_cols is set, there must be only one LLM.
               eval_cols_vals = eval_res_cols.map((metric_name, metric_idx) => {
                 const items = resp_objs[0].eval_res?.items;
-                if (!items) return [["(no result)", "(no result)"]];
+                const uid = resp_objs[0].uid;
+                if (!items)
+                  return [["(no result)", "(no result)", uid, undefined]];
                 return items.map((item) => {
-                  if (item === undefined) return ["(undefined)", "(undefined)"];
+                  if (item === undefined)
+                    return ["(undefined)", "(undefined)", uid, item];
                   if (
                     typeof item !== "object" &&
                     metric_idx === 0 &&
                     metric_name === "Score"
                   )
-                    return getEvalResultStr(item, true);
+                    return [...getEvalResultStr(item, true), uid, item] as [
+                      string | JSX.Element,
+                      string,
+                      string,
+                      EvaluationScore,
+                    ];
                   else if (typeof item === "object" && metric_name in item)
-                    return getEvalResultStr(item[metric_name], true);
-                  else return ["(unspecified)", "(unspecified)"];
+                    return [
+                      ...getEvalResultStr(item[metric_name], true),
+                      uid,
+                      item[metric_name],
+                    ] as [
+                      string | JSX.Element,
+                      string,
+                      string,
+                      EvaluationScore,
+                    ];
+                  else return ["(unspecified)", "(unspecified)", uid, item];
                 }); // treat n>1 resps per prompt as multi-line results in the column
               });
             }
@@ -750,7 +773,15 @@ const LLMResponseInspector: React.FC<LLMResponseInspectorProps> = ({
               | undefined
               | LLMResponse[]
               | LLMResponseData[]
-              | { type: "eval"; data: (string | JSX.Element)[][] }
+              | {
+                  type: "eval";
+                  data: (
+                    | string
+                    | JSX.Element
+                    | EvaluationScore
+                    | undefined
+                  )[][];
+                }
             > = {};
             let vals_arr_start_idx = 0;
             var_cols_vals.forEach((v, i) => {
@@ -838,11 +869,22 @@ const LLMResponseInspector: React.FC<LLMResponseInspectorProps> = ({
             else if ("type" in val && val.type === "eval") {
               return (
                 <Stack spacing={0}>
-                  {(val.data as [string | JSX.Element, string][]).map(
-                    (e, i) => (
-                      <EvalResultDisplay evalResultDivOrStr={e[0]} key={i} />
-                    ),
-                  )}
+                  {(
+                    val.data as [
+                      string | JSX.Element,
+                      string,
+                      string,
+                      EvaluationScore | undefined,
+                    ][]
+                  ).map((e, i) => (
+                    <EvalResultDisplay
+                      uid={e[2]}
+                      evalRes={e[3]}
+                      evalResIdx={i}
+                      evalResultDivOrStr={e[0]}
+                      key={i}
+                    />
+                  ))}
                 </Stack>
               );
             } else
@@ -947,7 +989,10 @@ const LLMResponseInspector: React.FC<LLMResponseInspectorProps> = ({
               <div
                 key={"l" + leaf_id}
                 className={className}
-                style={{ backgroundColor: rgroup_color(eatenvars.length) }}
+                style={{
+                  backgroundColor: rgroup_color(eatenvars.length),
+                  position: "relative",
+                }}
               >
                 <ResponseGroup
                   header={header}
diff --git a/chainforge/react-server/src/ResponseBoxes.tsx b/chainforge/react-server/src/ResponseBoxes.tsx
index 9f2093545..2b1e06504 100644
--- a/chainforge/react-server/src/ResponseBoxes.tsx
+++ b/chainforge/react-server/src/ResponseBoxes.tsx
@@ -1,7 +1,14 @@
-import React, { Suspense, useMemo, lazy, useState } from "react";
+import React, {
+  Suspense,
+  useMemo,
+  lazy,
+  useState,
+  useCallback,
+  useEffect,
+} from "react";
 import { ActionIcon, Collapse, Flex, Stack, Tooltip } from "@mantine/core";
 import { useDisclosure } from "@mantine/hooks";
-import { llmResponseDataToString, truncStr } from "./backend/utils";
+import { deepcopy, llmResponseDataToString, truncStr } from "./backend/utils";
 import {
   Dict,
   EvaluationScore,
@@ -9,8 +16,10 @@ import {
   LLMResponseData,
   StringOrHash,
 } from "./backend/typing";
-import { StringLookup } from "./backend/cache";
+import StorageCache, { StringLookup } from "./backend/cache";
 import { IconCheck, IconChecks, IconX } from "@tabler/icons-react";
+import { getRatingKeyForResponse } from "./ResponseRatingToolbar";
+import useStore from "./store";
 
 // Lazy load the response toolbars
 const ResponseRatingToolbar = lazy(() => import("./ResponseRatingToolbar"));
@@ -84,36 +93,101 @@ export const getEvalResultStr = (
   }
 };
 
+interface EvalResultAssessment {
+  correct: boolean | null;
+  // The original eval score that the user gave feedback on.
+  // If the underlying score changes, i.e. on subsequent runs after changing the evaluator,
+  // we need to be able to invalidate the user's assessment (or flip it automatically, in the case of boolean values).
+  orig_score?: EvaluationScore;
+  feedback?: string | null;
+}
+
 export const EvalResultDisplay = ({
+  uid, // the response uid
+  evalResIdx, // the index of the eval result in the array
+  evalRes, // the score of the eval result
   evalResultDivOrStr,
 }: {
+  uid: string;
+  evalResIdx: number;
+  evalRes?: EvaluationScore;
   evalResultDivOrStr: JSX.Element | string;
 }) => {
-  const [grade, setGrade] = useState<boolean | null>(null);
+  // The cache key storing the ratings for this user score
+  const evalResultAssessmentKey = useMemo(
+    () => getRatingKeyForResponse(uid, "metaeval") + `.${evalResIdx}`,
+    [uid, evalResIdx],
+  );
+
+  // The current rating states, reading from the global store.
+  // :: This ensures refreshes will occur only on this component, only when the rating
+  // :: for this component changes.
+  // const state = useStore((store) => store.state);
+  const setState = useStore((store) => store.setState);
+  const userRating = useStore<EvalResultAssessment | undefined>(
+    (store) => store.state[evalResultAssessmentKey],
+  );
+  const setRating = useCallback(
+    (correct: boolean | null, feedback?: string | null) => {
+      const safe_payload = deepcopy({
+        correct,
+        orig_score: evalRes,
+        feedback,
+      } as EvalResultAssessment);
+      setState(evalResultAssessmentKey, safe_payload);
+      StorageCache.store(evalResultAssessmentKey, safe_payload);
+    },
+    [evalResultAssessmentKey, setState, evalRes],
+  );
+
+  // The internal user assessment of this eval result
+  const rating = useMemo(() => userRating?.correct, [userRating]);
+
+  // Upon load, detect if the eval result has changed, if the user had previously assessed it.
+  // If so, either a) invalidate the user's rating or b) if it's a boolean, flip it.
+  useEffect(() => {
+    // If the original eval score wasn't saved, or the user has no rating, continue
+    if (userRating?.orig_score == null || userRating.correct == null) return;
+    const orig_eval_score = userRating.orig_score;
+    if (orig_eval_score !== evalRes) {
+      // The eval score has changed since the user last rated it!
+      if (
+        typeof evalRes === "boolean" &&
+        typeof orig_eval_score === "boolean"
+      ) {
+        // If the eval type was boolean, we can safely flip the user's rating:
+        setRating(!userRating.correct, userRating?.feedback);
+      } else {
+        // We don't know what to do if the score fundamentally changes type or is categorical.
+        // Simply invalidate the user's assessment:
+        setRating(null, null);
+      }
+    }
+  }, [userRating, evalRes]);
 
   return (
     <div className="eval-score">
       {evalResultDivOrStr}
-      {grade === null && (
+      {rating == null && (
         <Flex className="eval-vote-icons">
-          <ActionIcon variant="transparent" onClick={() => setGrade(true)}>
+          <ActionIcon variant="transparent" onClick={() => setRating(true)}>
             <IconCheck className="eval-vote-icon" size={20} />
           </ActionIcon>
-          <ActionIcon variant="transparent" onClick={() => setGrade(false)}>
+          <ActionIcon variant="transparent" onClick={() => setRating(false)}>
             <IconX className="eval-vote-icon" size={20} />
           </ActionIcon>
         </Flex>
       )}
-      {grade !== null && (
+      {rating != null && (
         <Flex className="eval-vote-chosen">
-          {grade === true && (
+          {rating === true && (
             <Tooltip
               label="Human-verified eval score"
               withArrow
               arrowSize={8}
               withinPortal
             >
-              <ActionIcon variant="transparent" onClick={() => setGrade(null)}>
+              <ActionIcon variant="transparent" onClick={() => setRating(null)}>
                 <IconChecks
                   color="#666"
                   stroke={2}
@@ -123,7 +197,7 @@ export const EvalResultDisplay = ({
               </ActionIcon>
             </Tooltip>
           )}
-          {grade === false && (
+          {rating === false && (
             <Tooltip
               label="Human marked this eval score as incorrect"
               multiline
@@ -131,7 +205,7 @@ export const EvalResultDisplay = ({
               arrowSize={8}
               withinPortal
             >
-              <ActionIcon variant="transparent" onClick={() => setGrade(null)}>
+              <ActionIcon variant="transparent" onClick={() => setRating(null)}>
                 <IconX
                   color="red"
                   stroke={4}
@@ -288,10 +362,13 @@ export const genResponseTextsDisplay = (
 
   // Collapse responses with the same texts.
   // We need to keep track of the original evaluation result per response str:
-  const resp_str_to_eval_res: Dict<EvaluationScore> = {};
+  const resp_str_to_eval_res: Dict<[EvaluationScore, number]> = {};
   if (eval_res_items)
     responses.forEach((r, idx) => {
-      resp_str_to_eval_res[llmResponseDataToString(r)] = eval_res_items[idx];
+      resp_str_to_eval_res[llmResponseDataToString(r)] = [
+        eval_res_items[idx],
+        idx,
+      ];
     });
 
   const same_resp_text_counts = countResponsesBy(responses, (r) =>
@@ -360,8 +437,11 @@ export const genResponseTextsDisplay = (
         {eval_res_items ? (
           <p className="small-response-metrics">
             <EvalResultDisplay
+              uid={res_obj.uid}
+              evalRes={resp_str_to_eval_res[r][0]}
+              evalResIdx={resp_str_to_eval_res[r][1]}
               evalResultDivOrStr={
-                getEvalResultStr(resp_str_to_eval_res[r], true)[0]
+                getEvalResultStr(resp_str_to_eval_res[r][0], true)[0]
               }
             />
           </p>
diff --git a/chainforge/react-server/src/ResponseRatingToolbar.tsx b/chainforge/react-server/src/ResponseRatingToolbar.tsx
index a0315ce5a..aa50880d5 100644
--- a/chainforge/react-server/src/ResponseRatingToolbar.tsx
+++ b/chainforge/react-server/src/ResponseRatingToolbar.tsx
@@ -15,12 +15,12 @@ import {
 import StorageCache from "./backend/cache";
 import useStore from "./store";
 import { deepcopy } from "./backend/utils";
-import { LLMResponseData } from "./backend/typing";
 
 type RatingDict = Record<number, boolean | string | undefined>;
 
-const getRatingKeyForResponse = (uid: string, label_name: string) =>
+export const getRatingKeyForResponse = (uid: string, label_name: string) =>
   `r.${uid}.${label_name}`;
+
 const collapse_ratings = (rating_dict: RatingDict, idxs: number[]) => {
   if (rating_dict === undefined) return undefined;
   for (let j = 0; j < idxs.length; j++) {
@@ -33,6 +33,7 @@ const collapse_ratings = (rating_dict: RatingDict, idxs: number[]) => {
 export const getLabelForResponse = (uid: string, label_name: string) => {
   return StorageCache.get(getRatingKeyForResponse(uid, label_name));
 };
+
 export const setLabelForResponse = (
   uid: string,
   label_name: string,

From 9350218f4bc8450f55362cc632ffa63094eed2c2 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Sat, 15 Mar 2025 18:19:12 -0400
Subject: [PATCH 07/35] wip transfer

---
 chainforge/react-server/src/MultiEvalNode.tsx | 162 ++++++++----------
 1 file changed, 69 insertions(+), 93 deletions(-)

diff --git a/chainforge/react-server/src/MultiEvalNode.tsx b/chainforge/react-server/src/MultiEvalNode.tsx
index 9f15cea7d..53fdafd82 100644
--- a/chainforge/react-server/src/MultiEvalNode.tsx
+++ b/chainforge/react-server/src/MultiEvalNode.tsx
@@ -262,9 +262,18 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
 
   // Add an evaluator to the end of the list
   const addEvaluator = useCallback(
-    (name: string, type: EvaluatorContainerDesc["type"], state: Dict) => {
+    (
+      name: string,
+      type: EvaluatorContainerDesc["type"],
+      state: Dict,
+      initiallyOpen = true,
+    ) => {
       setEvaluators(
-        evaluators.concat({ name, uid: uuid(), type, state, justAdded: true }),
+        // evaluators.concat({ name, uid: uuid(), type, state, justAdded: true }),
+        (e) => [
+          ...e,
+          { name, uid: uuid(), type, state, justAdded: initiallyOpen },
+        ],
       );
     },
     [evaluators],
@@ -299,89 +308,57 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
     );
   };
 
-  // const evaluatorComponents = useMemo(() => {
-  //   // evaluatorComponentRefs.current = [];
-
-  //   return evaluators.map((e, idx) => {
-  //     let component: React.ReactNode;
-  //     if (e.type === "python" || e.type === "javascript") {
-  //       component = (
-  //         <CodeEvaluatorComponent
-  //           ref={(el) =>
-  //             (evaluatorComponentRefs.current[idx] = {
-  //               type: "code",
-  //               name: e.name,
-  //               ref: el,
-  //             })
-  //           }
-  //           code={e.state?.code}
-  //           progLang={e.type}
-  //           type="evaluator"
-  //           id={id}
-  //           onCodeEdit={(code) =>
-  //             updateEvalState(idx, (e) => (e.state.code = code))
-  //           }
-  //           showUserInstruction={false}
-  //         />
-  //       );
-  //     } else if (e.type === "llm") {
-  //       component = (
-  //         <LLMEvaluatorComponent
-  //           ref={(el) =>
-  //             (evaluatorComponentRefs.current[idx] = {
-  //               type: "llm",
-  //               name: e.name,
-  //               ref: el,
-  //             })
-  //           }
-  //           prompt={e.state?.prompt}
-  //           grader={e.state?.grader}
-  //           format={e.state?.format}
-  //           id={id}
-  //           showUserInstruction={false}
-  //           onPromptEdit={(prompt) =>
-  //             updateEvalState(idx, (e) => (e.state.prompt = prompt))
-  //           }
-  //           onLLMGraderChange={(grader) =>
-  //             updateEvalState(idx, (e) => (e.state.grader = grader))
-  //           }
-  //           onFormatChange={(format) =>
-  //             updateEvalState(idx, (e) => (e.state.format = format))
-  //           }
-  //         />
-  //       );
-  //     } else {
-  //       console.error(
-  //         `Unknown evaluator type ${e.type} inside multi-evaluator node. Cannot display evaluator UI.`,
-  //       );
-  //       component = <Alert>Error: Unknown evaluator type {e.type}</Alert>;
-  //     }
-  //     return (
-  //       <EvaluatorContainer
-  //         name={e.name}
-  //         key={`${e.name}-${idx}`}
-  //         type={EVAL_TYPE_PRETTY_NAME[e.type]}
-  //         progress={e.progress}
-  //         onDelete={() => {
-  //           delete evaluatorComponentRefs.current[idx];
-  //           setEvaluators(evaluators.filter((_, i) => i !== idx));
-  //         }}
-  //         onChangeTitle={(newTitle) =>
-  //           setEvaluators(
-  //             evaluators.map((e, i) => {
-  //               if (i === idx) e.name = newTitle;
-  //               console.log(e);
-  //               return e;
-  //             }),
-  //           )
-  //         }
-  //         padding={e.type === "llm" ? "8px" : undefined}
-  //       >
-  //         {component}
-  //       </EvaluatorContainer>
-  //     );
-  //   });
-  // }, [evaluators, id]);
+  const evalGenModalRef = useRef<EvalGenModalRef>(null);
+  const openEvalGen = () => {
+    const resps = handlePullInputs();
+    evalGenModalRef.current?.trigger(resps, onFinalReportsReady);
+  };
+
+  const onFinalReportsReady = (reports: EvalGenReport) => {
+    // Placeholder for process the final reports returned from EvalGenModel
+    console.log("!!!!!!!!!!!!!!!!!!!!!!!!!! final reports", reports);
+    // let kkk = 1;
+    for (const crit of reports.criteria) {
+      // setTimeout(() => {
+      // console.log("crit", crit);
+      if (crit.eval_method === "code") {
+        // Python
+        addEvaluator(
+          crit.shortname,
+          "python",
+          {
+            code: "def evaluate(r):\n\treturn len(r.text)", // to be populated once python code is implemented for the criteria
+            sandbox: true,
+          },
+          false,
+        );
+      } else if (crit.eval_method === "expert") {
+        // LLM
+        addEvaluator(
+          crit.shortname,
+          "llm",
+          {
+            // to be populated once LLM code is implemented for the criteria
+            prompt: "",
+            format: "bin",
+          },
+          false,
+        );
+      } else {
+        // JavaScript
+        addEvaluator(
+          crit.shortname,
+          "javascript",
+          {
+            code: "function evaluate(r) {\n\treturn r.text.length;\n}", // to be populated once javascript code is implemented for the criteria
+          },
+          false,
+        );
+      }
+      // }, kkk * 5000);
+      // kkk++;
+    }
+  };
 
   const handleError = useCallback(
     (err: Error | string) => {
@@ -823,7 +800,7 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
         </Menu>
       </div>
 
-      {/* EvalGen {evaluators && evaluators.length === 0 ? (
+      {evaluators && evaluators.length === 0 ? (
         <Flex justify="center" gap={12} mt="md">
           <Tooltip
             label="Let an AI help you generate criteria and implement evaluation functions."
@@ -831,16 +808,15 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
             position="bottom"
             withArrow
           >
-            <Button onClick={onClickPickCriteria} variant="outline" size="xs">
+            <Button onClick={openEvalGen} variant="outline" size="xs">
               <IconSparkles size="11pt" />
-              &nbsp;Generate criteria
+              &nbsp;Generate evals with EvalGen
             </Button>
-          </Tooltip> */}
-      {/* <Button disabled variant='gradient' gradient={{ from: 'teal', to: 'lime', deg: 105 }}><IconSparkles />&nbsp;Validate</Button> */}
-      {/* </Flex>
+          </Tooltip>
+        </Flex>
       ) : (
         <></>
-      )} */}
+      )}
 
       {lastRunSuccess && lastResponses && lastResponses.length > 0 ? (
         <InspectFooter
@@ -872,4 +848,4 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
   );
 };
 
-export default MultiEvalNode;
+export default MultiEvalNode;
\ No newline at end of file

From ca72eed3056f1dda2e23caf5a87e8f6c65bc7e31 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Sat, 15 Mar 2025 18:23:55 -0400
Subject: [PATCH 08/35] wip transfer

---
 chainforge/react-server/src/EvalGenModal.tsx  | 1734 +++++++++++++++++
 .../src/backend/evalgen/README.md             |   27 +
 .../src/backend/evalgen/executor.ts           | 1088 +++++++++++
 .../src/backend/evalgen/oai_utils.ts          |  339 ++++
 .../react-server/src/backend/evalgen/test.ts  |  142 ++
 .../src/backend/evalgen/typing.ts             |   78 +
 .../react-server/src/backend/evalgen/utils.ts |  366 ++++
 7 files changed, 3774 insertions(+)
 create mode 100644 chainforge/react-server/src/EvalGenModal.tsx
 create mode 100644 chainforge/react-server/src/backend/evalgen/README.md
 create mode 100644 chainforge/react-server/src/backend/evalgen/executor.ts
 create mode 100644 chainforge/react-server/src/backend/evalgen/oai_utils.ts
 create mode 100644 chainforge/react-server/src/backend/evalgen/test.ts
 create mode 100644 chainforge/react-server/src/backend/evalgen/typing.ts
 create mode 100644 chainforge/react-server/src/backend/evalgen/utils.ts

diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx
new file mode 100644
index 000000000..ae8b47126
--- /dev/null
+++ b/chainforge/react-server/src/EvalGenModal.tsx
@@ -0,0 +1,1734 @@
+/**
+ * EvalGen 2.0
+ *
+ * Ian Arawjo, Shreya Shankar, J.D. Zamf., Helen Weixu Chen
+ *
+ * This file concerns the front-end to evaluation generator, EvalGen.
+ * EvalGen supports users in generating eval funcs (here binary assertions) and aligning them with their preferences.
+ *
+ * Specifically, the modal lets users:
+ *  - make and refine criteria to grade on (on the left)
+ *  - grade responses (on the right)
+ *  - while in the backend, an LLM is generating candidate assertions and selected the ones most aligned with user grades
+ * As the user grades responses, they add/refine existing criteria.
+ * This modal presents a shared interface where criteria can be iterated on *alongside* grading.
+ * This is because of **criteria drift,** a phenomenon identified observing users in EvalGen 1.0 (unreleased).
+ *
+ * An AI (LLM call) can also suggest criteria based on the implicit context (inputs, such as the prompt)
+ * and user feedback during grading (written feedback about failing outputs whose failure couldn't be classified under the immediate criteria set.)
+ */
+import React, {
+  ReactNode,
+  forwardRef,
+  useCallback,
+  useEffect,
+  useImperativeHandle,
+  useMemo,
+  useState,
+} from "react";
+import { v4 as uuid } from "uuid";
+import {
+  Accordion,
+  ActionIcon,
+  Box,
+  Button,
+  Card,
+  Center,
+  Checkbox,
+  Code,
+  Collapse,
+  Divider,
+  Flex,
+  Grid,
+  Group,
+  Menu,
+  Modal,
+  Radio,
+  RingProgress,
+  ScrollArea,
+  SimpleGrid,
+  Skeleton,
+  Stack,
+  Text,
+  TextInput,
+  Textarea,
+  Title,
+  Tooltip,
+  rem,
+} from "@mantine/core";
+import { useDisclosure } from "@mantine/hooks";
+import {
+  // CriteriaGradeCount,
+  Dict,
+  LLMResponse,
+  PromptVarsDict,
+  RatingDict,
+  ResponseUID,
+} from "./backend/typing";
+import { EvalCriteria, EvalGenReport } from "./backend/evalgen/typing";
+import {
+  IconChevronDown,
+  IconChevronLeft,
+  IconChevronRight,
+  IconDots,
+  IconRobot,
+  IconStarFilled,
+  IconTerminal2,
+  IconThumbDown,
+  IconThumbUp,
+  IconTrash,
+  IconFlagFilled,
+  IconPencil,
+  IconSparkles,
+} from "@tabler/icons-react";
+import {
+  cleanMetavarsFilterFunc,
+  deepcopy,
+  sampleRandomElements,
+  transformDict,
+} from "./backend/utils";
+import {
+  extractUIDFromRatingKey,
+  getRatingKeyForResponse,
+} from "./ResponseRatingToolbar";
+import useStore from "./store";
+import StorageCache from "./backend/cache";
+import EvaluationFunctionExecutor from "./backend/evalgen/executor";
+import { generateLLMEvaluationCriteria } from "./backend/evalgen/utils";
+import { escapeBraces } from "./backend/template";
+import { update } from "lodash";
+// import "./EvalGenModel.css";
+
+const INIT_CRITERIA: EvalCriteria[] = [
+  {
+    shortname: "Grammatical",
+    criteria: "The text is grammatically correct.",
+    eval_method: "expert",
+    uid: uuid(),
+    priority: 0,
+  },
+  {
+    shortname: "Tweet-length",
+    criteria: "The text is less than 144 characters.",
+    eval_method: "code",
+    uid: uuid(),
+    priority: 0,
+  },
+  {
+    shortname: "Bombastic",
+    criteria: "The message will drive views because it's controversial.",
+    eval_method: "expert",
+    uid: uuid(),
+    priority: 0,
+  },
+];
+
+const Contributor = ({
+  getStateValue,
+  style = { size: 22, thickness: 4 },
+}: {
+  getStateValue: (id: number) => number;
+  style: { size: number; thickness: number };
+}) => {
+  return (
+    <RingProgress
+      size={style.size}
+      thickness={style.thickness}
+      // label=""
+      sections={[
+        {
+          value: getStateValue(1),
+          color: "cyan",
+          tooltip: "You have successfully contributed 7 responses.",
+        },
+        {
+          value: getStateValue(2),
+          color: "orange",
+          tooltip: "You have successfully contributed 20 responses.",
+        },
+        {
+          value: getStateValue(3),
+          color: "green",
+          tooltip: "You have gone to buffet 100 times.",
+        },
+        {
+          value: getStateValue(4),
+          color: "grape",
+          tooltip: "You have made 21 nightmare",
+        },
+      ]}
+    />
+  );
+};
+
+const ThumbUpDownButtons = ({
+  grade,
+  onChangeGrade,
+  getGradeCount,
+}: {
+  grade: boolean | undefined;
+  onChangeGrade: (newGrade: boolean | undefined) => void;
+  getGradeCount: (grade: boolean | undefined) => number;
+}) => {
+  // console.log(
+  //   "getGradeCount",
+  //   getGradeCount(true),
+  //   getGradeCount(false),
+  //   getGradeCount(undefined),
+  // );
+  return (
+    <>
+      {/* Thumbs up/down buttons */}
+      <Button
+        color={grade === true ? "green" : "gray"}
+        m={0}
+        p={0}
+        variant="subtle"
+        onClick={() => {
+          // Toggle grade: if on (true), turn 'off' (undefined, for neutral).
+          if (onChangeGrade) onChangeGrade(grade === true ? undefined : true);
+        }}
+      >
+        <div className="gradeContainer">
+          <IconThumbUp size="14pt" fill={grade === true ? "#aea" : "white"} />
+          <div className="gradeUpCount">{getGradeCount(true)}</div>
+        </div>
+      </Button>
+      <Button
+        color={grade === false ? "red" : "gray"}
+        m={0}
+        p={0}
+        variant="subtle"
+        onClick={() => {
+          // Toggle grade: if on (true), turn 'off' (undefined, for neutral).
+          if (onChangeGrade) onChangeGrade(grade === false ? undefined : false);
+        }}
+      >
+        <div className="gradeContainer">
+          <IconThumbDown
+            size="14pt"
+            fill={grade === false ? "pink" : "white"}
+          />
+          <div className="gradeDownCount">{getGradeCount(false)}</div>
+        </div>
+      </Button>
+    </>
+  );
+};
+
+export interface CriteriaCardProps {
+  criterion: EvalCriteria;
+  onChange: (changedCriteria: EvalCriteria) => void;
+  onDelete: () => void;
+  initiallyOpen?: boolean;
+  grade: boolean | undefined;
+  onChangeGrade: (newGrade: boolean | undefined) => void;
+  getGradeCount: (grade: boolean | undefined) => number;
+  getStateValue: (stateId: number) => number;
+}
+
+const CriteriaCard: React.FC<CriteriaCardProps> = ({
+  criterion,
+  onChange,
+  onDelete,
+  initiallyOpen,
+  grade,
+  getGradeCount,
+  onChangeGrade,
+  getStateValue,
+}) => {
+  const [opened, { toggle }] = useDisclosure(initiallyOpen ?? false);
+  const [title, setTitle] = useState(criterion.shortname ?? "New Criteria");
+
+  return (
+    <Card withBorder mb={4} radius="md" style={{ cursor: "default" }}>
+      <Card.Section withBorder pl="8px">
+        <Flex align="center">
+          <Group spacing="0px">
+            {/* The arrow chevron user can click to collapse/expand */}
+            <Button
+              color="gray"
+              p={0}
+              m={0}
+              variant="subtle"
+              mr="4px"
+              onClick={toggle}
+            >
+              {opened ? (
+                <IconChevronDown size="14pt" />
+              ) : (
+                <IconChevronRight size="14pt" />
+              )}
+            </Button>
+
+            {/* Thumbs up/down buttons */}
+            <ThumbUpDownButtons
+              grade={grade}
+              onChangeGrade={onChangeGrade}
+              getGradeCount={getGradeCount}
+            />
+            <Contributor getStateValue={getStateValue} />
+
+            {/* Title of the criteria */}
+            <TextInput
+              value={title}
+              onChange={(e) => setTitle(e.target.value)}
+              onBlur={(e) => {
+                criterion.shortname = e.target.value;
+                if (onChange) onChange(criterion);
+              }}
+              placeholder="Criteria name"
+              variant="unstyled"
+              size="sm"
+              ml="xs"
+              className="nodrag nowheel"
+              styles={{
+                input: {
+                  padding: "0px",
+                  height: "14pt",
+                  minHeight: "0pt",
+                  fontWeight: 500,
+                },
+              }}
+            />
+          </Group>
+
+          <Group spacing="4px" ml="auto">
+            {/* Whether this criteria should be implemented with code (function) or an LLM evaluator */}
+            <Tooltip
+              label={
+                criterion.eval_method === "code"
+                  ? "Change to an LLM evaluator"
+                  : "Change to a code evaluator"
+              }
+              withinPortal
+              withArrow
+            >
+              <Text
+                color="#999"
+                size="sm"
+                mr="6px"
+                onClick={() => {
+                  criterion.eval_method =
+                    criterion.eval_method === "code" ? "expert" : "code";
+                  if (onChange) onChange(criterion);
+                }}
+              >
+                {criterion.eval_method === "code" ? (
+                  <Flex style={{ userSelect: "none" }}>
+                    <IconTerminal2 size="14pt" />
+                    &nbsp;Python
+                  </Flex>
+                ) : (
+                  <Flex style={{ userSelect: "none" }}>
+                    <IconRobot size="14pt" />
+                    &nbsp;LLM
+                  </Flex>
+                )}
+              </Text>
+            </Tooltip>
+
+            {/* Favorite star toggle */}
+            <Tooltip
+              label={
+                criterion.priority <= 0
+                  ? "Make this a deal-breaker"
+                  : "It's a deal-breaker"
+              }
+              withinPortal
+              withArrow
+            >
+              <Button
+                color={criterion.priority <= 0 ? "gray" : "red"}
+                m={0}
+                p={0}
+                variant="subtle"
+                onClick={() => {
+                  criterion.priority = criterion.priority <= 0 ? 1 : 0;
+                  if (onChange) onChange(criterion);
+                }}
+              >
+                {/* <IconStarFilled size="14pt" /> */}
+                <IconFlagFilled size="14pt" />
+              </Button>
+            </Tooltip>
+
+            {/* Delete button (and any other criterion-specific changes in the future) */}
+            <Menu withinPortal position="right-start" shadow="sm">
+              <Menu.Target>
+                <ActionIcon variant="subtle" color="gray">
+                  <IconDots style={{ width: rem(16), height: rem(16) }} />
+                </ActionIcon>
+              </Menu.Target>
+
+              <Menu.Dropdown>
+                <Menu.Item
+                  icon={<IconTrash size="14px" />}
+                  color="red"
+                  onClick={onDelete}
+                >
+                  Delete
+                </Menu.Item>
+              </Menu.Dropdown>
+            </Menu>
+          </Group>
+        </Flex>
+      </Card.Section>
+
+      {/* Description of the criteria */}
+      <Card.Section p="0px">
+        <Collapse in={opened}>
+          <Textarea
+            value={criterion.criteria}
+            placeholder="Describe here."
+            onChange={(e) => {
+              criterion.criteria = e.target.value;
+              if (onChange) onChange(criterion);
+            }}
+            onClickCapture={(e) => e.stopPropagation()}
+            styles={{
+              input: {
+                border: "none",
+                borderWidth: "0px",
+                margin: "0px",
+                color: "#444",
+                background: "transparent",
+                lineHeight: 1.1,
+              },
+            }}
+            autosize
+            minRows={2}
+            maxRows={5}
+            fz="sm"
+            mb="xs"
+            c="dimmed"
+          />
+        </Collapse>
+      </Card.Section>
+    </Card>
+  );
+};
+
+export interface EvalGenModalRef {
+  trigger: (
+    resps: LLMResponse[],
+    setFinalReports: (reports: EvalGenReport) => void,
+  ) => void;
+}
+
+const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
+  function EvalGenModal(props, ref) {
+    const [opened, { open, close }] = useDisclosure(false);
+    const apiKeys = useStore((state) => state.apiKeys);
+    const globalState = useStore((store) => store.state);
+    const [criteria, setCriteria] = useState<EvalCriteria[]>([]);
+    const [criteriaForDisplay, setCriteriaForDisplay] = useState<
+      EvalCriteria[]
+    >([]);
+
+    const [responses, setResponses] = useState<LLMResponse[]>([]);
+    const [shownResponse, setShownResponse] = useState<LLMResponse | undefined>(
+      undefined,
+    );
+    const [pastShownResponses, setPastShownResponses] = useState<LLMResponse[]>(
+      [],
+    );
+    const [shownResponseIdx, setShownResponseIdx] = useState(0);
+    // const [shownResponseUniqueIdx, setShownResponseUniqueIdx] = useState(0);
+
+    const [annotation, setAnnotation] = useState<string | undefined>(undefined);
+    const [holisticGrade, setHolisticGrade] = useState<
+      "good" | "bad" | undefined
+    >(undefined);
+
+    // Per-criteria grades (indexed by uid of response, then uid of criteria)
+    const [grades, setGrades] = useState<Dict<Dict<boolean | undefined>>>({});
+    const setPerCriteriaGrade = (
+      responseUID: string,
+      criteriaUID: string,
+      newGrade: boolean | undefined,
+    ) => {
+      setGrades((grades) => {
+        if (!grades[responseUID]) grades[responseUID] = {};
+        grades[responseUID][criteriaUID] = newGrade;
+        // grades[responseUID] = { ...grades[responseUID] };
+        // console.error("grades-2", grades);
+        return { ...grades };
+      });
+      updateGlobalRating(responseUID, "perCriteriaGrades", grades[responseUID]);
+    };
+    const getStateValue = (stateId: number) => {
+      return Math.floor(Math.random() * 30 + 6);
+    };
+    const getGradeCount = (
+      // responseUID: string,
+      criteriaUID: string,
+      grade: boolean | undefined,
+    ) => {
+      // console.log("getGradeCount", responseUID, criteriaUID, grade);
+      // console.log("getGradeCount", grades);
+
+      let count = 0;
+      for (const respUid in grades) {
+        count += grade === grades[respUid][criteriaUID] ? 1 : 0;
+      }
+      return count;
+
+      // if (grades[responseUID]) {
+      //   let count = 0;
+      //   for (const critUid in grades[responseUID]) {
+      //     count += grades[responseUID][critUid] ? 1 : 0;
+      //   }
+      //   // return grade === grades[responseUID][criteriaUID] ? 1 : 0; // this needs to be changed after the grading feature is fully implemented on server side.
+      //   return count;
+      //   // return 10;
+      // }
+
+      // if (grades[responseUID]) {
+      //   return grade === grades[responseUID][criteriaUID] ? 1 : 0; // this needs to be changed after the grading feature is fully implemented on server side.
+      // }
+      // return 0;
+    };
+
+    // The EvalGen object responsible for generating, implementing, and filtering candidate implementations
+    const [executor, setExecutor] = useState<EvaluationFunctionExecutor | null>(
+      null,
+    );
+
+    const [execProgress, setExecProgress] = useState(0);
+
+    // State variables to keep track of GPT call counts
+    const [numGPT4Calls, setNumGPT4Calls] = useState(0);
+    const [numGPT35Calls, setNumGPT35Calls] = useState(0);
+    const [logs, setLogs] = useState<{ date: Date; message: string }[]>([]);
+
+    // For updating the global human ratings state
+    const setState = useStore((store) => store.setState);
+    const updateGlobalRating = useCallback(
+      (uid: string, label: string, payload: RatingDict) => {
+        const key = getRatingKeyForResponse(uid, label);
+        const safe_payload = deepcopy(payload);
+        setState(key, safe_payload);
+        StorageCache.store(key, safe_payload);
+      },
+      [setState],
+    );
+
+    // console.error("criteria", criteria);
+
+    // Update executor whenever resps, grades, or criteria change
+    React.useEffect(() => {
+      if (criteria.length > 0 && !executor) {
+        const existingGrades = transformDict(
+          globalState,
+          (key) => key.startsWith("r.") && key.endsWith(".grade"),
+          extractUIDFromRatingKey,
+          (_, val) => {
+            // The grades are in { idx: grade } format. Take only the first,
+            // as we only take the first response in this iteration of EvalGen:
+            if (typeof val !== "object") return undefined;
+            const gs = Object.values(val);
+            if (gs.length === 0) return undefined;
+            return gs[0];
+          },
+        );
+
+        const addLog = (message: string) => {
+          setLogs((prevLogs) => [...prevLogs, { date: new Date(), message }]);
+        };
+
+        const ex = new EvaluationFunctionExecutor(
+          getLikelyPromptTemplateAsContext(responses),
+          responses,
+          criteria,
+          (gpt4Calls, gpt35Calls) => {
+            // Callback to update GPT call counts
+            setNumGPT4Calls((num) => num + gpt4Calls);
+            setNumGPT35Calls((num) => num + gpt35Calls);
+          },
+          addLog,
+          existingGrades,
+          grades,
+        );
+        setExecutor(ex);
+
+        setExecProgress(0);
+
+        // ex.start((progress) => {
+        //   setExecProgress(progress?.success ?? 0);
+        // });
+      } else if (executor) {
+        // Update criteria in executor
+        executor.addCriteria(criteria);
+      }
+
+      updateCriteriaForDisplay();
+    }, [criteria]);
+
+    const generateCriteria = (resps) => {
+      // Create criteria
+      // setIsLoadingCriteria((num) => num + 3);
+      genCriteriaFromContext(resps)
+        .then((crits) => {
+          console.log("crits #1", crits);
+          crits = [...criteria, ...crits];
+          console.log("crits #2", crits);
+          setCriteria(crits.map((c) => ({ ...c, uid: uuid() })));
+        })
+        .catch((err) => {
+          console.error(err);
+        })
+        .finally(() => {
+          setIsLoadingCriteria((num) => num - 3);
+          setNumGPT4Calls((num) => num + 1);
+        });
+    };
+
+    // const defaultOnFinish = (reports: string) => {};
+    const [onFinish, setOnFinish] = useState({
+      setFinalRpts: (reports: EvalGenReport) => {
+        // console.log("");
+      },
+    });
+
+    // Open the EvalGen wizard
+    const trigger = (
+      resps: LLMResponse[],
+      setFinalReports: (reports: EvalGenReport) => void,
+    ) => {
+      // We pass the responses here manually to ensure they remain the same
+      // for the duration of one EvalGen operation.
+      setResponses(resps);
+      gotoNextScreen("response");
+      // setFinalReports("A plenty response");
+      setOnFinish({
+        setFinalRpts: (reports: EvalGenReport) => {
+          close();
+          setFinalReports(reports);
+        },
+      });
+
+      const firstGrades = resps.reduce(
+        (acc: Dict<Dict<boolean | undefined>>, curr) => {
+          if (!(curr.uid in acc)) acc[curr.uid] = {};
+          return acc;
+        },
+        grades,
+      );
+      setGrades(firstGrades);
+
+      console.log("*****************************resps", resps);
+      if (criteria && criteria.length === 0) {
+        generateCriteria(resps);
+      }
+
+      setShownResponseIdx(0);
+      if (resps.length > 0) {
+        const first_resp = sampleRandomElements(resps, 1)[0];
+        // setShownResponse(first_resp);
+        setPastShownResponses([first_resp]);
+      } else {
+        // setShownResponse(undefined);
+        setPastShownResponses([]);
+      }
+      setShownResponse(resps[shownResponseIdx]);
+      open();
+    };
+    useImperativeHandle(ref, () => ({
+      trigger,
+    }));
+
+    // Add a criterion
+    const handleAddCriteria = (newCrit: EvalCriteria) => {
+      setCriteria((cs) => {
+        if (!newCrit.uid) newCrit.uid = uuid();
+        return [...cs, newCrit];
+      });
+    };
+
+    const getLikelyPromptTemplateAsContext = (resps) => {
+      // Attempt to infer the prompt template used to generate the responses:
+      const prompts = new Set();
+      for (const resp_obj of resps) {
+        if (resp_obj?.metavars?.__pt !== undefined) {
+          prompts.add(resp_obj.metavars.__pt);
+        }
+      }
+
+      if (prompts.size === 0) return null;
+
+      // Pick a prompt template at random to serve as context....
+      return escapeBraces(prompts.values().next().value);
+    };
+
+    async function genCriteriaFromContext(responses) {
+      // Get the context from the input responses
+      const inputPromptTemplate = getLikelyPromptTemplateAsContext(responses);
+
+      if (inputPromptTemplate === null) {
+        console.error("No context found. Cannot proceed.");
+        return;
+      }
+
+      // Attempt to generate criteria using an LLM
+      return await generateLLMEvaluationCriteria(inputPromptTemplate, apiKeys);
+    }
+
+    // Modify an existing criterion
+    const handleChangeCriteria = (newCrit: EvalCriteria, uid: string) => {
+      setCriteria((cs) => {
+        const idx = cs.findIndex((c) => c.uid === uid);
+        if (idx === -1) {
+          console.error("Could not find criteria with uid", uid);
+          return cs;
+        }
+        cs[idx] = newCrit;
+        return [...cs];
+      });
+    };
+
+    // Delete a criterion
+    const handleDeleteCriteria = (uid: string) => {
+      setCriteria((cs) => {
+        return cs.filter((c) => c.uid !== uid);
+      });
+    };
+
+    // Synthesize a new criteria according to the feedback given for the shown response
+    const [isLoadingCriteria, setIsLoadingCriteria] = useState(0);
+    const synthNewCriteriaWithLLM = (
+      response: string,
+      feedback: string,
+      grade: "good" | "bad" | "unknown",
+    ) => {
+      // Add a loading Skeleton
+      setIsLoadingCriteria((num) => num + 1);
+      // Make async LLM call to expand criteria only if the feedback contains some idea of a constraint on the output and isn't covered by existing criteria
+      const prettyCriteria = criteria
+        .map((crit) => {
+          return `${crit.shortname}: ${crit.criteria}`;
+        })
+        .join("\n");
+
+      generateLLMEvaluationCriteria(
+        "",
+        apiKeys,
+        `I've given some feedback on some text output. Use this feedback to decide on a single new evaluation criteria with a yes/no answer, only if the feedback isn't encompassed by existing criteria. I want you to take the criteria and output a JSON object in the format below. 
+
+TEXT OUTPUT: 
+\`\`\`
+${response}
+\`\`\`
+
+EXISTING CRITERIA:
+\`\`\`
+${prettyCriteria}
+\`\`\`
+
+GRADE (whether text was good or bad):
+\`\`\`
+${grade}
+\`\`\`
+
+FEEDBACK: 
+\`\`\`
+${feedback}
+\`\`\`
+
+If you determine the feedback corresponds to a new criteria, your response should contain a short title for the criteria ("shortname"), a description of the criteria in 2 sentences ("criteria"), and whether it should be evaluated with "code", or by an "expert" if the criteria is difficult to evaluate ("eval_method"). Your answer should be JSON within a \`\`\`json \`\`\` marker, with the following three fields: "criteria", "shortname", and "eval_method" (code or expert). The "criteria" should expand upon the user's input, the "shortname" should be a very brief title for the criteria, and this list should contain as many evaluation criteria as you can think of. Each evaluation criteria should test a unit concept that should evaluate to "true" in the ideal case. Only output JSON, nothing else. Output an empty list if there is no new evaluation criteria`, // prompt
+        "gpt-4o", // llm
+      )
+        .then((evalCrits) => {
+          // Take only the first if evalCrits has a nonempty list
+          if (evalCrits[0]) {
+            setCriteria((crit) =>
+              crit.concat([
+                {
+                  ...evalCrits[0],
+                  uid: uuid(),
+                },
+              ]),
+            );
+          }
+          // Remove a loading Skeleton
+          setIsLoadingCriteria((num) => num - 1);
+
+          setNumGPT4Calls((num) => num + 1);
+        })
+        .catch((err) => {
+          console.error(err);
+          setIsLoadingCriteria((num) => num - 1);
+        });
+    };
+
+    // Goto next response in the queue (skipping grading the current one)
+    const nextResponse = () => {
+      if (responses.length === 0) return;
+
+      // Update annotation for current response (if any)
+      // TODO: Fix this for generate case when num resp per prompt > 1
+
+      if (
+        grades[shownResponse.uid] ||
+        holisticGrade ||
+        (annotation && annotation.trim())
+      ) {
+        executor?.setGradeForExample(
+          shownResponse.uid,
+          grades[shownResponse.uid],
+          holisticGrade,
+          annotation ? annotation.trim() : null,
+        );
+      }
+
+      if (
+        shownResponse &&
+        annotation &&
+        typeof annotation === "string" &&
+        annotation.trim().length > 0
+      ) {
+        console.log(
+          "setting annotation for resp",
+          shownResponse.uid,
+          annotation,
+        );
+        updateGlobalRating(shownResponse.uid, "note", { 0: annotation });
+        setAnnotation("");
+      }
+
+      if (shownResponse && holisticGrade) {
+        updateGlobalRating(shownResponse.uid, "grade", {
+          0: holisticGrade === "good",
+        });
+      }
+
+      if (shownResponse && grades[shownResponse.uid]) {
+        updateGlobalRating(
+          shownResponse.uid,
+          "perCriteriaGrades",
+          grades[shownResponse.uid],
+        );
+      }
+
+      // @ts-expect-error The only way to deselect the Radio.Group is to set it to null. Undefined doesn't work.
+      setHolisticGrade(null);
+
+      if (shownResponseIdx < pastShownResponses.length - 1) {
+        // If we are not at the end of the history of shown responses, then show the next response:
+        setShownResponse(pastShownResponses[shownResponseIdx + 1]);
+        setShownResponseIdx(shownResponseIdx + 1); // increment the shown resp idx
+      } else {
+        // We are at the end of the history; pick the next response off the stack:
+        // TODO: Make this unique (maybe by removing picked responses from the list!)
+        let num_tries = 3;
+        let next_resp = executor?.getNextExampleToGrade();
+        while (
+          num_tries > 0 &&
+          (!next_resp ||
+            pastShownResponses.some((r) => r.uid === next_resp?.uid))
+        ) {
+          // We're presenting a response that's already been shown. Try again.
+          // NOTE: If we're trying again the first time, executor will flip and get the response on the other side of the grading stack, so we try once more:
+          if (next_resp && num_tries === 3)
+            next_resp =
+              executor?.getNextExampleToGrade() ??
+              sampleRandomElements(responses, 1)[0];
+          // Otherwise we just choose a response at random:
+          else next_resp = sampleRandomElements(responses, 1)[0];
+          num_tries -= 1;
+        }
+        // Note that this doesn't guarantee uniqueness here ---it is possible to see a response again.
+        // However, the internal "grades" dict will help us in remembering what grade the user gave the response.
+        setShownResponse(next_resp ?? undefined);
+        if (next_resp)
+          setPastShownResponses(pastShownResponses.concat(next_resp));
+        setShownResponseIdx(pastShownResponses.length);
+      }
+      updateShownResponseUniqueIndex();
+    };
+
+    // Go back to previously shown response
+    const prevResponse = () => {
+      if (pastShownResponses.length === 0 || shownResponseIdx === 0) return;
+      setShownResponse(pastShownResponses[shownResponseIdx - 1]);
+      setShownResponseIdx(shownResponseIdx - 1); // decrement shown resp idx
+      updateShownResponseUniqueIndex();
+    };
+
+    const updateShownResponseUniqueIndex = () => {
+      let idx = 0;
+      for (const resp of responses) {
+        if (resp === shownResponse) {
+          setShownResponseUniqueIdx(idx);
+          break;
+        }
+        idx++;
+      }
+    };
+
+    const nextResponse2 = () => {
+      if (responses.length === 0) return;
+      if (shownResponseIdx < responses.length - 1) {
+        // setShownResponse(responses[shownResponseIdx + 1]);
+        setShownResponseIdx(shownResponseIdx + 1);
+      }
+    };
+
+    const prevResponse2 = () => {
+      if (shownResponseIdx > 0) {
+        // setShownResponse(responses[shownResponseIdx - 1]);
+        setShownResponseIdx(shownResponseIdx - 1); // decrement shown resp idx
+      }
+    };
+
+    React.useEffect(() => {
+      setShownResponse(responses[shownResponseIdx]);
+    }, [shownResponseIdx]);
+
+    const estimateGPTCalls = () => {
+      return executor
+        ? `This will trigger around ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT4Calls} GPT-4o and ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT35Calls} GPT-3.5-turbo-16k calls.`
+        : "# estimated GPT calls not available.";
+    };
+
+    const updateCriteriaForDisplay = () => {
+      const highCriteria = criteria.filter((c) => c.priority === 1);
+      const lowCriteria = criteria.filter((c) => c.priority === 0);
+      setCriteriaForDisplay(highCriteria.concat(lowCriteria));
+    };
+    useEffect(() => {
+      const highCriteria = criteria.filter((c) => c.priority === 1);
+      const lowCriteria = criteria.filter((c) => c.priority === 0);
+      setCriteriaForDisplay(highCriteria.concat(lowCriteria));
+    }, [criteria]);
+
+    const [screen, setScreen] = useState("");
+    const gotoNextScreen = (screenName: string) => {
+      setScreen(screenName);
+    };
+
+    // const [onFinish, setOnFinish] = useState(null);
+
+    return (
+      <Modal
+        size="90%"
+        keepMounted
+        opened={opened}
+        onClose={close}
+        closeOnClickOutside={true}
+        style={{ position: "relative", left: "-5%" }}
+      >
+        {screen === "response" && (
+          <Grid h={window?.innerHeight * 0.8}>
+            <Grid.Col span={8}>
+              <Stack justify="space-between">
+                {/* View showing the response the user is currently grading */}
+                <GradingView
+                  shownResponse={shownResponse}
+                  shownResponseIdx={shownResponseIdx}
+                  // shownResponseIdx={shownResponseUniqueIdx}
+                  responseCount={responses.length}
+                  numGPT4Calls={numGPT4Calls}
+                  numGPT35Calls={numGPT35Calls}
+                  logs={logs}
+                  gotoNextResponse={nextResponse2}
+                  gotoPrevResponse={prevResponse2}
+                  estimateGPTCalls={estimateGPTCalls}
+                  gotoNextScreen={gotoNextScreen}
+                />
+
+                {/* Progress bar */}
+                {/* <Flex justify="left" align="center" gap="md">
+                <Stack w="100%" spacing={4}>
+                  <Text color="#aaa" size="sm">
+                    {bottomBar.progressLabel}
+                  </Text>
+                  <Progress w="100%" value={bottomBar.progressPerc} mb="0px" />
+                </Stack>
+
+                <Button
+                  onClick={handleDone}
+                  variant={bottomBar.buttonStyle}
+                  disabled={bottomBar.buttonDisabled}
+                >
+                  {bottomBar.buttonLabel}
+                </Button>
+              </Flex> */}
+              </Stack>
+            </Grid.Col>
+            <Grid.Col span={4} bg="#eee" pt="16px" h="100%">
+              <div
+                style={{
+                  display: "flex",
+                  flexDirection: "column",
+                  height: "100%",
+                }}
+              >
+                <div style={{ flex: 2, overflowY: "auto" }}>
+                  {criteriaForDisplay.map((e) => (
+                    <CriteriaCard
+                      criterion={e}
+                      key={e.uid}
+                      onChange={(newCrit) =>
+                        handleChangeCriteria(newCrit, e.uid)
+                      }
+                      onDelete={() => handleDeleteCriteria(e.uid)}
+                      grade={
+                        shownResponse
+                          ? grades[shownResponse.uid][e.uid]
+                          : undefined
+                      }
+                      getGradeCount={(grade) => {
+                        return shownResponse
+                          ? getGradeCount(
+                              // shownResponse.uid,
+                              e.uid,
+                              grade,
+                            )
+                          : 0;
+                      }}
+                      onChangeGrade={(newGrade) => {
+                        if (shownResponse)
+                          setPerCriteriaGrade(
+                            shownResponse.uid,
+                            e.uid,
+                            newGrade,
+                          );
+                      }}
+                      initiallyOpen={true}
+                      getStateValue={(stateId) => getStateValue(stateId)}
+                    />
+                  ))}
+                  {isLoadingCriteria > 0 ? (
+                    Array.from(
+                      { length: isLoadingCriteria },
+                      (v: unknown, idx: number) => (
+                        <Skeleton key={idx} h={80} mb={4} />
+                      ),
+                    )
+                  ) : (
+                    <></>
+                  )}
+                  {/* <Center> */}
+                  <div className="criteriaButtons">
+                    {/* <button
+                    onClick={() => {
+                      handleAddCriteria({
+                        shortname: "New Criteria",
+                        criteria: "",
+                        eval_method: "code",
+                        priority: 0,
+                        uid: uuid(),
+                      });
+                    }}
+                  >
+                    +
+                  </button> */}
+                    <Button
+                      leftIcon={<IconPencil size={14} />}
+                      variant="filled"
+                      // gradient={{ from: "blue", to: "green", deg: 90 }}
+                      onClick={() => {
+                        handleAddCriteria({
+                          shortname: "New Criteria",
+                          criteria: "",
+                          eval_method: "code",
+                          priority: 0,
+                          uid: uuid(),
+                        });
+                      }}
+                    >
+                      New Criteria
+                    </Button>
+                    {/* </Center>
+                <Center> */}
+                    <Button
+                      leftIcon={<IconSparkles size={14} />}
+                      variant="filled"
+                      // gradient={{ from: "blue", to: "green", deg: 90 }}
+                      onClick={() => {
+                        generateCriteria(responses);
+                      }}
+                    >
+                      Suggest Criteria
+                    </Button>
+                    {/* </Center> */}
+                  </div>
+                </div>
+
+                <Stack spacing="0px" pl="xs" pr="lg" style={{ flex: 1 }}>
+                  <Divider mt="lg" />
+                  <Title mb="0px" order={4}>
+                    Suggest New Criteria Based on the Feedback
+                  </Title>
+                  <Textarea
+                    value={annotation}
+                    onChange={(e) => setAnnotation(e.target.value)}
+                    description="How good is this response? Explain anything not captured under your existing criteria. Your feedback will be used to generate new criteria."
+                    mb="sm"
+                  />
+                  <Radio.Group
+                    name="favoriteFramework"
+                    label="Rate the response holistically:"
+                    value={holisticGrade}
+                    onChange={(v) => setHolisticGrade(v as "good" | "bad")}
+                    withAsterisk
+                    mb="md"
+                  >
+                    <Group mt="xs">
+                      <Radio value="good" label="Good" />
+                      <Radio value="bad" label="Bad" />
+                      <span>
+                        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+                      </span>
+                      <Button
+                        color="green"
+                        variant="filled"
+                        disabled={
+                          !holisticGrade ||
+                          annotation === undefined ||
+                          annotation.length === 0
+                        }
+                        onClick={() => {
+                          synthNewCriteriaWithLLM(
+                            shownResponse?.responses[0].toString() ?? "",
+                            annotation ?? "",
+                            holisticGrade ?? "unknown",
+                          );
+
+                          nextResponse();
+                        }}
+                      >
+                        + Submit Feedback
+                      </Button>
+                    </Group>
+                  </Radio.Group>
+                </Stack>
+              </div>
+            </Grid.Col>
+          </Grid>
+        )}
+        {screen === "report" && (
+          <Grid>
+            <ReportCardView
+              report={{
+                criteria: criteria,
+                failureCoverage: 99.2,
+                falseFailureRate: 66.7,
+              }}
+              onFinish={(reports: EvalGenReport) => {
+                onFinish.setFinalRpts(reports);
+              }}
+              getGradeCount={(crit: EvalCriteria, grade: boolean) => {
+                return shownResponse
+                  ? getGradeCount(
+                      // shownResponse.uid,
+                      crit.uid,
+                      grade,
+                    )
+                  : 0;
+              }}
+              getStateValue={(stateId) => getStateValue(stateId)}
+            />
+          </Grid>
+        )}
+      </Modal>
+    );
+  },
+);
+
+const HeaderText = ({ children }: { children: ReactNode }) => {
+  return (
+    <Text size="xl" fw={500} pl="sm" mb="lg">
+      {children}
+    </Text>
+  );
+};
+
+interface GradingViewProps {
+  shownResponse: LLMResponse | undefined;
+  shownResponseIdx: number;
+  responseCount: number;
+  numGPT4Calls: number;
+  numGPT35Calls: number;
+  logs: { date: Date; message: string }[];
+  gotoPrevResponse: () => void;
+  gotoNextResponse: () => void;
+  estimateGPTCalls: () => string;
+  gotoNextScreen: (screenName: string) => void;
+}
+
+const GradingView: React.FC<GradingViewProps> = ({
+  shownResponse,
+  shownResponseIdx,
+  responseCount,
+  numGPT4Calls,
+  numGPT35Calls,
+  logs,
+  gotoPrevResponse,
+  gotoNextResponse,
+  estimateGPTCalls,
+  gotoNextScreen,
+}) => {
+  // Calculate inner values only when shownResponse changes
+  const responseText = useMemo(
+    () =>
+      shownResponse && shownResponse.responses?.length > 0
+        ? shownResponse.responses[0].toString()
+        : "",
+    [shownResponse],
+  );
+
+  const prompt = useMemo(() => shownResponse?.prompt ?? "", [shownResponse]);
+  const varsDivs = useMemo(() => {
+    const combined_vars_metavars = shownResponse
+      ? {
+          ...shownResponse.vars,
+          ...transformDict(shownResponse.metavars, cleanMetavarsFilterFunc),
+        }
+      : {};
+
+    // console.log("**************shownResponse", shownResponse);
+    return Object.entries(combined_vars_metavars).map(([varname, val]) => (
+      <div key={varname} className="grade-resp-var-container">
+        <span className="response-var-name">{varname}&nbsp;=&nbsp;</span>
+        <span className="response-var-value linebreaks">{val}</span>
+      </div>
+    ));
+  }, [shownResponse]);
+
+  // const [shownResponseIdx, setShownResponseIdx] = useState(0);
+  // const [shownResponses, setShownResponses] = useState<LLMResponse[]>([]);
+  // React.useEffect(() => {
+  //   console.error("current response", shownResponse);
+  //   if (shownResponse && !shownResponses.includes(shownResponse)) {
+  //     shownResponses.push(shownResponse);
+  //     setShownResponses(shownResponses);
+  //     setShownResponseIdx(shownResponses.length - 1);
+  //     console.error("current response is saved.", shownResponses.length);
+  //   } else {
+  //     console.error("current response already saved.");
+  //     for (const [idx, resp] of shownResponses.entries()) {
+  //       if (shownResponse === resp) {
+  //         setShownResponseIdx(idx);
+  //         break;
+  //       }
+  //     }
+  //   }
+  // }, [shownResponse]);
+
+  return (
+    <Stack justify="space-between" mih={500}>
+      <Box>
+        {/* Top header */}
+        <Flex justify="center">
+          <HeaderText>
+            {/* What do you think of this response? */}
+            What do you think of response #{shownResponseIdx + 1} of{" "}
+            {responseCount}?
+          </HeaderText>
+        </Flex>
+        {/* Middle response box with chevron buttons < and > for going back and forward a response */}
+        <Flex justify="center" align="center" mb="sm">
+          {/* Go back to previous response */}
+          <Button variant="white" color="dark" onClick={gotoPrevResponse}>
+            <IconChevronLeft />
+          </Button>
+
+          {/* The response one is currently grading */}
+          <div
+            className="response-box"
+            style={{
+              backgroundColor: "#eee",
+              width: "80%",
+              maxHeight: "340px",
+              overflowY: "scroll",
+              borderColor: "black",
+              borderStyle: "solid",
+            }}
+          >
+            <div className="response-item-llm-name-wrapper">
+              <div
+                className="small-response"
+                style={{ fontSize: "11pt", padding: "12pt" }}
+              >
+                {responseText}
+              </div>
+            </div>
+          </div>
+
+          {/* Go forward to the next response */}
+          <Tooltip label={estimateGPTCalls()} withArrow>
+            <Button variant="white" color="dark" onClick={gotoNextResponse}>
+              <IconChevronRight />
+            </Button>
+          </Tooltip>
+        </Flex>
+        {/* Views for the vars (inputs) that generated this response, and the concrete prompt */}
+        <Flex justify="center" mb="xl" gap="lg">
+          <div
+            style={{
+              backgroundColor: "#fff",
+              padding: "12px",
+              width: "31%",
+              borderRadius: "12px",
+              borderWidth: "1px",
+              borderStyle: "solid",
+            }}
+          >
+            Vars
+            <hr />
+            <div style={{ maxHeight: "160px", overflowY: "scroll" }}>
+              {varsDivs}
+            </div>
+          </div>
+          <div
+            style={{
+              backgroundColor: "#fff",
+              padding: "12px",
+              width: "41%",
+              borderRadius: "2px",
+            }}
+          >
+            Prompt
+            <hr />
+            <div
+              className="monofont linebreaks"
+              style={{
+                maxHeight: "160px",
+                overflowY: "scroll",
+                fontSize: "10pt",
+                lineHeight: "1.2",
+              }}
+            >
+              {prompt}
+            </div>
+          </div>
+        </Flex>
+        <Flex direction="column">
+          <Flex justify="space-between" align="center">
+            <Text size="lg" weight={500} mb="sm">
+              LLM Activity
+            </Text>
+            {/* GPT Call Tally */}
+            <Text size="sm" color="dark" style={{ fontStyle: "italic" }}>
+              Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls}{" "}
+              GPT-3.5-Turbo-16k calls.
+            </Text>
+          </Flex>
+          <div
+            style={{
+              backgroundColor: "#f0f0f0",
+              color: "#333",
+              fontFamily: "monospace",
+              padding: "12px",
+              width: "calc(100% - 30px)",
+              height: "200px",
+              overflowY: "auto",
+              borderRadius: "8px",
+              border: "1px solid #ddd",
+              marginRight: "20px", // Space on the right
+            }}
+            ref={(el) => {
+              if (el) {
+                el.scrollTop = el.scrollHeight;
+              }
+            }}
+          >
+            {logs.map((log, index) => (
+              <div key={index}>
+                <span style={{ color: "#4A90E2" }}>
+                  {log.date.toLocaleString()} -{" "}
+                </span>
+                <span>{log.message}</span>
+              </div>
+            ))}
+          </div>
+        </Flex>
+      </Box>
+      <div>
+        <Center>
+          <Button
+            leftIcon={<IconSparkles size={14} />}
+            variant="gradient"
+            gradient={{ from: "blue", to: "green", deg: 45 }}
+            onClick={() => {
+              // console.log("(3) gotoNextScreen", gotoNextScreen);
+              gotoNextScreen("report");
+            }}
+          >
+            I&apos;m done. Access EvalGen Report!
+          </Button>
+        </Center>
+      </div>
+    </Stack>
+  );
+};
+
+interface ReportCardViewProps {
+  report: EvalGenReport;
+  // recomputeAlignment,
+  onFinish: (reports: EvalGenReport) => void;
+  getGradeCount: (crit: EvalCriteria, grade: boolean) => number;
+  getStateValue: (stateId: number) => number;
+}
+
+// const ReportCardScreen = () => {
+const ReportCardView: React.FC<ReportCardViewProps> = ({
+  report,
+  // recomputeAlignment,
+  onFinish,
+  getGradeCount,
+  getStateValue,
+}) => {
+  // The criteria cards, now with report information
+
+  const [finalReport, setFinalReport] = useState(report);
+
+  const onSelect = (criterion: EvalCriteria, isSelected: boolean) => {
+    if (isSelected) {
+      finalReport.criteria.push(criterion);
+    } else {
+      finalReport.criteria = finalReport.criteria.filter(
+        (c) => c !== criterion,
+      );
+    }
+    setFinalReport(finalReport);
+  };
+  const cards = useMemo(() => {
+    const res = [];
+
+    // Iterate through selected eval functions and create cards
+    // for (const selectedFunc of report.selectedEvalFunctions) {
+    //   const crit = selectedFunc.evalCriteria;
+    //   // Find corresponding report in allEvalFunctionReports map from criteria to list
+    //   const critEvalFuncReports = report.allEvalFunctionReports.get(crit);
+    //   const evalFuncReport = critEvalFuncReports.find(
+    //     (rep) => rep.evalFunction === selectedFunc,
+    //   );
+
+    //   // Get the functions that were not selected for this criteria
+    //   const otherFuncs = critEvalFuncReports.filter(
+    //     (rep) => rep.evalFunction !== selectedFunc,
+    //   );
+    for (const crit of report.criteria) {
+      res.push(
+        <ReportCriteriaCard
+          criterion={crit}
+          key={crit.uid}
+          // onCheck={(checked) => {
+          //   crit.selected = checked;
+          //   recomputeAlignment();
+          // }}
+          getGradeCount={getGradeCount}
+          getStateValue={getStateValue}
+          onSelect={onSelect}
+        />,
+      );
+    }
+
+    return res;
+  }, [report]);
+
+  return (
+    report && (
+      <div>
+        <Text align="center" size="lg" pl="sm" mb="lg">
+          Chosen Functions and Alignment
+        </Text>
+
+        {/* Show coverage and false failure rate numbers */}
+        <Flex justify="center" gap="md" mb="lg">
+          <Group position="center" spacing="xl" style={{ textAlign: "center" }}>
+            <Card
+              shadow="sm"
+              padding="md"
+              radius="md"
+              style={{ backgroundColor: "#f0f0f0" }}
+            >
+              <Text weight={500} size="md">
+                Coverage of Bad Responses
+              </Text>
+              <Text color="blue" weight={700} size="md">
+                {report.failureCoverage.toFixed(2)}%
+              </Text>
+            </Card>
+            <Card
+              shadow="sm"
+              padding="md"
+              radius="md"
+              style={{ backgroundColor: "#f0f0f0" }}
+            >
+              <Text weight={500} size="md">
+                False Failure Rate
+              </Text>
+              <Text color="red" weight={700} size="md">
+                {report.falseFailureRate.toFixed(2)}%
+              </Text>
+            </Card>
+          </Group>
+        </Flex>
+
+        <ScrollArea mih={300} h={500} mah={500}>
+          <SimpleGrid cols={3} spacing="sm" verticalSpacing="sm" mb="lg">
+            {cards}
+          </SimpleGrid>
+        </ScrollArea>
+
+        <Flex justify="center" gap={12} mt="xs">
+          <Button
+            onClick={() => {
+              // console.log("finalReport", finalReport);
+              onFinish(finalReport);
+            }}
+          >
+            Finish with selected evaluators
+          </Button>
+        </Flex>
+      </div>
+    )
+  );
+};
+
+interface ReportCriteriaCardProps {
+  criterion: EvalCriteria;
+  // onChange: (changedCriteria: EvalCriteria) => void;
+  // onDelete: () => void;
+  // initiallyOpen?: boolean;
+  // grade: boolean | undefined;
+  // onChangeGrade: (newGrade: boolean | undefined) => void;
+  getGradeCount: (crit: EvalCriteria, grade: boolean) => number;
+  getStateValue: (stateId: number) => number;
+  onSelect: (criterion: EvalCriteria, isChecked: boolean) => void;
+}
+
+const ReportCriteriaCard: React.FC<ReportCriteriaCardProps> = ({
+  criterion,
+  // onChange,
+  // onDelete,
+  // initiallyOpen,
+  // grade,
+  getGradeCount,
+  // onChangeGrade,
+  getStateValue,
+  onSelect,
+}) => {
+  // const [opened, { toggle }] = useDisclosure(true);
+  // const [title, setTitle] = useState(criterion.shortname);
+  const [checked, setChecked] = useState(true);
+
+  // Simulates eval functions that are expected to be passed in later on (TODO)
+  const evalFuncs = [
+    { evalFunction: { code: "To be provided (1) ..." } },
+    { evalFunction: { code: "To be provided (2) ..." } },
+    { evalFunction: { code: "To be provided (3) ..." } },
+  ];
+  const unselectedImplementations = evalFuncs.map((item) => (
+    <div key={uuid()}>
+      <Code style={{ whiteSpace: "pre-wrap" }} key={uuid()}>
+        {item.evalFunction.code}
+      </Code>
+      <Divider />
+    </div>
+  ));
+
+  return (
+    // <Card withBorder mb={4} radius="md" style={{ cursor: "default" }}>
+    <Card
+      shadow="sm"
+      padding="sm"
+      pl="md"
+      pb="xl"
+      radius="md"
+      withBorder
+      style={{ backgroundColor: checked ? "#f2f7fc" : "#fff" }}
+    >
+      <div
+        // onClick={() => setChecked(!checked)}
+        onKeyUp={(e) => e.preventDefault()}
+        className="checkcard"
+      >
+        {/* <Card.Section withBorder pl="8px">
+          <Flex align="center">
+            <Group spacing="0px"> */}
+        {/* The arrow chevron user can click to collapse/expand */}
+        {/* <Button
+                color="gray"
+                p={0}
+                m={0}
+                variant="subtle"
+                mr="4px"
+                onClick={toggle}
+              >
+                {opened ? (
+                  <IconChevronDown size="14pt" />
+                ) : (
+                  <IconChevronRight size="14pt" />
+                )}
+              </Button> */}
+
+        <Tooltip label={checked ? "Don't use this" : "Use this"} withArrow>
+          <Checkbox
+            checked={checked}
+            onChange={() => {
+              setChecked(!checked);
+              if (onSelect) onSelect(criterion, !checked);
+            }}
+            tabIndex={-1}
+            size="xs"
+            mr="sm"
+            mt="xs"
+            styles={{ input: { cursor: "pointer" } }}
+            aria-hidden
+          />
+        </Tooltip>
+
+        {/* Thumbs up/down buttons - disable for now */}
+        {/* <ReadOnlyThumbUpDownButtons
+                  upCount={getGradeCount(criterion, true)}
+                  downCount={getGradeCount(criterion, false)}
+                /> */}
+
+        <div style={{ width: "100%" }}>
+          {/* Title of the criteria */}
+          <TextInput
+            value={criterion.shortname}
+            // placeholder="Criteria name"
+            readOnly
+            variant="unstyled"
+            size="sm"
+            ml="xs"
+            className="nodrag nowheel"
+            styles={{
+              input: {
+                border: "none",
+                borderWidth: "0px",
+                padding: "0px",
+                background: "transparent",
+                fontWeight: 500,
+                fontSize: "12pt",
+                margin: "0px",
+                height: "auto",
+                minHeight: "auto",
+              },
+            }}
+          />
+          {/* </Group> */}
+
+          {/* <Group spacing="4px" ml="auto"> */}
+
+          {/* <Button
+                  color={criterion.priority <= 0 ? "gray" : "red"}
+                  m={0}
+                  p={0}
+                  variant="subtle"
+                >
+                  <IconFlagFilled size="14pt" />
+                </Button> */}
+          {/* </Group>
+            </Flex>
+          </Card.Section> */}
+
+          {/* Description of the criteria */}
+          {/* <Card.Section p="0px"> */}
+          {/* <Collapse in={opened}> */}
+          <Textarea
+            value={criterion.criteria}
+            // placeholder="Describe here."
+            readOnly
+            // onClickCapture={(e) => e.stopPropagation()}
+            styles={{
+              input: {
+                border: "none",
+                borderWidth: "0px",
+                paddingTop: "0px !important",
+                paddingLeft: "0px",
+                margin: "0px",
+                color: "#444",
+                background: "transparent",
+                lineHeight: 1.1,
+              },
+            }}
+            autosize
+            minRows={2}
+            maxRows={5}
+            fz="sm"
+            mb="xs"
+            c="dimmed"
+          />
+
+          {/* Whether this criteria should be implemented with code (function) or an LLM evaluator */}
+          <Text color="#999" size="sm" mr="6px">
+            {criterion.eval_method === "code" ? (
+              <Flex style={{ userSelect: "none" }}>
+                <IconTerminal2 size="14pt" />
+                &nbsp;Python
+              </Flex>
+            ) : (
+              <Flex style={{ userSelect: "none" }}>
+                <IconRobot size="14pt" />
+                &nbsp;LLM
+              </Flex>
+            )}
+          </Text>
+        </div>
+        <Stack spacing={0}>
+          <Contributor
+            getStateValue={getStateValue}
+            style={{ size: 90, thickness: 12 }}
+          />
+          <Text align="center" size="xs" color="gray" maw="90%" lh={1.1}>
+            Alignment with your grades
+          </Text>
+        </Stack>
+      </div>
+      {/* </Collapse> */}
+      {/* </Card.Section> */}
+      <div>
+        <Accordion>
+          <Accordion.Item
+            key={"Show Bad Implementations"}
+            value={"Show Bad Implementations"}
+          >
+            <Accordion.Control>
+              <Text size="sm"> Show Bad Implementations </Text>
+            </Accordion.Control>
+            <Accordion.Panel>{unselectedImplementations}</Accordion.Panel>
+          </Accordion.Item>
+        </Accordion>
+      </div>
+    </Card>
+  );
+};
+
+const ReadOnlyThumbUpDownButtons = ({
+  upCount,
+  downCount,
+}: {
+  upCount: number;
+  downCount: number;
+  // grade: boolean | undefined;
+  // onChangeGrade: (newGrade: boolean | undefined) => void;
+  // getGradeCount: (grade: boolean | undefined) => number;
+}) => {
+  return (
+    <>
+      {/* Thumbs up/down buttons */}
+      <Button color={"green"} m={0} p={0} variant="subtle">
+        <div className="gradeContainer">
+          <IconThumbUp size="14pt" fill={"#aea"} />
+          <div className="gradeUpCount">{upCount}</div>
+        </div>
+      </Button>
+      <Button color={"red"} m={0} p={0} variant="subtle">
+        <div className="gradeContainer">
+          <IconThumbDown size="14pt" fill={"pink"} />
+          <div className="gradeDownCount">{downCount}</div>
+        </div>
+      </Button>
+    </>
+  );
+};
+
+// export default { EvalGenModal, ReportCardScreen };
+export default EvalGenModal;
diff --git a/chainforge/react-server/src/backend/evalgen/README.md b/chainforge/react-server/src/backend/evalgen/README.md
new file mode 100644
index 000000000..e4cda83b9
--- /dev/null
+++ b/chainforge/react-server/src/backend/evalgen/README.md
@@ -0,0 +1,27 @@
+# `evalgen`: Grading and Evaluation Function Selection Module
+
+This module takes a developer's prompt and set of examples (variables, prompts, responses), suggests evaluation criteria (with confirmation from developer + other criteria), generates and evaluates multiple functions per criteria on each of the examples, and returns the best function per criteria (most aligned with the developer's grades).
+
+## Execution
+
+There is an interactive script to play with the functionality in `test.ts`. You can run it by running `ts-node test.ts` in the `grading` directory of the project. The terminal is a bit laggy sometimes.
+
+## Architecture
+
+The module is divided into the following components: `executor`, `utils`, `oai_utils`.
+
+### Utils
+
+This module contains types and prompts for criteria generation, function generation, and function execution.
+
+### OAI Utils
+
+This module contains utilities for interacting with the Azure OpenAI API and streaming partial results (e.g., each evaluation criteria as it is generated).
+
+### Executor
+
+This module contains the main logic for the module. It takes a developer's prompt and set of examples, as well as a list of evaluation criteria (which can be generated by the utils module). It has a background process to generate and evaluate functions for each criteria, updating each example's grading priority as function results stream in. There is a method to query the next example to grade, and another method to set the grade for an example. The module also has a method to query the best function per criteria (most aligned with the developer's grades).
+
+# Credits
+
+The `evalgen` module was created by Shreya Shankar. It was adapted to the ChainForge codebase by Ian Arawjo.
diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts
new file mode 100644
index 000000000..09d9e0d8e
--- /dev/null
+++ b/chainforge/react-server/src/backend/evalgen/executor.ts
@@ -0,0 +1,1088 @@
+import {
+  execPyFunc,
+  executeLLMEval,
+  generateFunctionsForCriteria,
+} from "./utils";
+import {
+  EvalCriteria,
+  EvalFunction,
+  EvalFunctionResult,
+  EvalFunctionReport,
+  EvalFunctionSetReport,
+  EvalCriteriaUID,
+} from "./typing";
+import { LLMResponse, ResponseUID, QueryProgress, Dict } from "../typing";
+import { EventEmitter } from "events";
+
+/**
+ * The EvaluationFunctionExecutor class is designed to asynchronously
+ * evaluate a set of examples against specified evaluation criteria using
+ * generated evaluation functions and to prioritize grading based on the
+ * results.
+ *
+ * Usage:
+ *
+ * 1. Initialization:
+ *    Create an instance of the EvaluationFunctionExecutor by providing the
+ *    evaluation criteria, a prompt template for the developer's LLM chain,
+ *    and a set of examples to be evaluated.
+ *
+ *    const executor = new EvaluationFunctionExecutor(
+ *      promptTemplate, examples, evalCriteria);
+ *
+ *    // Optionally, you can call setEvalCriteria to set the evaluation criteria
+ *    // after the executor has been initialized.
+ *    executor.setEvalCriteria(evalCriteria);
+ *
+ * 2. Start Background Computation:
+ *    Call the `start` method to begin generating and executing evaluation
+ *    functions in the background. This method returns immediately,
+ *    allowing your application to perform other tasks concurrently.
+ *
+ *    executor.start();
+ *
+ * 3. Continue with Other Computations and Interactive Grading:
+ *    You can proceed with other tasks (i.e., grading) immediately after
+ *    starting the background computation. Use `getNextExampleToScore`
+ *    to determine which example to grade next and `setGradeForExample`
+ *    to assign grades to specific examples. This interactive grading will
+ *    help in filtering out incorrect evaluation functions.
+ *
+ *    // Example of interactive grading loop
+ *    let nextExampleId = executor.getNextExampleToScore();
+ *    while (nextExampleId !== null) {
+ *      const grade = ...; // Determine the grade for the example, e.g.,
+ *                          // through user input
+ *      executor.setGradeForExample(nextExampleId, grade);
+ *      nextExampleId = executor.getNextExampleToScore();
+ *    }
+ *
+ * 5. (Optional) Querying Results:
+ *    At any time, you can query the current grading priorities of examples
+ *    or check the grading status by using methods like `getScore`,
+ *    `getAllScores`, or `getNextExampleToScore`.
+ */
+export default class EvaluationFunctionExecutor {
+  private scores: Map<ResponseUID, number>;
+  // Cache function results for each example
+  private resultsCache: Map<EvalFunction, Map<ResponseUID, EvalFunctionResult>>;
+  private grades: Map<ResponseUID, boolean>; // Grades for all examples
+  private perCriteriaGrades: Dict<Dict<boolean | undefined>>; // Grades per criteria
+  private annotations: Dict<string>; // Annotations for each response
+  private lastPickedHighScore: boolean; // To alternate between highest and lowest scores when sampling examples to grade
+  private examples: LLMResponse[]; // The set of examples being evaluated and graded
+  private evalCriteria: EvalCriteria[]; // The criteria used to generate evaluation functions
+  private evalFunctions: EvalFunction[]; // The set of evaluation functions generated for the developer's LLM chain
+  private promptTemplate: string; // The prompt template for the developer's LLM chain
+  private backgroundTaskPromise: Promise<void> | null = null; // To keep track of the background task for generating and executing evaluation functions
+  private criteriaQueue: EvalCriteria[] = []; // Queue for new criteria to be processed
+  private processing = false; // To keep track of whether we are currently processing a criteria
+  private updateGPTCalls: (numGPT4Calls: number, numGPT35Calls: number) => void;
+  private logFunction: (logMessage: string) => void;
+
+  /**
+   * Initializes a new instance of the EvaluationFunctionExecutor class.
+   *
+   * @param evalCriteria The criteria used to generate evaluation functions. Provided/confirmed by the developer.
+   * @param promptTemplate The prompt demplate for the developer's LLM chain. This is useful for GPT-4 to generate correct evaluation functions.
+   * @param examples A set of variable-prompt-response triples that we want the developer to grade (and use for filtering incorrect evaluation functions).
+   * @param existingGrades Optional. A dict in format {uid: grade}, containing existing grades.
+   */
+  constructor(
+    promptTemplate: string,
+    examples: LLMResponse[],
+    evalCriteria: EvalCriteria[] = [],
+    updateGPTCalls: (numGPT4Calls: number, numGPT35Calls: number) => void,
+    addLog: (log: string) => void,
+    existingGrades?: Record<ResponseUID, boolean>,
+    existingPerCriteriaGrades?: Dict<Dict<boolean | undefined>>,
+    annotations?: Dict<string>,
+  ) {
+    console.log(evalCriteria);
+
+    this.resultsCache = new Map<
+      EvalFunction,
+      Map<ResponseUID, EvalFunctionResult>
+    >();
+    this.lastPickedHighScore = false; // Start off picking the highest score
+    this.examples = examples;
+    this.evalCriteria = evalCriteria;
+    this.promptTemplate = promptTemplate;
+
+    // Set scores and grades to default values of 0
+    this.scores = new Map<ResponseUID, number>();
+
+    // Set scores to 0 for each example id
+    for (const example of examples) {
+      this.scores.set(example.uid, 0);
+    }
+
+    this.grades = new Map<ResponseUID, boolean>();
+    this.perCriteriaGrades = {};
+    this.evalFunctions = [];
+    this.annotations = {};
+
+    // Pass in any existing grades
+    if (existingGrades) {
+      Object.entries(existingGrades).forEach(([uid, grade]) => {
+        this.grades.set(uid, grade);
+      });
+    }
+
+    // Pass in any existing per-criteria grades
+    if (existingPerCriteriaGrades) {
+      this.perCriteriaGrades = existingPerCriteriaGrades;
+    }
+
+    if (annotations) {
+      this.annotations = annotations;
+    }
+
+    this.criteriaQueue = [];
+    this.processing = false;
+
+    this.updateGPTCalls = updateGPTCalls;
+    this.logFunction = addLog;
+  }
+
+  /**
+   * Starts the background computation for generating and executing evaluation functions.
+   * This method initiates the tasks but does not wait for them to complete.
+   * This method should be called after the constructor.
+   */
+  public start(onProgress?: (progress: QueryProgress) => void): void {
+    // Throw error if there is no eval criteria
+    if (this.evalCriteria.length === 0) {
+      throw new Error(
+        "No evaluation criteria provided. Please provide at least one evaluation criterion.",
+      );
+    }
+
+    // Throw error if bg task is already running
+    if (this.backgroundTaskPromise) {
+      throw new Error(
+        "Background task for generating and executing evaluation functions is already running.",
+      );
+    }
+
+    // Initiate the background task without awaiting its completion
+    this.backgroundTaskPromise =
+      this.generateAndExecuteEvaluationFunctions(onProgress);
+  }
+
+  /**
+   * Allows the client to explicitly wait for the background tasks to complete if needed.
+   */
+  public async waitForCompletion(): Promise<void> {
+    if (this.backgroundTaskPromise) {
+      await this.backgroundTaskPromise;
+      this.backgroundTaskPromise = null;
+    }
+  }
+
+  /**
+   * Whether the executor is currently running (.start() has been called and is not yet completed).
+   */
+  public isRunning(): boolean {
+    return this.backgroundTaskPromise !== null;
+  }
+
+  private async generateAndExecuteFunctionsForCriteria(
+    criteria: EvalCriteria,
+    onProgress?: (progress: QueryProgress) => void,
+  ): Promise<void> {
+    const emitter = new EventEmitter();
+    const functionExecutionPromises: Promise<any>[] = [];
+
+    emitter.on("functionGenerated", (evalFunction) => {
+      const executionPromise = (async () => {
+        this.evalFunctions.push(evalFunction);
+        const executionPromises = this.examples.map(async (example) => {
+          // Get random positive and negative examples for this criteria using the perCriteriaGrades
+          const criteriaId = criteria.uid;
+          const randomPositiveExample = this.examples.find(
+            (example) =>
+              this.perCriteriaGrades[criteriaId]?.[example.uid] === true,
+          );
+          const randomNegativeExample = this.examples.find(
+            (example) =>
+              this.perCriteriaGrades[criteriaId]?.[example.uid] === false,
+          );
+
+          const funcToExecute =
+            evalFunction.evalCriteria.eval_method === "code"
+              ? execPyFunc
+              : executeLLMEval;
+
+          const result = await funcToExecute(
+            evalFunction,
+            example,
+            randomPositiveExample,
+            randomNegativeExample,
+          );
+
+          // Update GPT-3.5 call count by 1 if the eval method is expert
+          if (evalFunction.evalCriteria.eval_method === "expert") {
+            this.updateGPTCalls(0, 1);
+          }
+
+          if (onProgress) {
+            onProgress({
+              success:
+                (100 * functionExecutionPromises.length) /
+                this.criteriaQueue.length,
+              error: 0,
+            });
+          }
+
+          if (!this.resultsCache.has(evalFunction)) {
+            this.resultsCache.set(evalFunction, new Map());
+          }
+          this.resultsCache.get(evalFunction)?.set(example.uid, result);
+
+          if (result === EvalFunctionResult.FAIL) {
+            this.updateScore(example.uid, evalFunction);
+          }
+        });
+
+        await Promise.all(executionPromises);
+      })();
+
+      functionExecutionPromises.push(executionPromise);
+    });
+
+    const badExample = this.examples.find(
+      (example) =>
+        this.perCriteriaGrades[criteria.uid]?.[example.uid] === false,
+    );
+
+    await generateFunctionsForCriteria(
+      criteria,
+      this.promptTemplate,
+      this.examples[Math.floor(Math.random() * this.examples.length)],
+      emitter,
+      badExample,
+    );
+    // Update GPT-4o call count by 1
+    this.updateGPTCalls(1, 0);
+
+    console.log(`Generated functions for criteria: ${criteria.shortname}`);
+    console.log(
+      `Number of functions generated: ${functionExecutionPromises.length}`,
+    );
+    this.logFunction(
+      `Generated ${functionExecutionPromises.length} functions for criteria: ${criteria.shortname}`,
+    );
+
+    await Promise.all(functionExecutionPromises);
+  }
+
+  /**
+   * Generates and executes evaluation functions for a set of examples based on provided criteria.
+   * This method is responsible for initializing the evaluation process and managing the asynchronous execution of functions.
+   */
+  public async generateAndExecuteEvaluationFunctions(
+    onProgress?: (progress: QueryProgress) => void,
+  ): Promise<void> {
+    const emitter = new EventEmitter();
+    const numCriteriaToProcess = this.evalCriteria.length;
+
+    // Since we don't know how many implementations the LLM will suggest,
+    // we must estimate it here so we can use this information to stream
+    // "progress" updates back to the client:
+    let funcsExecuted = 0;
+    const estimatedFuncsToExecute =
+      numCriteriaToProcess +
+      this.evalCriteria.length * 5 * this.examples.length;
+
+    let criteriaProcessed = 0; // Track the number of criteria processed
+    let resolveAllFunctionsGenerated: any; // To be called when all functions are generated and executed
+    const functionExecutionPromises: Promise<any>[] = []; // Track execution promises for function executions
+
+    // This promise resolves when the 'allFunctionsGenerated' event is emitted
+    const allFunctionsGeneratedPromise = new Promise<void>((resolve) => {
+      resolveAllFunctionsGenerated = resolve;
+    });
+
+    // Listen for generated functions and execute them as they come in
+    emitter.on("functionGenerated", (evalFunction) => {
+      this.logFunction(
+        `Generated a new ${evalFunction.evalCriteria.eval_method === "code" ? "code-based" : "LLM-based"} validator for criteria: ${evalFunction.evalCriteria.shortname}${evalFunction.evalCriteria.eval_method === "expert" ? `, with prompt: ${evalFunction.name}` : ""}. Executing it on ${this.examples.length} examples.`,
+      );
+
+      // Capture the execution promise of each function
+      const executionPromise = (async () => {
+        // Add the eval function to the list of functions
+        this.evalFunctions.push(evalFunction);
+
+        const executionPromises = this.examples.map(async (example) => {
+          // Get random positive and negative examples for this criteria using the perCriteriaGrades
+          const criteriaId = evalFunction.evalCriteria.uid;
+          const randomPositiveExample = this.examples.find(
+            (example) =>
+              this.perCriteriaGrades[criteriaId]?.[example.uid] === true,
+          );
+          const randomNegativeExample = this.examples.find(
+            (example) =>
+              this.perCriteriaGrades[criteriaId]?.[example.uid] === false,
+          );
+
+          const funcToExecute =
+            evalFunction.evalCriteria.eval_method === "code"
+              ? execPyFunc
+              : executeLLMEval;
+
+          // Run the function on the example and if there's an error, increment skipped
+          const result = await funcToExecute(
+            evalFunction,
+            example,
+            randomPositiveExample,
+            randomNegativeExample,
+          );
+
+          // Update GPT-3.5 call count by 1 if the eval method is expert
+          if (evalFunction.evalCriteria.eval_method === "expert") {
+            this.updateGPTCalls(0, 1);
+          }
+
+          funcsExecuted++;
+          if (onProgress) {
+            onProgress({
+              success: (100 * funcsExecuted) / estimatedFuncsToExecute,
+              error: 0,
+            });
+          }
+
+          // Put result in cache
+          if (!this.resultsCache.has(evalFunction)) {
+            this.resultsCache.set(evalFunction, new Map());
+          }
+          this.resultsCache.get(evalFunction)?.set(example.uid, result);
+
+          // Update the score if the result is false
+          if (result === EvalFunctionResult.FAIL) {
+            this.updateScore(example.uid, evalFunction);
+          }
+        });
+
+        await Promise.all(executionPromises);
+        // console.log(`Function ${evalFunction.name} executed on all examples.`);
+      })();
+
+      functionExecutionPromises.push(executionPromise);
+    });
+
+    // Generate functions for each criterion
+    this.evalCriteria.forEach((criteria) => {
+      console.log(criteria);
+      generateFunctionsForCriteria(
+        criteria,
+        this.promptTemplate,
+        this.examples[Math.floor(Math.random() * this.examples.length)],
+        emitter, // Pass the EventEmitter instance
+      ).then(() => {
+        emitter.emit("criteriaProcessed");
+        // Update GPT-4o call count by 1
+        this.updateGPTCalls(1, 0);
+      });
+    });
+
+    // Listen for a custom 'criteriaProcessed' event to track when each criterion's functions have been generated
+    emitter.on("criteriaProcessed", () => {
+      criteriaProcessed++;
+      if (criteriaProcessed === this.evalCriteria.length) {
+        // Ensure all function executions have completed before emitting 'allFunctionsGenerated'
+        Promise.all(functionExecutionPromises).then(() => {
+          console.log(
+            "All evaluation functions have been generated and executed.",
+          );
+          this.logFunction(
+            "All initially-generated evaluation functions have been generated and executed.",
+          );
+          if (resolveAllFunctionsGenerated) {
+            resolveAllFunctionsGenerated(); // Resolve the promise when all functions have been generated and executed
+          }
+        });
+
+        if (onProgress)
+          onProgress({
+            success: 100,
+            error: 0,
+          });
+      }
+    });
+
+    // Wait for the 'allFunctionsGenerated' event, which now waits for all executions
+    await allFunctionsGeneratedPromise;
+  }
+
+  public generateNewImplementationsForCriteria(
+    criteriaID: EvalCriteriaUID,
+  ): void {
+    const crit = this.evalCriteria.find((c) => c.uid === criteriaID);
+    if (!crit) {
+      throw new Error(`Criteria with ID ${criteriaID} not found.`);
+    }
+    this.criteriaQueue.push(crit);
+    if (!this.processing) {
+      this.processNextCriteria();
+    }
+  }
+
+  /**
+   * Adds another evaluation criteria and triggers the generation and execution of evaluation functions for the new criteria.
+   * This method allows the client to add new evaluation criteria after the executor has been initialized.
+   * The new criteria will be processed in parallel with the existing criteria.
+   * The method returns immediately, allowing the client to continue with other tasks.
+   *
+   * @param criteria The new evaluation criteria to be added.
+   */
+  public addCriteria(criteriaList: EvalCriteria[]): void {
+    // See if there are new criteria to add
+    for (const criteria of criteriaList) {
+      if (this.evalCriteria.includes(criteria)) {
+        continue;
+      }
+
+      console.log(`Adding new criteria: ${criteria.shortname}`);
+      this.criteriaQueue.push(criteria);
+      this.evalCriteria.push(criteria);
+
+      // Start the generation and execution of functions for the new criteria
+      if (!this.processing) {
+        this.processNextCriteria();
+      }
+    }
+
+    // See if there are criteria to remove
+    for (const criteria of this.evalCriteria) {
+      if (!criteriaList.includes(criteria)) {
+        console.log(`Removing criteria: ${criteria.shortname}`);
+        this.evalCriteria = this.evalCriteria.filter((c) => c !== criteria);
+      }
+    }
+  }
+
+  private async processNextCriteria() {
+    // TODO: use worker pool to parallelize this
+    this.processing = true;
+    while (this.criteriaQueue.length > 0) {
+      const criteria = this.criteriaQueue.shift();
+      if (criteria) {
+        await this.generateAndExecuteFunctionsForCriteria(criteria);
+      }
+    }
+    this.processing = false;
+  }
+
+  /**
+   * Updates the grading prioritiy score for a given example based on the outcome of a synthesized evaluation function.
+   * This method calculates the failure rate of a function and adjusts the example's score accordingly. Functions with higher failure rates will result in lower scores for the example.
+   *
+   * @param exampleId The unique ID of the example being scored.
+   * @param evalFunction The eval function used for evaluation.
+   */
+  private updateScore(
+    exampleId: ResponseUID,
+    evalFunction: EvalFunction,
+  ): void {
+    // const outcome = this.outcomes.get(evalFunction);
+
+    // Get all the results for this function
+    const results = this.resultsCache.get(evalFunction);
+
+    if (results === undefined) {
+      return;
+    }
+
+    // Compute pass rate
+    const passed = Array.from(results.values()).filter(
+      (result) => result === EvalFunctionResult.PASS,
+    ).length;
+
+    // Compute failure rate
+    const failed = Array.from(results.values()).filter(
+      (result) => result === EvalFunctionResult.FAIL,
+    ).length;
+
+    const passRate = passed / (passed + failed);
+
+    const currentScore = this.scores.get(exampleId) || 0;
+    this.scores.set(exampleId, currentScore + passRate);
+  }
+
+  /**
+   * Retrieves the current response priority score for a given example.
+   * This method allows clients to query the score of an example at any point during the evaluation process, for transparency and debugging purposes.
+   *
+   * @param exampleId The unique ID of the example whose score is being requested.
+   * @returns The current response priority score of the example, if available.
+   */
+  public getScore(exampleId: ResponseUID): number | undefined {
+    return this.scores.get(exampleId);
+  }
+
+  /**
+   * Retrieves scores for all examples.
+   * This method provides a snapshot of the current scores for all examples being evaluated.
+   *
+   * @returns A map of example IDs to their current scores.
+   */
+  public getAllScores(): Map<ResponseUID, number> {
+    return new Map(this.scores);
+  }
+
+  /**
+   * Retrieves the grades set by the developer for all examples.
+   *
+   * @returns A map of example IDs to their grades.
+   */
+  public getGrades(): Map<ResponseUID, boolean> {
+    return new Map(this.grades);
+  }
+
+  public estimateNumGPTCalls(perCriteriaGrades: Dict<boolean>): {
+    numGPT4Calls: number;
+    numGPT35Calls: number;
+  } {
+    let numGPT4Calls = 0;
+    let numLLMCriteria = 0;
+    for (const criteriaId in perCriteriaGrades) {
+      const currGrade = perCriteriaGrades[criteriaId];
+      const numGradedAsCurrGrade = this.examples.filter(
+        (example) =>
+          this.perCriteriaGrades[example.uid] &&
+          this.perCriteriaGrades[example.uid][criteriaId] === currGrade,
+      ).length;
+      if (Math.random() <= 1 / (numGradedAsCurrGrade + 1)) {
+        numGPT4Calls += 1;
+        const criteria = this.evalCriteria.find(
+          (criteria) => criteria.uid === criteriaId,
+        );
+        if (criteria && criteria.eval_method === "expert") {
+          numLLMCriteria += 1;
+        }
+      }
+    }
+
+    return {
+      numGPT4Calls,
+      numGPT35Calls: numLLMCriteria * 3 * this.examples.length,
+    };
+  }
+
+  /**
+   * Sets a grade for an example based on external input from the developer.
+   * This will be used for filtering out incorrect evaluation functions.
+   * If the developer does not provide a holistic grade, the executor will infer it from the perCriteriaGrades.
+   * With some probability, generate new implementations for the criteria in perCriteriaGrades.
+   *
+   * @param exampleId The unique ID of the example being graded.
+   * @param holisticGrade The developer-provided grade assigned to the example, "good" or "bad" or unknown.
+   */
+  public setGradeForExample(
+    exampleId: ResponseUID,
+    perCriteriaGrades?: Dict<boolean | undefined>,
+    holisticGrade?: string,
+    annotation?: string,
+  ): void {
+    if (holisticGrade !== null) {
+      const boolHolistic = holisticGrade === "good";
+      this.grades.set(exampleId, boolHolistic);
+    }
+
+    if (perCriteriaGrades !== null) {
+      this.perCriteriaGrades[exampleId] = perCriteriaGrades;
+
+      // If holisticGrade was null, set it based on the perCriteriaGrades---if all criteria in the perCriteriaGrades are true, set the holisticGrade to true, else false
+      if (holisticGrade === null) {
+        const allTrue = Object.values(perCriteriaGrades).every(
+          (value) => value === true,
+        );
+        this.grades.set(exampleId, allTrue);
+      }
+    }
+
+    if (annotation !== null) {
+      this.annotations[exampleId] = annotation;
+    }
+
+    let numCriteriaWithNewImplementations = 0;
+
+    // Trigger generateNewImplementationsForCriteria for each criteria in perCriteriaGrades
+    for (const criteriaId in perCriteriaGrades) {
+      const currGrade = perCriteriaGrades[criteriaId];
+      // With probability 1 / # graded examples for this criteria with currGrade, generate new implementations
+      const numGradedAsCurrGrade = this.examples.filter(
+        (example) =>
+          this.perCriteriaGrades[example.uid] &&
+          this.perCriteriaGrades[example.uid][criteriaId] === currGrade,
+      ).length;
+
+      if (Math.random() <= 1 / (numGradedAsCurrGrade + 1)) {
+        console.log(
+          `Generating new implementations for criteria: ${criteriaId}`,
+        );
+        const evalCriteria = this.evalCriteria.find(
+          (criteria) => criteria.uid === criteriaId,
+        );
+        if (evalCriteria) {
+          this.criteriaQueue.push(evalCriteria);
+          if (!this.processing) {
+            this.processNextCriteria();
+          }
+          numCriteriaWithNewImplementations++;
+        } else {
+          console.error(`Evaluation criteria with ID ${criteriaId} not found.`);
+        }
+      }
+    }
+
+    console.log(
+      `Generated new implementations for ${numCriteriaWithNewImplementations} criteria.`,
+    );
+  }
+
+  /**
+   * Set evaluation criteria for the executor.
+   * This method allows the client to set the evaluation criteria after the executor has been initialized.
+   */
+  public setEvalCriteria(evalCriteria: EvalCriteria[]): void {
+    this.evalCriteria = evalCriteria;
+  }
+
+  /**
+   * Set examples for the executor.
+   * This method allows the client to change the examples after the executor has been initialized.
+   */
+  public setExamples(examples: LLMResponse[]): void {
+    this.examples = examples;
+
+    // Set scores to 0 for each example id
+    for (const example of examples) {
+      this.scores.set(example.uid, 0);
+    }
+
+    // Set grades if examples contain them
+    for (const example of examples) {
+      if (example.metavars.grade !== undefined) {
+        this.grades.set(example.uid, example.metavars.grade);
+      }
+    }
+  }
+
+  /**
+   * Gets a map of ungraded example ids and their scores, sorted by score.
+   * @return A map of ungraded example ids and their scores, sorted by score.
+   */
+  public getUngradedScores(): Map<ResponseUID, number> {
+    // Step 1: Convert the scores Map to an array and filter out graded examples
+    const ungradedEntries = Array.from(this.scores.entries())
+      .filter(([id]) => !this.grades.has(id))
+      .map(([id, score]) => ({ id, score, rand: Math.random() })) // Add a random value for tie-breaking
+
+      // Step 2: Sort the ungraded entries first by score, then randomly for tie-breaking
+      .sort((a, b) => {
+        if (a.score === b.score) {
+          return a.rand - b.rand; // Tie-breaking by random value
+        }
+        return b.score - a.score; // Sort by score descending
+      })
+
+      // Step 3: Convert the sorted objects back into the format expected by the Map constructor
+      .map(({ id, score }) => [id, score] as [ResponseUID, number]);
+
+    // Step 4: Convert the array of key-value pairs back into a Map and return
+    return new Map(ungradedEntries);
+  }
+
+  private getExampleForId(id: string) {
+    const item = this.examples.filter((e) => e.uid === id);
+    if (item.length === 1) return item[0];
+    else if (item.length > 1) {
+      console.error(
+        "More than one example found with the same id. Ids must be unique. Returning the first, to not halt...",
+      );
+      return item[0];
+    } else return null;
+  }
+
+  /**
+   * Determines the next example to be graded, alternating between examples with the highest and lowest ungraded scores.
+   * This method aims to balance attention across examples of varying difficulty or quality. Ideally, in grading, we get a sample of good and bad
+   * responses.
+   *
+   * @param policy The policy to use for selecting the next example to grade. Currently, the only supported policies are "random" and "priority".
+   *
+   * @returns The unique ID of the next example to be graded, or null if all examples have been graded.
+   */
+  public getNextExampleToGrade(
+    policy: "random" | "priority" = "priority",
+  ): LLMResponse | null {
+    const ungraded = Array.from(this.getUngradedScores().keys());
+
+    if (ungraded.length === 0) {
+      return null; // No ungraded examples left
+    }
+
+    // If the policy is random, return a random ungraded example
+    if (policy === "random") {
+      return this.getExampleForId(
+        ungraded[Math.floor(Math.random() * ungraded.length)],
+      );
+    }
+
+    // Otherwise whether to pick the highest or lowest ungraded score
+    const pickIndex = this.lastPickedHighScore ? ungraded.length - 1 : 0;
+    this.lastPickedHighScore = !this.lastPickedHighScore; // Alternate for next time
+
+    return this.getExampleForId(ungraded[pickIndex]);
+  }
+
+  /**
+   * Filters out evaluation functions that are incorrect based on the grades provided by the developer.
+   *
+   * @param falseFailureRateThreshold The threshold for the failure rate of each selected evaluation functions. The returned function set will only contain functions with a false failure rate below this threshold.
+   *
+   * @returns A filtered set of evaluation functions that each have a false failure rate below the specified threshold and cover as much evaluation criteria as possible.
+   */
+  public async filterEvaluationFunctions(
+    falseFailureRateThreshold: number,
+  ): Promise<EvalFunctionSetReport> {
+    const gradedExamples = this.examples.filter((example) =>
+      this.grades.has(example.uid),
+    );
+    const gradedResultMap: Map<
+      ResponseUID,
+      Map<EvalFunction, EvalFunctionResult>
+    > = new Map();
+
+    // Iterate over graded examples and evaluation functions to fill the matrix
+    for (const example of gradedExamples) {
+      const row = new Map<EvalFunction, EvalFunctionResult>();
+      for (const evalFunction of this.evalFunctions) {
+        // Check if the result is in the cache
+        if (this.resultsCache.has(evalFunction)) {
+          const result = this.resultsCache.get(evalFunction)?.get(example.uid);
+          if (result !== undefined) {
+            row.set(evalFunction, result);
+            continue;
+          }
+        }
+
+        // If not, execute the function and store the result in the cache
+        const funcToExecute =
+          evalFunction.evalCriteria.eval_method === "code"
+            ? execPyFunc
+            : executeLLMEval;
+        const result = await funcToExecute(evalFunction, example);
+
+        // Put result in cache
+        if (!this.resultsCache.has(evalFunction)) {
+          this.resultsCache.set(evalFunction, new Map());
+        }
+        this.resultsCache.get(evalFunction)?.set(example.uid, result);
+
+        row.set(evalFunction, result);
+      }
+      gradedResultMap.set(example.uid, row);
+    }
+
+    const numFailGrades = gradedExamples.filter(
+      (example) => !this.grades.get(example.uid),
+    ).length;
+    const numPassGrades = gradedExamples.filter((example) =>
+      this.grades.get(example.uid),
+    ).length;
+    const bestEvalFunctions: EvalFunction[] = [];
+    const evalFunctionReport: Map<EvalCriteria, EvalFunctionReport[]> =
+      new Map();
+
+    // Iterate through each criteria
+    // For each criteria, select the function with the highest alignment rate
+    for (const criteria of this.evalCriteria) {
+      let scoredFunctions = [];
+
+      for (const evalFunction of this.evalFunctions) {
+        // Skip functions that don't match the criteria
+        if (evalFunction.evalCriteria !== criteria) {
+          continue;
+        }
+
+        // Create a report for this function
+        const report: EvalFunctionReport = {
+          evalFunction,
+          true_pass: 0,
+          true_fail: 0,
+          false_pass: 0,
+          false_fail: 0,
+          alignment: 0,
+          skipped: 0,
+        };
+
+        // Calculate alignment for this function based on the graded examples
+        for (const example of gradedExamples) {
+          const result = gradedResultMap.get(example.uid)?.get(evalFunction);
+          const grade = this.grades.get(example.uid)
+            ? EvalFunctionResult.PASS
+            : EvalFunctionResult.FAIL;
+
+          if (result !== undefined) {
+            // Handle true positives and true negatives
+            if (result === grade) {
+              if (result === EvalFunctionResult.PASS) {
+                report.true_pass++;
+              } else if (result === EvalFunctionResult.FAIL) {
+                report.true_fail++;
+              }
+            } else {
+              if (result === EvalFunctionResult.PASS) {
+                report.false_pass++;
+              } else if (result === EvalFunctionResult.FAIL) {
+                report.false_fail++;
+              } else {
+                report.skipped++;
+              }
+            }
+          }
+        }
+
+        // Calculate coverage
+        const failureCoverage =
+          numFailGrades > 0
+            ? report.true_fail / (report.true_fail + report.false_pass)
+            : 1.0;
+
+        // Calculate false failure rate
+        const falseFailureRate =
+          report.false_fail / (report.true_pass + report.false_fail);
+
+        // The alignment is the F1 score of failure coverage and 1 - false failure rate
+        report.alignment =
+          numFailGrades > 0 || numPassGrades > 0
+            ? (2 * failureCoverage * (1 - falseFailureRate)) /
+              (failureCoverage + (1 - falseFailureRate))
+            : undefined;
+
+        // Save the report for this function
+        if (!evalFunctionReport.has(criteria)) {
+          evalFunctionReport.set(criteria, []);
+        }
+        evalFunctionReport.get(criteria)?.push(report);
+        console.log(report);
+
+        scoredFunctions.push({
+          evalFunction,
+          failureCoverage,
+          falseFailureRate:
+            report.false_fail / (report.true_pass + report.false_fail),
+        });
+      }
+
+      // See if we can filter out functions with ffr > threshold
+      const numFunctionsBelowThreshold = scoredFunctions.filter(
+        (func) => func.falseFailureRate <= falseFailureRateThreshold,
+      ).length;
+      if (numFunctionsBelowThreshold > 0) {
+        // Filter out functions with ffr > threshold
+        scoredFunctions = scoredFunctions.filter(
+          (func) => func.falseFailureRate <= falseFailureRateThreshold,
+        );
+      }
+
+      // Save the best function for this criteria
+      // Maximize failure coverage and minimize false failure rate
+      scoredFunctions.sort((a, b) => {
+        if (a.failureCoverage === b.failureCoverage) {
+          return a.falseFailureRate - b.falseFailureRate;
+        }
+        return b.failureCoverage - a.failureCoverage;
+      });
+
+      if (scoredFunctions.length > 0) {
+        bestEvalFunctions.push(scoredFunctions[0].evalFunction);
+      }
+    }
+
+    const [coverage, falseFailureRate] = this.getSelectedFunctionAlignment(
+      bestEvalFunctions,
+      gradedResultMap,
+      gradedExamples,
+    );
+
+    // Create report of coverage, missed failures, selected functions, and all eval function reports
+    const report = {
+      failureCoverage: coverage,
+      falseFailureRate,
+      selectedEvalFunctions: bestEvalFunctions,
+      allEvalFunctionReports: evalFunctionReport,
+    };
+
+    return report;
+  }
+
+  private getSelectedFunctionAlignment(
+    selectedEvalFunctions: EvalFunction[],
+    gradedResultMap: Map<ResponseUID, Map<EvalFunction, EvalFunctionResult>>,
+    gradedExamples: LLMResponse[],
+  ) {
+    // Of the selected functions, calculate the coverage of failures and false failure rate
+    let truePass = 0;
+    const coveredFailures = new Set<ResponseUID>();
+    const falseFailures = new Set<ResponseUID>();
+
+    for (const example of gradedExamples) {
+      let systemPass = true;
+
+      for (const evalFunction of selectedEvalFunctions) {
+        const result = gradedResultMap.get(example.uid)?.get(evalFunction);
+        if (
+          result === EvalFunctionResult.FAIL &&
+          !this.grades.get(example.uid)
+        ) {
+          coveredFailures.add(example.uid);
+          systemPass = false;
+        }
+
+        if (
+          result === EvalFunctionResult.FAIL &&
+          this.grades.get(example.uid)
+        ) {
+          systemPass = false;
+          falseFailures.add(example.uid);
+        }
+      }
+
+      if (systemPass) {
+        if (this.grades.get(example.uid)) {
+          truePass++;
+        }
+      }
+    }
+
+    // Print out failure coverage
+    const numFailures = gradedExamples.filter(
+      (example) => !this.grades.get(example.uid),
+    ).length;
+    const coverage = (coveredFailures.size / numFailures) * 100;
+    const falseFailureRate =
+      (falseFailures.size / (truePass + falseFailures.size)) * 100;
+    console.log(`Failure coverage: ${coverage}`);
+    console.log(`False failure rate: ${falseFailureRate}`);
+
+    // Print out missed failures
+    // const missedFailures = gradedExamples.filter(
+    //   (example) =>
+    //     !this.grades.get(example.uid) && !coveredFailures.has(example.uid),
+    // );
+    // if (missedFailures.length > 0) {
+    //   console.log(`Missed failures: ${missedFailures}`);
+    // }
+
+    return [coverage, falseFailureRate];
+  }
+
+  public async recomputeAlignment(
+    selectedEvalCriteria: EvalCriteria[],
+    oldReport: EvalFunctionSetReport,
+  ): Promise<EvalFunctionSetReport> {
+    // Recompute alignment based on the selected functions
+    const gradedExamples = this.examples.filter((example) =>
+      this.grades.has(example.uid),
+    );
+    const gradedResultMap: Map<
+      ResponseUID,
+      Map<EvalFunction, EvalFunctionResult>
+    > = new Map();
+
+    // Iterate over graded examples and evaluation functions to fill the matrix
+    for (const example of gradedExamples) {
+      const row = new Map<EvalFunction, EvalFunctionResult>();
+      for (const evalFunction of this.evalFunctions) {
+        // Check if the result is in the cache
+        if (this.resultsCache.has(evalFunction)) {
+          const result = this.resultsCache.get(evalFunction)?.get(example.uid);
+          if (result !== undefined) {
+            row.set(evalFunction, result);
+            continue;
+          }
+        }
+
+        // If not, execute the function and store the result in the cache
+        const funcToExecute =
+          evalFunction.evalCriteria.eval_method === "code"
+            ? execPyFunc
+            : executeLLMEval;
+        const result = await funcToExecute(evalFunction, example);
+
+        // Put result in cache
+        if (!this.resultsCache.has(evalFunction)) {
+          this.resultsCache.set(evalFunction, new Map());
+        }
+        this.resultsCache.get(evalFunction)?.set(example.uid, result);
+
+        row.set(evalFunction, result);
+      }
+      gradedResultMap.set(example.uid, row);
+    }
+
+    // Filter out functions that don't match the selected criteria
+    const selectedEvalFunctions = oldReport.selectedEvalFunctions.filter(
+      (evalFunction) =>
+        selectedEvalCriteria.includes(evalFunction.evalCriteria),
+    );
+
+    const [coverage, falseFailureRate] = this.getSelectedFunctionAlignment(
+      selectedEvalFunctions,
+      gradedResultMap,
+      gradedExamples,
+    );
+
+    // Create report of coverage, missed failures, selected functions, and all eval function reports
+    const report = {
+      failureCoverage: coverage,
+      falseFailureRate,
+      selectedEvalFunctions: oldReport.selectedEvalFunctions,
+      allEvalFunctionReports: oldReport.allEvalFunctionReports,
+    };
+
+    return report;
+  }
+
+  /**
+   * Retrieves the current outcomes of the evaluation functions.
+   * This method provides a snapshot of the current outcomes of the evaluation functions.
+   *
+   * @returns A map of evaluation functions to their current outcomes.
+   */
+  public getOutcomes(): Map<
+    EvalFunction,
+    { passed: number; failed: number; skipped: number }
+  > {
+    // Compute based on the results cache
+    const outcomes = new Map<
+      EvalFunction,
+      { passed: number; failed: number; skipped: number }
+    >();
+
+    for (const [evalFunction, results] of this.resultsCache) {
+      let passed = 0;
+      let failed = 0;
+      let skipped = 0;
+
+      for (const result of results.values()) {
+        if (result === EvalFunctionResult.PASS) {
+          passed++;
+        } else if (result === EvalFunctionResult.FAIL) {
+          failed++;
+        } else {
+          skipped++;
+        }
+      }
+
+      outcomes.set(evalFunction, { passed, failed, skipped });
+    }
+
+    return outcomes;
+  }
+}
diff --git a/chainforge/react-server/src/backend/evalgen/oai_utils.ts b/chainforge/react-server/src/backend/evalgen/oai_utils.ts
new file mode 100644
index 000000000..b9f23ce18
--- /dev/null
+++ b/chainforge/react-server/src/backend/evalgen/oai_utils.ts
@@ -0,0 +1,339 @@
+// import { env as process_env } from "process";
+import { EventEmitter } from "events";
+// import { AzureKeyCredential, OpenAIClient } from "@azure/openai";
+import { get_openai_api_key } from "../utils";
+type ContentType = "criteria" | "python_fn" | "llm_eval";
+
+export class OpenAIStreamer extends EventEmitter {
+  private buffer = "";
+  private isJsonContentStarted = false;
+  private isPythonContentStarted = false;
+  private pythonBlockBuffer = "";
+  // private client;
+  private openai_api_key;
+
+  constructor() {
+    super();
+
+    const OPENAI_API_KEY = get_openai_api_key();
+    this.openai_api_key = OPENAI_API_KEY;
+
+    // this.client = new OpenAIClient(
+    //   process?.env?.AZURE_OPENAI_ENDPOINT ?? AZURE_OPENAI_ENDPOINT ?? "",
+    //   new AzureKeyCredential(
+    //     process?.env?.AZURE_OPENAI_KEY ?? AZURE_OPENAI_KEY ?? "",
+    //   ),
+    // );
+
+    // this.client = new OpenAIApi(configuration);
+  }
+
+  private buildMessages(prompt: string): any[] {
+    return [
+      {
+        content:
+          "You are an expert Python programmer and helping me write assertions for my LLM pipeline. An LLM pipeline accepts an example and prompt template, fills the template's placeholders with the example, and generates a response.",
+        role: "system",
+      },
+      { role: "user", content: prompt },
+    ];
+  }
+
+  private resetBuffer(): void {
+    this.buffer = "";
+    this.isJsonContentStarted = false;
+    this.isPythonContentStarted = false;
+    this.pythonBlockBuffer = "";
+  }
+
+  async generate(
+    prompt: string,
+    model: string,
+    type: ContentType,
+  ): Promise<void> {
+    this.resetBuffer();
+    const messages = this.buildMessages(prompt);
+
+    // const events = await this.client.listChatCompletions(model, messages, {});
+
+    // for await (const event of events) {
+    //   for (const choice of event.choices) {
+    //     const delta = choice.delta?.content;
+    //     if (delta !== undefined) {
+    //       if (type === "criteria") {
+    //         this.processCriteriaDelta(delta);
+    //       } else if (type === "llm_eval") {
+    //         this.processStringDelta(delta);
+    //       } else if (type === "python_fn") {
+    //         this.processFunctionDelta(delta);
+    //       } else {
+    //         throw new Error("Invalid type");
+    //       }
+    //     }
+    //   }
+    // }
+
+    // Used restapi as here: https://stackoverflow.com/questions/76137987/openai-completion-stream-with-node-js-and-express-js
+
+    const streamRes = await fetch(
+      "https://api.openai.com/v1/chat/completions",
+      {
+        method: "POST",
+        headers: {
+          Authorization: `Bearer ${this.openai_api_key}`,
+          "Content-Type": "application/json",
+        },
+        body: JSON.stringify({
+          model,
+          messages,
+          stream: true,
+        }),
+      },
+    );
+
+    const reader = streamRes.body?.getReader();
+    if (!reader) {
+      console.error("Error initializing reader for OpenAI requests.");
+      return;
+    }
+
+    let done = false;
+    let concenattedJsonStrn = "";
+
+    while (!done) {
+      const { value, done: readerDone } = await reader.read();
+      done = readerDone;
+      const buffer = Buffer.from(value as ArrayBuffer);
+      const textPayload = buffer.toString();
+      concenattedJsonStrn += textPayload;
+      if (
+        !concenattedJsonStrn.includes(`data: `) ||
+        !concenattedJsonStrn.includes(`\n\n`)
+      ) {
+        continue;
+      }
+      const payloads = concenattedJsonStrn.toString().split("\n\n");
+      concenattedJsonStrn = "";
+
+      for (const payload of payloads) {
+        if (payload.includes("[DONE]")) return;
+        if (payload.startsWith("data:")) {
+          try {
+            const data = JSON.parse(payload.replace("data: ", ""));
+            const delta: undefined | string = data.choices[0].delta?.content;
+            if (delta !== undefined) {
+              if (type === "criteria") {
+                this.processCriteriaDelta(delta);
+              } else if (type === "llm_eval") {
+                this.processStringDelta(delta);
+              } else if (type === "python_fn") {
+                this.processFunctionDelta(delta);
+              } else {
+                throw new Error("Invalid type");
+              }
+            }
+          } catch (error) {
+            console.log(`Error with JSON.parse and ${payload}.\n${error}`);
+            concenattedJsonStrn += payload;
+          }
+        }
+      }
+    }
+
+    this.emit("end"); // Signal that streaming is complete
+  }
+
+  private processCriteriaDelta(delta: string): void {
+    this.buffer += delta;
+    if (!this.isJsonContentStarted) {
+      const startIndex = this.buffer.indexOf("```json\n");
+      if (startIndex !== -1) {
+        this.isJsonContentStarted = true;
+        this.buffer = this.buffer.substring(startIndex + 8); // Skip the '```json \n' part
+      }
+      // Trim the buffer to avoid whitespace at beginning and end
+      this.buffer = this.buffer.trim();
+    }
+
+    if (this.isJsonContentStarted) {
+      this.tryEmitEvalCriteria();
+    }
+  }
+
+  private tryEmitEvalCriteria(): void {
+    let braceCount = 0;
+    let lastIndex = 0; // Track start of the next JSON object
+
+    // Detect and handle the start of an array
+    if (this.buffer.trim().startsWith("[")) {
+      this.buffer = this.buffer.trim().substring(1); // Remove the leading '['
+    }
+
+    // Remove leading commas if they exist right before a JSON object
+    this.buffer = this.buffer.replace(/^\s*,\s*/, "");
+
+    for (let i = 0; i < this.buffer.length; i++) {
+      const char = this.buffer[i];
+      if (char === "{") {
+        braceCount++;
+      } else if (char === "}") {
+        braceCount--;
+      }
+
+      // When a complete JSON object is detected
+      if (braceCount === 0 && char === "}") {
+        const jsonStr = this.buffer.substring(lastIndex, i + 1).trim();
+        lastIndex = i + 1; // Update for potential next object
+
+        // Remove any leading comma for the next object
+        if (this.buffer[lastIndex] === ",") {
+          lastIndex++; // Skip the comma for the next object
+        }
+
+        try {
+          const jsonObj = JSON.parse(jsonStr);
+          this.emit("evalCriteria", jsonObj);
+        } catch (error) {
+          console.error("Error parsing JSON:", error);
+        }
+      }
+    }
+
+    // Keep any incomplete JSON for the next delta
+    this.buffer = this.buffer.substring(lastIndex).trim();
+  }
+
+  private processStringDelta(delta: string): void {
+    this.buffer += delta;
+    if (!this.isJsonContentStarted) {
+      const startIndex = this.buffer.indexOf("```json\n");
+      if (startIndex !== -1) {
+        this.isJsonContentStarted = true;
+        this.buffer = this.buffer.substring(startIndex + 8); // Skip the '```json\n' part
+      }
+    }
+
+    if (this.isJsonContentStarted) {
+      this.tryEmitStrings();
+    }
+  }
+
+  private tryEmitStrings(): void {
+    let quoteCount = 0;
+    let lastIndex = 0; // Track the start of the next string
+
+    // Detect and handle the start of an array
+    if (this.buffer.startsWith("[")) {
+      this.buffer = this.buffer.substring(1); // Remove the leading '['
+    }
+
+    // Remove leading commas and whitespace that might be right before a JSON string
+    this.buffer = this.buffer.replace(/^\s*,\s*/, "");
+
+    for (let i = 0; i < this.buffer.length; i++) {
+      const char = this.buffer[i];
+
+      // Toggle quote count on encountering quotes, ignoring escaped quotes
+      if (char === '"' && (i === 0 || this.buffer[i - 1] !== "\\")) {
+        quoteCount++;
+      }
+
+      // When a complete string is detected (every second quote)
+      if (quoteCount === 2) {
+        const jsonString = this.buffer.substring(lastIndex, i + 1); // Include the closing quote
+        lastIndex = i + 1; // Update for the potential next string
+
+        // Remove any leading comma for the next string
+        if (this.buffer[lastIndex] === ",") {
+          lastIndex++; // Skip the comma for the next string
+        }
+
+        quoteCount = 0; // Reset for the next string
+
+        // Extract the string value from JSON
+        try {
+          const strValue = JSON.parse(jsonString);
+          this.emit("function", strValue);
+        } catch (error) {
+          console.error("Error parsing JSON string:", error);
+        }
+      }
+    }
+
+    // Keep any incomplete JSON string for the next delta
+    this.buffer = this.buffer.substring(lastIndex).trim();
+  }
+
+  private processFunctionDelta(delta: string): void {
+    this.buffer += delta;
+    if (!this.isPythonContentStarted) {
+      let startIndex = this.buffer.indexOf("```python");
+      if (startIndex === -1) startIndex = this.buffer.indexOf("```");
+      if (startIndex !== -1) {
+        this.isPythonContentStarted = true;
+        this.buffer = this.buffer.substring(startIndex);
+      }
+    } else {
+      const endIndex = this.buffer.indexOf("```", 8); // Look for end marker after the start
+      if (endIndex !== -1) {
+        // Extract Python code block
+        const pythonCode = this.buffer
+          .replace("```python", "")
+          .replaceAll("```", "")
+          .trim();
+        this.pythonBlockBuffer += pythonCode;
+        this.buffer = this.buffer.substring(endIndex + 3);
+        this.isPythonContentStarted = false;
+        // Now process the Python code block for functions
+        this.tryEmitFunctionCriteria();
+      }
+    }
+  }
+
+  private tryEmitFunctionCriteria(): void {
+    // Split the buffer into lines
+    const lines = this.pythonBlockBuffer.split("\n");
+    let collecting = false;
+    let functionBody: string[] = [];
+    let baseIndentation = 0;
+
+    for (const line of lines) {
+      if (!collecting) {
+        // Check if the line is a function definition
+        if (line.trim().startsWith("def ")) {
+          collecting = true;
+          functionBody = [line];
+          // Determine the base indentation level
+          baseIndentation = line.indexOf("def");
+        }
+      } else {
+        // Check if the line returns to the base indentation level or lower, indicating the end of the function
+        const currentIndentation = line.search(/\S|$/); // Find first non-space character or end of line
+        if (currentIndentation <= baseIndentation) {
+          // Emit the collected function body
+          this.emit("function", functionBody.join("\n"));
+          functionBody = []; // Reset for the next function
+          collecting = false;
+
+          // If the current line is another function definition, start collecting again
+          if (line.trim().startsWith("def ")) {
+            collecting = true;
+            functionBody = [line];
+            baseIndentation = line.indexOf("def");
+          }
+        } else if (collecting) {
+          // Continue collecting the function body
+          functionBody.push(line);
+        }
+      }
+    }
+
+    // Check if there's a function body collected at the end of the buffer without returning to the base indentation
+    if (collecting && functionBody.length > 0) {
+      this.emit("function", functionBody.join("\n"));
+    }
+
+    // Clear the buffer after processing
+    this.pythonBlockBuffer = "";
+  }
+}
diff --git a/chainforge/react-server/src/backend/evalgen/test.ts b/chainforge/react-server/src/backend/evalgen/test.ts
new file mode 100644
index 000000000..7766479cc
--- /dev/null
+++ b/chainforge/react-server/src/backend/evalgen/test.ts
@@ -0,0 +1,142 @@
+// import fs from "fs";
+// import csvParser from "csv-parser";
+// import readline from "readline";
+
+// import { Example, EvalCriteria, generateLLMEvaluationCriteria } from "./utils";
+// import EvaluationFunctionExecutor from "./executor";
+
+// const readCSV = async (filePath: string): Promise<Example[]> => {
+//   const examples: Example[] = [];
+//   let counter = 0; // Counter to generate unique IDs
+
+//   return new Promise((resolve, reject) => {
+//     fs.createReadStream(filePath)
+//       .pipe(csvParser(["prompt", "example", "response", "model"]))
+//       .on("data", (data) => {
+//         try {
+//           examples.push({
+//             id: `example_${++counter}`, // Generating a unique ID
+//             variables: data.example,
+//             prompt: data.prompt,
+//             response: data.response,
+//           });
+//         } catch (error) {
+//           // console.error("Error parsing variables from CSV:", error);
+//           // Don't throw here, just skip the example
+//         }
+//       })
+//       .on("end", () => resolve(examples))
+//       .on("error", reject);
+//   });
+// };
+
+// const rl = readline.createInterface({
+//   input: process.stdin,
+//   output: process.stdout,
+// });
+
+// const askQuestion = (query: string): Promise<string> =>
+//   new Promise((resolve) => rl.question(query, resolve));
+
+// const main = async () => {
+//   // Placeholder values - replace with actual data
+//   const promptTemplate = `You are an AI Assistant that’s an expert at reviewing pull requests. Review the below pull request that you receive.
+
+//   Input format
+//   - The input format follows Github diff format with addition and subtraction of code.
+//   - The + sign means that code has been added.
+//   - The - sign means that code has been removed.
+
+//   Instructions
+//   - Take into account that you don’t have access to the full code but only the code diff.
+//   - Only answer on what can be improved and provide the improvement in code.
+//   - Answer in short form.
+//   - Include code snippets if necessary.
+//   - Adhere to the languages code conventions.
+//   - Make it personal and always show gratitude to the author using "@" when tagging.`;
+
+//   let examples: Example[] = await readCSV("./codereviews.csv");
+
+//   // Get a sample of 10 examples
+//   examples = examples.slice(0, 10);
+
+//   // Print number of examples
+//   console.log(`Loaded ${examples.length} examples.`);
+
+//   // Start a timer
+//   let start = Date.now();
+//   let timeElapsed = 0;
+
+//   // Step 1: Suggest eval criteria and solicit approval
+//   const evalCriteria = await generateLLMEvaluationCriteria(promptTemplate);
+//   // Pause the timer
+//   timeElapsed += Date.now() - start;
+
+//   const approval = await askQuestion(
+//     "Do you approve the suggested criteria? (y/n) ",
+//   );
+
+//   if (approval.toLowerCase() !== "y") {
+//     console.log(
+//       "Please adjust the criteria directly in the source code for now.",
+//     );
+//     return;
+//   }
+
+//   const executor = new EvaluationFunctionExecutor(
+//     promptTemplate,
+//     examples,
+//   );
+
+//   // Set the evaluation criteria
+//   executor.setEvaluationCriteria(evalCriteria);
+
+//   // Resume the timer
+//   start = Date.now();
+
+//   // Step 2: Start background task
+//   executor.start();
+
+//   //   await executor.waitForCompletion();
+
+//   //   Step 3: Present examples to grade
+//   while (true) {
+//     // Get ungraded scores
+//     const ungradedScores = executor.getUngradedScores();
+//     console.log("Ungraded Scores: ", ungradedScores);
+
+//     const nextExampleId = executor.getNextExampleToGrade();
+//     if (!nextExampleId) {
+//       console.log("All examples graded or no examples available.");
+//       break;
+//     }
+
+//     const example = examples.find((e) => e.id === nextExampleId);
+//     if (!example) continue;
+
+//     console.log(
+//       `Example ID: ${example.id}, Prompt: ${example.prompt}, Response: ${example.response}`,
+//     );
+//     const grade = await askQuestion(
+//       "Is this response acceptable? (y/n/finish) ",
+//     );
+
+//     if (grade === "finish") {
+//       break;
+//     }
+
+//     executor.setGradeForExample(example.id, grade.toLowerCase() === "y");
+//   }
+
+//   // Print grades
+//   console.log("Grades: ", executor.getGrades());
+
+//   // Step 4: Filtering and results
+//   //   await executor.waitForCompletion();
+//   const filteredFunctions = await executor.filterEvaluationFunctions(0.2);
+//   console.log("Filtered Functions: ", filteredFunctions);
+
+//   rl.close();
+// };
+
+// main().catch(console.error);
diff --git a/chainforge/react-server/src/backend/evalgen/typing.ts b/chainforge/react-server/src/backend/evalgen/typing.ts
new file mode 100644
index 000000000..e9e6cd24d
--- /dev/null
+++ b/chainforge/react-server/src/backend/evalgen/typing.ts
@@ -0,0 +1,78 @@
+import { ChatHistoryInfo, Dict } from "../typing";
+
+export type EvalCriteriaUID = string;
+
+export interface EvalCriteria {
+  shortname: string;
+  criteria: string;
+  eval_method: "code" | "expert";
+  uid: EvalCriteriaUID;
+  priority: number;
+  source?: string;
+}
+
+export interface EvalGenReport {
+  criteria: EvalCriteria[];
+  failureCoverage: number;
+  falseFailureRate: number;
+}
+
+export function validEvalCriteriaFormat(json_obj: Dict) {
+  return (
+    "criteria" in json_obj &&
+    "shortname" in json_obj &&
+    ["code", "expert"].includes(json_obj.eval_method)
+  );
+}
+
+export enum EvalFunctionResult {
+  PASS = "pass",
+  FAIL = "fail",
+  SKIP = "skip",
+}
+
+export interface EvalFunction {
+  evalCriteria: EvalCriteria;
+  code: string;
+  name: string;
+  uid: string;
+}
+
+export interface EvalFunctionReport {
+  evalFunction: EvalFunction;
+  true_pass: number;
+  true_fail: number;
+  false_pass: number;
+  false_fail: number;
+  skipped: number;
+  alignment?: number;
+}
+
+export interface EvalFunctionSetReport {
+  failureCoverage: number;
+  falseFailureRate: number;
+  selectedEvalFunctions: EvalFunction[];
+  allEvalFunctionReports: Map<EvalCriteria, EvalFunctionReport[]>; // Map from criteria to function reports
+}
+
+export class EvalExecutionError extends Error {
+  constructor(message: string) {
+    super(message); // Call the parent constructor with the message
+    this.name = "EvalExecutionError"; // Set the error name to the class name
+    Object.setPrototypeOf(this, EvalExecutionError.prototype);
+  }
+}
+
+export const AssertionWriterSystemMsg =
+  "You are an expert Python programmer and helping me write assertions for my LLM pipeline. An LLM pipeline accepts an example and prompt template, fills the template's placeholders with the example, and generates a response.";
+export const AssertionWriterSystemMsgChatHistory: ChatHistoryInfo[] = [
+  {
+    messages: [
+      {
+        role: "system",
+        content: AssertionWriterSystemMsg,
+      },
+    ],
+    fill_history: {},
+  },
+];
diff --git a/chainforge/react-server/src/backend/evalgen/utils.ts b/chainforge/react-server/src/backend/evalgen/utils.ts
new file mode 100644
index 000000000..699d8abd6
--- /dev/null
+++ b/chainforge/react-server/src/backend/evalgen/utils.ts
@@ -0,0 +1,366 @@
+// Interfaces and utility functions
+// TODO: Use ChainForge's openai utils (I tried but got errors)
+// import { AzureOpenAIStreamer } from "./oai_utils";
+import { EventEmitter } from "events";
+import {
+  AssertionWriterSystemMsg,
+  EvalCriteria,
+  EvalFunction,
+  EvalFunctionResult,
+  validEvalCriteriaFormat,
+} from "./typing";
+import { Dict, LLMResponse } from "../typing";
+import { executejs, executepy, simpleQueryLLM } from "../backend";
+import { getVarsAndMetavars, retryAsyncFunc } from "../utils";
+import { v4 as uuid } from "uuid";
+import { OpenAIStreamer } from "./oai_utils";
+import {
+  buildContextPromptForVarsMetavars,
+  buildGenEvalCodePrompt,
+} from "../../AiPopover";
+
+/**
+ * Extracts substrings within "```json" and "```" ticks. Excludes the ticks from return.
+ * @param mdText
+ * @returns
+ */
+function extractJSONBlocks(mdText: string): string[] | undefined {
+  const regex = /```json(.*?)```/gs;
+  const matches = mdText.match(regex);
+  if (matches)
+    return matches.map((s) => s.replace("```json", "").replace("```", ""));
+
+  console.error("No JSON found in output.");
+  return undefined;
+}
+
+/**
+ * Given the user's prompt, generates a list of criteria in JSON format.
+ *
+ * FUTURE: One might consider giving more contextual information, e.g. input vars to the prompt or prompt history.
+ *
+ * @param prompt The user's prompt (must be 'concrete'/escaped braces)
+ * @returns A list of parsed `EvalCriteria`
+ */
+export async function generateLLMEvaluationCriteria(
+  prompt: string,
+  apiKeys?: Dict,
+  promptTemplate?: string, // overrides prompt template used
+  systemMsg?: string | null, // overrides default system message, if present. Use null to specify empty.
+): Promise<EvalCriteria[]> {
+  // Construct the detailed prompt for the LLM
+  const detailedPrompt =
+    promptTemplate ??
+    `Here is my LLM prompt template:
+  
+  \`${prompt}\`
+    
+    Based on the instructions in the prompt that need to be followed, I want to write assertions for my LLM pipeline to run on all pipeline responses. Give me a list of 2 or 3 distinct criteria to check for in LLM responses. Each item in the list should contain a string description of a criteria to check for, and whether it should be evaluated with code or by an expert if the criteria is difficult to evaluate. Your answer should be a JSON list of objects within \`\`\`json \`\`\` markers, where each object has the following three fields: "criteria", "shortname", and "eval_method" (code or expert). At most 3 criteria should have eval_method as expert. The "criteria" should be short, and the "shortname" should be a very brief title for the criteria. Each evaluation criteria should test a concept that should evaluate to "true" in the ideal case.`;
+
+  // Query the LLM (below, we will try this up to 3 times)
+  async function _query() {
+    const result = await simpleQueryLLM(
+      detailedPrompt, // prompt
+      "gpt-4o", // llm
+      // spec, // llm
+      systemMsg !== undefined
+        ? systemMsg === null
+          ? undefined
+          : systemMsg
+        : AssertionWriterSystemMsg, // system_msg
+      apiKeys, // API keys (if any)
+    );
+
+    if (result.errors && Object.keys(result.errors).length > 0)
+      throw new Error(Object.values(result.errors as Dict)[0].toString());
+
+    // Get output (text from LLM response)
+    const output = result.responses[0].responses[0];
+    // console.log("LLM said: ", output); // for debuggging
+
+    // Attempt to extract JSON blocks (strings) from input
+    const json_blocks = extractJSONBlocks(output.toString());
+    if (json_blocks === undefined || json_blocks.length === 0)
+      throw new Error(
+        "EvalGen: Could not parse LLM response into evaluation critera: No JSON detected in output.",
+      );
+
+    // Attempt to parse all JSON blocks into objects
+    const data: EvalCriteria[] = json_blocks.map((s) => JSON.parse(s)).flat(1);
+
+    // console.log("Parsed", data);
+
+    // Double-check the formatting
+    if (data.every(validEvalCriteriaFormat)) {
+      // Initialize any required properties
+      data.forEach((d) => {
+        d.uid = uuid();
+        d.priority = 0;
+      });
+      return data;
+    }
+    // Incorrect formatting
+    else
+      throw new Error(
+        "EvalGen: At least one JSON block was not in expected EvalCriteria format.",
+      );
+  }
+
+  // Retry up to 3 times; otherwise, we will throw the last encountered error.
+  return retryAsyncFunc(_query, 3);
+}
+
+export async function executeLLMEval(
+  evalFunction: EvalFunction,
+  example: LLMResponse,
+  positiveExample: LLMResponse,
+  negativeExample: LLMResponse,
+): Promise<EvalFunctionResult> {
+  // Construct call to an LLM to evaluate the example
+  const evalPrompt =
+    "Evaluate the text below according to this criteria: " +
+    evalFunction.code +
+    ' Only return "yes" or "no", nothing else.\n\n```\n' +
+    example.responses[0] +
+    "\n```";
+
+  // Sleep a random number of seconds between 1 and 30
+  // const sleep = (ms: number) =>
+  //   new Promise((resolve) => setTimeout(resolve, ms));
+  // await sleep(Math.floor(Math.random() * 30) * 1000);
+
+  // Query an LLM as an evaluator
+  let systemMessage = "You are an expert evaluator.";
+  if (
+    positiveExample &&
+    positiveExample.responses[0] &&
+    negativeExample &&
+    negativeExample.responses[0]
+  ) {
+    systemMessage +=
+      " Please consider the following good example: " +
+      positiveExample.responses[0] +
+      " and bad example: " +
+      negativeExample.responses[0] +
+      " when making your evaluation.";
+  }
+
+  const result = await simpleQueryLLM(
+    evalPrompt, // prompt
+    "gpt-3.5-turbo-16k", // llm
+    systemMessage, // system_msg
+  );
+  // Get the output
+  const output = result.responses[0].responses[0].toString();
+
+  // Parse the response to determine the boolean value to return
+  if (output.toLowerCase().includes("yes")) {
+    return EvalFunctionResult.PASS;
+  } else if (output.toLowerCase().includes("no")) {
+    return EvalFunctionResult.FAIL;
+  } else {
+    // throw new EvalExecutionError(
+    //   `Error executing function ${evalFunction.name}: could not parse ${response.choices[0].message.content}`,
+    // );
+    console.warn(
+      "executeLLMEval: Warning: Could not find 'yes' or 'no' in response.",
+      evalPrompt,
+      output,
+    );
+    return EvalFunctionResult.SKIP;
+  }
+}
+
+/**
+ * Executes a JavaScript function, described by evalFunction, against the "example" LLM response object.
+ * @returns `EvalFunctionResult`
+ */
+export async function execJSFunc(
+  evalFunction: EvalFunction,
+  example: LLMResponse,
+  iframe_id: string,
+) {
+  try {
+    const result = await executejs(
+      iframe_id,
+      evalFunction.code,
+      [example],
+      "response",
+      "evaluator",
+    );
+
+    // Check for errors
+    if (result.error !== undefined) throw new Error(result.error);
+
+    // Extract the evaluation result
+    const eval_res = result.responses
+      ? (result.responses[0] as LLMResponse).eval_res?.items[0]
+      : undefined;
+
+    // Check that the evaluation result is a boolean value
+    // NOTE: EvalGen only supports assertion functions at this time.
+    if (typeof eval_res !== "boolean")
+      throw new Error(
+        "Non-boolean return value encountered when executing JS eval code. Value: " +
+          eval_res,
+      );
+
+    return eval_res ? EvalFunctionResult.PASS : EvalFunctionResult.FAIL;
+  } catch (err) {
+    console.error(err);
+    return EvalFunctionResult.SKIP;
+  }
+}
+
+/**
+ * Executes a Python function, described by evalFunction, against the "example" LLM response object.
+ * NOTE: This runs in a sandbox using pyodide.
+ * @returns `EvalFunctionResult`
+ */
+export async function execPyFunc(
+  evalFunction: EvalFunction,
+  example: LLMResponse,
+  positiveExample: LLMResponse,
+  negativeExample: LLMResponse,
+): Promise<EvalFunctionResult> {
+  try {
+    // We need to replace the function name with "evaluate", which is what is expected by backend:
+    let code = evalFunction.code.replace(
+      `def ${evalFunction.name}`,
+      "def evaluate",
+    );
+
+    // Add `import re` to the code if it's not already there
+    if (!code.includes("import re")) code = "import re\n" + code;
+
+    // console.log(`Executing function: ${code}`);
+
+    // Execute the function via pyodide
+    const result = await executepy(
+      uuid(),
+      code,
+      [example],
+      "response",
+      "evaluator",
+      undefined,
+      "pyodide", // execute in sandbox with a pyodide WebWorker
+    );
+
+    // Check for errors
+    if (result.error !== undefined) throw new Error(result.error);
+
+    // console.log("Result:", result);
+
+    // Extract the evaluation result
+    const eval_res = result.responses
+      ? (result.responses[0] as LLMResponse).eval_res?.items[0]
+      : undefined;
+
+    // Check that the evaluation result is a boolean value
+    // NOTE: EvalGen only supports assertion functions at this time.
+    if (typeof eval_res !== "boolean")
+      throw new Error(
+        "Non-boolean return value encountered when executing Python eval code. Value: " +
+          eval_res,
+      );
+
+    return eval_res ? EvalFunctionResult.PASS : EvalFunctionResult.FAIL;
+  } catch (err) {
+    console.error(err);
+    return EvalFunctionResult.SKIP;
+  }
+}
+
+export async function generateFunctionsForCriteria(
+  criteria: EvalCriteria,
+  promptTemplate: string,
+  example: LLMResponse,
+  emitter: EventEmitter,
+  badExample?: LLMResponse,
+): Promise<void> {
+  const functionGenPrompt = buildFunctionGenPrompt(
+    criteria,
+    promptTemplate,
+    example,
+    badExample,
+  );
+  console.log("Function generation prompt:", functionGenPrompt);
+
+  try {
+    const streamer = new OpenAIStreamer();
+
+    streamer.on("function", (functionDefinition: string) => {
+      processAndEmitFunction(criteria, functionDefinition, emitter);
+    });
+
+    const modelType =
+      criteria.eval_method === "expert" ? "llm_eval" : "python_fn";
+    await streamer.generate(functionGenPrompt, "gpt-4o", modelType);
+  } catch (error) {
+    console.error("Error generating function for criteria:", error);
+    throw new Error(
+      `Failed to generate function for criteria: ${criteria.criteria}`,
+    );
+  }
+}
+
+function buildFunctionGenPrompt(
+  criteria: EvalCriteria,
+  promptTemplate: string,
+  example: LLMResponse,
+  badExample?: LLMResponse,
+): string {
+  let badExampleSection = "";
+  if (badExample) {
+    badExampleSection = `
+    Here is an example response that DOES NOT meet the criteria:
+    \`\`\`
+    ${badExample.responses[0]}
+    \`\`\`
+    `;
+  }
+
+  if (criteria.eval_method === "expert") {
+    return `Given the following prompt template for an LLM pipeline:\n\n ${promptTemplate}\n\n, your task is to devise a prompt for an expert to evaluate the pipeline's responses based on the following criteria: ${criteria.criteria}
+    ${badExampleSection}
+    You will devise 3 prompts for the evaluation criterion to see which has the best accuracy. Each prompt you generate should be a short question that an expert can answer with a "yes" or "no" to evaluate entire criteria (don't miss anything in the criteria). Try different variations/wordings in the prompts. Return your prompts in a JSON list of strings within \`\`\`json \`\`\` markers. Each string should be a question for the expert to answer, and each question should be contained on its own line.
+    `;
+  } else {
+    const prompt = `Given the following prompt template for an LLM pipeline:\n\n ${promptTemplate}\n\n, your task is to devise multiple Python assertions to evaluate LLM responses based on the criteria "${criteria.shortname}". 
+    ${badExampleSection}
+    Create 3 implementations of the criterion.
+    ${buildGenEvalCodePrompt("python", buildContextPromptForVarsMetavars(getVarsAndMetavars([example])), criteria.criteria, true)}
+    Be creative in your implementations. Our goal is to explore diverse approaches to evaluate LLM responses effectively. Try to avoid using third-party libraries for code-based evaluation methods. Include the full implementation of each function. Each function should return only True or False.`;
+
+    return prompt;
+  }
+}
+
+function processAndEmitFunction(
+  criteria: EvalCriteria,
+  functionDefinition: string,
+  emitter: EventEmitter,
+): void {
+  const evalFunction: EvalFunction = {
+    evalCriteria: criteria,
+    code: functionDefinition,
+    name: functionDefinition,
+    uid: uuid(),
+  };
+
+  if (criteria.eval_method === "code") {
+    const functionNameMatch = functionDefinition.match(
+      /def\s+([a-zA-Z_]\w*)\s*\(/,
+    );
+    if (functionNameMatch) {
+      evalFunction.name = functionNameMatch[1];
+    } else {
+      console.error(
+        "Could not extract the function name from the provided code.",
+      );
+      return; // Skip emitting if no function name could be extracted
+    }
+  }
+
+  emitter.emit("functionGenerated", evalFunction);
+}

From 4ca8bd99d8106d5e94a988363078cf635286cc5f Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Sat, 15 Mar 2025 18:25:16 -0400
Subject: [PATCH 09/35] wip transfer

---
 chainforge/react-server/src/MultiEvalNode.tsx         | 4 ++++
 chainforge/react-server/src/ResponseRatingToolbar.tsx | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/chainforge/react-server/src/MultiEvalNode.tsx b/chainforge/react-server/src/MultiEvalNode.tsx
index 53fdafd82..e546550ab 100644
--- a/chainforge/react-server/src/MultiEvalNode.tsx
+++ b/chainforge/react-server/src/MultiEvalNode.tsx
@@ -20,6 +20,7 @@ import {
   Button,
   Alert,
   Tooltip,
+  Flex,
 } from "@mantine/core";
 import { useDisclosure } from "@mantine/hooks";
 import {
@@ -31,6 +32,7 @@ import {
   IconPlus,
   IconRobot,
   IconSearch,
+  IconSparkles,
   IconTerminal,
   IconTrash,
 } from "@tabler/icons-react";
@@ -57,6 +59,8 @@ import { GatheringResponsesRingProgress } from "./LLMItemButtonGroup";
 import { Dict, LLMResponse, QueryProgress } from "./backend/typing";
 import { AlertModalContext } from "./AlertModal";
 import { Status } from "./StatusIndicatorComponent";
+import { EvalGenReport } from "./backend/evalgen/typing";
+import { EvalGenModalRef } from "./EvalGenModal";
 
 const IS_RUNNING_LOCALLY = APP_IS_RUNNING_LOCALLY();
 
diff --git a/chainforge/react-server/src/ResponseRatingToolbar.tsx b/chainforge/react-server/src/ResponseRatingToolbar.tsx
index aa50880d5..2f517647d 100644
--- a/chainforge/react-server/src/ResponseRatingToolbar.tsx
+++ b/chainforge/react-server/src/ResponseRatingToolbar.tsx
@@ -30,6 +30,10 @@ const collapse_ratings = (rating_dict: RatingDict, idxs: number[]) => {
   return undefined;
 };
 
+export const extractUIDFromRatingKey = (key: string) => {
+  return key.substring(2, key.lastIndexOf("."));
+};
+
 export const getLabelForResponse = (uid: string, label_name: string) => {
   return StorageCache.get(getRatingKeyForResponse(uid, label_name));
 };

From 248cb9ce9e7a5b0f42584dcc3db4fd42d7fc36aa Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Sat, 15 Mar 2025 18:28:51 -0400
Subject: [PATCH 10/35] wip transfer

---
 chainforge/react-server/src/MultiEvalNode.tsx         | 2 +-
 chainforge/react-server/src/ResponseRatingToolbar.tsx | 3 +--
 chainforge/react-server/src/backend/typing.ts         | 2 ++
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/chainforge/react-server/src/MultiEvalNode.tsx b/chainforge/react-server/src/MultiEvalNode.tsx
index e546550ab..c700dc267 100644
--- a/chainforge/react-server/src/MultiEvalNode.tsx
+++ b/chainforge/react-server/src/MultiEvalNode.tsx
@@ -852,4 +852,4 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
   );
 };
 
-export default MultiEvalNode;
\ No newline at end of file
+export default MultiEvalNode;
diff --git a/chainforge/react-server/src/ResponseRatingToolbar.tsx b/chainforge/react-server/src/ResponseRatingToolbar.tsx
index 2f517647d..8e40e6e91 100644
--- a/chainforge/react-server/src/ResponseRatingToolbar.tsx
+++ b/chainforge/react-server/src/ResponseRatingToolbar.tsx
@@ -15,8 +15,7 @@ import {
 import StorageCache from "./backend/cache";
 import useStore from "./store";
 import { deepcopy } from "./backend/utils";
-
-type RatingDict = Record<number, boolean | string | undefined>;
+import { RatingDict } from "./backend/typing";
 
 export const getRatingKeyForResponse = (uid: string, label_name: string) =>
   `r.${uid}.${label_name}`;
diff --git a/chainforge/react-server/src/backend/typing.ts b/chainforge/react-server/src/backend/typing.ts
index ac4f19468..02b5691a1 100644
--- a/chainforge/react-server/src/backend/typing.ts
+++ b/chainforge/react-server/src/backend/typing.ts
@@ -276,3 +276,5 @@ export type TabularDataColType = {
 };
 
 export type PythonInterpreter = "flask" | "pyodide";
+
+export type RatingDict = Record<number, boolean | string | undefined>;

From 9044bb461ff10fdf61bc9e751de536b32d17e241 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Sat, 15 Mar 2025 18:30:59 -0400
Subject: [PATCH 11/35] wip transfer

---
 chainforge/react-server/src/MultiEvalNode.tsx | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/chainforge/react-server/src/MultiEvalNode.tsx b/chainforge/react-server/src/MultiEvalNode.tsx
index c700dc267..8b9a1793d 100644
--- a/chainforge/react-server/src/MultiEvalNode.tsx
+++ b/chainforge/react-server/src/MultiEvalNode.tsx
@@ -60,7 +60,7 @@ import { Dict, LLMResponse, QueryProgress } from "./backend/typing";
 import { AlertModalContext } from "./AlertModal";
 import { Status } from "./StatusIndicatorComponent";
 import { EvalGenReport } from "./backend/evalgen/typing";
-import { EvalGenModalRef } from "./EvalGenModal";
+import EvalGenModal, { EvalGenModalRef } from "./EvalGenModal";
 
 const IS_RUNNING_LOCALLY = APP_IS_RUNNING_LOCALLY();
 
@@ -619,7 +619,9 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
         ref={inspectModal}
         jsonResponses={lastResponses}
       />
-      {/* <PickCriteriaModal ref={pickCriteriaModalRef} /> */}
+
+      <EvalGenModal ref={evalGenModalRef} />
+
       <iframe style={{ display: "none" }} id={`${id}-iframe`}></iframe>
 
       {/* {evaluatorComponents} */}

From 09123ee1216c78c61c0723c5fc4fc1ece2350232 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Sat, 15 Mar 2025 18:52:12 -0400
Subject: [PATCH 12/35] wip transfer

---
 chainforge/react-server/src/EvalGenModal.tsx | 100 ++++---------------
 chainforge/react-server/src/backend/utils.ts |   4 +
 2 files changed, 26 insertions(+), 78 deletions(-)

diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx
index ae8b47126..ae13c01c3 100644
--- a/chainforge/react-server/src/EvalGenModal.tsx
+++ b/chainforge/react-server/src/EvalGenModal.tsx
@@ -240,12 +240,10 @@ const CriteriaCard: React.FC<CriteriaCardProps> = ({
   const [opened, { toggle }] = useDisclosure(initiallyOpen ?? false);
   const [title, setTitle] = useState(criterion.shortname ?? "New Criteria");
 
-  return (
-    <Card withBorder mb={4} radius="md" style={{ cursor: "default" }}>
-      <Card.Section withBorder pl="8px">
+  return (<Stack spacing={0} ml={8}>
         <Flex align="center">
           <Group spacing="0px">
-            {/* The arrow chevron user can click to collapse/expand */}
+            {/* The arrow chevron user can click to collapse/expand
             <Button
               color="gray"
               p={0}
@@ -259,7 +257,7 @@ const CriteriaCard: React.FC<CriteriaCardProps> = ({
               ) : (
                 <IconChevronRight size="14pt" />
               )}
-            </Button>
+            </Button> */}
 
             {/* Thumbs up/down buttons */}
             <ThumbUpDownButtons
@@ -267,7 +265,6 @@ const CriteriaCard: React.FC<CriteriaCardProps> = ({
               onChangeGrade={onChangeGrade}
               getGradeCount={getGradeCount}
             />
-            <Contributor getStateValue={getStateValue} />
 
             {/* Title of the criteria */}
             <TextInput
@@ -279,7 +276,7 @@ const CriteriaCard: React.FC<CriteriaCardProps> = ({
               }}
               placeholder="Criteria name"
               variant="unstyled"
-              size="sm"
+              size="md"
               ml="xs"
               className="nodrag nowheel"
               styles={{
@@ -328,59 +325,19 @@ const CriteriaCard: React.FC<CriteriaCardProps> = ({
               </Text>
             </Tooltip>
 
-            {/* Favorite star toggle */}
-            <Tooltip
-              label={
-                criterion.priority <= 0
-                  ? "Make this a deal-breaker"
-                  : "It's a deal-breaker"
-              }
-              withinPortal
-              withArrow
-            >
-              <Button
-                color={criterion.priority <= 0 ? "gray" : "red"}
-                m={0}
-                p={0}
-                variant="subtle"
-                onClick={() => {
-                  criterion.priority = criterion.priority <= 0 ? 1 : 0;
-                  if (onChange) onChange(criterion);
-                }}
-              >
-                {/* <IconStarFilled size="14pt" /> */}
-                <IconFlagFilled size="14pt" />
-              </Button>
-            </Tooltip>
+            {/* <Contributor getStateValue={getStateValue} /> */}
 
             {/* Delete button (and any other criterion-specific changes in the future) */}
-            <Menu withinPortal position="right-start" shadow="sm">
-              <Menu.Target>
-                <ActionIcon variant="subtle" color="gray">
-                  <IconDots style={{ width: rem(16), height: rem(16) }} />
-                </ActionIcon>
-              </Menu.Target>
-
-              <Menu.Dropdown>
-                <Menu.Item
-                  icon={<IconTrash size="14px" />}
-                  color="red"
-                  onClick={onDelete}
-                >
-                  Delete
-                </Menu.Item>
-              </Menu.Dropdown>
-            </Menu>
+            <ActionIcon variant="subtle" color="red" onClick={onDelete}>
+              <IconTrash style={{ width: rem(16), height: rem(16) }} />
+            </ActionIcon>
           </Group>
         </Flex>
-      </Card.Section>
 
-      {/* Description of the criteria */}
-      <Card.Section p="0px">
-        <Collapse in={opened}>
           <Textarea
             value={criterion.criteria}
-            placeholder="Describe here."
+            placeholder="Describe here. You must describe what the criteria means before EvalGen can implement it."
+            ml={38}
             onChange={(e) => {
               criterion.criteria = e.target.value;
               if (onChange) onChange(criterion);
@@ -403,9 +360,7 @@ const CriteriaCard: React.FC<CriteriaCardProps> = ({
             mb="xs"
             c="dimmed"
           />
-        </Collapse>
-      </Card.Section>
-    </Card>
+          </Stack>
   );
 };
 
@@ -911,7 +866,7 @@ If you determine the feedback corresponds to a new criteria, your response shoul
 
     return (
       <Modal
-        size="90%"
+        size="95%"
         keepMounted
         opened={opened}
         onClose={close}
@@ -957,11 +912,14 @@ If you determine the feedback corresponds to a new criteria, your response shoul
               </Stack>
             </Grid.Col>
             <Grid.Col span={4} bg="#eee" pt="16px" h="100%">
+              <Center>
+                <Title order={3} ml={8} mt="sm" mb="md">Rubric</Title>
+              </Center>
+              
               <div
                 style={{
                   display: "flex",
                   flexDirection: "column",
-                  height: "100%",
                 }}
               >
                 <div style={{ flex: 2, overflowY: "auto" }}>
@@ -1009,24 +967,12 @@ If you determine the feedback corresponds to a new criteria, your response shoul
                   ) : (
                     <></>
                   )}
-                  {/* <Center> */}
+                  
                   <div className="criteriaButtons">
-                    {/* <button
-                    onClick={() => {
-                      handleAddCriteria({
-                        shortname: "New Criteria",
-                        criteria: "",
-                        eval_method: "code",
-                        priority: 0,
-                        uid: uuid(),
-                      });
-                    }}
-                  >
-                    +
-                  </button> */}
                     <Button
                       leftIcon={<IconPencil size={14} />}
-                      variant="filled"
+                      variant="subtle"
+                      color="gray"
                       // gradient={{ from: "blue", to: "green", deg: 90 }}
                       onClick={() => {
                         handleAddCriteria({
@@ -1038,13 +984,12 @@ If you determine the feedback corresponds to a new criteria, your response shoul
                         });
                       }}
                     >
-                      New Criteria
+                      Add a new criteria
                     </Button>
-                    {/* </Center>
-                <Center> */}
                     <Button
                       leftIcon={<IconSparkles size={14} />}
-                      variant="filled"
+                      variant="subtle"
+                      color="gray"
                       // gradient={{ from: "blue", to: "green", deg: 90 }}
                       onClick={() => {
                         generateCriteria(responses);
@@ -1052,7 +997,6 @@ If you determine the feedback corresponds to a new criteria, your response shoul
                     >
                       Suggest Criteria
                     </Button>
-                    {/* </Center> */}
                   </div>
                 </div>
 
diff --git a/chainforge/react-server/src/backend/utils.ts b/chainforge/react-server/src/backend/utils.ts
index ce55d6008..ca66c078c 100644
--- a/chainforge/react-server/src/backend/utils.ts
+++ b/chainforge/react-server/src/backend/utils.ts
@@ -194,6 +194,10 @@ export function set_api_keys(api_keys: Dict<string>): void {
   if (key_is_present("DeepSeek")) DEEPSEEK_API_KEY = api_keys.DeepSeek;
 }
 
+export function get_openai_api_key(): string | undefined {
+  return OPENAI_API_KEY;
+}
+
 export function get_azure_openai_api_keys(): [
   string | undefined,
   string | undefined,

From 4a5e9526330848b0d2d138eb720a67abd4a94eb3 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Sat, 15 Mar 2025 23:08:31 -0400
Subject: [PATCH 13/35] Add new Stepper modal WIP

---
 chainforge/react-server/src/EvalGen2Modal.tsx | 341 ++++++++++++++++++
 chainforge/react-server/src/EvalGenModal.tsx  | 205 +++++------
 chainforge/react-server/src/MultiEvalNode.tsx |  28 +-
 .../react-server/src/text-fields-node.css     |  31 ++
 4 files changed, 498 insertions(+), 107 deletions(-)
 create mode 100644 chainforge/react-server/src/EvalGen2Modal.tsx

diff --git a/chainforge/react-server/src/EvalGen2Modal.tsx b/chainforge/react-server/src/EvalGen2Modal.tsx
new file mode 100644
index 000000000..fb1eb28c6
--- /dev/null
+++ b/chainforge/react-server/src/EvalGen2Modal.tsx
@@ -0,0 +1,341 @@
+import React, { useState } from "react";
+import {
+  Modal,
+  Button,
+  Group,
+  Stepper,
+  Title,
+  Text,
+  Card,
+  Stack,
+} from "@mantine/core";
+import { EvalCriteria, EvalGenReport } from "./backend/evalgen/typing";
+
+/*
+    PROPS FOR STEPPER SCREEN COMPONENTS
+ */
+interface WelcomeStepProps {
+  onNext: () => void;
+}
+
+interface FeedbackStepProps {
+  onNext: () => void;
+  onPrevious: () => void;
+  // setFeedbackData: (feedback: FeedbackItem[]) => void;
+}
+
+interface CriteriaStepProps {
+  onNext: () => void;
+  onPrevious: () => void;
+  // feedbackData: FeedbackItem[];
+  // setCriteriaData: (criteria: EvalCriteria[]) => void;
+}
+
+interface GradingStepProps {
+  onNext: () => void;
+  onPrevious: () => void;
+  // criteriaData: EvalCriteria[];
+  // setGradingData: (grades: GradeData) => void;
+}
+
+interface ResultsStepProps {
+  onPrevious: () => void;
+  onComplete: () => void;
+  // criteriaData: Criterion[];
+  // gradingData: GradeData;
+}
+
+// Main wizard component props
+interface EvalGenWizardProps {
+  opened: boolean;
+  onClose: () => void;
+  onComplete: (result: EvalGenReport) => void;
+}
+
+/*
+    STEPPER SCREEN COMPONENTS
+ */
+const WelcomeStep: React.FC<WelcomeStepProps> = ({ onNext }) => (
+  <Stack spacing="lg">
+    <Title order={2}>Welcome to the LLM Evaluation Wizard</Title>
+    <Text>
+      This wizard will guide you through creating automated evaluators for LLM
+      responses. You&apos;ll define criteria, provide feedback, and implement
+      evaluations to measure alignment.
+    </Text>
+    <Button onClick={onNext} fullWidth mt="xl">
+      Get Started
+    </Button>
+  </Stack>
+);
+
+const FeedbackStep: React.FC<FeedbackStepProps> = ({ onNext, onPrevious }) => {
+  // State for thumbs up/down feedback and written comments
+  const [feedback, setFeedback] = useState([]);
+
+  const handleSubmit = () => {
+    // setFeedbackData(feedback);
+    onNext();
+  };
+
+  return (
+    <Stack spacing="lg">
+      <Title order={3}>Provide Feedback on Some LLM Responses</Title>
+
+      {/* TODO: Implement thumbs up/down feedback UI with written comments */}
+      <Text>
+        TODO: Display LLM responses with thumbs up/down controls and comment
+        field
+      </Text>
+
+      <Group position="apart" mt="xl">
+        <Button variant="default" onClick={onPrevious}>
+          Back
+        </Button>
+        <Button onClick={handleSubmit}>Continue</Button>
+      </Group>
+    </Stack>
+  );
+};
+
+const CriteriaStep: React.FC<CriteriaStepProps> = ({ onNext, onPrevious }) => {
+  // State for criteria cards
+  const [criteria, setCriteria] = useState([]);
+  const [newCriteriaText, setNewCriteriaText] = useState("");
+
+  // TODO: Use feedbackData to generate initial criteria when component mounts
+
+  const handleAddCriteria = () => {
+    // TODO: Add new criteria based on text input
+  };
+
+  const handleModifyCriteria = (uid: string, newText: string) => {
+    // TODO: Modify existing criteria
+  };
+
+  const handleRemoveCriteria = (uid: string) => {
+    // TODO: Remove criteria
+  };
+
+  const handleGenerateCriteria = () => {
+    // TODO: Generate new criteria based on user input
+  };
+
+  const handleSubmit = () => {
+    // setCriteriaData(criteria);
+    onNext();
+  };
+
+  return (
+    <Stack spacing="lg">
+      <Title order={3}>Define Evaluation Criteria</Title>
+      <Text>
+        Based on your feedback, we&apos;ve generated these starter criteria:
+      </Text>
+
+      {/* TODO: Implement criteria cards UI */}
+      <Text>TODO: Display criteria cards with edit/delete functionality</Text>
+
+      {/* TODO: Implement input for new criteria */}
+      <Text>TODO: Input field for adding new criteria</Text>
+
+      <Group position="apart" mt="xl">
+        <Button variant="default" onClick={onPrevious}>
+          Back
+        </Button>
+        <Button onClick={handleSubmit}>Ready to Grade!</Button>
+      </Group>
+    </Stack>
+  );
+};
+
+const GradingStep: React.FC<GradingStepProps> = ({ onNext, onPrevious }) => {
+  // State for per-criteria grades
+  const [grades, setGrades] = useState({});
+
+  // TODO: Set up grading UI for each criteria
+
+  const handleSubmit = () => {
+    // setGradingData(grades);
+    onNext();
+  };
+
+  return (
+    <Stack spacing="lg">
+      <Title order={3}>Grade LLM Responses By Criteria</Title>
+      <Text>Please evaluate each response according to your criteria:</Text>
+
+      {/* TODO: Implement grading UI per criteria */}
+      <Text>TODO: Display grading interface for each criteria</Text>
+
+      <Group position="apart" mt="xl">
+        <Button variant="default" onClick={onPrevious}>
+          Back
+        </Button>
+        <Button onClick={handleSubmit}>I&apos;m tired, process results</Button>
+      </Group>
+    </Stack>
+  );
+};
+
+const ResultsStep: React.FC<ResultsStepProps> = ({
+  onPrevious,
+  onComplete,
+}) => {
+  // TODO: Calculate alignment scores based on criteria and grading data
+  const alignmentScores = {};
+
+  return (
+    <Stack spacing="lg">
+      <Title order={3}>Evaluation Results</Title>
+      <Text>
+        Here&apos;s how well each evaluation criteria aligns with your grades:
+      </Text>
+
+      {/* TODO: Display alignment scores */}
+      <Text>TODO: Show alignment scores for each criteria</Text>
+
+      <Group position="apart" mt="xl">
+        <Button variant="default" onClick={onPrevious}>
+          Back
+        </Button>
+        <Button onClick={onComplete} color="green">
+          Done
+        </Button>
+      </Group>
+    </Stack>
+  );
+};
+
+const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
+  opened,
+  onClose,
+  onComplete,
+}) => {
+  const [active, setActive] = useState(0);
+
+  // State for data collected across steps
+  const [feedbackData, setFeedbackData] = useState([]);
+  const [criteriaData, setCriteriaData] = useState([]);
+  const [gradingData, setGradingData] = useState({});
+
+  const handleNext = () => {
+    setActive((current) => current + 1);
+  };
+
+  const handlePrevious = () => {
+    setActive((current) => current - 1);
+  };
+
+  const handleComplete = () => {
+    // Return final data to the caller
+    onComplete({
+      criteria: criteriaData,
+      failureCoverage: 0,
+      falseFailureRate: 0,
+      // grades: gradingData,
+      // alignmentScores: {} // TODO: Include actual alignment scores
+    });
+    onClose();
+  };
+
+  return (
+    <Modal
+      opened={opened}
+      onClose={onClose}
+      title="LLM Evaluation Wizard"
+      size="90%"
+      padding="md"
+      // keepMounted
+      // closeOnClickOutside={true}
+      style={{ position: "relative", left: "-5%" }}
+      styles={{
+        inner: {
+          padding: "5%", // This creates space around the modal (10% total)
+        },
+        content: {
+          height: "100%", // Fill the available space
+          maxHeight: "90vh", // Limit to 90% of viewport height
+          display: "flex",
+          flexDirection: "column",
+        },
+        body: {
+          flex: 1, // This makes the body expand to fill available space
+          overflow: "auto", // Add scrolling if content is too tall
+        },
+      }}
+    >
+      {active === 0 && <WelcomeStep onNext={handleNext} />}
+
+      {active === 1 && (
+        <FeedbackStep
+          onNext={handleNext}
+          onPrevious={handlePrevious}
+          // setFeedbackData={setFeedbackData}
+        />
+      )}
+
+      {active === 2 && (
+        <CriteriaStep
+          onNext={handleNext}
+          onPrevious={handlePrevious}
+          // feedbackData={feedbackData}
+          // setCriteriaData={setCriteriaData}
+        />
+      )}
+
+      {active === 3 && (
+        <GradingStep
+          onNext={handleNext}
+          onPrevious={handlePrevious}
+          // criteriaData={criteriaData}
+          // setGradingData={setGradingData}
+        />
+      )}
+
+      {active === 4 && (
+        <ResultsStep
+          onPrevious={handlePrevious}
+          onComplete={handleComplete}
+          // criteriaData={criteriaData}
+          // gradingData={gradingData}
+        />
+      )}
+
+      {/* Sticky footer */}
+      <div
+        style={{
+          position: "fixed",
+          bottom: 0,
+          background: "white",
+          padding: "10px",
+          borderTop: "1px solid #ddd",
+          width: "95%",
+        }}
+      >
+        <Stepper active={active} mb="xl">
+          <Stepper.Step label="Welcome" description="Get started">
+            {/* Step content is rendered below */}
+          </Stepper.Step>
+          <Stepper.Step label="Feedback" description="Rate some responses">
+            {/* Step content is rendered below */}
+          </Stepper.Step>
+          <Stepper.Step label="Criteria" description="Define eval criteria">
+            {/* Step content is rendered below */}
+          </Stepper.Step>
+          <Stepper.Step
+            label="Grading and Generation"
+            description="Grade by criteria, while we generate implementations"
+          >
+            {/* Step content is rendered below */}
+          </Stepper.Step>
+          <Stepper.Step label="Results" description="View alignment">
+            {/* Step content is rendered below */}
+          </Stepper.Step>
+        </Stepper>
+      </div>
+    </Modal>
+  );
+};
+
+export default EvalGenWizard;
diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx
index ae13c01c3..9c450dcd1 100644
--- a/chainforge/react-server/src/EvalGenModal.tsx
+++ b/chainforge/react-server/src/EvalGenModal.tsx
@@ -240,10 +240,11 @@ const CriteriaCard: React.FC<CriteriaCardProps> = ({
   const [opened, { toggle }] = useDisclosure(initiallyOpen ?? false);
   const [title, setTitle] = useState(criterion.shortname ?? "New Criteria");
 
-  return (<Stack spacing={0} ml={8}>
-        <Flex align="center">
-          <Group spacing="0px">
-            {/* The arrow chevron user can click to collapse/expand
+  return (
+    <Stack spacing={0} ml={8}>
+      <Flex align="center">
+        <Group spacing="0px">
+          {/* The arrow chevron user can click to collapse/expand
             <Button
               color="gray"
               p={0}
@@ -259,108 +260,108 @@ const CriteriaCard: React.FC<CriteriaCardProps> = ({
               )}
             </Button> */}
 
-            {/* Thumbs up/down buttons */}
-            <ThumbUpDownButtons
-              grade={grade}
-              onChangeGrade={onChangeGrade}
-              getGradeCount={getGradeCount}
-            />
-
-            {/* Title of the criteria */}
-            <TextInput
-              value={title}
-              onChange={(e) => setTitle(e.target.value)}
-              onBlur={(e) => {
-                criterion.shortname = e.target.value;
-                if (onChange) onChange(criterion);
-              }}
-              placeholder="Criteria name"
-              variant="unstyled"
-              size="md"
-              ml="xs"
-              className="nodrag nowheel"
-              styles={{
-                input: {
-                  padding: "0px",
-                  height: "14pt",
-                  minHeight: "0pt",
-                  fontWeight: 500,
-                },
-              }}
-            />
-          </Group>
-
-          <Group spacing="4px" ml="auto">
-            {/* Whether this criteria should be implemented with code (function) or an LLM evaluator */}
-            <Tooltip
-              label={
-                criterion.eval_method === "code"
-                  ? "Change to an LLM evaluator"
-                  : "Change to a code evaluator"
-              }
-              withinPortal
-              withArrow
-            >
-              <Text
-                color="#999"
-                size="sm"
-                mr="6px"
-                onClick={() => {
-                  criterion.eval_method =
-                    criterion.eval_method === "code" ? "expert" : "code";
-                  if (onChange) onChange(criterion);
-                }}
-              >
-                {criterion.eval_method === "code" ? (
-                  <Flex style={{ userSelect: "none" }}>
-                    <IconTerminal2 size="14pt" />
-                    &nbsp;Python
-                  </Flex>
-                ) : (
-                  <Flex style={{ userSelect: "none" }}>
-                    <IconRobot size="14pt" />
-                    &nbsp;LLM
-                  </Flex>
-                )}
-              </Text>
-            </Tooltip>
-
-            {/* <Contributor getStateValue={getStateValue} /> */}
-
-            {/* Delete button (and any other criterion-specific changes in the future) */}
-            <ActionIcon variant="subtle" color="red" onClick={onDelete}>
-              <IconTrash style={{ width: rem(16), height: rem(16) }} />
-            </ActionIcon>
-          </Group>
-        </Flex>
+          {/* Thumbs up/down buttons */}
+          <ThumbUpDownButtons
+            grade={grade}
+            onChangeGrade={onChangeGrade}
+            getGradeCount={getGradeCount}
+          />
 
-          <Textarea
-            value={criterion.criteria}
-            placeholder="Describe here. You must describe what the criteria means before EvalGen can implement it."
-            ml={38}
-            onChange={(e) => {
-              criterion.criteria = e.target.value;
+          {/* Title of the criteria */}
+          <TextInput
+            value={title}
+            onChange={(e) => setTitle(e.target.value)}
+            onBlur={(e) => {
+              criterion.shortname = e.target.value;
               if (onChange) onChange(criterion);
             }}
-            onClickCapture={(e) => e.stopPropagation()}
+            placeholder="Criteria name"
+            variant="unstyled"
+            size="md"
+            ml="xs"
+            className="nodrag nowheel"
             styles={{
               input: {
-                border: "none",
-                borderWidth: "0px",
-                margin: "0px",
-                color: "#444",
-                background: "transparent",
-                lineHeight: 1.1,
+                padding: "0px",
+                height: "14pt",
+                minHeight: "0pt",
+                fontWeight: 500,
               },
             }}
-            autosize
-            minRows={2}
-            maxRows={5}
-            fz="sm"
-            mb="xs"
-            c="dimmed"
           />
-          </Stack>
+        </Group>
+
+        <Group spacing="4px" ml="auto">
+          {/* Whether this criteria should be implemented with code (function) or an LLM evaluator */}
+          <Tooltip
+            label={
+              criterion.eval_method === "code"
+                ? "Change to an LLM evaluator"
+                : "Change to a code evaluator"
+            }
+            withinPortal
+            withArrow
+          >
+            <Text
+              color="#999"
+              size="sm"
+              mr="6px"
+              onClick={() => {
+                criterion.eval_method =
+                  criterion.eval_method === "code" ? "expert" : "code";
+                if (onChange) onChange(criterion);
+              }}
+            >
+              {criterion.eval_method === "code" ? (
+                <Flex style={{ userSelect: "none" }}>
+                  <IconTerminal2 size="14pt" />
+                  &nbsp;Python
+                </Flex>
+              ) : (
+                <Flex style={{ userSelect: "none" }}>
+                  <IconRobot size="14pt" />
+                  &nbsp;LLM
+                </Flex>
+              )}
+            </Text>
+          </Tooltip>
+
+          {/* <Contributor getStateValue={getStateValue} /> */}
+
+          {/* Delete button (and any other criterion-specific changes in the future) */}
+          <ActionIcon variant="subtle" color="red" onClick={onDelete}>
+            <IconTrash style={{ width: rem(16), height: rem(16) }} />
+          </ActionIcon>
+        </Group>
+      </Flex>
+
+      <Textarea
+        value={criterion.criteria}
+        placeholder="Describe here. You must describe what the criteria means before EvalGen can implement it."
+        ml={38}
+        onChange={(e) => {
+          criterion.criteria = e.target.value;
+          if (onChange) onChange(criterion);
+        }}
+        onClickCapture={(e) => e.stopPropagation()}
+        styles={{
+          input: {
+            border: "none",
+            borderWidth: "0px",
+            margin: "0px",
+            color: "#444",
+            background: "transparent",
+            lineHeight: 1.1,
+          },
+        }}
+        autosize
+        minRows={2}
+        maxRows={5}
+        fz="sm"
+        mb="xs"
+        c="dimmed"
+      />
+    </Stack>
   );
 };
 
@@ -913,9 +914,11 @@ If you determine the feedback corresponds to a new criteria, your response shoul
             </Grid.Col>
             <Grid.Col span={4} bg="#eee" pt="16px" h="100%">
               <Center>
-                <Title order={3} ml={8} mt="sm" mb="md">Rubric</Title>
+                <Title order={3} ml={8} mt="sm" mb="md">
+                  Rubric
+                </Title>
               </Center>
-              
+
               <div
                 style={{
                   display: "flex",
@@ -967,7 +970,7 @@ If you determine the feedback corresponds to a new criteria, your response shoul
                   ) : (
                     <></>
                   )}
-                  
+
                   <div className="criteriaButtons">
                     <Button
                       leftIcon={<IconPencil size={14} />}
diff --git a/chainforge/react-server/src/MultiEvalNode.tsx b/chainforge/react-server/src/MultiEvalNode.tsx
index 8b9a1793d..7f778ba4e 100644
--- a/chainforge/react-server/src/MultiEvalNode.tsx
+++ b/chainforge/react-server/src/MultiEvalNode.tsx
@@ -61,6 +61,7 @@ import { AlertModalContext } from "./AlertModal";
 import { Status } from "./StatusIndicatorComponent";
 import { EvalGenReport } from "./backend/evalgen/typing";
 import EvalGenModal, { EvalGenModalRef } from "./EvalGenModal";
+import EvalGenWizard from "./EvalGen2Modal";
 
 const IS_RUNNING_LOCALLY = APP_IS_RUNNING_LOCALLY();
 
@@ -312,11 +313,11 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
     );
   };
 
-  const evalGenModalRef = useRef<EvalGenModalRef>(null);
-  const openEvalGen = () => {
-    const resps = handlePullInputs();
-    evalGenModalRef.current?.trigger(resps, onFinalReportsReady);
-  };
+  // const evalGenModalRef = useRef<EvalGenModalRef>(null);
+  // const openEvalGen = () => {
+  //   const resps = handlePullInputs();
+  //   evalGenModalRef.current?.trigger(resps, onFinalReportsReady);
+  // };
 
   const onFinalReportsReady = (reports: EvalGenReport) => {
     // Placeholder for process the final reports returned from EvalGenModel
@@ -600,6 +601,16 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
     }
   }, [data]);
 
+  // EvalGen Wizard
+  const [evalGenOpened, setEvalGenOpened] = useState(false);
+  const openEvalGen = useCallback(() => {
+    setEvalGenOpened(true);
+  }, []);
+  const handleEvalGenComplete = (evaluationData: EvalGenReport) => {
+    console.log("Evaluation wizard completed with data:", evaluationData);
+    // Do something with the evaluation implementations
+  };
+
   return (
     <BaseNode
       classNames="evaluator-node"
@@ -620,7 +631,12 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
         jsonResponses={lastResponses}
       />
 
-      <EvalGenModal ref={evalGenModalRef} />
+      <EvalGenWizard
+        opened={evalGenOpened}
+        onClose={() => setEvalGenOpened(false)}
+        onComplete={handleEvalGenComplete}
+      />
+      {/* <EvalGenModal ref={evalGenModalRef} /> */}
 
       <iframe style={{ display: "none" }} id={`${id}-iframe`}></iframe>
 
diff --git a/chainforge/react-server/src/text-fields-node.css b/chainforge/react-server/src/text-fields-node.css
index 719cf0a96..0069509e5 100644
--- a/chainforge/react-server/src/text-fields-node.css
+++ b/chainforge/react-server/src/text-fields-node.css
@@ -1318,3 +1318,34 @@ th .content-editable-div {
   color: #999;
   margin: 0;
 }
+
+.gradeContainer {
+  position: relative;
+  width: 20px;
+}
+
+.gradeUpCount {
+  position: absolute;
+  left: 12px;
+  top: -5px;
+  font-size: x-small;
+}
+
+.gradeDownCount {
+  position: absolute;
+  left: 13px;
+  top: 13px;
+  font-size: x-small;
+}
+
+.criteriaButtons {
+  text-align: center;
+  display: flex;
+  justify-content: space-between;
+  padding-left: 50px;
+  padding-right: 50px;
+  /* gap: 100px;
+  padding: 10px;
+  column-gap: normal;
+  -moz-column-gap: 100px; */
+}

From 7d6d13c91aa4a95fa25f453e14adced98eb9d14c Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Sun, 16 Mar 2025 09:56:40 -0400
Subject: [PATCH 14/35] wip

---
 chainforge/react-server/src/EvalGen2Modal.tsx | 92 +++++++++++++++++--
 1 file changed, 85 insertions(+), 7 deletions(-)

diff --git a/chainforge/react-server/src/EvalGen2Modal.tsx b/chainforge/react-server/src/EvalGen2Modal.tsx
index fb1eb28c6..e5914bb42 100644
--- a/chainforge/react-server/src/EvalGen2Modal.tsx
+++ b/chainforge/react-server/src/EvalGen2Modal.tsx
@@ -8,6 +8,9 @@ import {
   Text,
   Card,
   Stack,
+  Anchor,
+  List,
+  Flex,
 } from "@mantine/core";
 import { EvalCriteria, EvalGenReport } from "./backend/evalgen/typing";
 
@@ -56,12 +59,73 @@ interface EvalGenWizardProps {
     STEPPER SCREEN COMPONENTS
  */
 const WelcomeStep: React.FC<WelcomeStepProps> = ({ onNext }) => (
-  <Stack spacing="lg">
-    <Title order={2}>Welcome to the LLM Evaluation Wizard</Title>
+  <Stack spacing="md" m="lg" p="lg" mb={120}>
+    <Title order={2}>Welcome to the EvalGen Wizard</Title>
     <Text>
       This wizard will guide you through creating automated evaluators for LLM
-      responses. You&apos;ll define criteria, provide feedback, and implement
-      evaluations to measure alignment.
+      responses that are aligned with your preferences. You'll look at data, 
+      define what you care about, apply those criteria to grade
+      data, and refine your criteria as you see more outputs. EvalGen then
+      generates automated evaluators that implement each criteria, chooses
+      implementations most aligned with your grades, and reports how aligned
+      they are. 
+    </Text>
+    <Text>EvalGen is backed up by our{" "}
+      <Anchor
+        href="https://dl.acm.org/doi/abs/10.1145/3654777.3676450"
+        target="_blank"
+      >
+        empirical research at UIST 2024
+      </Anchor>, and is inspired by similar inductive processes in grounded theory and heuristic evaluation. Currently, Evalgen:</Text>
+    <List>
+      <List.Item>
+        Only generates <b>assertions (pass/fail tests)</b>. Numeric and categorical
+        evaluators are not included.
+      </List.Item>
+      <List.Item>
+        Asks for grades on a <b>per-criteria</b> basis on the main grading screen. This
+        is the chief difference from our paper.
+      </List.Item>
+      <List.Item>
+        Requires access to the GenAI features of ChainForge. Set up the Provider
+        you wish to use for this in your Global Settings view. The Provider must
+        be powerful enough to generate code. (By default, it is OpenAI.)
+      </List.Item>
+      <List.Item>
+        Should be run on the outputs of <b>already-run</b> Prompt Nodes (LLM responses).
+      </List.Item>
+      <List.Item>
+        EvalGen will send off many requests during usage. 🔔 <b>By using Evalgen,
+        you take full responsibility for credit usage.</b>
+      </List.Item>
+    </List>
+    <Text>Currently, EvalGen does NOT:</Text>
+    <List>
+      <List.Item>
+        Work on imported spreadsheets of data (although if you are interested in
+        this, raise a Pull Request).
+      </List.Item>
+      <List.Item>
+        Generate code that uses third-party libraries. For safety, LLM-generated
+        Python code is run sandboxed in the browser with pyodide. (If your eval
+        criteria implementation must use a third-party library, we suggest you
+        use ChainForge’s genAI features on the specific eval node, outside this
+        wizard.)
+      </List.Item>
+    </List>
+    <Text>We have captured the following about your context:</Text>
+    <ul>
+      <li>…</li>
+      <li>[x] Use this info when helping me think of evaluation criteria</li>
+    </ul>
+    <Text>
+      After EvalGen finishes, the chosen evaluators appear in the MultiEval
+      node. You can export evaluator details by right-clicking the node and
+      selecting Copy Eval Specs.
+    </Text>
+    <Text>
+      EvalGen is in Beta. To improve it, provide feedback on our Github Issues
+      or Discussion pages, or raise a Pull Request with the changes.
     </Text>
     <Button onClick={onNext} fullWidth mt="xl">
       Get Started
@@ -80,7 +144,7 @@ const FeedbackStep: React.FC<FeedbackStepProps> = ({ onNext, onPrevious }) => {
 
   return (
     <Stack spacing="lg">
-      <Title order={3}>Provide Feedback on Some LLM Responses</Title>
+      <Title order={3}>Provide Feedback on Some Model Outputs</Title>
 
       {/* TODO: Implement thumbs up/down feedback UI with written comments */}
       <Text>
@@ -243,7 +307,7 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
     <Modal
       opened={opened}
       onClose={onClose}
-      title="LLM Evaluation Wizard"
+      title="EvalGen Wizard"
       size="90%"
       padding="md"
       // keepMounted
@@ -302,7 +366,21 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
         />
       )}
 
-      {/* Sticky footer */}
+      {/* Sticky footer - button and steppers */}
+      <div
+        style={{
+          position: "fixed",
+          bottom: 106,
+          padding: "10px",
+          width: "95%",
+        }}
+      >
+        <Flex justify="space-between">
+          <Button variant="default">&lt; Back</Button>
+          <Button>Next &gt;</Button>
+        </Flex>
+        
+      </div>
       <div
         style={{
           position: "fixed",

From 0724efb0c7f41fa22147b49038bd4687a3f42951 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Sun, 16 Mar 2025 12:49:13 -0400
Subject: [PATCH 15/35] wip

---
 chainforge/react-server/src/EvalGen2Modal.tsx |  634 ++++++-
 .../react-server/src/OldEvalGenModal.js       | 1494 +++++++++++++++++
 chainforge/react-server/src/PromptNode.tsx    |    5 +-
 .../react-server/src/backend/evalgen/utils.ts |    8 +-
 chainforge/react-server/src/backend/utils.ts  |   35 +
 5 files changed, 2124 insertions(+), 52 deletions(-)
 create mode 100644 chainforge/react-server/src/OldEvalGenModal.js

diff --git a/chainforge/react-server/src/EvalGen2Modal.tsx b/chainforge/react-server/src/EvalGen2Modal.tsx
index e5914bb42..cf6e2f625 100644
--- a/chainforge/react-server/src/EvalGen2Modal.tsx
+++ b/chainforge/react-server/src/EvalGen2Modal.tsx
@@ -1,4 +1,4 @@
-import React, { useState } from "react";
+import React, { useMemo, useState } from "react";
 import {
   Modal,
   Button,
@@ -11,8 +11,42 @@ import {
   Anchor,
   List,
   Flex,
+  TextInput,
+  ScrollArea,
+  SimpleGrid,
+  Tooltip,
+  Skeleton,
+  Code,
+  Divider,
+  Checkbox,
+  Textarea,
+  Popover,
+  RingProgress,
+  Switch,
+  Accordion,
+  useMantineTheme,
 } from "@mantine/core";
-import { EvalCriteria, EvalGenReport } from "./backend/evalgen/typing";
+import {
+  EvalCriteria,
+  EvalFunctionReport,
+  EvalGenReport,
+} from "./backend/evalgen/typing";
+import {
+  IconCode,
+  IconRepeat,
+  IconRobot,
+  IconSparkles,
+  IconTrash,
+} from "@tabler/icons-react";
+import { generateLLMEvaluationCriteria } from "./backend/evalgen/utils";
+import useStore from "./store";
+import { v4 as uuid } from "uuid";
+import Plot from "react-plotly.js";
+import { useDisclosure } from "@mantine/hooks";
+import { accuracyToColor, cmatrixTextAnnotations } from "./backend/utils";
+import { LLMResponse } from "./backend/typing";
+import { escapeBraces } from "./backend/template";
+import { StringLookup } from "./backend/cache";
 
 /*
     PROPS FOR STEPPER SCREEN COMPONENTS
@@ -30,6 +64,9 @@ interface FeedbackStepProps {
 interface CriteriaStepProps {
   onNext: () => void;
   onPrevious: () => void;
+  criteria: EvalCriteria[];
+  setCriteria: React.Dispatch<React.SetStateAction<EvalCriteria[]>>;
+  genCriteriaFromContext: () => Promise<EvalCriteria[] | undefined>;
   // feedbackData: FeedbackItem[];
   // setCriteriaData: (criteria: EvalCriteria[]) => void;
 }
@@ -53,6 +90,7 @@ interface EvalGenWizardProps {
   opened: boolean;
   onClose: () => void;
   onComplete: (result: EvalGenReport) => void;
+  responses: LLMResponse[] | undefined;
 }
 
 /*
@@ -63,28 +101,31 @@ const WelcomeStep: React.FC<WelcomeStepProps> = ({ onNext }) => (
     <Title order={2}>Welcome to the EvalGen Wizard</Title>
     <Text>
       This wizard will guide you through creating automated evaluators for LLM
-      responses that are aligned with your preferences. You'll look at data, 
-      define what you care about, apply those criteria to grade
-      data, and refine your criteria as you see more outputs. EvalGen then
-      generates automated evaluators that implement each criteria, chooses
-      implementations most aligned with your grades, and reports how aligned
-      they are. 
+      responses that are aligned with your preferences. You`&apos;ll look at
+      data, define what you care about, apply those criteria to grade data, and
+      refine your criteria as you see more outputs. EvalGen then generates
+      automated evaluators that implement each criteria, chooses implementations
+      most aligned with your grades, and reports how aligned they are.
     </Text>
-    <Text>EvalGen is backed up by our{" "}
+    <Text>
+      EvalGen is backed up by our{" "}
       <Anchor
         href="https://dl.acm.org/doi/abs/10.1145/3654777.3676450"
         target="_blank"
       >
         empirical research at UIST 2024
-      </Anchor>, and is inspired by similar inductive processes in grounded theory and heuristic evaluation. Currently, Evalgen:</Text>
+      </Anchor>
+      , and is inspired by similar inductive processes in grounded theory and
+      heuristic evaluation. Currently, Evalgen:
+    </Text>
     <List>
       <List.Item>
-        Only generates <b>assertions (pass/fail tests)</b>. Numeric and categorical
-        evaluators are not included.
+        Only generates <b>assertions (pass/fail tests)</b>. Numeric and
+        categorical evaluators are not included.
       </List.Item>
       <List.Item>
-        Asks for grades on a <b>per-criteria</b> basis on the main grading screen. This
-        is the chief difference from our paper.
+        Asks for grades on a <b>per-criteria</b> basis on the main grading
+        screen. This is the chief difference from our paper.
       </List.Item>
       <List.Item>
         Requires access to the GenAI features of ChainForge. Set up the Provider
@@ -92,11 +133,12 @@ const WelcomeStep: React.FC<WelcomeStepProps> = ({ onNext }) => (
         be powerful enough to generate code. (By default, it is OpenAI.)
       </List.Item>
       <List.Item>
-        Should be run on the outputs of <b>already-run</b> Prompt Nodes (LLM responses).
+        Should be run on the outputs of <b>already-run</b> Prompt Nodes (LLM
+        responses).
       </List.Item>
       <List.Item>
-        EvalGen will send off many requests during usage. 🔔 <b>By using Evalgen,
-        you take full responsibility for credit usage.</b>
+        EvalGen will send off many requests during usage. 🔔{" "}
+        <b>By using Evalgen, you take full responsibility for credit usage.</b>
       </List.Item>
     </List>
     <Text>Currently, EvalGen does NOT:</Text>
@@ -162,27 +204,401 @@ const FeedbackStep: React.FC<FeedbackStepProps> = ({ onNext, onPrevious }) => {
   );
 };
 
-const CriteriaStep: React.FC<CriteriaStepProps> = ({ onNext, onPrevious }) => {
-  // State for criteria cards
-  const [criteria, setCriteria] = useState([]);
-  const [newCriteriaText, setNewCriteriaText] = useState("");
-
-  // TODO: Use feedbackData to generate initial criteria when component mounts
+interface CriteriaCardProps {
+  title: string;
+  description: string;
+  evalMethod: string;
+  onTitleChange?: (newTitle: string) => void;
+  onDescriptionChange?: (newDesc: string) => void;
+  onEvalMethodChange?: (newEvalMethod: string) => void;
+  onRemove?: () => void;
+  reportMode?: boolean;
+  evalFuncReport?: EvalFunctionReport;
+  onCheck?: (newChecked: boolean) => void;
+  otherFuncs?: EvalFunctionReport[];
+}
 
-  const handleAddCriteria = () => {
-    // TODO: Add new criteria based on text input
+const CriteriaCard: React.FC<CriteriaCardProps> = function CriteriaCard({
+  title,
+  description,
+  evalMethod,
+  onTitleChange,
+  onDescriptionChange,
+  onEvalMethodChange,
+  onRemove,
+  reportMode,
+  evalFuncReport,
+  onCheck,
+  otherFuncs,
+}) {
+  const [checked, setChecked] = useState(true);
+  const [codeChecked, setCodeChecked] = useState(evalMethod === "code");
+  const theme = useMantineTheme();
+
+  // Report card specific
+  const [openedCMatrix, { close: closeCMatrix, open: openCMatrix }] =
+    useDisclosure(false);
+  const [viewedCode, { close: closeViewedCode, open: openViewedCode }] =
+    useDisclosure(false);
+  const cMatrixPlot = useMemo(() => {
+    if (!evalFuncReport) return undefined;
+    const x = ["Pred.<br>fail", "Pred.<br>pass"];
+    const y = ["Human<br>pass", "Human<br>fail"];
+    const z = [
+      [evalFuncReport.false_fail, evalFuncReport.true_pass],
+      [evalFuncReport.true_fail, evalFuncReport.false_pass],
+    ];
+    return (
+      <Plot
+        data={[
+          {
+            z,
+            x,
+            y,
+            xgap: 2,
+            ygap: 2,
+            type: "heatmap",
+            colorscale: "Blues",
+            showscale: false,
+            showlegend: false,
+          },
+        ]}
+        layout={{
+          width: 160,
+          height: 160,
+          margin: { t: 10, b: 40, l: 50, r: 0 },
+          annotations: cmatrixTextAnnotations(x, y, z),
+        }}
+      />
+    );
+  }, [evalFuncReport]);
+  const reportAccuracyRing = useMemo(() => {
+    if (!evalFuncReport) return undefined;
+    return {
+      percent: Math.floor((evalFuncReport.alignment ?? 0) * 100),
+      color: accuracyToColor(evalFuncReport.alignment ?? 0),
+    };
+  }, [evalFuncReport]);
+
+  const setCheckedAndRealign = (newChecked: boolean) => {
+    setChecked(newChecked);
+
+    // oncheck is a callback to the parent to update the selected eval functions
+    // oncheck is an awaitable function
+    if (onCheck && evalFuncReport) onCheck(newChecked);
   };
 
-  const handleModifyCriteria = (uid: string, newText: string) => {
-    // TODO: Modify existing criteria
-  };
+  const unselectedImplementations = useMemo(
+    () =>
+      otherFuncs !== undefined && otherFuncs.length > 0
+        ? otherFuncs.map((item, idx) => (
+            <div key={idx}>
+              <Code style={{ whiteSpace: "pre-wrap" }}>
+                {item.evalFunction.code}
+              </Code>
+              <Divider />
+            </div>
+          ))
+        : null,
+    [otherFuncs],
+  );
 
-  const handleRemoveCriteria = (uid: string) => {
-    // TODO: Remove criteria
-  };
+  return (
+    <Card
+      shadow="sm"
+      padding="sm"
+      pl="md"
+      pb="xl"
+      radius="md"
+      withBorder
+      style={{ backgroundColor: checked ? "#f2f7fc" : "#fff" }}
+    >
+      <div
+        // onClick={() => setChecked(!checked)}
+        onKeyUp={(e) => e.preventDefault()}
+        className="checkcard"
+      >
+        <Tooltip label={checked ? "Don't use this" : "Use this"} withArrow>
+          <Checkbox
+            checked={checked}
+            onChange={() => setCheckedAndRealign(!checked)}
+            tabIndex={-1}
+            size="xs"
+            mr="sm"
+            mt="xs"
+            styles={{ input: { cursor: "pointer" } }}
+            aria-hidden
+          />
+        </Tooltip>
+
+        <div style={{ width: "100%" }}>
+          <TextInput
+            value={title}
+            onChange={(e) =>
+              onTitleChange ? onTitleChange(e.currentTarget.value) : null
+            }
+            mb={7}
+            lh={1}
+            styles={{
+              input: {
+                border: "none",
+                borderWidth: "0px",
+                padding: "0px",
+                background: "transparent",
+                fontWeight: 500,
+                fontSize: "12pt",
+                margin: "0px",
+                height: "auto",
+                minHeight: "auto",
+              },
+            }}
+          />
+
+          <Textarea
+            value={description}
+            onChange={(e) =>
+              onDescriptionChange
+                ? onDescriptionChange(e.currentTarget.value)
+                : null
+            }
+            onClickCapture={(e) => e.stopPropagation()}
+            styles={{
+              input: {
+                border: "none",
+                borderWidth: "0px",
+                paddingTop: "0px !important",
+                paddingLeft: "0px",
+                margin: "0px",
+                color: "#444",
+                background: "transparent",
+                lineHeight: 1.1,
+              },
+            }}
+            autosize
+            minRows={2}
+            maxRows={5}
+            fz="sm"
+            mb="xs"
+            c="dimmed"
+          />
+
+          {reportMode && (
+            <Popover
+              opened={viewedCode}
+              // offset={{ crossAxis: -20 }}
+              withinPortal
+              position="bottom"
+              shadow="lg"
+              withArrow
+              width={400}
+            >
+              <Popover.Target>
+                <Text
+                  size="sm"
+                  color="gray"
+                  onMouseEnter={openViewedCode}
+                  onMouseLeave={closeViewedCode}
+                >
+                  {codeChecked ? "Python" : "LLM"}
+                </Text>
+              </Popover.Target>
+              <Popover.Dropdown>
+                <Code style={{ whiteSpace: "pre-wrap" }}>
+                  {evalFuncReport?.evalFunction.code}
+                </Code>
+              </Popover.Dropdown>
+            </Popover>
+          )}
+        </div>
+
+        {!reportMode ? (
+          <Button
+            size="xs"
+            variant="subtle"
+            compact
+            color="gray"
+            onClick={onRemove}
+            pos="absolute"
+            right="8px"
+            top="8px"
+            style={{ padding: "0px" }}
+          >
+            <IconTrash size={"95%"} />
+          </Button>
+        ) : (
+          <></>
+        )}
+
+        {reportMode && reportAccuracyRing ? (
+          <Stack spacing={0}>
+            <Popover
+              position="right"
+              opened={openedCMatrix}
+              offset={{ crossAxis: -20 }}
+              withinPortal
+              shadow="lg"
+              withArrow
+            >
+              <Popover.Target>
+                <RingProgress
+                  size={100}
+                  sections={[
+                    {
+                      value: reportAccuracyRing.percent,
+                      color: reportAccuracyRing.color,
+                    },
+                  ]}
+                  label={
+                    <Text
+                      color={reportAccuracyRing.color}
+                      weight={700}
+                      align="center"
+                      size="lg"
+                    >
+                      {`${reportAccuracyRing.percent}%`}
+                    </Text>
+                  }
+                  onMouseEnter={openCMatrix}
+                  onMouseLeave={closeCMatrix}
+                />
+              </Popover.Target>
+              <Popover.Dropdown>{cMatrixPlot}</Popover.Dropdown>
+            </Popover>
+            <Text align="center" size="xs" color="gray" maw="90%" lh={1.1}>
+              Alignment with your grades
+            </Text>
+          </Stack>
+        ) : (
+          <></>
+        )}
+
+        {!reportMode ? (
+          <Switch
+            size="lg"
+            color="gray"
+            onLabel="Code"
+            offLabel="LLM"
+            pos="absolute"
+            right="8px"
+            bottom="10px"
+            checked={codeChecked}
+            onChange={(e) => {
+              setCodeChecked(e.currentTarget.checked);
+              if (onEvalMethodChange)
+                onEvalMethodChange(e.currentTarget.checked ? "code" : "expert");
+            }}
+            thumbIcon={
+              codeChecked ? (
+                <IconCode
+                  size="0.8rem"
+                  color={theme.colors.teal[theme.fn.primaryShade()]}
+                  stroke={3}
+                />
+              ) : (
+                <IconRobot
+                  size="0.8rem"
+                  color={theme.colors.blue[theme.fn.primaryShade()]}
+                  stroke={3}
+                />
+              )
+            }
+          />
+        ) : (
+          <></>
+        )}
+      </div>
+
+      <div>
+        {reportMode && (
+          <Accordion>
+            <Accordion.Item
+              key={"Show Bad Implementations"}
+              value={"Show Bad Implementations"}
+            >
+              <Accordion.Control>
+                <Text size="sm"> Show Bad Implementations </Text>
+              </Accordion.Control>
+              <Accordion.Panel>{unselectedImplementations}</Accordion.Panel>
+            </Accordion.Item>
+          </Accordion>
+        )}
+      </div>
+    </Card>
+  );
+};
 
-  const handleGenerateCriteria = () => {
-    // TODO: Generate new criteria based on user input
+const CriteriaStep: React.FC<CriteriaStepProps> = ({
+  onNext,
+  onPrevious,
+  criteria, 
+  setCriteria,
+  genCriteriaFromContext,
+}) => {
+  // State for criteria cards
+  const [addCriteriaValue, setAddCriteriaValue] = useState("");
+  const [isLoadingCriteria, setIsLoadingCriteria] = useState(0);
+
+  // Global state
+  const apiKeys = useStore((state) => state.apiKeys);
+
+  // An estimate of many requests the implementation executor will require (upper bound).
+  const estimatedLLMRequestsToImplement = useMemo(() => {
+    return 0; // TODO
+    // const num_llm_evals = criteria.reduce(
+    //   (acc, crit) => acc + (crit.eval_method === "expert" ? 1 : 0),
+    //   0,
+    // );
+    // // The executor sends off one query per criteria to generate 3-5 candidates each.
+    // // Each candidate LLM eval prompt will be run over all candidates.
+    // return criteria.length + num_llm_evals * 5 * samples.length;
+  }, [criteria]);
+
+  const addCriteria = () => {
+    // Add a loading Skeleton
+    setIsLoadingCriteria((num) => num + 1);
+    // Make async LLM call to expand criteria
+    generateLLMEvaluationCriteria(
+      "",
+      apiKeys,
+      `I've described a criteria I want to use to evaluate text. I want you to take the criteria and output a JSON object in the format below. 
+
+CRITERIA: 
+\`\`\`
+${addCriteriaValue}
+\`\`\`
+
+Your response should contain a short title for the criteria ("shortname"), a description of the criteria in 2 sentences ("criteria"), and whether it should be evaluated with "code", or by an "expert" if the criteria is difficult to evaluate ("eval_method"). Your answer should be JSON within a \`\`\`json \`\`\` marker, with the following three fields: "criteria", "shortname", and "eval_method" (code or expert). The "criteria" should expand upon the user's input, the "shortname" should be a very brief title for the criteria, and this list should contain as many evaluation criteria as you can think of. Each evaluation criteria should test a unit concept that should evaluate to "true" in the ideal case. Only output JSON, nothing else.`, // prompt
+      null, // system_msg
+    )
+      .then((evalCrits) => {
+        // Take only the first
+        setCriteria((crit) =>
+          crit.concat([
+            {
+              ...evalCrits[0],
+              uid: uuid(),
+            },
+          ]),
+        );
+        // Remove a loading Skeleton
+        setIsLoadingCriteria((num) => num - 1);
+      })
+      .catch((err) => {
+        console.error(err);
+        setIsLoadingCriteria((num) => num - 1);
+      });
+  };
+  const updateCriteria = (
+    newValue: string,
+    critIdx: number,
+    propName: "shortname" | "criteria" | "eval_method",
+  ) => {
+    setCriteria((crit) => {
+      if (propName in crit[critIdx])
+        // @ts-expect-error This is hard to type because it's a wrapper over an accessor.
+        crit[critIdx][propName] = newValue;
+      return [...crit];
+    });
   };
 
   const handleSubmit = () => {
@@ -193,21 +609,112 @@ const CriteriaStep: React.FC<CriteriaStepProps> = ({ onNext, onPrevious }) => {
   return (
     <Stack spacing="lg">
       <Title order={3}>Define Evaluation Criteria</Title>
-      <Text>
-        Based on your feedback, we&apos;ve generated these starter criteria:
-      </Text>
 
-      {/* TODO: Implement criteria cards UI */}
-      <Text>TODO: Display criteria cards with edit/delete functionality</Text>
+      <div>
+        <Text size="sm" pl="sm" mb="lg">
+          Select criteria that you would like to evaluate responses on. Based on
+          your chosen criteria, LLM will generate implementations of assertions.
+          Afterwards, an optional human scoring pass can better align these
+          implementations with your expectations.
+        </Text>
+
+        <Text size="sm" pl="sm" mb="lg" style={{ fontStyle: "italic" }}>
+          Note: Due to rate limits and/or cost, think carefully before selecting more than 5
+          criteria to be evaluated by LLMs.
+        </Text>
+
+        <Flex align="center" gap="lg">
+          <TextInput
+            label="Type a new criteria to add, then press Enter:"
+            value={addCriteriaValue}
+            onChange={(evt) => setAddCriteriaValue(evt.currentTarget.value)}
+            placeholder="the response is valid JSON"
+            mb="lg"
+            pl="sm"
+            pr="sm"
+            w="100%"
+            onKeyDown={(evt) => {
+              if (evt.key === "Enter") {
+                evt.preventDefault();
+                addCriteria();
+                setAddCriteriaValue("");
+              }
+            }}
+          />
+          <Button
+            variant="filled"
+            onClick={() => {
+              if (isLoadingCriteria > 0) return;
+              setIsLoadingCriteria(3);
+              genCriteriaFromContext()
+                .then((crit) =>
+                  setCriteria(crit ? criteria.concat(crit) : criteria),
+                )
+                .finally(() => setIsLoadingCriteria(0));
+            }}
+          >
+            <IconRepeat />
+            <IconSparkles />
+            &nbsp;Suggest more
+          </Button>
+        </Flex>
 
-      {/* TODO: Implement input for new criteria */}
-      <Text>TODO: Input field for adding new criteria</Text>
+        <ScrollArea mih={300} h={500} mah={500}>
+          <SimpleGrid cols={3} spacing="sm" verticalSpacing="sm" mb="lg">
+            {criteria.map((c, idx) => (
+              <CriteriaCard
+                title={c.shortname}
+                description={c.criteria}
+                evalMethod={c.eval_method}
+                key={`cc-${c.uid ?? idx.toString() + c.shortname}`}
+                onTitleChange={(title) =>
+                  updateCriteria(title, idx, "shortname")
+                }
+                onDescriptionChange={(desc) =>
+                  updateCriteria(desc, idx, "criteria")
+                }
+                onEvalMethodChange={(method) =>
+                  updateCriteria(method, idx, "eval_method")
+                }
+                onRemove={() =>
+                  setCriteria(criteria.filter((v, j) => j !== idx))
+                }
+              />
+            ))}
+            {isLoadingCriteria > 0 ? (
+              Array.from({ length: isLoadingCriteria }, (x, i) => (
+                <Skeleton key={`skele-card-${i}`}>
+                  <CriteriaCard
+                    title={"Loading"}
+                    description={"Loading"}
+                    evalMethod={"expert"}
+                  />
+                </Skeleton>
+              ))
+            ) : (
+              <></>
+            )}
+          </SimpleGrid>
+        </ScrollArea>
+      </div>
 
       <Group position="apart" mt="xl">
         <Button variant="default" onClick={onPrevious}>
           Back
         </Button>
-        <Button onClick={handleSubmit}>Ready to Grade!</Button>
+        <Tooltip
+          label={`Will send off up to ${estimatedLLMRequestsToImplement} requests`}
+          withArrow
+        >
+          <Button
+            variant="gradient"
+            gradient={{ from: "teal", to: "lime", deg: 105 }}
+            disabled={!criteria || criteria.length === 0}
+            onClick={handleSubmit}
+          >
+            Ready to Grade!
+          </Button>
+        </Tooltip>
       </Group>
     </Stack>
   );
@@ -275,13 +782,15 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
   opened,
   onClose,
   onComplete,
+  responses,
 }) => {
   const [active, setActive] = useState(0);
 
-  // State for data collected across steps
-  const [feedbackData, setFeedbackData] = useState([]);
-  const [criteriaData, setCriteriaData] = useState([]);
-  const [gradingData, setGradingData] = useState({});
+  // Criteria across the steps
+  const [criteria, setCriteria] = useState<EvalCriteria[]>([]);
+
+  // Global state
+  const apiKeys = useStore((state) => state.apiKeys);
 
   const handleNext = () => {
     setActive((current) => current + 1);
@@ -294,7 +803,7 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
   const handleComplete = () => {
     // Return final data to the caller
     onComplete({
-      criteria: criteriaData,
+      criteria: criteria,
       failureCoverage: 0,
       falseFailureRate: 0,
       // grades: gradingData,
@@ -303,6 +812,35 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
     onClose();
   };
 
+  const getLikelyPromptTemplateAsContext = (resps: LLMResponse[]) => {
+    // Attempt to infer the prompt template used to generate the responses:
+    const prompts = new Set<string>();
+    for (const resp_obj of resps) {
+      const pt = resp_obj?.metavars?.__pt;
+      if (pt !== undefined) {
+        prompts.add(StringLookup.get(pt) as string);
+      }
+    }
+
+    if (prompts.size === 0) return null;
+
+    // Pick a prompt template at random to serve as context....
+    return escapeBraces(prompts.values().next().value ?? "");
+  };
+
+  async function genCriteriaFromContext(responses: LLMResponse[]) {
+    // Get the context from the input responses
+    const inputPromptTemplate = getLikelyPromptTemplateAsContext(responses);
+
+    if (inputPromptTemplate === null) {
+      console.error("No context found. Cannot proceed.");
+      return;
+    }
+
+    // Attempt to generate criteria using an LLM
+    return await generateLLMEvaluationCriteria(inputPromptTemplate, apiKeys);
+  }
+
   return (
     <Modal
       opened={opened}
@@ -343,6 +881,9 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
         <CriteriaStep
           onNext={handleNext}
           onPrevious={handlePrevious}
+          criteria={criteria}
+          setCriteria={setCriteria}
+          genCriteriaFromContext={() => genCriteriaFromContext(responses ?? [])}
           // feedbackData={feedbackData}
           // setCriteriaData={setCriteriaData}
         />
@@ -379,7 +920,6 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
           <Button variant="default">&lt; Back</Button>
           <Button>Next &gt;</Button>
         </Flex>
-        
       </div>
       <div
         style={{
diff --git a/chainforge/react-server/src/OldEvalGenModal.js b/chainforge/react-server/src/OldEvalGenModal.js
new file mode 100644
index 000000000..a2e9b77ac
--- /dev/null
+++ b/chainforge/react-server/src/OldEvalGenModal.js
@@ -0,0 +1,1494 @@
+import React, {
+  forwardRef,
+  useImperativeHandle,
+  useState,
+  useMemo,
+  useEffect,
+  useCallback,
+} from "react";
+import { v4 as uuid } from "uuid";
+import Plot from "react-plotly.js";
+import {
+  SimpleGrid,
+  Card,
+  Modal,
+  Text,
+  Button,
+  UnstyledButton,
+  Textarea,
+  TextInput,
+  Flex,
+  Progress,
+  ScrollArea,
+  useMantineTheme,
+  Loader,
+  Switch,
+  Stack,
+  Box,
+  Space,
+  Center,
+  Tooltip,
+  Skeleton,
+  RingProgress,
+  Checkbox,
+  Popover,
+  Group,
+  Collapse,
+  Code,
+  Accordion,
+  Divider,
+} from "@mantine/core";
+import { useDisclosure } from "@mantine/hooks";
+import {
+  IconChevronLeft,
+  IconChevronRight,
+  IconCode,
+  IconPencil,
+  IconRepeat,
+  IconRobot,
+  IconSparkles,
+  IconThumbDown,
+  IconThumbUp,
+  IconTrash,
+} from "@tabler/icons-react";
+import ConfettiExplosion from "react-confetti-explosion";
+import {
+  cleanMetavarsFilterFunc,
+  deepcopy,
+  sampleRandomElements,
+  transformDict,
+} from "./backend/utils";
+import { generateLLMEvaluationCriteria } from "./backend/evalgen/utils";
+import { escapeBraces } from "./backend/template";
+import EvaluationFunctionExecutor from "./backend/evalgen/executor";
+import {
+  extractUIDFromRatingKey,
+  getRatingKeyForResponse,
+} from "./ResponseRatingToolbar";
+import useStore from "./store";
+import { DEFAULT_LLM_EVAL_MODEL } from "./LLMEvalNode";
+import StorageCache from "./backend/cache";
+
+const MANTINE_GREEN = "#40c057";
+const SELECT_EVAL_FUNC_THRESHOLD = 0.4;
+
+const HeaderText = ({ children }) => {
+  return (
+    <Text size="xl" fw={500} pl="sm" mb="lg">
+      {children}
+    </Text>
+  );
+};
+
+const evalgenReportToImplementations = (report) => {
+  // Convert to expected format by MultiEval node
+  const specs = report.selectedEvalFunctions.map((evalFuncSpec) => {
+    // Skip if evalFuncSpec.evalCriteria.selected is false
+    if (evalFuncSpec.evalCriteria.selected === false) return null;
+
+    if (evalFuncSpec.evalCriteria.eval_method === "code")
+      return {
+        name: evalFuncSpec.evalCriteria.shortname,
+        type: "python", // for now, only generates Python
+        state: {
+          code: evalFuncSpec.code,
+        },
+      };
+    else
+      return {
+        name: evalFuncSpec.evalCriteria.shortname,
+        type: "llm",
+        state: {
+          prompt: evalFuncSpec.code,
+          grader: deepcopy(DEFAULT_LLM_EVAL_MODEL),
+          format: "bin", // for now, only boolean assertions
+        },
+      };
+  });
+
+  return specs.filter((s) => s !== null);
+};
+
+const accuracyToColor = (acc) => {
+  if (acc > 0.9) return "green";
+  else if (acc > 0.7) return "yellow";
+  else if (acc > 0.5) return "orange";
+  else return "red";
+};
+
+const cmatrixTextAnnotations = (x, y, z) => {
+  const annotations = [];
+  const midVal = Math.max(...z.flat());
+  for (let i = 0; i < y.length; i++) {
+    for (let j = 0; j < x.length; j++) {
+      annotations.push({
+        xref: "x1",
+        yref: "y1",
+        x: x[j],
+        y: y[i],
+        text: z[i][j],
+        font: {
+          // family: "monospace",
+          // size: 12,
+          color: z[i][j] < midVal ? "white" : "black",
+        },
+        showarrow: false,
+      });
+    }
+  }
+  return annotations;
+};
+
+/** Example flows to help users get started and see what CF can do */
+const CriteriaCard = function CriteriaCard({
+  title,
+  description,
+  evalMethod,
+  onTitleChange,
+  onDescriptionChange,
+  onEvalMethodChange,
+  onRemove,
+  reportMode,
+  evalFuncReport,
+  onCheck,
+  otherFuncs,
+}) {
+  const [checked, setChecked] = useState(true);
+  const [codeChecked, setCodeChecked] = useState(evalMethod === "code");
+  const theme = useMantineTheme();
+
+  // Report card specific
+  const [openedCMatrix, { close: closeCMatrix, open: openCMatrix }] =
+    useDisclosure(false);
+  const [viewedCode, { close: closeViewedCode, open: openViewedCode }] =
+    useDisclosure(false);
+  const [openedOtherFuncs, { toggleOtherFuncs }] = useDisclosure(false);
+  const cMatrixPlot = useMemo(() => {
+    if (!evalFuncReport) return undefined;
+    const x = ["Pred.<br>fail", "Pred.<br>pass"];
+    const y = ["Human<br>pass", "Human<br>fail"];
+    const z = [
+      [evalFuncReport.false_fail, evalFuncReport.true_pass],
+      [evalFuncReport.true_fail, evalFuncReport.false_pass],
+    ];
+    return (
+      <Plot
+        data={[
+          {
+            z,
+            x,
+            y,
+            xgap: 2,
+            ygap: 2,
+            type: "heatmap",
+            hoverongaps: false,
+            colorscale: "Blues",
+            showscale: false,
+            showlegend: false,
+          },
+        ]}
+        layout={{
+          width: 160,
+          height: 160,
+          margin: { t: 10, b: 40, l: 50, r: 0 },
+          annotations: cmatrixTextAnnotations(x, y, z),
+        }}
+      />
+    );
+  }, [evalFuncReport]);
+  const reportAccuracyRing = useMemo(() => {
+    if (!evalFuncReport) return undefined;
+    return {
+      percent: Math.floor(evalFuncReport.alignment * 100),
+      color: accuracyToColor(evalFuncReport.alignment),
+    };
+  }, [evalFuncReport]);
+
+  // Update the checkbox whenever the evalFuncReport changes,
+  // ticking it if the accuracy is over the threshold.
+  // useEffect(() => {
+  //   if (!evalFuncReport) return;
+  //   setChecked(evalFuncReport.accuracy >= SELECT_EVAL_FUNC_THRESHOLD);
+  // }, [evalFuncReport]);
+
+  const setCheckedAndRealign = (newChecked) => {
+    setChecked(newChecked);
+
+    // oncheck is a callback to the parent to update the selected eval functions
+    // oncheck is an awaitable function
+    if (onCheck && evalFuncReport) onCheck(newChecked);
+  };
+
+  const unselectedImplementations =
+    otherFuncs !== undefined && otherFuncs.length > 0
+      ? otherFuncs.map((item) => (
+          <div key={uuid()}>
+            <Code style={{ whiteSpace: "pre-wrap" }} key={uuid()}>
+              {item.evalFunction.code}
+            </Code>
+            <Divider />
+          </div>
+        ))
+      : null;
+
+  return (
+    <Card
+      shadow="sm"
+      padding="sm"
+      pl="md"
+      pb="xl"
+      radius="md"
+      withBorder
+      style={{ backgroundColor: checked ? "#f2f7fc" : "#fff" }}
+    >
+      <div
+        // onClick={() => setChecked(!checked)}
+        onKeyUp={(e) => e.preventDefault()}
+        className="checkcard"
+      >
+        <Tooltip label={checked ? "Don't use this" : "Use this"} withArrow>
+          <Checkbox
+            checked={checked}
+            onChange={() => setCheckedAndRealign(!checked)}
+            tabIndex={-1}
+            size="xs"
+            mr="sm"
+            mt="xs"
+            styles={{ input: { cursor: "pointer" } }}
+            aria-hidden
+          />
+        </Tooltip>
+
+        <div style={{ width: "100%" }}>
+          <TextInput
+            value={title}
+            onChange={(e) => onTitleChange(e.currentTarget.value)}
+            mb={7}
+            lh={1}
+            styles={{
+              input: {
+                border: "none",
+                borderWidth: "0px",
+                padding: "0px",
+                background: "transparent",
+                fontWeight: 500,
+                fontSize: "12pt",
+                margin: "0px",
+                height: "auto",
+                minHeight: "auto",
+              },
+            }}
+          />
+
+          <Textarea
+            value={description}
+            onChange={(e) => onDescriptionChange(e.currentTarget.value)}
+            onClickCapture={(e) => e.stopPropagation()}
+            styles={{
+              input: {
+                border: "none",
+                borderWidth: "0px",
+                paddingTop: "0px !important",
+                paddingLeft: "0px",
+                margin: "0px",
+                color: "#444",
+                background: "transparent",
+                lineHeight: 1.1,
+              },
+            }}
+            autosize
+            minRows={2}
+            maxRows={5}
+            fz="sm"
+            mb="xs"
+            c="dimmed"
+          />
+
+          {reportMode && (
+            <Popover
+              opened={viewedCode}
+              // offset={{ crossAxis: -20 }}
+              withinPortal
+              position="bottom"
+              shadow="lg"
+              withArrow
+              width={400}
+            >
+              <Popover.Target>
+                <Text
+                  size="sm"
+                  color="gray"
+                  onMouseEnter={openViewedCode}
+                  onMouseLeave={closeViewedCode}
+                >
+                  {codeChecked ? "Python" : "LLM"}
+                </Text>
+              </Popover.Target>
+              <Popover.Dropdown>
+                <Code style={{ whiteSpace: "pre-wrap" }}>
+                  {evalFuncReport.evalFunction.code}
+                </Code>
+              </Popover.Dropdown>
+            </Popover>
+          )}
+        </div>
+
+        {!reportMode ? (
+          <Button
+            size="xs"
+            variant="subtle"
+            compact
+            color="gray"
+            onClick={onRemove}
+            pos="absolute"
+            right="8px"
+            top="8px"
+            style={{ padding: "0px" }}
+          >
+            <IconTrash size={"95%"} />
+          </Button>
+        ) : (
+          <></>
+        )}
+
+        {reportMode && reportAccuracyRing ? (
+          <Stack spacing={0}>
+            <Popover
+              position="right"
+              opened={openedCMatrix}
+              offset={{ crossAxis: -20 }}
+              withinPortal
+              shadow="lg"
+              withArrow
+            >
+              <Popover.Target>
+                <RingProgress
+                  size={100}
+                  sections={[
+                    {
+                      value: reportAccuracyRing.percent,
+                      color: reportAccuracyRing.color,
+                    },
+                  ]}
+                  label={
+                    <Text
+                      color={reportAccuracyRing.color}
+                      weight={700}
+                      align="center"
+                      size="lg"
+                    >
+                      {`${reportAccuracyRing.percent}%`}
+                    </Text>
+                  }
+                  onMouseEnter={openCMatrix}
+                  onMouseLeave={closeCMatrix}
+                />
+              </Popover.Target>
+              <Popover.Dropdown>{cMatrixPlot}</Popover.Dropdown>
+            </Popover>
+            <Text align="center" size="xs" color="gray" maw="90%" lh={1.1}>
+              Alignment with your grades
+            </Text>
+          </Stack>
+        ) : (
+          <></>
+        )}
+
+        {!reportMode ? (
+          <Switch
+            size="lg"
+            color="gray"
+            onLabel="Code"
+            offLabel="LLM"
+            pos="absolute"
+            right="8px"
+            bottom="10px"
+            checked={codeChecked}
+            onChange={(e) => {
+              setCodeChecked(e.currentTarget.checked);
+              if (onEvalMethodChange)
+                onEvalMethodChange(e.currentTarget.checked ? "code" : "expert");
+            }}
+            thumbIcon={
+              codeChecked ? (
+                <IconCode
+                  size="0.8rem"
+                  color={theme.colors.teal[theme.fn.primaryShade()]}
+                  stroke={3}
+                />
+              ) : (
+                <IconRobot
+                  size="0.8rem"
+                  color={theme.colors.blue[theme.fn.primaryShade()]}
+                  stroke={3}
+                />
+              )
+            }
+          />
+        ) : (
+          <></>
+        )}
+      </div>
+
+      <div>
+        {reportMode && (
+          <Accordion>
+            <Accordion.Item
+              key={"Show Bad Implementations"}
+              value={"Show Bad Implementations"}
+            >
+              <Accordion.Control>
+                <Text size="sm"> Show Bad Implementations </Text>
+              </Accordion.Control>
+              <Accordion.Panel>{unselectedImplementations}</Accordion.Panel>
+            </Accordion.Item>
+          </Accordion>
+        )}
+      </div>
+    </Card>
+  );
+};
+
+const ChooseCard = function ChooseCard({
+  title,
+  description,
+  icon,
+  bg,
+  onClick,
+}) {
+  const [hovering, setHovering] = useState(false);
+
+  return (
+    <Card
+      shadow="sm"
+      padding="lg"
+      radius="md"
+      withBorder
+      style={{ backgroundColor: bg + (hovering ? "44" : "77") }}
+      onMouseEnter={() => setHovering(true)}
+      onMouseLeave={() => setHovering(false)}
+      onClick={onClick}
+    >
+      <UnstyledButton className="checkcard">
+        <Tooltip
+          label={description}
+          maw="200px"
+          position="bottom"
+          withinPortal
+          withArrow
+          multiline
+        >
+          <Flex justify="center" gap="md">
+            <Box>{icon}</Box>
+            <Text fw={500} lh={1.2} fz="md">
+              {title}
+            </Text>
+          </Flex>
+        </Tooltip>
+      </UnstyledButton>
+    </Card>
+  );
+};
+
+// Pop-up to ask user to pick criterias for evaluation
+export const PickCriteriaModal = forwardRef(
+  function PickCriteriaModal(props, ref) {
+    const [opened, { open, close }] = useDisclosure(false);
+    const [responses, setResponses] = useState([]);
+    const apiKeys = useStore((state) => state.apiKeys);
+    const globalState = useStore((store) => store.state);
+
+    // Callback to caller when criteria implementations return
+    const [onFinish, setOnFinish] = useState(null);
+
+    // Which stage of picking + generating criteria we are in. Screens are:
+    // pick, wait, grade
+    const [screen, setScreen] = useState("welcome");
+    const modalTitle = useMemo(() => {
+      if (screen === "pick") return "Pick Criteria";
+      else if (screen === "welcome") return "Welcome";
+      else if (screen === "wait") return "Collecting implementations...";
+      else if (screen === "report") return "EvalGen Report";
+      else return "Grading Responses";
+    }, [screen]);
+
+    const [criteria, setCriteria] = useState([]);
+    const [addCriteriaValue, setAddCriteriaValue] = useState("");
+    const [isLoadingCriteria, setIsLoadingCriteria] = useState(0);
+
+    // The EvalGen object responsible for generating, implementing, and filtering candidate implementations
+    const [executor, setExecutor] = useState(null);
+    const [execProgress, setExecProgress] = useState(0);
+
+    // Stores report generated when executor is done
+    const [report, setReport] = useState(null);
+
+    // The samples to pass the executor / grading responses features. This will be bounded
+    // by maxNumSamplesForExecutor, instead of the whole dataset.
+    const samples = useMemo(() => {
+      // The max number of samples (responses) to pass the executor. This controls how many requests will
+      // need to be sent off and how many evaluation function executions are performed.
+      // TODO: Give the user some control over this.
+      const maxNumSamplesForExecutor = 16;
+
+      // Sample from the full set of responses, if needed:
+      if (responses.length > maxNumSamplesForExecutor)
+        return sampleRandomElements(responses, maxNumSamplesForExecutor);
+      else return responses.slice();
+    }, [responses]);
+
+    const addCriteria = () => {
+      // Add a loading Skeleton
+      setIsLoadingCriteria((num) => num + 1);
+      // Make async LLM call to expand criteria
+      generateLLMEvaluationCriteria(
+        "",
+        apiKeys,
+        `I've described a criteria I want to use to evaluate text. I want you to take the criteria and output a JSON object in the format below. 
+
+CRITERIA: 
+\`\`\`
+${addCriteriaValue}
+\`\`\`
+
+Your response should contain a short title for the criteria ("shortname"), a description of the criteria in 2 sentences ("criteria"), and whether it should be evaluated with "code", or by an "expert" if the criteria is difficult to evaluate ("eval_method"). Your answer should be JSON within a \`\`\`json \`\`\` marker, with the following three fields: "criteria", "shortname", and "eval_method" (code or expert). The "criteria" should expand upon the user's input, the "shortname" should be a very brief title for the criteria, and this list should contain as many evaluation criteria as you can think of. Each evaluation criteria should test a unit concept that should evaluate to "true" in the ideal case. Only output JSON, nothing else.`, // prompt
+        "gpt-3.5-turbo", // llm
+        null, // system_msg
+      )
+        .then((evalCrits) => {
+          // Take only the first
+          setCriteria((crit) =>
+            crit.concat([
+              {
+                ...evalCrits[0],
+                uid: uuid(),
+              },
+            ]),
+          );
+          // Remove a loading Skeleton
+          setIsLoadingCriteria((num) => num - 1);
+        })
+        .catch((err) => {
+          console.error(err);
+          setIsLoadingCriteria((num) => num - 1);
+        });
+    };
+    const updateCriteria = (newValue, critIdx, propName) => {
+      setCriteria((crit) => {
+        crit[critIdx][propName] = newValue;
+        return [...crit];
+      });
+    };
+
+    // An estimate of many requests the implementation executor will require (upper bound).
+    const estimatedLLMRequestsToImplement = useMemo(() => {
+      const num_llm_evals = criteria.reduce(
+        (acc, crit) => acc + (crit.eval_method === "expert" ? 1 : 0),
+        0,
+      );
+      // The executor sends off one query per criteria to generate 3-5 candidates each.
+      // Each candidate LLM eval prompt will be run over all candidates.
+      return criteria.length + num_llm_evals * 5 * samples.length;
+    }, [criteria, samples]);
+
+    const getLikelyPromptTemplateAsContext = useCallback(() => {
+      // Attempt to infer the prompt template used to generate the responses:
+      const prompts = new Set();
+      for (const resp_obj of responses) {
+        if (resp_obj?.metavars?.__pt !== undefined) {
+          prompts.add(resp_obj.metavars.__pt);
+        }
+      }
+
+      if (prompts.size === 0) return null;
+
+      // Pick a prompt template at random to serve as context....
+      return escapeBraces(prompts.values().next().value);
+    }, [responses]);
+
+    // Given the context from "inputs", tries to generate an array of natural language criteria.
+    const genCriteriaFromContext = useCallback(async () => {
+      // Get the context from the input responses
+      const inputPromptTemplate = getLikelyPromptTemplateAsContext();
+
+      if (inputPromptTemplate === null) {
+        console.error("No context found. Cannot proceed.");
+        return;
+      }
+
+      // Attempt to generate criteria using an LLM
+      return await generateLLMEvaluationCriteria(inputPromptTemplate, apiKeys);
+    }, [responses]);
+
+    // Update the executor whenever samples or eval criteria changes,
+    // as long as the executor is not already running.
+    useEffect(() => {
+      let ex = executor;
+      if (!ex) {
+        // Instantiate executor.
+        // Get the grades from the global state, and transform the dict such that it's in {uid: grade} format.
+        const existingGrades = transformDict(
+          globalState,
+          (key) => key.startsWith("r.") && key.endsWith(".grade"),
+          extractUIDFromRatingKey,
+          (_, val) => {
+            // The grades are in { idx: grade } format. Take only the first,
+            // as we only take the first response in this iteration of EvalGen:
+            if (typeof val !== "object") return undefined;
+            const gs = Object.values(val);
+            if (gs.length === 0) return undefined;
+            return gs[0];
+          },
+        );
+
+        // Create a new EvalGen executor, passing in the samples and existing grades
+        ex = new EvaluationFunctionExecutor(
+          getLikelyPromptTemplateAsContext(samples),
+          samples,
+          undefined,
+          existingGrades,
+        );
+        setExecutor(ex);
+      } else if (ex.isRunning()) {
+        console.error(
+          "Executor already running. Avoiding updating it with new samples or criteria.",
+        );
+        return;
+      }
+      ex.setExamples(samples);
+      ex.setEvalCriteria(criteria);
+    }, [samples, criteria]);
+
+    // Starts generating implementations for the chosen criteria
+    const beginGenCriteriaImplementations = useCallback(async () => {
+      // Check that an executor exists (this should never be triggered)
+      if (!executor) {
+        console.error("Executor does not exist.");
+        return;
+      } else if (executor.isRunning()) {
+        console.error("Executor is already running.");
+        return;
+      }
+
+      // Start the executor in the background
+      setExecProgress(0);
+      executor.start((progress) => {
+        setExecProgress(progress?.success ?? 0);
+      });
+    }, [executor]);
+
+    // This gives the parent access to triggering the modal alert
+    const trigger = (inputs, _onFinish) => {
+      setResponses(inputs);
+      setScreen("welcome");
+      setAddCriteriaValue("");
+      setExecutor(null);
+      setOnFinish(() => (report) => {
+        close();
+        if (_onFinish) _onFinish(evalgenReportToImplementations(report));
+      });
+      open();
+    };
+    useImperativeHandle(ref, () => ({
+      trigger,
+    }));
+
+    const handleInitialGradingDone = () => {
+      setScreen("pick");
+
+      // Generate criteria
+      setCriteria([]);
+      setIsLoadingCriteria(3);
+      genCriteriaFromContext()
+        .then((crits) => setCriteria(crits.map((c) => ({ ...c, uid: uuid() }))))
+        .finally(() => setIsLoadingCriteria(0));
+    };
+
+    const transitionToReport = (report) => {
+      setReport(report);
+      setScreen("report");
+    };
+
+    const recomputeAlignment = async () => {
+      // Get selected criteria
+      const selectedCriteria = criteria.filter(
+        (c) => c.selected || c.selected === undefined,
+      );
+
+      // Pass this into executor to recompute alignment
+      const newReport = await executor?.recomputeAlignment(
+        selectedCriteria,
+        report,
+      );
+
+      // Update the report
+      setReport(newReport);
+    };
+
+    const gradeResponsesScreen = useMemo(
+      () => (
+        <GradeResponsesScreen
+          resps={samples}
+          executor={executor}
+          onClickDone={handleInitialGradingDone}
+          askForAnnotations={screen === "grade_first"}
+          onFinish={transitionToReport}
+          execProgress={execProgress}
+        />
+      ),
+      [samples, executor, screen, onFinish, execProgress],
+    );
+
+    return (
+      <Modal
+        size="80%"
+        opened={opened}
+        onClose={close}
+        title={
+          <div>
+            <span style={{ fontSize: "14pt" }}>{modalTitle}</span>
+          </div>
+        }
+        closeOnClickOutside={true}
+        style={{ position: "relative", left: "-5%" }}
+      >
+        {screen === "welcome" ? (
+          <div>
+            <Center>
+              <Text size="sm" pl="sm" mt="lg" mb="sm" maw="560px">
+                Welcome to EvalGen. The EvalGen wizard will generate evaluation
+                criteria and implementations for grading responses that align
+                with your expectations.
+              </Text>
+            </Center>
+            <Center>
+              <Text size="sm" pl="sm" mb="lg" maw="560px">
+                To get started, we need to specify some criteria in natural
+                language that will be used to evaluate model responses. How
+                would you like to generate criteria?
+              </Text>
+            </Center>
+            <Center>
+              <Flex justify="center" gap="lg" mt="sm" mb="lg" maw="560px">
+                <ChooseCard
+                  onClick={() => {
+                    if (isLoadingCriteria > 0) return;
+                    setScreen("pick");
+                    setCriteria([]);
+                    setIsLoadingCriteria(3);
+                    genCriteriaFromContext()
+                      .then((crits) =>
+                        setCriteria(crits.map((c) => ({ ...c, uid: uuid() }))),
+                      )
+                      .finally(() => setIsLoadingCriteria(0));
+                  }}
+                  title="Infer criteria from my context"
+                  description="An AI will look at your input prompt and context and try to infer criteria. You will still be able to review, revise, and add criteria."
+                  icon={<IconSparkles />}
+                  bg="#a834eb"
+                />
+                <ChooseCard
+                  onClick={() => {
+                    setScreen("pick");
+                    // setCriteria([]);
+                  }}
+                  title="Let me specify criteria manually"
+                  description="Enter criteria manually. An AI will generate longer descriptions for your criteria, which you can review and revise."
+                  icon={<IconPencil />}
+                  bg="#34eb74"
+                />
+                <ChooseCard
+                  onClick={() => {
+                    setScreen("grade_first");
+                    // setCriteria([]);
+                  }}
+                  title="Grade some responses first"
+                  description="Grade some responses first, to help yourself identify criteria. The AI will incorporate your grades in its criteria suggestions."
+                  icon={<IconThumbUp />}
+                  bg="#eba834"
+                />
+                {/* TODO <ChooseCard title="Chat with an AI to infer criteria" description="Chat with an AI assistant that will ask questions about your task and situation. The AI will infer some criteria and provide them as starting points." icon={<IconMessage2Bolt />} bg="#34c9eb" /> */}
+              </Flex>
+            </Center>
+          </div>
+        ) : (
+          <></>
+        )}
+
+        {screen === "pick" ? (
+          <div>
+            <Text size="sm" pl="sm" mb="lg">
+              Select criteria that you would like to evaluate responses on.
+              Based on your chosen criteria, LLM will generate implementations
+              of assertions. Afterwards, an optional human scoring pass can
+              better align these implementations with your expectations.
+            </Text>
+
+            <Text size="sm" pl="sm" mb="lg" style={{ fontStyle: "italic" }}>
+              Note: Due to rate limits, please don&apos;t select more than 3
+              criteria to be evaluated by LLMs.
+            </Text>
+
+            <Flex align="center" gap="lg">
+              <TextInput
+                label="Type a new criteria to add, then press Enter:"
+                value={addCriteriaValue}
+                onChange={(evt) => setAddCriteriaValue(evt.currentTarget.value)}
+                placeholder="the response is valid JSON"
+                mb="lg"
+                pl="sm"
+                pr="sm"
+                w="100%"
+                onKeyDown={(evt) => {
+                  if (evt.key === "Enter") {
+                    evt.preventDefault();
+                    addCriteria();
+                    setAddCriteriaValue("");
+                  }
+                }}
+              />
+              <Button
+                variant="filled"
+                onClick={() => {
+                  if (isLoadingCriteria > 0) return;
+                  setIsLoadingCriteria(3);
+                  genCriteriaFromContext()
+                    .then((crit) => setCriteria(criteria.concat(crit)))
+                    .finally(() => setIsLoadingCriteria(0));
+                }}
+              >
+                <IconRepeat />
+                <IconSparkles />
+                &nbsp;Suggest more
+              </Button>
+            </Flex>
+
+            <ScrollArea mih={300} h={500} mah={500}>
+              <SimpleGrid cols={3} spacing="sm" verticalSpacing="sm" mb="lg">
+                {criteria.map((c, idx) => (
+                  <CriteriaCard
+                    title={c.shortname}
+                    description={c.criteria}
+                    evalMethod={c.eval_method}
+                    key={`cc-${c.uid ?? idx.toString() + c.shortname}`}
+                    onTitleChange={(title) =>
+                      updateCriteria(title, idx, "shortname")
+                    }
+                    onDescriptionChange={(desc) =>
+                      updateCriteria(desc, idx, "criteria")
+                    }
+                    onEvalMethodChange={(method) =>
+                      updateCriteria(method, idx, "eval_method")
+                    }
+                    onRemove={() =>
+                      setCriteria(criteria.filter((v, j) => j !== idx))
+                    }
+                  />
+                ))}
+                {isLoadingCriteria > 0 ? (
+                  Array.from({ length: isLoadingCriteria }, (x, i) => (
+                    <Skeleton key={`skele-card-${i}`}>
+                      <CriteriaCard
+                        title={"Loading"}
+                        description={"Loading"}
+                        evalMethod={"expert"}
+                      />
+                    </Skeleton>
+                  ))
+                ) : (
+                  <></>
+                )}
+              </SimpleGrid>
+            </ScrollArea>
+
+            <Flex justify="center" gap={12} mt="xs">
+              <Tooltip
+                label={`Will send off up to ${estimatedLLMRequestsToImplement} requests`}
+                withArrow
+              >
+                <Button
+                  onClick={() => {
+                    // Start generating implementations + transition to next screen
+                    // setScreen("wait");
+                    // For study just go right to grading
+                    setScreen("grade");
+                    beginGenCriteriaImplementations();
+
+                    // generateLLMEvaluationCriteria(
+                    //   escapeBraces(`Delete 10 words or phrases from the following paragraph that don't contribute much to its meaning, but keep readability:
+                    // "{paragraph}"
+
+                    // Please do not add any new words or change words, only delete words.`),
+                    // ).then(setCriteria);
+                  }}
+                  variant="gradient"
+                  gradient={{ from: "teal", to: "lime", deg: 105 }}
+                  disabled={!criteria || criteria.length === 0}
+                >
+                  <IconSparkles />
+                  &nbsp;I&apos;m done. Implement it!
+                </Button>
+              </Tooltip>
+            </Flex>
+          </div>
+        ) : (
+          <></>
+        )}
+
+        {screen === "wait" ? (
+          <div>
+            <Stack justify="center" align="center" h={500}>
+              <Text mb={0}>Collecting...</Text>
+              <Loader size="lg" />
+              <Text color="gray" size="sm">
+                This may take a while.
+              </Text>
+
+              <Space h="lg" />
+              <Button
+                onClick={() => setScreen("grade")}
+                size="lg"
+                variant="gradient"
+                gradient={{ from: "teal", to: "lime", deg: 105 }}
+              >
+                <IconSparkles />
+                &nbsp;Grade Responses While You Wait
+              </Button>
+              <Text ml="lg" lh={1.2} w={380} color="gray">
+                Grading helps us choose implementations that better align with
+                your expectations. 📈
+              </Text>
+            </Stack>
+          </div>
+        ) : (
+          <></>
+        )}
+
+        {screen === "grade" ? gradeResponsesScreen : <></>}
+        {screen === "grade_first" ? (
+          <div>
+            <Center>
+              <Text size="md" pl="sm" mt="lg" mb="sm" maw="80%">
+                Grade at least 5 responses. You can use the arrows to skip
+                responses. Try to get a good sample of good (thumbs up) and bad
+                (thumbs down) examples.
+                {/* Welcome to EvalGen. We&apos;ve learned that grading responses
+                helps you decide your criteria. So, before AI can help you
+                generate evaluators,{" "}
+                <span style={{ fontWeight: 800 }}>
+                  we ask you to grade at least 5 responses
+                </span>
+                . The EvalGen wizard will then generate evaluation criteria and
+                implementations for grading responses that align with your
+                expectations. */}
+              </Text>
+            </Center>
+            <hr />
+            {gradeResponsesScreen}
+          </div>
+        ) : (
+          <></>
+        )}
+
+        {screen === "report" ? (
+          <ReportCardScreen
+            report={report}
+            recomputeAlignment={recomputeAlignment}
+            onClickFinish={(report) => onFinish(report)}
+          />
+        ) : (
+          <></>
+        )}
+      </Modal>
+    );
+  },
+);
+
+// Screen where the user grades responses.
+export const GradeResponsesScreen = forwardRef(function GradeResponsesScreen(
+  { resps, executor, onClickDone, askForAnnotations, onFinish, execProgress },
+  ref,
+) {
+  // Confetti effects
+  const [isGreenExploding, setIsGreenExploding] = React.useState(false);
+  const [isRedExploding, setIsRedExploding] = React.useState(false);
+
+  const [responses, setResponses] = useState([]);
+  const [shownResponse, setShownResponse] = useState(undefined);
+  const [pastShownResponses, setPastShownResponses] = useState([]);
+  const [shownResponseIdx, setShownResponseIdx] = useState(0);
+  const [grades, setGrades] = useState({});
+
+  const showProgressType = useMemo(
+    () => (executor ? "grade" : "num_graded"),
+    [executor],
+  );
+  const [minNumGrade, setMinNumGrade] = useState(5);
+  const numGraded = useMemo(() => Object.keys(grades).length, [grades]);
+
+  const [promptReasoning, setPromptReasoning] = useState(null);
+  const [annotation, setAnnotation] = useState(undefined);
+
+  // For updating the global human ratings state
+  const setState = useStore((store) => store.setState);
+  const updateGlobalRating = useCallback(
+    (uid, label, payload) => {
+      const key = getRatingKeyForResponse(uid, label);
+      const safe_payload = deepcopy(payload);
+      setState(key, safe_payload);
+      StorageCache.store(key, safe_payload);
+    },
+    [setState],
+  );
+
+  const bottomBar = useMemo(() => {
+    const bar = {};
+    if (showProgressType === "num_graded") {
+      bar.progressPerc = Math.min((numGraded / minNumGrade) * 100, 100);
+      bar.progressLabel = `${numGraded} / ${minNumGrade} graded`;
+      bar.buttonLabel = bar.progressPerc < 100 ? "Keep grading!" : "Next Step";
+      bar.buttonDisabled = bar.progressPerc < 100;
+      bar.buttonStyle = "filled";
+    } else {
+      bar.progressPerc = Math.min(execProgress, 100);
+      bar.progressLabel = "Generating and selecting implementations...";
+      bar.buttonLabel = bar.progressPerc < 99.5 ? "I'm tired 😴" : "Done";
+      bar.buttonDisabled = false;
+      bar.buttonStyle = bar.progressPerc < 99.5 ? "outline" : "filled";
+    }
+    return bar;
+  }, [showProgressType, numGraded, minNumGrade, execProgress]);
+
+  const responseText = useMemo(() =>
+    shownResponse && shownResponse.responses?.length > 0
+      ? shownResponse.responses[0]
+      : "",
+  );
+  const prompt = useMemo(() => shownResponse?.prompt ?? "", [shownResponse]);
+  const varsDivs = useMemo(() => {
+    const combined_vars_metavars = shownResponse
+      ? {
+          ...shownResponse.vars,
+          ...transformDict(shownResponse.metavars, cleanMetavarsFilterFunc),
+        }
+      : {};
+    return Object.entries(combined_vars_metavars).map(([varname, val]) => (
+      <div key={varname} className="grade-resp-var-container">
+        <span className="response-var-name">{varname}&nbsp;=&nbsp;</span>
+        <span className="response-var-value linebreaks">{val}</span>
+      </div>
+    ));
+  }, [shownResponse]);
+
+  // Goto next response in the queue (skipping grading the current one)
+  const nextResponse = () => {
+    if (responses.length === 0) return;
+
+    // Update annotation for current response (if any)
+    // TODO: Fix this for generate case when num resp per prompt > 1
+    if (
+      shownResponse &&
+      annotation &&
+      typeof annotation === "string" &&
+      annotation.trim().length > 0
+    ) {
+      // console.log("setting annotation for resp", shownResponse.uid, annotation);
+      updateGlobalRating(shownResponse.uid, "note", { 0: annotation });
+      setAnnotation(null);
+    }
+    setPromptReasoning(null);
+
+    if (shownResponseIdx < pastShownResponses.length - 1) {
+      // If we are not at the end of the history of shown responses, then show the next response:
+      setShownResponse(pastShownResponses[shownResponseIdx + 1]);
+      setShownResponseIdx(shownResponseIdx + 1); // increment the shown resp idx
+    } else {
+      // We are at the end of the history; pick the next response off the stack:
+      // TODO: Make this unique (maybe by removing picked responses from the list!)
+      let num_tries = 3;
+      let next_resp = executor?.getNextExampleToGrade();
+      while (
+        num_tries > 0 &&
+        (!next_resp || pastShownResponses.some((r) => r.uid === next_resp.uid))
+      ) {
+        // We're presenting a response that's already been shown. Try again.
+        // NOTE: If we're trying again the first time, executor will flip and get the response on the other side of the grading stack, so we try once more:
+        if (next_resp && num_tries === 3)
+          next_resp =
+            executor?.getNextExampleToGrade() ??
+            sampleRandomElements(responses, 1)[0];
+        // Otherwise we just choose a response at random:
+        else next_resp = sampleRandomElements(responses, 1)[0];
+        num_tries -= 1;
+      }
+      // Note that this doesn't guarantee uniqueness here ---it is possible to see a response again.
+      // However, the internal "grades" dict will help us in remembering what grade the user gave the response.
+      setShownResponse(next_resp);
+      setPastShownResponses(pastShownResponses.concat(next_resp));
+      setShownResponseIdx(pastShownResponses.length);
+    }
+  };
+
+  // Go back to previously shown response
+  const prevResponse = () => {
+    if (pastShownResponses.length === 0 || shownResponseIdx === 0) return;
+    setShownResponse(pastShownResponses[shownResponseIdx - 1]);
+    setShownResponseIdx(shownResponseIdx - 1); // decrement shown resp idx
+  };
+
+  // Update responses to draw from, when passed by external source
+  const updateResponsePool = (inputs) => {
+    if (!inputs) return;
+
+    setResponses(inputs);
+
+    // Choose the first response to display to the user
+    if (inputs?.length > 0) {
+      const random_resp = sampleRandomElements(inputs, 1)[0];
+      setShownResponse(random_resp);
+      setPastShownResponses([random_resp]);
+      setShownResponseIdx(0);
+      setGrades({});
+    }
+  };
+
+  const handleDone = useCallback(async () => {
+    if (showProgressType === "num_graded") {
+      if (onClickDone) onClickDone();
+    } else {
+      // Await completion of all gen + execution of eval funcs
+      await executor?.waitForCompletion();
+
+      // Filtering eval funcs by grades and present results
+      const filteredFunctions = await executor?.filterEvaluationFunctions(0.25);
+      console.log("Filtered Functions: ", filteredFunctions);
+
+      // Return selected implementations to caller
+      if (onFinish) onFinish(filteredFunctions);
+    }
+  }, [executor, showProgressType]);
+
+  const updateGrade = (idx, uid, grade) => {
+    grades[idx] = grade;
+    setGrades({ ...grades });
+    executor?.setGradeForExample(uid, grade);
+    updateGlobalRating(uid, "grade", { 0: grade });
+  };
+
+  const handleClickGradeButton = (isGoodResponse) => {
+    updateGrade(shownResponseIdx, shownResponse.uid, isGoodResponse);
+    const explodeFunc = isGoodResponse
+      ? setIsGreenExploding
+      : setIsRedExploding;
+    explodeFunc(true);
+    if (isGoodResponse) {
+      // Don't ask for clarification if it's a good grade
+      setTimeout(() => explodeFunc(false), 1200);
+      setTimeout(nextResponse, 800);
+    } else {
+      // If they gave a bad grade, ask them why
+      setTimeout(() => explodeFunc(false), 1200);
+      setTimeout(() => {
+        if (askForAnnotations) setPromptReasoning(true);
+        else nextResponse();
+      }, 800);
+    }
+  };
+
+  // Update responses whenever upstream changes
+  useEffect(() => {
+    updateResponsePool(resps);
+  }, [resps]);
+
+  return (
+    <Stack justify="space-between" mih={500}>
+      <Box>
+        <Flex justify="center">
+          {shownResponseIdx in grades ? (
+            grades[shownResponseIdx] ? (
+              <HeaderText>
+                You chose&nbsp;
+                <IconThumbUp color="green" style={{ marginBottom: "-3px" }} />!
+              </HeaderText>
+            ) : (
+              <HeaderText>
+                You chose&nbsp;
+                <IconThumbDown color="red" style={{ marginBottom: "-6px" }} />!
+              </HeaderText>
+            )
+          ) : (
+            <HeaderText>
+              Is this response&nbsp;
+              <IconThumbUp style={{ marginBottom: "-3px" }} />
+              &nbsp;or&nbsp;
+              <IconThumbDown style={{ marginBottom: "-6px" }} />
+              &nbsp;?
+            </HeaderText>
+          )}
+        </Flex>
+
+        <Flex justify="center" align="center" mb="sm">
+          <Button variant="white" color="dark" onClick={prevResponse}>
+            <IconChevronLeft />
+          </Button>
+          <div
+            className="response-box"
+            style={{
+              backgroundColor: "#eee",
+              width: "80%",
+              maxHeight: "340px",
+              overflowY: "scroll",
+              borderColor: "black",
+              borderStyle: "solid",
+            }}
+          >
+            <div className="response-item-llm-name-wrapper">
+              <div
+                className="small-response"
+                style={{ fontSize: "11pt", padding: "12pt" }}
+              >
+                {responseText}
+              </div>
+            </div>
+          </div>
+          <Button variant="white" color="dark" onClick={nextResponse}>
+            <IconChevronRight />
+          </Button>
+        </Flex>
+
+        <Flex justify="center" mb="xl" gap="lg">
+          <div
+            style={{
+              backgroundColor: "#fff",
+              padding: "12px",
+              width: "31%",
+              borderRadius: "12px",
+              borderWidth: "1px",
+              borderStyle: "solid",
+            }}
+          >
+            Vars
+            <hr />
+            <div style={{ maxHeight: "160px", overflowY: "scroll" }}>
+              {varsDivs}
+            </div>
+          </div>
+          <div
+            style={{
+              backgroundColor: "#fff",
+              padding: "12px",
+              width: "41%",
+              borderRadius: "2px",
+            }}
+          >
+            Prompt
+            <hr />
+            <div
+              className="monofont linebreaks"
+              style={{
+                maxHeight: "160px",
+                overflowY: "scroll",
+                fontSize: "10pt",
+                lineHeight: "1.2",
+              }}
+            >
+              {prompt}
+            </div>
+          </div>
+        </Flex>
+
+        {promptReasoning === null ? (
+          <Flex justify="center" gap="50px" mb="xl">
+            <Button
+              color="red"
+              variant="filled"
+              onClick={() => {
+                handleClickGradeButton(false);
+              }}
+            >
+              <IconThumbDown />
+              &nbsp;Bad!
+              <>
+                {isRedExploding && (
+                  <ConfettiExplosion
+                    zIndex={1000}
+                    colors={["#f00"]}
+                    force={0.1}
+                    height={300}
+                    width={200}
+                    particleCount={5}
+                    duration={2200}
+                    onComplete={() => setIsRedExploding(false)}
+                    style={{ position: "absolute", left: "50%", top: "20%" }}
+                  />
+                )}
+              </>
+            </Button>
+            <Button
+              color="green"
+              variant="filled"
+              onClick={() => {
+                handleClickGradeButton(true);
+              }}
+            >
+              <IconThumbUp />
+              &nbsp;Good!
+              <>
+                {isGreenExploding && (
+                  <ConfettiExplosion
+                    zIndex={1000}
+                    colors={[MANTINE_GREEN]}
+                    force={0.9}
+                    height={300}
+                    width={300}
+                    particleCount={10}
+                    duration={2200}
+                    onComplete={() => setIsGreenExploding(false)}
+                    style={{ position: "absolute", left: "50%", top: "20%" }}
+                  />
+                )}
+              </>
+            </Button>
+          </Flex>
+        ) : (
+          <Center>
+            <Stack spacing="xs">
+              <Text>What&apos;s the reason for your score?</Text>
+              <Flex align="center" gap="lg">
+                <Textarea
+                  value={annotation}
+                  onChange={(e) => setAnnotation(e.currentTarget.value)}
+                  autoFocus
+                  onKeyDown={(e) => {
+                    if (e.key === "Enter") {
+                      e.preventDefault();
+                      nextResponse();
+                    }
+                  }}
+                />
+                <Button onClick={nextResponse} w={100}>
+                  {!annotation ? "Skip" : "Continue"}
+                </Button>
+              </Flex>
+            </Stack>
+          </Center>
+        )}
+      </Box>
+
+      <Flex justify="left" align="center" gap="md">
+        {/* <Progress size={18} w='100%' sections={[{ value: 30, color: 'blue', label: '3/10 graded', tooltip: 'Samples graded' }]} /> */}
+        {/* <Loader size='sm' /> */}
+        <Stack w="100%" spacing={4}>
+          <Text color="#aaa" size="sm">
+            {bottomBar.progressLabel}
+          </Text>
+          <Progress w="100%" value={bottomBar.progressPerc} mb="0px" />
+        </Stack>
+
+        <Button
+          onClick={handleDone}
+          variant={bottomBar.buttonStyle}
+          disabled={bottomBar.buttonDisabled}
+        >
+          {bottomBar.buttonLabel}
+        </Button>
+      </Flex>
+    </Stack>
+  );
+});
+
+// Screen after EvalGen finishes, to show a report to the user
+// about the chosen functions and the alignment with their ratings.
+const ReportCardScreen = ({ report, recomputeAlignment, onClickFinish }) => {
+  // The criteria cards, now with report information
+  const cards = useMemo(() => {
+    const res = [];
+
+    // Iterate through selected eval functions and create cards
+    for (const selectedFunc of report.selectedEvalFunctions) {
+      const crit = selectedFunc.evalCriteria;
+      // Find corresponding report in allEvalFunctionReports map from criteria to list
+      const critEvalFuncReports = report.allEvalFunctionReports.get(crit);
+      const evalFuncReport = critEvalFuncReports.find(
+        (rep) => rep.evalFunction === selectedFunc,
+      );
+
+      // Get the functions that were not selected for this criteria
+      const otherFuncs = critEvalFuncReports.filter(
+        (rep) => rep.evalFunction !== selectedFunc,
+      );
+
+      res.push(
+        <CriteriaCard
+          title={crit.shortname}
+          description={crit.criteria}
+          evalMethod={crit.eval_method}
+          key={`cc-${crit.uid ?? res.length.toString() + crit.shortname}`}
+          reportMode={true}
+          evalFuncReport={evalFuncReport} // undefined if none was chosen
+          otherFuncs={otherFuncs}
+          onCheck={(checked) => {
+            crit.selected = checked;
+            recomputeAlignment();
+          }}
+        />,
+      );
+    }
+
+    return res;
+  }, [report]);
+
+  return (
+    report && (
+      <div>
+        <Text align="center" size="lg" pl="sm" mb="lg">
+          Chosen Functions and Alignment
+        </Text>
+
+        {/* Show coverage and false failure rate numbers */}
+        <Flex justify="center" gap="md" mb="lg">
+          <Group position="center" spacing="xl" style={{ textAlign: "center" }}>
+            <Card
+              shadow="sm"
+              padding="md"
+              radius="md"
+              style={{ backgroundColor: "#f0f0f0" }}
+            >
+              <Text weight={500} size="md">
+                Coverage of Bad Responses
+              </Text>
+              <Text color="blue" weight={700} size="md">
+                {report.failureCoverage.toFixed(2)}%
+              </Text>
+            </Card>
+            <Card
+              shadow="sm"
+              padding="md"
+              radius="md"
+              style={{ backgroundColor: "#f0f0f0" }}
+            >
+              <Text weight={500} size="md">
+                False Failure Rate
+              </Text>
+              <Text color="red" weight={700} size="md">
+                {report.falseFailureRate.toFixed(2)}%
+              </Text>
+            </Card>
+          </Group>
+        </Flex>
+
+        <ScrollArea mih={300} h={500} mah={500}>
+          <SimpleGrid cols={3} spacing="sm" verticalSpacing="sm" mb="lg">
+            {cards}
+          </SimpleGrid>
+        </ScrollArea>
+
+        <Flex justify="center" gap={12} mt="xs">
+          <Button onClick={() => onClickFinish(report)}>
+            Finish with selected evaluators
+          </Button>
+        </Flex>
+      </div>
+    )
+  );
+};
diff --git a/chainforge/react-server/src/PromptNode.tsx b/chainforge/react-server/src/PromptNode.tsx
index ce811a0c0..91cd2653c 100644
--- a/chainforge/react-server/src/PromptNode.tsx
+++ b/chainforge/react-server/src/PromptNode.tsx
@@ -1134,7 +1134,10 @@ Soft failing by replacing undefined with empty strings.`,
                   o.metavars = resp_obj.metavars ?? {};
 
                   // Add a metavar for the prompt *template* in this PromptNode
-                  // o.metavars.__pt = prompt_template;
+                  o.metavars.__pt =
+                    typeof prompt_template === "string"
+                      ? prompt_template
+                      : prompt_template[0];
 
                   // Carry over any chat history
                   if (resp_obj.chat_history)
diff --git a/chainforge/react-server/src/backend/evalgen/utils.ts b/chainforge/react-server/src/backend/evalgen/utils.ts
index 699d8abd6..9586c1ebe 100644
--- a/chainforge/react-server/src/backend/evalgen/utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/utils.ts
@@ -11,7 +11,7 @@ import {
 } from "./typing";
 import { Dict, LLMResponse } from "../typing";
 import { executejs, executepy, simpleQueryLLM } from "../backend";
-import { getVarsAndMetavars, retryAsyncFunc } from "../utils";
+import { getVarsAndMetavars, llmResponseDataToString, retryAsyncFunc } from "../utils";
 import { v4 as uuid } from "uuid";
 import { OpenAIStreamer } from "./oai_utils";
 import {
@@ -75,11 +75,11 @@ export async function generateLLMEvaluationCriteria(
       throw new Error(Object.values(result.errors as Dict)[0].toString());
 
     // Get output (text from LLM response)
-    const output = result.responses[0].responses[0];
+    const output = llmResponseDataToString(result.responses[0].responses[0]);
     // console.log("LLM said: ", output); // for debuggging
 
     // Attempt to extract JSON blocks (strings) from input
-    const json_blocks = extractJSONBlocks(output.toString());
+    const json_blocks = extractJSONBlocks(output);
     if (json_blocks === undefined || json_blocks.length === 0)
       throw new Error(
         "EvalGen: Could not parse LLM response into evaluation critera: No JSON detected in output.",
@@ -151,7 +151,7 @@ export async function executeLLMEval(
     systemMessage, // system_msg
   );
   // Get the output
-  const output = result.responses[0].responses[0].toString();
+  const output = llmResponseDataToString(result.responses[0].responses[0]);
 
   // Parse the response to determine the boolean value to return
   if (output.toLowerCase().includes("yes")) {
diff --git a/chainforge/react-server/src/backend/utils.ts b/chainforge/react-server/src/backend/utils.ts
index ca66c078c..823bda765 100644
--- a/chainforge/react-server/src/backend/utils.ts
+++ b/chainforge/react-server/src/backend/utils.ts
@@ -51,6 +51,7 @@ import {
 } from "@mirai73/bedrock-fm";
 import StorageCache, { StringLookup } from "./cache";
 import Compressor from "compressorjs";
+import { Annotations } from "plotly.js";
 // import { Models } from "@mirai73/bedrock-fm/lib/bedrock";
 
 const ANTHROPIC_HUMAN_PROMPT = "\n\nHuman:";
@@ -2453,3 +2454,37 @@ export const ensureUniqueName = (_name: string, _prev_names: string[]) => {
   }
   return new_name;
 };
+
+export const accuracyToColor = (acc: number) => {
+  if (acc > 0.9) return "green";
+  else if (acc > 0.7) return "yellow";
+  else if (acc > 0.5) return "orange";
+  else return "red";
+};
+
+export const cmatrixTextAnnotations = (
+  x: string[],
+  y: string[],
+  z: number[][],
+) => {
+  const annotations = [];
+  const midVal = Math.max(...z.flat());
+  for (let i = 0; i < y.length; i++) {
+    for (let j = 0; j < x.length; j++) {
+      annotations.push({
+        xref: "x1",
+        yref: "y1",
+        x: x[j],
+        y: y[i],
+        text: z[i][j].toString(),
+        font: {
+          // family: "monospace",
+          // size: 12,
+          color: z[i][j] < midVal ? "white" : "black",
+        },
+        showarrow: false,
+      });
+    }
+  }
+  return annotations as Partial<Annotations>[];
+};

From 04fe78ea6d1c2074c63c43d88f9c944c5f79de73 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Sun, 16 Mar 2025 19:53:12 -0400
Subject: [PATCH 16/35] wip

---
 .../src/EvalGen/EvalGenWizard.tsx             | 206 ++++++++
 .../react-server/src/EvalGen/FeedbackStep.tsx |  34 ++
 .../src/EvalGen/GradeResponsesStep.tsx        |  47 ++
 .../react-server/src/EvalGen/GradingView.tsx  | 219 +++++++++
 .../PickCriteriaStep.tsx}                     | 444 +-----------------
 .../src/EvalGen/ReportCardStep.tsx            |  41 ++
 .../react-server/src/EvalGen/WelcomeStep.tsx  |  84 ++++
 chainforge/react-server/src/MultiEvalNode.tsx |   4 +-
 .../react-server/src/backend/evalgen/utils.ts |   6 +-
 9 files changed, 663 insertions(+), 422 deletions(-)
 create mode 100644 chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
 create mode 100644 chainforge/react-server/src/EvalGen/FeedbackStep.tsx
 create mode 100644 chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
 create mode 100644 chainforge/react-server/src/EvalGen/GradingView.tsx
 rename chainforge/react-server/src/{EvalGen2Modal.tsx => EvalGen/PickCriteriaStep.tsx} (57%)
 create mode 100644 chainforge/react-server/src/EvalGen/ReportCardStep.tsx
 create mode 100644 chainforge/react-server/src/EvalGen/WelcomeStep.tsx

diff --git a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
new file mode 100644
index 000000000..a51dd20d1
--- /dev/null
+++ b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
@@ -0,0 +1,206 @@
+import React, { useCallback, useState } from "react";
+import { EvalCriteria, EvalGenReport } from "../backend/evalgen/typing";
+import { LLMResponse } from "../backend/typing";
+import useStore from "../store";
+import { escapeBraces } from "../backend/template";
+import { StringLookup } from "../backend/cache";
+import { generateLLMEvaluationCriteria } from "../backend/evalgen/utils";
+import { Button, Flex, Modal, Stepper } from "@mantine/core";
+import WelcomeStep from "./WelcomeStep";
+import FeedbackStep from "./FeedbackStep";
+import PickCriteriaStep from "./PickCriteriaStep";
+import ReportCardStep from "./ReportCardStep";
+import GradingResponsesStep from "./GradeResponsesStep";
+
+// Main wizard component props
+interface EvalGenWizardProps {
+  opened: boolean;
+  onClose: () => void;
+  onComplete: (result: EvalGenReport) => void;
+  responses: LLMResponse[];
+}
+
+const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
+  opened,
+  onClose,
+  onComplete,
+  responses, // The LLM responses to operate over
+}) => {
+  const [active, setActive] = useState(0);
+
+  // Criteria the user defines across the stages
+  const [criteria, setCriteria] = useState<EvalCriteria[]>([]);
+  const [onNextCallback, setOnNextCallback] = useState(() => () => {});
+
+  // Global state
+  const apiKeys = useStore((state) => state.apiKeys);
+
+  const handleNext = useCallback(() => {
+    setActive((current) => Math.min(4, current + 1));
+  }, []);
+
+  const handlePrevious = useCallback(() => {
+    setActive((current) => Math.max(0, current - 1));
+  }, []);
+
+  const handleComplete = () => {
+    // Return final data to the caller
+    onComplete({
+      criteria: criteria,
+      failureCoverage: 0,
+      falseFailureRate: 0,
+      // grades: gradingData,
+      // alignmentScores: {} // TODO: Include actual alignment scores
+    });
+    onClose();
+  };
+
+  const getLikelyPromptTemplateAsContext = (resps: LLMResponse[]) => {
+    // Attempt to infer the prompt template used to generate the responses:
+    const prompts = new Set<string>();
+    for (const resp_obj of resps) {
+      const pt = resp_obj?.metavars?.__pt;
+      if (pt !== undefined) {
+        prompts.add(StringLookup.get(pt) as string);
+      }
+    }
+
+    if (prompts.size === 0) return null;
+
+    // Pick a prompt template at random to serve as context....
+    return escapeBraces(prompts.values().next().value ?? "");
+  };
+
+  async function genCriteriaFromContext(responses: LLMResponse[]) {
+    // Get the context from the input responses
+    const inputPromptTemplate = getLikelyPromptTemplateAsContext(responses);
+
+    if (inputPromptTemplate === null) {
+      console.error("No context found. Cannot proceed.");
+      return;
+    }
+
+    // Attempt to generate criteria using an LLM
+    return await generateLLMEvaluationCriteria(inputPromptTemplate, apiKeys);
+  }
+
+  return (
+    <Modal
+      opened={opened}
+      onClose={onClose}
+      title="EvalGen Wizard"
+      size="90%"
+      padding="md"
+      // keepMounted
+      // closeOnClickOutside={true}
+      style={{ position: "relative", left: "-5%" }}
+      styles={{
+        inner: {
+          padding: "5%", // This creates space around the modal (10% total)
+        },
+        content: {
+          height: "100%", // Fill the available space
+          maxHeight: "90vh", // Limit to 90% of viewport height
+          display: "flex",
+          flexDirection: "column",
+        },
+        body: {
+          flex: 1, // This makes the body expand to fill available space
+          overflow: "auto", // Add scrolling if content is too tall
+        },
+      }}
+    >
+      {active === 0 && <WelcomeStep setOnNextCallback={setOnNextCallback} />}
+
+      {active === 1 && (
+        <FeedbackStep
+          onNext={handleNext}
+          onPrevious={handlePrevious}
+          responses={responses}
+          setOnNextCallback={setOnNextCallback}
+        />
+      )}
+
+      {active === 2 && (
+        <PickCriteriaStep
+          onNext={handleNext}
+          onPrevious={handlePrevious}
+          criteria={criteria}
+          setCriteria={setCriteria}
+          genCriteriaFromContext={() => genCriteriaFromContext(responses ?? [])}
+          setOnNextCallback={setOnNextCallback}
+        />
+      )}
+
+      {active === 3 && (
+        <GradingResponsesStep
+          onNext={handleNext}
+          onPrevious={handlePrevious}
+          responses={responses}
+          criteria={criteria}
+          setCriteria={setCriteria}
+          setOnNextCallback={setOnNextCallback}
+        />
+      )}
+
+      {active === 4 && (
+        <ReportCardStep
+          onPrevious={handlePrevious}
+          onComplete={handleComplete}
+          criteria={criteria}
+          setOnNextCallback={setOnNextCallback}
+        />
+      )}
+
+      {/* Sticky footer - button and steppers */}
+      <div
+        style={{
+          position: "fixed",
+          bottom: 106,
+          padding: "10px",
+          width: "95%",
+        }}
+      >
+        <Flex justify="space-between">
+          <Button variant="default" onClick={handlePrevious}>
+            &lt; Back
+          </Button>
+          <Button onClick={handleNext}>Next &gt;</Button>
+        </Flex>
+      </div>
+      <div
+        style={{
+          position: "fixed",
+          bottom: 0,
+          background: "white",
+          padding: "10px",
+          borderTop: "1px solid #ddd",
+          width: "95%",
+        }}
+      >
+        <Stepper active={active} mb="xl">
+          <Stepper.Step label="Welcome" description="Get started">
+            {/* Step content is rendered below */}
+          </Stepper.Step>
+          <Stepper.Step label="Feedback" description="Rate some responses">
+            {/* Step content is rendered below */}
+          </Stepper.Step>
+          <Stepper.Step label="Criteria" description="Define eval criteria">
+            {/* Step content is rendered below */}
+          </Stepper.Step>
+          <Stepper.Step
+            label="Grading and Generation"
+            description="Grade by criteria, while we generate implementations"
+          >
+            {/* Step content is rendered below */}
+          </Stepper.Step>
+          <Stepper.Step label="Results" description="View alignment">
+            {/* Step content is rendered below */}
+          </Stepper.Step>
+        </Stepper>
+      </div>
+    </Modal>
+  );
+};
+
+export default EvalGenWizard;
diff --git a/chainforge/react-server/src/EvalGen/FeedbackStep.tsx b/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
new file mode 100644
index 000000000..af6598390
--- /dev/null
+++ b/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
@@ -0,0 +1,34 @@
+import React, { useState } from "react";
+import { LLMResponse } from "../backend/typing";
+import { Button, Group, Stack, Text, Title } from "@mantine/core";
+
+interface FeedbackStepProps {
+  onNext: () => void;
+  onPrevious: () => void;
+  responses: LLMResponse[];
+  setOnNextCallback: React.Dispatch<React.SetStateAction<() => unknown>>;
+}
+
+const FeedbackStep: React.FC<FeedbackStepProps> = ({ onNext, onPrevious }) => {
+  // State for thumbs up/down feedback and written comments
+  const [feedback, setFeedback] = useState([]);
+
+  const handleSubmit = () => {
+    // setFeedbackData(feedback);
+    onNext();
+  };
+
+  return (
+    <Stack spacing="lg">
+      <Title order={3}>Provide Feedback on Some Model Outputs</Title>
+
+      {/* TODO: Implement thumbs up/down feedback UI with written comments */}
+      <Text>
+        TODO: Display LLM responses with thumbs up/down controls and comment
+        field
+      </Text>
+    </Stack>
+  );
+};
+
+export default FeedbackStep;
diff --git a/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx b/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
new file mode 100644
index 000000000..b93c8f19c
--- /dev/null
+++ b/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
@@ -0,0 +1,47 @@
+import React, { useState } from "react";
+import { EvalCriteria } from "../backend/evalgen/typing";
+import { LLMResponse } from "../backend/typing";
+import { Button, Group, Stack, Text, Title } from "@mantine/core";
+import GradingView from "./GradingView";
+
+interface GradingResponsesStepProps {
+  onNext: () => void;
+  onPrevious: () => void;
+  responses: LLMResponse[];
+  criteria: EvalCriteria[];
+  setCriteria: React.Dispatch<React.SetStateAction<EvalCriteria[]>>;
+  setOnNextCallback: React.Dispatch<React.SetStateAction<() => unknown>>;
+}
+
+const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
+  onNext,
+  onPrevious,
+}) => {
+  // State for per-criteria grades
+  const [grades, setGrades] = useState({});
+
+  // TODO: Set up grading UI for each criteria
+
+  const handleSubmit = () => {
+    // setGradingData(grades);
+    onNext();
+  };
+
+  return (
+    <Stack spacing="lg">
+      <Title order={3}>Grade LLM Responses By Criteria</Title>
+      <Text>Please evaluate each response according to your criteria:</Text>
+
+      {/* <GradingView /> */}
+
+      <Group position="apart" mt="xl">
+        <Button variant="default" onClick={onPrevious}>
+          Back
+        </Button>
+        <Button onClick={handleSubmit}>I&apos;m tired, process results</Button>
+      </Group>
+    </Stack>
+  );
+};
+
+export default GradingResponsesStep;
diff --git a/chainforge/react-server/src/EvalGen/GradingView.tsx b/chainforge/react-server/src/EvalGen/GradingView.tsx
new file mode 100644
index 000000000..35d266b5f
--- /dev/null
+++ b/chainforge/react-server/src/EvalGen/GradingView.tsx
@@ -0,0 +1,219 @@
+import React, { ReactNode, useMemo } from "react";
+import { LLMResponse } from "../backend/typing";
+import { cleanMetavarsFilterFunc, transformDict } from "../backend/utils";
+import { Box, Button, Center, Flex, Stack, Text, Tooltip } from "@mantine/core";
+import {
+  IconChevronLeft,
+  IconChevronRight,
+  IconSparkles,
+} from "@tabler/icons-react";
+
+const HeaderText = ({ children }: { children: ReactNode }) => {
+  return (
+    <Text size="xl" fw={500} pl="sm" mb="lg">
+      {children}
+    </Text>
+  );
+};
+
+export interface GradingViewProps {
+  shownResponse: LLMResponse | undefined;
+  shownResponseIdx: number;
+  responseCount: number;
+  numGPT4Calls: number;
+  numGPT35Calls: number;
+  logs: { date: Date; message: string }[];
+  gotoPrevResponse: () => void;
+  gotoNextResponse: () => void;
+  estimateGPTCalls: () => string;
+  gotoNextScreen: (screenName: string) => void;
+}
+
+const GradingView: React.FC<GradingViewProps> = ({
+  shownResponse,
+  shownResponseIdx,
+  responseCount,
+  numGPT4Calls,
+  numGPT35Calls,
+  logs,
+  gotoPrevResponse,
+  gotoNextResponse,
+  estimateGPTCalls,
+  gotoNextScreen,
+}) => {
+  // Calculate inner values only when shownResponse changes
+  const responseText = useMemo(
+    () =>
+      shownResponse && shownResponse.responses?.length > 0
+        ? shownResponse.responses[0].toString()
+        : "",
+    [shownResponse],
+  );
+
+  const prompt = useMemo(() => shownResponse?.prompt ?? "", [shownResponse]);
+  const varsDivs = useMemo(() => {
+    const combined_vars_metavars = shownResponse
+      ? {
+          ...shownResponse.vars,
+          ...transformDict(shownResponse.metavars, cleanMetavarsFilterFunc),
+        }
+      : {};
+
+    // console.log("**************shownResponse", shownResponse);
+    return Object.entries(combined_vars_metavars).map(([varname, val]) => (
+      <div key={varname} className="grade-resp-var-container">
+        <span className="response-var-name">{varname}&nbsp;=&nbsp;</span>
+        <span className="response-var-value linebreaks">{val}</span>
+      </div>
+    ));
+  }, [shownResponse]);
+
+  return (
+    <Stack justify="space-between" mih={500}>
+      <Box>
+        {/* Top header */}
+        <Flex justify="center">
+          <HeaderText>
+            {/* What do you think of this response? */}
+            What do you think of response #{shownResponseIdx + 1} of{" "}
+            {responseCount}?
+          </HeaderText>
+        </Flex>
+        {/* Middle response box with chevron buttons < and > for going back and forward a response */}
+        <Flex justify="center" align="center" mb="sm">
+          {/* Go back to previous response */}
+          <Button variant="white" color="dark" onClick={gotoPrevResponse}>
+            <IconChevronLeft />
+          </Button>
+
+          {/* The response one is currently grading */}
+          <div
+            className="response-box"
+            style={{
+              backgroundColor: "#eee",
+              width: "80%",
+              maxHeight: "340px",
+              overflowY: "scroll",
+              borderColor: "black",
+              borderStyle: "solid",
+            }}
+          >
+            <div className="response-item-llm-name-wrapper">
+              <div
+                className="small-response"
+                style={{ fontSize: "11pt", padding: "12pt" }}
+              >
+                {responseText}
+              </div>
+            </div>
+          </div>
+
+          {/* Go forward to the next response */}
+          <Tooltip label={estimateGPTCalls()} withArrow>
+            <Button variant="white" color="dark" onClick={gotoNextResponse}>
+              <IconChevronRight />
+            </Button>
+          </Tooltip>
+        </Flex>
+        {/* Views for the vars (inputs) that generated this response, and the concrete prompt */}
+        <Flex justify="center" mb="xl" gap="lg">
+          <div
+            style={{
+              backgroundColor: "#fff",
+              padding: "12px",
+              width: "31%",
+              borderRadius: "12px",
+              borderWidth: "1px",
+              borderStyle: "solid",
+            }}
+          >
+            Vars
+            <hr />
+            <div style={{ maxHeight: "160px", overflowY: "scroll" }}>
+              {varsDivs}
+            </div>
+          </div>
+          <div
+            style={{
+              backgroundColor: "#fff",
+              padding: "12px",
+              width: "41%",
+              borderRadius: "2px",
+            }}
+          >
+            Prompt
+            <hr />
+            <div
+              className="monofont linebreaks"
+              style={{
+                maxHeight: "160px",
+                overflowY: "scroll",
+                fontSize: "10pt",
+                lineHeight: "1.2",
+              }}
+            >
+              {prompt}
+            </div>
+          </div>
+        </Flex>
+        <Flex direction="column">
+          <Flex justify="space-between" align="center">
+            <Text size="lg" weight={500} mb="sm">
+              LLM Activity
+            </Text>
+            {/* GPT Call Tally */}
+            <Text size="sm" color="dark" style={{ fontStyle: "italic" }}>
+              Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls}{" "}
+              GPT-3.5-Turbo-16k calls.
+            </Text>
+          </Flex>
+          <div
+            style={{
+              backgroundColor: "#f0f0f0",
+              color: "#333",
+              fontFamily: "monospace",
+              padding: "12px",
+              width: "calc(100% - 30px)",
+              height: "200px",
+              overflowY: "auto",
+              borderRadius: "8px",
+              border: "1px solid #ddd",
+              marginRight: "20px", // Space on the right
+            }}
+            ref={(el) => {
+              if (el) {
+                el.scrollTop = el.scrollHeight;
+              }
+            }}
+          >
+            {logs.map((log, index) => (
+              <div key={index}>
+                <span style={{ color: "#4A90E2" }}>
+                  {log.date.toLocaleString()} -{" "}
+                </span>
+                <span>{log.message}</span>
+              </div>
+            ))}
+          </div>
+        </Flex>
+      </Box>
+      <div>
+        <Center>
+          <Button
+            leftIcon={<IconSparkles size={14} />}
+            variant="gradient"
+            gradient={{ from: "blue", to: "green", deg: 45 }}
+            onClick={() => {
+              // console.log("(3) gotoNextScreen", gotoNextScreen);
+              gotoNextScreen("report");
+            }}
+          >
+            I&apos;m done. Access EvalGen Report!
+          </Button>
+        </Center>
+      </div>
+    </Stack>
+  );
+};
+
+export default GradingView;
diff --git a/chainforge/react-server/src/EvalGen2Modal.tsx b/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
similarity index 57%
rename from chainforge/react-server/src/EvalGen2Modal.tsx
rename to chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
index cf6e2f625..0b1867d42 100644
--- a/chainforge/react-server/src/EvalGen2Modal.tsx
+++ b/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
@@ -1,36 +1,29 @@
 import React, { useMemo, useState } from "react";
+import { EvalCriteria, EvalFunctionReport } from "../backend/evalgen/typing";
 import {
-  Modal,
+  Accordion,
   Button,
-  Group,
-  Stepper,
-  Title,
-  Text,
   Card,
-  Stack,
-  Anchor,
-  List,
-  Flex,
-  TextInput,
-  ScrollArea,
-  SimpleGrid,
-  Tooltip,
-  Skeleton,
+  Checkbox,
   Code,
   Divider,
-  Checkbox,
-  Textarea,
+  Flex,
+  Group,
   Popover,
   RingProgress,
+  ScrollArea,
+  SimpleGrid,
+  Skeleton,
+  Stack,
   Switch,
-  Accordion,
+  Text,
+  Textarea,
+  TextInput,
+  Title,
+  Tooltip,
   useMantineTheme,
 } from "@mantine/core";
-import {
-  EvalCriteria,
-  EvalFunctionReport,
-  EvalGenReport,
-} from "./backend/evalgen/typing";
+import { useDisclosure } from "@mantine/hooks";
 import {
   IconCode,
   IconRepeat,
@@ -38,172 +31,21 @@ import {
   IconSparkles,
   IconTrash,
 } from "@tabler/icons-react";
-import { generateLLMEvaluationCriteria } from "./backend/evalgen/utils";
-import useStore from "./store";
+import useStore from "../store";
+import { accuracyToColor, cmatrixTextAnnotations } from "../backend/utils";
+import { generateLLMEvaluationCriteria } from "../backend/evalgen/utils";
 import { v4 as uuid } from "uuid";
 import Plot from "react-plotly.js";
-import { useDisclosure } from "@mantine/hooks";
-import { accuracyToColor, cmatrixTextAnnotations } from "./backend/utils";
-import { LLMResponse } from "./backend/typing";
-import { escapeBraces } from "./backend/template";
-import { StringLookup } from "./backend/cache";
-
-/*
-    PROPS FOR STEPPER SCREEN COMPONENTS
- */
-interface WelcomeStepProps {
-  onNext: () => void;
-}
-
-interface FeedbackStepProps {
-  onNext: () => void;
-  onPrevious: () => void;
-  // setFeedbackData: (feedback: FeedbackItem[]) => void;
-}
 
-interface CriteriaStepProps {
+interface PickCriteriaStepProps {
   onNext: () => void;
   onPrevious: () => void;
   criteria: EvalCriteria[];
   setCriteria: React.Dispatch<React.SetStateAction<EvalCriteria[]>>;
   genCriteriaFromContext: () => Promise<EvalCriteria[] | undefined>;
-  // feedbackData: FeedbackItem[];
-  // setCriteriaData: (criteria: EvalCriteria[]) => void;
+  setOnNextCallback: React.Dispatch<React.SetStateAction<() => unknown>>;
 }
 
-interface GradingStepProps {
-  onNext: () => void;
-  onPrevious: () => void;
-  // criteriaData: EvalCriteria[];
-  // setGradingData: (grades: GradeData) => void;
-}
-
-interface ResultsStepProps {
-  onPrevious: () => void;
-  onComplete: () => void;
-  // criteriaData: Criterion[];
-  // gradingData: GradeData;
-}
-
-// Main wizard component props
-interface EvalGenWizardProps {
-  opened: boolean;
-  onClose: () => void;
-  onComplete: (result: EvalGenReport) => void;
-  responses: LLMResponse[] | undefined;
-}
-
-/*
-    STEPPER SCREEN COMPONENTS
- */
-const WelcomeStep: React.FC<WelcomeStepProps> = ({ onNext }) => (
-  <Stack spacing="md" m="lg" p="lg" mb={120}>
-    <Title order={2}>Welcome to the EvalGen Wizard</Title>
-    <Text>
-      This wizard will guide you through creating automated evaluators for LLM
-      responses that are aligned with your preferences. You`&apos;ll look at
-      data, define what you care about, apply those criteria to grade data, and
-      refine your criteria as you see more outputs. EvalGen then generates
-      automated evaluators that implement each criteria, chooses implementations
-      most aligned with your grades, and reports how aligned they are.
-    </Text>
-    <Text>
-      EvalGen is backed up by our{" "}
-      <Anchor
-        href="https://dl.acm.org/doi/abs/10.1145/3654777.3676450"
-        target="_blank"
-      >
-        empirical research at UIST 2024
-      </Anchor>
-      , and is inspired by similar inductive processes in grounded theory and
-      heuristic evaluation. Currently, Evalgen:
-    </Text>
-    <List>
-      <List.Item>
-        Only generates <b>assertions (pass/fail tests)</b>. Numeric and
-        categorical evaluators are not included.
-      </List.Item>
-      <List.Item>
-        Asks for grades on a <b>per-criteria</b> basis on the main grading
-        screen. This is the chief difference from our paper.
-      </List.Item>
-      <List.Item>
-        Requires access to the GenAI features of ChainForge. Set up the Provider
-        you wish to use for this in your Global Settings view. The Provider must
-        be powerful enough to generate code. (By default, it is OpenAI.)
-      </List.Item>
-      <List.Item>
-        Should be run on the outputs of <b>already-run</b> Prompt Nodes (LLM
-        responses).
-      </List.Item>
-      <List.Item>
-        EvalGen will send off many requests during usage. 🔔{" "}
-        <b>By using Evalgen, you take full responsibility for credit usage.</b>
-      </List.Item>
-    </List>
-    <Text>Currently, EvalGen does NOT:</Text>
-    <List>
-      <List.Item>
-        Work on imported spreadsheets of data (although if you are interested in
-        this, raise a Pull Request).
-      </List.Item>
-      <List.Item>
-        Generate code that uses third-party libraries. For safety, LLM-generated
-        Python code is run sandboxed in the browser with pyodide. (If your eval
-        criteria implementation must use a third-party library, we suggest you
-        use ChainForge’s genAI features on the specific eval node, outside this
-        wizard.)
-      </List.Item>
-    </List>
-    <Text>We have captured the following about your context:</Text>
-    <ul>
-      <li>…</li>
-      <li>[x] Use this info when helping me think of evaluation criteria</li>
-    </ul>
-    <Text>
-      After EvalGen finishes, the chosen evaluators appear in the MultiEval
-      node. You can export evaluator details by right-clicking the node and
-      selecting Copy Eval Specs.
-    </Text>
-    <Text>
-      EvalGen is in Beta. To improve it, provide feedback on our Github Issues
-      or Discussion pages, or raise a Pull Request with the changes.
-    </Text>
-    <Button onClick={onNext} fullWidth mt="xl">
-      Get Started
-    </Button>
-  </Stack>
-);
-
-const FeedbackStep: React.FC<FeedbackStepProps> = ({ onNext, onPrevious }) => {
-  // State for thumbs up/down feedback and written comments
-  const [feedback, setFeedback] = useState([]);
-
-  const handleSubmit = () => {
-    // setFeedbackData(feedback);
-    onNext();
-  };
-
-  return (
-    <Stack spacing="lg">
-      <Title order={3}>Provide Feedback on Some Model Outputs</Title>
-
-      {/* TODO: Implement thumbs up/down feedback UI with written comments */}
-      <Text>
-        TODO: Display LLM responses with thumbs up/down controls and comment
-        field
-      </Text>
-
-      <Group position="apart" mt="xl">
-        <Button variant="default" onClick={onPrevious}>
-          Back
-        </Button>
-        <Button onClick={handleSubmit}>Continue</Button>
-      </Group>
-    </Stack>
-  );
-};
-
 interface CriteriaCardProps {
   title: string;
   description: string;
@@ -527,10 +369,10 @@ const CriteriaCard: React.FC<CriteriaCardProps> = function CriteriaCard({
   );
 };
 
-const CriteriaStep: React.FC<CriteriaStepProps> = ({
+const PickCriteriaStep: React.FC<PickCriteriaStepProps> = ({
   onNext,
   onPrevious,
-  criteria, 
+  criteria,
   setCriteria,
   genCriteriaFromContext,
 }) => {
@@ -619,8 +461,8 @@ Your response should contain a short title for the criteria ("shortname"), a des
         </Text>
 
         <Text size="sm" pl="sm" mb="lg" style={{ fontStyle: "italic" }}>
-          Note: Due to rate limits and/or cost, think carefully before selecting more than 5
-          criteria to be evaluated by LLMs.
+          Note: Due to rate limits and/or cost, think carefully before selecting
+          more than 5 criteria to be evaluated by LLMs.
         </Text>
 
         <Flex align="center" gap="lg">
@@ -720,240 +562,4 @@ Your response should contain a short title for the criteria ("shortname"), a des
   );
 };
 
-const GradingStep: React.FC<GradingStepProps> = ({ onNext, onPrevious }) => {
-  // State for per-criteria grades
-  const [grades, setGrades] = useState({});
-
-  // TODO: Set up grading UI for each criteria
-
-  const handleSubmit = () => {
-    // setGradingData(grades);
-    onNext();
-  };
-
-  return (
-    <Stack spacing="lg">
-      <Title order={3}>Grade LLM Responses By Criteria</Title>
-      <Text>Please evaluate each response according to your criteria:</Text>
-
-      {/* TODO: Implement grading UI per criteria */}
-      <Text>TODO: Display grading interface for each criteria</Text>
-
-      <Group position="apart" mt="xl">
-        <Button variant="default" onClick={onPrevious}>
-          Back
-        </Button>
-        <Button onClick={handleSubmit}>I&apos;m tired, process results</Button>
-      </Group>
-    </Stack>
-  );
-};
-
-const ResultsStep: React.FC<ResultsStepProps> = ({
-  onPrevious,
-  onComplete,
-}) => {
-  // TODO: Calculate alignment scores based on criteria and grading data
-  const alignmentScores = {};
-
-  return (
-    <Stack spacing="lg">
-      <Title order={3}>Evaluation Results</Title>
-      <Text>
-        Here&apos;s how well each evaluation criteria aligns with your grades:
-      </Text>
-
-      {/* TODO: Display alignment scores */}
-      <Text>TODO: Show alignment scores for each criteria</Text>
-
-      <Group position="apart" mt="xl">
-        <Button variant="default" onClick={onPrevious}>
-          Back
-        </Button>
-        <Button onClick={onComplete} color="green">
-          Done
-        </Button>
-      </Group>
-    </Stack>
-  );
-};
-
-const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
-  opened,
-  onClose,
-  onComplete,
-  responses,
-}) => {
-  const [active, setActive] = useState(0);
-
-  // Criteria across the steps
-  const [criteria, setCriteria] = useState<EvalCriteria[]>([]);
-
-  // Global state
-  const apiKeys = useStore((state) => state.apiKeys);
-
-  const handleNext = () => {
-    setActive((current) => current + 1);
-  };
-
-  const handlePrevious = () => {
-    setActive((current) => current - 1);
-  };
-
-  const handleComplete = () => {
-    // Return final data to the caller
-    onComplete({
-      criteria: criteria,
-      failureCoverage: 0,
-      falseFailureRate: 0,
-      // grades: gradingData,
-      // alignmentScores: {} // TODO: Include actual alignment scores
-    });
-    onClose();
-  };
-
-  const getLikelyPromptTemplateAsContext = (resps: LLMResponse[]) => {
-    // Attempt to infer the prompt template used to generate the responses:
-    const prompts = new Set<string>();
-    for (const resp_obj of resps) {
-      const pt = resp_obj?.metavars?.__pt;
-      if (pt !== undefined) {
-        prompts.add(StringLookup.get(pt) as string);
-      }
-    }
-
-    if (prompts.size === 0) return null;
-
-    // Pick a prompt template at random to serve as context....
-    return escapeBraces(prompts.values().next().value ?? "");
-  };
-
-  async function genCriteriaFromContext(responses: LLMResponse[]) {
-    // Get the context from the input responses
-    const inputPromptTemplate = getLikelyPromptTemplateAsContext(responses);
-
-    if (inputPromptTemplate === null) {
-      console.error("No context found. Cannot proceed.");
-      return;
-    }
-
-    // Attempt to generate criteria using an LLM
-    return await generateLLMEvaluationCriteria(inputPromptTemplate, apiKeys);
-  }
-
-  return (
-    <Modal
-      opened={opened}
-      onClose={onClose}
-      title="EvalGen Wizard"
-      size="90%"
-      padding="md"
-      // keepMounted
-      // closeOnClickOutside={true}
-      style={{ position: "relative", left: "-5%" }}
-      styles={{
-        inner: {
-          padding: "5%", // This creates space around the modal (10% total)
-        },
-        content: {
-          height: "100%", // Fill the available space
-          maxHeight: "90vh", // Limit to 90% of viewport height
-          display: "flex",
-          flexDirection: "column",
-        },
-        body: {
-          flex: 1, // This makes the body expand to fill available space
-          overflow: "auto", // Add scrolling if content is too tall
-        },
-      }}
-    >
-      {active === 0 && <WelcomeStep onNext={handleNext} />}
-
-      {active === 1 && (
-        <FeedbackStep
-          onNext={handleNext}
-          onPrevious={handlePrevious}
-          // setFeedbackData={setFeedbackData}
-        />
-      )}
-
-      {active === 2 && (
-        <CriteriaStep
-          onNext={handleNext}
-          onPrevious={handlePrevious}
-          criteria={criteria}
-          setCriteria={setCriteria}
-          genCriteriaFromContext={() => genCriteriaFromContext(responses ?? [])}
-          // feedbackData={feedbackData}
-          // setCriteriaData={setCriteriaData}
-        />
-      )}
-
-      {active === 3 && (
-        <GradingStep
-          onNext={handleNext}
-          onPrevious={handlePrevious}
-          // criteriaData={criteriaData}
-          // setGradingData={setGradingData}
-        />
-      )}
-
-      {active === 4 && (
-        <ResultsStep
-          onPrevious={handlePrevious}
-          onComplete={handleComplete}
-          // criteriaData={criteriaData}
-          // gradingData={gradingData}
-        />
-      )}
-
-      {/* Sticky footer - button and steppers */}
-      <div
-        style={{
-          position: "fixed",
-          bottom: 106,
-          padding: "10px",
-          width: "95%",
-        }}
-      >
-        <Flex justify="space-between">
-          <Button variant="default">&lt; Back</Button>
-          <Button>Next &gt;</Button>
-        </Flex>
-      </div>
-      <div
-        style={{
-          position: "fixed",
-          bottom: 0,
-          background: "white",
-          padding: "10px",
-          borderTop: "1px solid #ddd",
-          width: "95%",
-        }}
-      >
-        <Stepper active={active} mb="xl">
-          <Stepper.Step label="Welcome" description="Get started">
-            {/* Step content is rendered below */}
-          </Stepper.Step>
-          <Stepper.Step label="Feedback" description="Rate some responses">
-            {/* Step content is rendered below */}
-          </Stepper.Step>
-          <Stepper.Step label="Criteria" description="Define eval criteria">
-            {/* Step content is rendered below */}
-          </Stepper.Step>
-          <Stepper.Step
-            label="Grading and Generation"
-            description="Grade by criteria, while we generate implementations"
-          >
-            {/* Step content is rendered below */}
-          </Stepper.Step>
-          <Stepper.Step label="Results" description="View alignment">
-            {/* Step content is rendered below */}
-          </Stepper.Step>
-        </Stepper>
-      </div>
-    </Modal>
-  );
-};
-
-export default EvalGenWizard;
+export default PickCriteriaStep;
diff --git a/chainforge/react-server/src/EvalGen/ReportCardStep.tsx b/chainforge/react-server/src/EvalGen/ReportCardStep.tsx
new file mode 100644
index 000000000..43dbf9355
--- /dev/null
+++ b/chainforge/react-server/src/EvalGen/ReportCardStep.tsx
@@ -0,0 +1,41 @@
+import React from "react";
+import { Button, Group, Stack, Text, Title } from "@mantine/core";
+import { EvalCriteria } from "../backend/evalgen/typing";
+
+interface ReportCardStepProps {
+  onPrevious: () => void;
+  onComplete: () => void;
+  criteria: EvalCriteria[];
+  setOnNextCallback: React.Dispatch<React.SetStateAction<() => unknown>>;
+}
+
+const ReportCardStep: React.FC<ReportCardStepProps> = ({
+  onPrevious,
+  onComplete,
+}) => {
+  // TODO: Calculate alignment scores based on criteria and grading data
+  const alignmentScores = {};
+
+  return (
+    <Stack spacing="lg">
+      <Title order={3}>Evaluation Results</Title>
+      <Text>
+        Here&apos;s how well each evaluation criteria aligns with your grades:
+      </Text>
+
+      {/* TODO: Display alignment scores */}
+      <Text>TODO: Show alignment scores for each criteria</Text>
+
+      <Group position="apart" mt="xl">
+        <Button variant="default" onClick={onPrevious}>
+          Back
+        </Button>
+        <Button onClick={onComplete} color="green">
+          Done
+        </Button>
+      </Group>
+    </Stack>
+  );
+};
+
+export default ReportCardStep;
diff --git a/chainforge/react-server/src/EvalGen/WelcomeStep.tsx b/chainforge/react-server/src/EvalGen/WelcomeStep.tsx
new file mode 100644
index 000000000..9ea7ca9b2
--- /dev/null
+++ b/chainforge/react-server/src/EvalGen/WelcomeStep.tsx
@@ -0,0 +1,84 @@
+import React from "react";
+import { Anchor, Button, List, Stack, Text, Title } from "@mantine/core";
+
+interface WelcomeStepProps {
+  setOnNextCallback: React.Dispatch<React.SetStateAction<() => unknown>>;
+}
+
+const WelcomeStep: React.FC<WelcomeStepProps> = ({ setOnNextCallback }) => (
+  <Stack spacing="md" m="lg" p="lg" mb={120}>
+    <Title order={2}>Welcome to the EvalGen Wizard</Title>
+    <Text>
+      This wizard will guide you through creating automated evaluators for LLM
+      responses that are aligned with your preferences. You`&apos;ll look at
+      data, define what you care about, apply those criteria to grade data, and
+      refine your criteria as you see more outputs. EvalGen then generates
+      automated evaluators that implement each criteria, chooses implementations
+      most aligned with your grades, and reports how aligned they are.
+    </Text>
+    <Text>
+      EvalGen is backed up by our{" "}
+      <Anchor
+        href="https://dl.acm.org/doi/abs/10.1145/3654777.3676450"
+        target="_blank"
+      >
+        empirical research at UIST 2024
+      </Anchor>
+      , and is inspired by similar inductive processes in grounded theory and
+      heuristic evaluation. Currently, Evalgen:
+    </Text>
+    <List>
+      <List.Item>
+        Only generates <b>assertions (pass/fail tests)</b>. Numeric and
+        categorical evaluators are not included.
+      </List.Item>
+      <List.Item>
+        Asks for grades on a <b>per-criteria</b> basis on the main grading
+        screen. This is the chief difference from our paper.
+      </List.Item>
+      <List.Item>
+        Requires access to the GenAI features of ChainForge. Set up the Provider
+        you wish to use for this in your Global Settings view. The Provider must
+        be powerful enough to generate code. (By default, it is OpenAI.)
+      </List.Item>
+      <List.Item>
+        Should be run on the outputs of <b>already-run</b> Prompt Nodes (LLM
+        responses).
+      </List.Item>
+      <List.Item>
+        EvalGen will send off many requests during usage. 🔔{" "}
+        <b>By using Evalgen, you take full responsibility for credit usage.</b>
+      </List.Item>
+    </List>
+    <Text>Currently, EvalGen does NOT:</Text>
+    <List>
+      <List.Item>
+        Work on imported spreadsheets of data (although if you are interested in
+        this, raise a Pull Request).
+      </List.Item>
+      <List.Item>
+        Generate code that uses third-party libraries. For safety, LLM-generated
+        Python code is run sandboxed in the browser with pyodide. (If your eval
+        criteria implementation must use a third-party library, we suggest you
+        use ChainForge’s genAI features on the specific eval node, outside this
+        wizard.)
+      </List.Item>
+    </List>
+    <Text>We have captured the following about your context:</Text>
+    <ul>
+      <li>…</li>
+      <li>[x] Use this info when helping me think of evaluation criteria</li>
+    </ul>
+    <Text>
+      After EvalGen finishes, the chosen evaluators appear in the MultiEval
+      node. You can export evaluator details by right-clicking the node and
+      selecting Copy Eval Specs.
+    </Text>
+    <Text>
+      EvalGen is in Beta. To improve it, provide feedback on our Github Issues
+      or Discussion pages, or raise a Pull Request with the changes.
+    </Text>
+  </Stack>
+);
+
+export default WelcomeStep;
diff --git a/chainforge/react-server/src/MultiEvalNode.tsx b/chainforge/react-server/src/MultiEvalNode.tsx
index 7f778ba4e..9520e7525 100644
--- a/chainforge/react-server/src/MultiEvalNode.tsx
+++ b/chainforge/react-server/src/MultiEvalNode.tsx
@@ -60,8 +60,7 @@ import { Dict, LLMResponse, QueryProgress } from "./backend/typing";
 import { AlertModalContext } from "./AlertModal";
 import { Status } from "./StatusIndicatorComponent";
 import { EvalGenReport } from "./backend/evalgen/typing";
-import EvalGenModal, { EvalGenModalRef } from "./EvalGenModal";
-import EvalGenWizard from "./EvalGen2Modal";
+import EvalGenWizard from "./EvalGen/EvalGenWizard";
 
 const IS_RUNNING_LOCALLY = APP_IS_RUNNING_LOCALLY();
 
@@ -635,6 +634,7 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
         opened={evalGenOpened}
         onClose={() => setEvalGenOpened(false)}
         onComplete={handleEvalGenComplete}
+        responses={[]}
       />
       {/* <EvalGenModal ref={evalGenModalRef} /> */}
 
diff --git a/chainforge/react-server/src/backend/evalgen/utils.ts b/chainforge/react-server/src/backend/evalgen/utils.ts
index 9586c1ebe..e3e585ee4 100644
--- a/chainforge/react-server/src/backend/evalgen/utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/utils.ts
@@ -11,7 +11,11 @@ import {
 } from "./typing";
 import { Dict, LLMResponse } from "../typing";
 import { executejs, executepy, simpleQueryLLM } from "../backend";
-import { getVarsAndMetavars, llmResponseDataToString, retryAsyncFunc } from "../utils";
+import {
+  getVarsAndMetavars,
+  llmResponseDataToString,
+  retryAsyncFunc,
+} from "../utils";
 import { v4 as uuid } from "uuid";
 import { OpenAIStreamer } from "./oai_utils";
 import {

From 411de58b272c7ac7f0e86c6e1e41bc69a374c63f Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Sun, 16 Mar 2025 21:04:43 -0400
Subject: [PATCH 17/35] wip expand EvalGen into its own folder

---
 chainforge/react-server/.eslintrc.js          |   1 +
 .../src/EvalGen/EvalGenWizard.tsx             |  11 +-
 .../react-server/src/EvalGen/FeedbackStep.tsx |  52 +-
 .../src/EvalGen/GradeResponsesStep.tsx        | 516 +++++++++++++++++-
 .../react-server/src/EvalGen/GradingView.tsx  |  84 +--
 .../react-server/src/EvalGen/WelcomeStep.tsx  |   4 +-
 chainforge/react-server/src/EvalGenModal.tsx  |  17 +-
 chainforge/react-server/src/MultiEvalNode.tsx |   6 +-
 8 files changed, 576 insertions(+), 115 deletions(-)

diff --git a/chainforge/react-server/.eslintrc.js b/chainforge/react-server/.eslintrc.js
index 648e0a4d5..cf13223b8 100644
--- a/chainforge/react-server/.eslintrc.js
+++ b/chainforge/react-server/.eslintrc.js
@@ -17,6 +17,7 @@ module.exports = {
     camelcase: ["off"],
     "react/prop-types": ["off"],
     "@typescript-eslint/no-explicit-any": ["off"],
+    "@typescript-eslint/no-empty-function": ["off"],
   },
 
   settings: {
diff --git a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
index a51dd20d1..4b2852ea7 100644
--- a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
+++ b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
@@ -162,10 +162,17 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
         }}
       >
         <Flex justify="space-between">
-          <Button variant="default" onClick={handlePrevious}>
+          <Button
+            variant="default"
+            onClick={handlePrevious}
+            disabled={active === 0}
+          >
             &lt; Back
           </Button>
-          <Button onClick={handleNext}>Next &gt;</Button>
+
+          <Button onClick={handleNext} disabled={active === 4}>
+            Next &gt;
+          </Button>
         </Flex>
       </div>
       <div
diff --git a/chainforge/react-server/src/EvalGen/FeedbackStep.tsx b/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
index af6598390..d6610cf12 100644
--- a/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
+++ b/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
@@ -1,6 +1,7 @@
-import React, { useState } from "react";
+import React, { useCallback, useEffect, useState } from "react";
 import { LLMResponse } from "../backend/typing";
 import { Button, Group, Stack, Text, Title } from "@mantine/core";
+import GradingView from "./GradingView";
 
 interface FeedbackStepProps {
   onNext: () => void;
@@ -9,19 +10,54 @@ interface FeedbackStepProps {
   setOnNextCallback: React.Dispatch<React.SetStateAction<() => unknown>>;
 }
 
-const FeedbackStep: React.FC<FeedbackStepProps> = ({ onNext, onPrevious }) => {
-  // State for thumbs up/down feedback and written comments
-  const [feedback, setFeedback] = useState([]);
+const FeedbackStep: React.FC<FeedbackStepProps> = ({
+  onNext,
+  onPrevious,
+  responses,
+  setOnNextCallback,
+}) => {
+  const [shownResponse, setShownResponse] = useState<LLMResponse | undefined>(
+    undefined,
+  );
+  const [pastShownResponses, setPastShownResponses] = useState<LLMResponse[]>(
+    [],
+  );
+  const [shownResponseIdx, setShownResponseIdx] = useState(0);
+
+  useEffect(() => {
+    if (!responses || responses.length === 0) return;
+    setShownResponse(responses[0]);
+    setShownResponseIdx(0);
+  }, [responses]);
 
-  const handleSubmit = () => {
-    // setFeedbackData(feedback);
-    onNext();
-  };
+  const nextResponse = useCallback(() => {
+    if (responses.length === 0) return;
+    if (shownResponseIdx < responses.length - 1) {
+      setShownResponseIdx(shownResponseIdx + 1);
+      setShownResponse(responses[shownResponseIdx + 1]);
+    }
+  }, [shownResponseIdx, responses]);
+
+  const prevResponse = useCallback(() => {
+    if (shownResponseIdx > 0) {
+      setShownResponseIdx(shownResponseIdx - 1);
+      setShownResponse(responses[shownResponseIdx - 1]);
+    }
+  }, [shownResponseIdx, responses]);
 
   return (
     <Stack spacing="lg">
       <Title order={3}>Provide Feedback on Some Model Outputs</Title>
 
+      <GradingView
+        shownResponse={shownResponse}
+        shownResponseIdx={shownResponseIdx}
+        // shownResponseIdx={shownResponseUniqueIdx}
+        responseCount={responses.length}
+        gotoNextResponse={nextResponse}
+        gotoPrevResponse={prevResponse}
+      />
+
       {/* TODO: Implement thumbs up/down feedback UI with written comments */}
       <Text>
         TODO: Display LLM responses with thumbs up/down controls and comment
diff --git a/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx b/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
index b93c8f19c..83ba89671 100644
--- a/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
+++ b/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
@@ -1,8 +1,219 @@
-import React, { useState } from "react";
+import React, { useCallback, useEffect, useState } from "react";
 import { EvalCriteria } from "../backend/evalgen/typing";
 import { LLMResponse } from "../backend/typing";
-import { Button, Group, Stack, Text, Title } from "@mantine/core";
+import {
+  ActionIcon,
+  Button,
+  Center,
+  Divider,
+  Flex,
+  Grid,
+  Group,
+  Radio,
+  rem,
+  Skeleton,
+  Stack,
+  Text,
+  Textarea,
+  TextInput,
+  Title,
+  Tooltip,
+} from "@mantine/core";
 import GradingView from "./GradingView";
+import { useDisclosure } from "@mantine/hooks";
+import { v4 as uuid } from "uuid";
+import {
+  IconPencil,
+  IconRobot,
+  IconTerminal2,
+  IconThumbDown,
+  IconThumbUp,
+  IconTrash,
+} from "@tabler/icons-react";
+import { generateLLMEvaluationCriteria } from "../backend/evalgen/utils";
+import useStore from "../store";
+
+const ThumbUpDownButtons = ({
+  grade,
+  onChangeGrade,
+  getGradeCount,
+}: {
+  grade: boolean | undefined;
+  onChangeGrade: (newGrade: boolean | undefined) => void;
+  getGradeCount: (grade: boolean | undefined) => number;
+}) => {
+  return (
+    <>
+      {/* Thumbs up/down buttons */}
+      <Button
+        color={grade === true ? "green" : "gray"}
+        m={0}
+        p={0}
+        variant="subtle"
+        onClick={() => {
+          // Toggle grade: if on (true), turn 'off' (undefined, for neutral).
+          if (onChangeGrade) onChangeGrade(grade === true ? undefined : true);
+        }}
+      >
+        <div className="gradeContainer">
+          <IconThumbUp size="14pt" fill={grade === true ? "#aea" : "white"} />
+          <div className="gradeUpCount">{getGradeCount(true)}</div>
+        </div>
+      </Button>
+      <Button
+        color={grade === false ? "red" : "gray"}
+        m={0}
+        p={0}
+        variant="subtle"
+        onClick={() => {
+          // Toggle grade: if on (true), turn 'off' (undefined, for neutral).
+          if (onChangeGrade) onChangeGrade(grade === false ? undefined : false);
+        }}
+      >
+        <div className="gradeContainer">
+          <IconThumbDown
+            size="14pt"
+            fill={grade === false ? "pink" : "white"}
+          />
+          <div className="gradeDownCount">{getGradeCount(false)}</div>
+        </div>
+      </Button>
+    </>
+  );
+};
+
+interface CriteriaCardProps {
+  criterion: EvalCriteria;
+  onChange: (changedCriteria: EvalCriteria) => void;
+  onDelete: () => void;
+  initiallyOpen?: boolean;
+  grade: boolean | undefined;
+  onChangeGrade: (newGrade: boolean | undefined) => void;
+  getGradeCount: (grade: boolean | undefined) => number;
+  getStateValue: (stateId: number) => number;
+}
+
+const CriteriaCard: React.FC<CriteriaCardProps> = ({
+  criterion,
+  onChange,
+  onDelete,
+  initiallyOpen,
+  grade,
+  getGradeCount,
+  onChangeGrade,
+  getStateValue,
+}) => {
+  const [opened, { toggle }] = useDisclosure(initiallyOpen ?? false);
+  const [title, setTitle] = useState(criterion.shortname ?? "New Criteria");
+
+  return (
+    <Stack spacing={0} ml={8}>
+      <Flex align="center">
+        <Group spacing="0px">
+          {/* Thumbs up/down buttons */}
+          <ThumbUpDownButtons
+            grade={grade}
+            onChangeGrade={onChangeGrade}
+            getGradeCount={getGradeCount}
+          />
+
+          {/* Title of the criteria */}
+          <TextInput
+            value={title}
+            onChange={(e) => setTitle(e.target.value)}
+            onBlur={(e) => {
+              criterion.shortname = e.target.value;
+              if (onChange) onChange(criterion);
+            }}
+            placeholder="Criteria name"
+            variant="unstyled"
+            size="md"
+            ml="xs"
+            className="nodrag nowheel"
+            styles={{
+              input: {
+                padding: "0px",
+                height: "14pt",
+                minHeight: "0pt",
+                fontWeight: 500,
+              },
+            }}
+          />
+        </Group>
+
+        <Group spacing="4px" ml="auto">
+          {/* Whether this criteria should be implemented with code (function) or an LLM evaluator */}
+          <Tooltip
+            label={
+              criterion.eval_method === "code"
+                ? "Change to an LLM evaluator"
+                : "Change to a code evaluator"
+            }
+            withinPortal
+            withArrow
+          >
+            <Text
+              color="#999"
+              size="sm"
+              mr="6px"
+              onClick={() => {
+                criterion.eval_method =
+                  criterion.eval_method === "code" ? "expert" : "code";
+                if (onChange) onChange(criterion);
+              }}
+            >
+              {criterion.eval_method === "code" ? (
+                <Flex style={{ userSelect: "none" }}>
+                  <IconTerminal2 size="14pt" />
+                  &nbsp;Python
+                </Flex>
+              ) : (
+                <Flex style={{ userSelect: "none" }}>
+                  <IconRobot size="14pt" />
+                  &nbsp;LLM
+                </Flex>
+              )}
+            </Text>
+          </Tooltip>
+
+          {/* <Contributor getStateValue={getStateValue} /> */}
+
+          {/* Delete button (and any other criterion-specific changes in the future) */}
+          <ActionIcon variant="subtle" color="red" onClick={onDelete}>
+            <IconTrash style={{ width: rem(16), height: rem(16) }} />
+          </ActionIcon>
+        </Group>
+      </Flex>
+
+      <Textarea
+        value={criterion.criteria}
+        placeholder="Describe here. You must describe what the criteria means before EvalGen can implement it."
+        ml={38}
+        onChange={(e) => {
+          criterion.criteria = e.target.value;
+          if (onChange) onChange(criterion);
+        }}
+        onClickCapture={(e) => e.stopPropagation()}
+        styles={{
+          input: {
+            border: "none",
+            borderWidth: "0px",
+            margin: "0px",
+            color: "#444",
+            background: "transparent",
+            lineHeight: 1.1,
+          },
+        }}
+        autosize
+        minRows={2}
+        maxRows={5}
+        fz="sm"
+        mb="xs"
+        c="dimmed"
+      />
+    </Stack>
+  );
+};
 
 interface GradingResponsesStepProps {
   onNext: () => void;
@@ -16,31 +227,294 @@ interface GradingResponsesStepProps {
 const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
   onNext,
   onPrevious,
+  responses,
+  criteria,
+  setCriteria,
+  setOnNextCallback,
 }) => {
-  // State for per-criteria grades
-  const [grades, setGrades] = useState({});
+  const apiKeys = useStore((state) => state.apiKeys);
+  const [shownResponse, setShownResponse] = useState<LLMResponse | undefined>(
+    undefined,
+  );
+  const [pastShownResponses, setPastShownResponses] = useState<LLMResponse[]>(
+    [],
+  );
+  const [shownResponseIdx, setShownResponseIdx] = useState(0);
+
+  useEffect(() => {
+    if (!responses || responses.length === 0) return;
+    setShownResponse(responses[0]);
+    setShownResponseIdx(0);
+  }, [responses]);
+
+  const nextResponse = useCallback(() => {
+    if (responses.length === 0) return;
+    if (shownResponseIdx < responses.length - 1) {
+      setShownResponseIdx(shownResponseIdx + 1);
+      setShownResponse(responses[shownResponseIdx + 1]);
+    }
+  }, [shownResponseIdx, responses]);
+
+  const prevResponse = useCallback(() => {
+    if (shownResponseIdx > 0) {
+      setShownResponseIdx(shownResponseIdx - 1);
+      setShownResponse(responses[shownResponseIdx - 1]);
+    }
+  }, [shownResponseIdx, responses]);
+
+  // Add a criterion
+  const handleAddCriteria = (newCrit: EvalCriteria) => {
+    setCriteria((cs) => {
+      if (!newCrit.uid) newCrit.uid = uuid();
+      return [...cs, newCrit];
+    });
+  };
+
+  // Modify an existing criterion
+  const handleChangeCriteria = (newCrit: EvalCriteria, uid: string) => {
+    setCriteria((cs) => {
+      const idx = cs.findIndex((c) => c.uid === uid);
+      if (idx === -1) {
+        console.error("Could not find criteria with uid", uid);
+        return cs;
+      }
+      cs[idx] = newCrit;
+      return [...cs];
+    });
+  };
 
-  // TODO: Set up grading UI for each criteria
+  // Delete a criterion
+  const handleDeleteCriteria = (uid: string) => {
+    setCriteria((cs) => {
+      return cs.filter((c) => c.uid !== uid);
+    });
+  };
+
+  // Synthesize a new criteria according to the feedback given for the shown response
+  const [isLoadingCriteria, setIsLoadingCriteria] = useState(0);
+  const synthNewCriteriaWithLLM = (
+    response: string,
+    feedback: string,
+    grade: "good" | "bad" | "unknown",
+  ) => {
+    // Add a loading Skeleton
+    setIsLoadingCriteria((num) => num + 1);
+    // Make async LLM call to expand criteria only if the feedback contains some idea of a constraint on the output and isn't covered by existing criteria
+    const prettyCriteria = criteria
+      .map((crit) => {
+        return `${crit.shortname}: ${crit.criteria}`;
+      })
+      .join("\n");
+
+    generateLLMEvaluationCriteria(
+      "",
+      apiKeys,
+      `I've given some feedback on some text output. Use this feedback to decide on a single new evaluation criteria with a yes/no answer, only if the feedback isn't encompassed by existing criteria. I want you to take the criteria and output a JSON object in the format below. 
+  
+  TEXT OUTPUT: 
+  \`\`\`
+  ${response}
+  \`\`\`
+  
+  EXISTING CRITERIA:
+  \`\`\`
+  ${prettyCriteria}
+  \`\`\`
+  
+  GRADE (whether text was good or bad):
+  \`\`\`
+  ${grade}
+  \`\`\`
+  
+  FEEDBACK: 
+  \`\`\`
+  ${feedback}
+  \`\`\`
+  
+  If you determine the feedback corresponds to a new criteria, your response should contain a short title for the criteria ("shortname"), a description of the criteria in 2 sentences ("criteria"), and whether it should be evaluated with "code", or by an "expert" if the criteria is difficult to evaluate ("eval_method"). Your answer should be JSON within a \`\`\`json \`\`\` marker, with the following three fields: "criteria", "shortname", and "eval_method" (code or expert). The "criteria" should expand upon the user's input, the "shortname" should be a very brief title for the criteria, and this list should contain as many evaluation criteria as you can think of. Each evaluation criteria should test a unit concept that should evaluate to "true" in the ideal case. Only output JSON, nothing else. Output an empty list if there is no new evaluation criteria`, // prompt
+      "gpt-4o", // llm
+    )
+      .then((evalCrits) => {
+        // Take only the first if evalCrits has a nonempty list
+        if (evalCrits[0]) {
+          setCriteria((crit) =>
+            crit.concat([
+              {
+                ...evalCrits[0],
+                uid: uuid(),
+              },
+            ]),
+          );
+        }
+        // Remove a loading Skeleton
+        setIsLoadingCriteria((num) => num - 1);
 
-  const handleSubmit = () => {
-    // setGradingData(grades);
-    onNext();
+        setNumGPT4Calls((num) => num + 1);
+      })
+      .catch((err) => {
+        console.error(err);
+        setIsLoadingCriteria((num) => num - 1);
+      });
   };
 
   return (
-    <Stack spacing="lg">
-      <Title order={3}>Grade LLM Responses By Criteria</Title>
-      <Text>Please evaluate each response according to your criteria:</Text>
-
-      {/* <GradingView /> */}
-
-      <Group position="apart" mt="xl">
-        <Button variant="default" onClick={onPrevious}>
-          Back
-        </Button>
-        <Button onClick={handleSubmit}>I&apos;m tired, process results</Button>
-      </Group>
-    </Stack>
+    <Grid h={window?.innerHeight * 0.8}>
+      <Grid.Col span={8}>
+        <Stack justify="space-between">
+          {/* View showing the response the user is currently grading */}
+          <GradingView
+            shownResponse={shownResponse}
+            shownResponseIdx={shownResponseIdx}
+            responseCount={responses.length}
+            gotoNextResponse={nextResponse}
+            gotoPrevResponse={prevResponse}
+          />
+
+          {/* Progress bar */}
+          {/* <Flex justify="left" align="center" gap="md">
+                    <Stack w="100%" spacing={4}>
+                      <Text color="#aaa" size="sm">
+                        {bottomBar.progressLabel}
+                      </Text>
+                      <Progress w="100%" value={bottomBar.progressPerc} mb="0px" />
+                    </Stack>
+                  </Flex> */}
+        </Stack>
+      </Grid.Col>
+      <Grid.Col span={4} bg="#eee" pt="16px" h="100%">
+        <Center>
+          <Title order={3} ml={8} mt="sm" mb="md">
+            Rubric
+          </Title>
+        </Center>
+
+        <div
+          style={{
+            display: "flex",
+            flexDirection: "column",
+          }}
+        >
+          <div style={{ flex: 2, overflowY: "auto" }}>
+            {criteriaForDisplay.map((e) => (
+              <CriteriaCard
+                criterion={e}
+                key={e.uid}
+                onChange={(newCrit) => handleChangeCriteria(newCrit, e.uid)}
+                onDelete={() => handleDeleteCriteria(e.uid)}
+                grade={
+                  shownResponse ? grades[shownResponse.uid][e.uid] : undefined
+                }
+                getGradeCount={(grade) => {
+                  return shownResponse
+                    ? getGradeCount(
+                        // shownResponse.uid,
+                        e.uid,
+                        grade,
+                      )
+                    : 0;
+                }}
+                onChangeGrade={(newGrade) => {
+                  if (shownResponse)
+                    setPerCriteriaGrade(shownResponse.uid, e.uid, newGrade);
+                }}
+                initiallyOpen={true}
+                getStateValue={(stateId) => getStateValue(stateId)}
+              />
+            ))}
+            {isLoadingCriteria > 0 ? (
+              Array.from(
+                { length: isLoadingCriteria },
+                (v: unknown, idx: number) => (
+                  <Skeleton key={idx} h={80} mb={4} />
+                ),
+              )
+            ) : (
+              <></>
+            )}
+
+            <div className="criteriaButtons">
+              <Button
+                leftIcon={<IconPencil size={14} />}
+                variant="subtle"
+                color="gray"
+                // gradient={{ from: "blue", to: "green", deg: 90 }}
+                onClick={() => {
+                  handleAddCriteria({
+                    shortname: "New Criteria",
+                    criteria: "",
+                    eval_method: "code",
+                    priority: 0,
+                    uid: uuid(),
+                  });
+                }}
+              >
+                Add a new criteria
+              </Button>
+              {/* <Button
+                leftIcon={<IconSparkles size={14} />}
+                variant="subtle"
+                color="gray"
+                // gradient={{ from: "blue", to: "green", deg: 90 }}
+                onClick={() => {
+                  generateCriteria(responses);
+                }}
+              >
+                Suggest Criteria
+              </Button> */}
+            </div>
+          </div>
+
+          <Stack spacing="0px" pl="xs" pr="lg" style={{ flex: 1 }}>
+            <Divider mt="lg" />
+            <Title mb="0px" order={4}>
+              Suggest New Criteria Based on the Feedback
+            </Title>
+            <Textarea
+              value={annotation}
+              onChange={(e) => setAnnotation(e.target.value)}
+              description="How good is this response? Explain anything not captured under your existing criteria. Your feedback will be used to generate new criteria."
+              mb="sm"
+            />
+            <Radio.Group
+              name="favoriteFramework"
+              label="Rate the response holistically:"
+              value={holisticGrade}
+              onChange={(v) => setHolisticGrade(v as "good" | "bad")}
+              withAsterisk
+              mb="md"
+            >
+              <Group mt="xs">
+                <Radio value="good" label="Good" />
+                <Radio value="bad" label="Bad" />
+                <span>
+                  &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+                </span>
+                <Button
+                  color="green"
+                  variant="filled"
+                  disabled={
+                    !holisticGrade ||
+                    annotation === undefined ||
+                    annotation.length === 0
+                  }
+                  onClick={() => {
+                    synthNewCriteriaWithLLM(
+                      shownResponse?.responses[0].toString() ?? "",
+                      annotation ?? "",
+                      holisticGrade ?? "unknown",
+                    );
+
+                    nextResponse();
+                  }}
+                >
+                  + Submit Feedback
+                </Button>
+              </Group>
+            </Radio.Group>
+          </Stack>
+        </div>
+      </Grid.Col>
+    </Grid>
   );
 };
 
diff --git a/chainforge/react-server/src/EvalGen/GradingView.tsx b/chainforge/react-server/src/EvalGen/GradingView.tsx
index 35d266b5f..766352979 100644
--- a/chainforge/react-server/src/EvalGen/GradingView.tsx
+++ b/chainforge/react-server/src/EvalGen/GradingView.tsx
@@ -1,12 +1,17 @@
 import React, { ReactNode, useMemo } from "react";
 import { LLMResponse } from "../backend/typing";
-import { cleanMetavarsFilterFunc, transformDict } from "../backend/utils";
+import {
+  cleanMetavarsFilterFunc,
+  llmResponseDataToString,
+  transformDict,
+} from "../backend/utils";
 import { Box, Button, Center, Flex, Stack, Text, Tooltip } from "@mantine/core";
 import {
   IconChevronLeft,
   IconChevronRight,
   IconSparkles,
 } from "@tabler/icons-react";
+import { StringLookup } from "../backend/cache";
 
 const HeaderText = ({ children }: { children: ReactNode }) => {
   return (
@@ -20,32 +25,22 @@ export interface GradingViewProps {
   shownResponse: LLMResponse | undefined;
   shownResponseIdx: number;
   responseCount: number;
-  numGPT4Calls: number;
-  numGPT35Calls: number;
-  logs: { date: Date; message: string }[];
   gotoPrevResponse: () => void;
   gotoNextResponse: () => void;
-  estimateGPTCalls: () => string;
-  gotoNextScreen: (screenName: string) => void;
 }
 
 const GradingView: React.FC<GradingViewProps> = ({
   shownResponse,
   shownResponseIdx,
   responseCount,
-  numGPT4Calls,
-  numGPT35Calls,
-  logs,
   gotoPrevResponse,
   gotoNextResponse,
-  estimateGPTCalls,
-  gotoNextScreen,
 }) => {
   // Calculate inner values only when shownResponse changes
   const responseText = useMemo(
     () =>
       shownResponse && shownResponse.responses?.length > 0
-        ? shownResponse.responses[0].toString()
+        ? llmResponseDataToString(shownResponse.responses[0])
         : "",
     [shownResponse],
   );
@@ -54,12 +49,14 @@ const GradingView: React.FC<GradingViewProps> = ({
   const varsDivs = useMemo(() => {
     const combined_vars_metavars = shownResponse
       ? {
-          ...shownResponse.vars,
-          ...transformDict(shownResponse.metavars, cleanMetavarsFilterFunc),
+          ...StringLookup.concretizeDict(shownResponse.vars),
+          ...transformDict(
+            StringLookup.concretizeDict(shownResponse.metavars),
+            cleanMetavarsFilterFunc,
+          ),
         }
       : {};
 
-    // console.log("**************shownResponse", shownResponse);
     return Object.entries(combined_vars_metavars).map(([varname, val]) => (
       <div key={varname} className="grade-resp-var-container">
         <span className="response-var-name">{varname}&nbsp;=&nbsp;</span>
@@ -109,7 +106,7 @@ const GradingView: React.FC<GradingViewProps> = ({
           </div>
 
           {/* Go forward to the next response */}
-          <Tooltip label={estimateGPTCalls()} withArrow>
+          <Tooltip label="To next response" withArrow>
             <Button variant="white" color="dark" onClick={gotoNextResponse}>
               <IconChevronRight />
             </Button>
@@ -156,62 +153,7 @@ const GradingView: React.FC<GradingViewProps> = ({
             </div>
           </div>
         </Flex>
-        <Flex direction="column">
-          <Flex justify="space-between" align="center">
-            <Text size="lg" weight={500} mb="sm">
-              LLM Activity
-            </Text>
-            {/* GPT Call Tally */}
-            <Text size="sm" color="dark" style={{ fontStyle: "italic" }}>
-              Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls}{" "}
-              GPT-3.5-Turbo-16k calls.
-            </Text>
-          </Flex>
-          <div
-            style={{
-              backgroundColor: "#f0f0f0",
-              color: "#333",
-              fontFamily: "monospace",
-              padding: "12px",
-              width: "calc(100% - 30px)",
-              height: "200px",
-              overflowY: "auto",
-              borderRadius: "8px",
-              border: "1px solid #ddd",
-              marginRight: "20px", // Space on the right
-            }}
-            ref={(el) => {
-              if (el) {
-                el.scrollTop = el.scrollHeight;
-              }
-            }}
-          >
-            {logs.map((log, index) => (
-              <div key={index}>
-                <span style={{ color: "#4A90E2" }}>
-                  {log.date.toLocaleString()} -{" "}
-                </span>
-                <span>{log.message}</span>
-              </div>
-            ))}
-          </div>
-        </Flex>
       </Box>
-      <div>
-        <Center>
-          <Button
-            leftIcon={<IconSparkles size={14} />}
-            variant="gradient"
-            gradient={{ from: "blue", to: "green", deg: 45 }}
-            onClick={() => {
-              // console.log("(3) gotoNextScreen", gotoNextScreen);
-              gotoNextScreen("report");
-            }}
-          >
-            I&apos;m done. Access EvalGen Report!
-          </Button>
-        </Center>
-      </div>
     </Stack>
   );
 };
diff --git a/chainforge/react-server/src/EvalGen/WelcomeStep.tsx b/chainforge/react-server/src/EvalGen/WelcomeStep.tsx
index 9ea7ca9b2..4902d05ec 100644
--- a/chainforge/react-server/src/EvalGen/WelcomeStep.tsx
+++ b/chainforge/react-server/src/EvalGen/WelcomeStep.tsx
@@ -64,11 +64,11 @@ const WelcomeStep: React.FC<WelcomeStepProps> = ({ setOnNextCallback }) => (
         wizard.)
       </List.Item>
     </List>
-    <Text>We have captured the following about your context:</Text>
+    {/* <Text>We have captured the following about your context:</Text>
     <ul>
       <li>…</li>
       <li>[x] Use this info when helping me think of evaluation criteria</li>
-    </ul>
+    </ul> */}
     <Text>
       After EvalGen finishes, the chosen evaluators appear in the MultiEval
       node. You can export evaluator details by right-clicking the node and
diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx
index 9c450dcd1..88444038c 100644
--- a/chainforge/react-server/src/EvalGenModal.tsx
+++ b/chainforge/react-server/src/EvalGenModal.tsx
@@ -390,7 +390,6 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
       [],
     );
     const [shownResponseIdx, setShownResponseIdx] = useState(0);
-    // const [shownResponseUniqueIdx, setShownResponseUniqueIdx] = useState(0);
 
     const [annotation, setAnnotation] = useState<string | undefined>(undefined);
     const [holisticGrade, setHolisticGrade] = useState<
@@ -594,14 +593,6 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
       trigger,
     }));
 
-    // Add a criterion
-    const handleAddCriteria = (newCrit: EvalCriteria) => {
-      setCriteria((cs) => {
-        if (!newCrit.uid) newCrit.uid = uuid();
-        return [...cs, newCrit];
-      });
-    };
-
     const getLikelyPromptTemplateAsContext = (resps) => {
       // Attempt to infer the prompt template used to generate the responses:
       const prompts = new Set();
@@ -630,6 +621,14 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
       return await generateLLMEvaluationCriteria(inputPromptTemplate, apiKeys);
     }
 
+    // Add a criterion
+    const handleAddCriteria = (newCrit: EvalCriteria) => {
+      setCriteria((cs) => {
+        if (!newCrit.uid) newCrit.uid = uuid();
+        return [...cs, newCrit];
+      });
+    };
+
     // Modify an existing criterion
     const handleChangeCriteria = (newCrit: EvalCriteria, uid: string) => {
       setCriteria((cs) => {
diff --git a/chainforge/react-server/src/MultiEvalNode.tsx b/chainforge/react-server/src/MultiEvalNode.tsx
index 9520e7525..8c0fff363 100644
--- a/chainforge/react-server/src/MultiEvalNode.tsx
+++ b/chainforge/react-server/src/MultiEvalNode.tsx
@@ -252,6 +252,8 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
   const [lastRunSuccess, setLastRunSuccess] = useState(true);
   const [showDrawer, setShowDrawer] = useState(false);
 
+  const [pulledInputs, setPulledInputs] = useState<LLMResponse[]>([]);
+
   // Debounce helpers
   const debounceTimeoutRef = useRef(null);
   const debounce = genDebounceFunc(debounceTimeoutRef);
@@ -321,7 +323,6 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
   const onFinalReportsReady = (reports: EvalGenReport) => {
     // Placeholder for process the final reports returned from EvalGenModel
     console.log("!!!!!!!!!!!!!!!!!!!!!!!!!! final reports", reports);
-    // let kkk = 1;
     for (const crit of reports.criteria) {
       // setTimeout(() => {
       // console.log("crit", crit);
@@ -603,6 +604,7 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
   // EvalGen Wizard
   const [evalGenOpened, setEvalGenOpened] = useState(false);
   const openEvalGen = useCallback(() => {
+    setPulledInputs(handlePullInputs());
     setEvalGenOpened(true);
   }, []);
   const handleEvalGenComplete = (evaluationData: EvalGenReport) => {
@@ -634,7 +636,7 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
         opened={evalGenOpened}
         onClose={() => setEvalGenOpened(false)}
         onComplete={handleEvalGenComplete}
-        responses={[]}
+        responses={pulledInputs}
       />
       {/* <EvalGenModal ref={evalGenModalRef} /> */}
 

From b6746e794a69bbf975755b09f612bfd319ccc0da Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Mon, 17 Mar 2025 22:10:11 -0400
Subject: [PATCH 18/35] wip

---
 .../src/EvalGen/EvalGenWizard.tsx             |  21 +++-
 .../react-server/src/EvalGen/FeedbackStep.tsx | 118 ++++++++++++++++--
 .../react-server/src/EvalGen/GradingView.tsx  |   7 +-
 chainforge/react-server/src/backend/typing.ts |   2 +-
 4 files changed, 128 insertions(+), 20 deletions(-)

diff --git a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
index 4b2852ea7..37b11e48c 100644
--- a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
+++ b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
@@ -1,4 +1,4 @@
-import React, { useCallback, useState } from "react";
+import React, { useCallback, useMemo, useState } from "react";
 import { EvalCriteria, EvalGenReport } from "../backend/evalgen/typing";
 import { LLMResponse } from "../backend/typing";
 import useStore from "../store";
@@ -11,6 +11,7 @@ import FeedbackStep from "./FeedbackStep";
 import PickCriteriaStep from "./PickCriteriaStep";
 import ReportCardStep from "./ReportCardStep";
 import GradingResponsesStep from "./GradeResponsesStep";
+import { batchResponsesByUID } from "../backend/utils";
 
 // Main wizard component props
 interface EvalGenWizardProps {
@@ -26,8 +27,15 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
   onComplete,
   responses, // The LLM responses to operate over
 }) => {
+  // The active screen (stage) of EvalGen
   const [active, setActive] = useState(0);
 
+  // Regroup input responses by batch UID, whenever jsonResponses changes
+  const batchedResponses = useMemo(
+    () => (responses ? batchResponsesByUID(responses) : []),
+    [responses],
+  );
+
   // Criteria the user defines across the stages
   const [criteria, setCriteria] = useState<EvalCriteria[]>([]);
   const [onNextCallback, setOnNextCallback] = useState(() => () => {});
@@ -73,7 +81,8 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
 
   async function genCriteriaFromContext(responses: LLMResponse[]) {
     // Get the context from the input responses
-    const inputPromptTemplate = getLikelyPromptTemplateAsContext(responses);
+    const inputPromptTemplate =
+      getLikelyPromptTemplateAsContext(batchedResponses);
 
     if (inputPromptTemplate === null) {
       console.error("No context found. Cannot proceed.");
@@ -116,7 +125,7 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
         <FeedbackStep
           onNext={handleNext}
           onPrevious={handlePrevious}
-          responses={responses}
+          responses={batchedResponses}
           setOnNextCallback={setOnNextCallback}
         />
       )}
@@ -127,7 +136,9 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
           onPrevious={handlePrevious}
           criteria={criteria}
           setCriteria={setCriteria}
-          genCriteriaFromContext={() => genCriteriaFromContext(responses ?? [])}
+          genCriteriaFromContext={() =>
+            genCriteriaFromContext(batchedResponses)
+          }
           setOnNextCallback={setOnNextCallback}
         />
       )}
@@ -136,7 +147,7 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
         <GradingResponsesStep
           onNext={handleNext}
           onPrevious={handlePrevious}
-          responses={responses}
+          responses={batchedResponses}
           criteria={criteria}
           setCriteria={setCriteria}
           setOnNextCallback={setOnNextCallback}
diff --git a/chainforge/react-server/src/EvalGen/FeedbackStep.tsx b/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
index d6610cf12..6ad4926f2 100644
--- a/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
+++ b/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
@@ -1,7 +1,20 @@
-import React, { useCallback, useEffect, useState } from "react";
-import { LLMResponse } from "../backend/typing";
-import { Button, Group, Stack, Text, Title } from "@mantine/core";
+import React, { useCallback, useEffect, useMemo, useState } from "react";
+import { Dict, LLMResponse, RatingDict } from "../backend/typing";
+import {
+  Button,
+  Center,
+  Flex,
+  Stack,
+  Text,
+  Textarea,
+  Title,
+} from "@mantine/core";
 import GradingView from "./GradingView";
+import { IconThumbDown, IconThumbUp } from "@tabler/icons-react";
+import { getRatingKeyForResponse } from "../ResponseRatingToolbar";
+import useStore from "../store";
+import { deepcopy } from "../backend/utils";
+import StorageCache from "../backend/cache";
 
 interface FeedbackStepProps {
   onNext: () => void;
@@ -19,14 +32,52 @@ const FeedbackStep: React.FC<FeedbackStepProps> = ({
   const [shownResponse, setShownResponse] = useState<LLMResponse | undefined>(
     undefined,
   );
-  const [pastShownResponses, setPastShownResponses] = useState<LLMResponse[]>(
-    [],
-  );
   const [shownResponseIdx, setShownResponseIdx] = useState(0);
 
+  // Global state
+  const storeState = useStore<Dict<RatingDict>>((store) => store.state);
+  const setStoreState = useStore((store) => store.setState);
+
+  // The cache keys storing the ratings for this response object
+  const grade = useMemo(() => {
+    if (!shownResponse) return null;
+    const key = getRatingKeyForResponse(shownResponse?.uid, "grade");
+    const g = storeState[key];
+    console.log(shownResponse?.uid);
+    if (g) return g[0];
+    else return null;
+  }, [shownResponse, storeState]);
+  const annotation = useMemo(() => {
+    if (!shownResponse) return "";
+    const key = getRatingKeyForResponse(shownResponse?.uid, "note");
+    const a = storeState[key];
+    console.log(shownResponse?.uid);
+    if (a) return a[0]?.toString();
+    else return "";
+  }, [shownResponse, storeState]);
+
+  // Set the rating in the global store, which *should* update the above.
+  const setRating = useCallback(
+    (
+      uid: string | undefined,
+      label: string,
+      payload: boolean | string | null,
+    ) => {
+      if (!uid) return;
+      const key = getRatingKeyForResponse(uid, label);
+      setStoreState(key, { 0: payload }); // TODO: This will erase any feedback given on n>1 responses in the input.
+      StorageCache.store(key, { 0: payload });
+    },
+    [setStoreState],
+  );
+  const setGrade = (val: boolean | null) =>
+    setRating(shownResponse?.uid, "grade", val);
+  const setAnnotation = (val: string) =>
+    setRating(shownResponse?.uid, "note", val);
+
   useEffect(() => {
     if (!responses || responses.length === 0) return;
-    setShownResponse(responses[0]);
+    setShownResponse(responses[0]); // We only show the first response if n>1 resps per prompt, for simplicity's sake
     setShownResponseIdx(0);
   }, [responses]);
 
@@ -58,11 +109,54 @@ const FeedbackStep: React.FC<FeedbackStepProps> = ({
         gotoPrevResponse={prevResponse}
       />
 
-      {/* TODO: Implement thumbs up/down feedback UI with written comments */}
-      <Text>
-        TODO: Display LLM responses with thumbs up/down controls and comment
-        field
-      </Text>
+      <Flex justify="center" gap="50px" mb="xl">
+        <Button
+          color={grade === true ? "gray" : "red"}
+          variant={grade !== false ? "outline" : "filled"}
+          onClick={() => {
+            setGrade(grade !== false ? false : null);
+          }}
+        >
+          <IconThumbDown />
+          &nbsp;Bad!
+        </Button>
+        <Button
+          color={grade === false ? "gray" : "green"}
+          variant={grade !== true ? "outline" : "filled"}
+          onClick={() => {
+            setGrade(grade !== true ? true : null);
+          }}
+        >
+          <IconThumbUp />
+          &nbsp;Good!
+        </Button>
+      </Flex>
+      <Center mb={100}>
+        <Stack spacing="xs" w="80%">
+          <Text>What&apos;s the reason for your score?</Text>
+          <Flex align="center" justify="space-around" gap="lg">
+            <Textarea
+              value={annotation}
+              onChange={(e) => setAnnotation(e.currentTarget.value)}
+              disabled={grade === null}
+              autoFocus
+              w="100%"
+              onKeyDown={(e) => {
+                if (e.key === "Enter") {
+                  e.preventDefault();
+                  nextResponse();
+                }
+              }}
+            />
+            <Button
+              onClick={nextResponse}
+              disabled={grade === null || !annotation}
+            >
+              Submit and Next
+            </Button>
+          </Flex>
+        </Stack>
+      </Center>
     </Stack>
   );
 };
diff --git a/chainforge/react-server/src/EvalGen/GradingView.tsx b/chainforge/react-server/src/EvalGen/GradingView.tsx
index 766352979..1640c48fb 100644
--- a/chainforge/react-server/src/EvalGen/GradingView.tsx
+++ b/chainforge/react-server/src/EvalGen/GradingView.tsx
@@ -45,7 +45,10 @@ const GradingView: React.FC<GradingViewProps> = ({
     [shownResponse],
   );
 
-  const prompt = useMemo(() => shownResponse?.prompt ?? "", [shownResponse]);
+  const prompt = useMemo(
+    () => StringLookup.get(shownResponse?.prompt) ?? "",
+    [shownResponse],
+  );
   const varsDivs = useMemo(() => {
     const combined_vars_metavars = shownResponse
       ? {
@@ -66,7 +69,7 @@ const GradingView: React.FC<GradingViewProps> = ({
   }, [shownResponse]);
 
   return (
-    <Stack justify="space-between" mih={500}>
+    <Stack justify="space-between">
       <Box>
         {/* Top header */}
         <Flex justify="center">
diff --git a/chainforge/react-server/src/backend/typing.ts b/chainforge/react-server/src/backend/typing.ts
index 02b5691a1..83a7d70a4 100644
--- a/chainforge/react-server/src/backend/typing.ts
+++ b/chainforge/react-server/src/backend/typing.ts
@@ -277,4 +277,4 @@ export type TabularDataColType = {
 
 export type PythonInterpreter = "flask" | "pyodide";
 
-export type RatingDict = Record<number, boolean | string | undefined>;
+export type RatingDict = Record<number, boolean | string | null | undefined>;

From 10cc2108b1a006a33db5d045b82d86ec21fcfce9 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Wed, 19 Mar 2025 13:50:18 -0400
Subject: [PATCH 19/35] wip make executor use custom provider from global CF
 settings

---
 .../src/EvalGen/EvalGenWizard.tsx             | 147 ++++++++++++-
 .../src/EvalGen/GradeResponsesStep.tsx        | 199 +++++++++++++++---
 .../react-server/src/EvalGen/GradingView.tsx  |   2 +-
 .../src/EvalGen/PickCriteriaStep.tsx          |  32 +--
 chainforge/react-server/src/EvalGenModal.tsx  |  15 --
 .../src/backend/evalgen/executor.ts           |  59 +++---
 .../react-server/src/backend/evalgen/utils.ts |  47 +++--
 .../react-server/src/text-fields-node.css     |  11 +-
 8 files changed, 386 insertions(+), 126 deletions(-)

diff --git a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
index 37b11e48c..d29df0086 100644
--- a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
+++ b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
@@ -1,9 +1,9 @@
-import React, { useCallback, useMemo, useState } from "react";
+import React, { useCallback, useEffect, useMemo, useState } from "react";
 import { EvalCriteria, EvalGenReport } from "../backend/evalgen/typing";
-import { LLMResponse } from "../backend/typing";
+import { Dict, LLMResponse, RatingDict } from "../backend/typing";
 import useStore from "../store";
 import { escapeBraces } from "../backend/template";
-import { StringLookup } from "../backend/cache";
+import StorageCache, { StringLookup } from "../backend/cache";
 import { generateLLMEvaluationCriteria } from "../backend/evalgen/utils";
 import { Button, Flex, Modal, Stepper } from "@mantine/core";
 import WelcomeStep from "./WelcomeStep";
@@ -11,7 +11,10 @@ import FeedbackStep from "./FeedbackStep";
 import PickCriteriaStep from "./PickCriteriaStep";
 import ReportCardStep from "./ReportCardStep";
 import GradingResponsesStep from "./GradeResponsesStep";
-import { batchResponsesByUID } from "../backend/utils";
+import { batchResponsesByUID, deepcopy, sampleRandomElements } from "../backend/utils";
+import { getRatingKeyForResponse } from "../ResponseRatingToolbar";
+import EvaluationFunctionExecutor from "../backend/evalgen/executor";
+import { getAIFeaturesModels } from "../backend/ai";
 
 // Main wizard component props
 interface EvalGenWizardProps {
@@ -30,18 +33,129 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
   // The active screen (stage) of EvalGen
   const [active, setActive] = useState(0);
 
+  // From global state
+  const apiKeys = useStore((state) => state.apiKeys);
+  const genAIFeaturesProvider = useStore((state) => state.aiFeaturesProvider);
+  const genAIModelNames = useMemo(() => {
+    const models = getAIFeaturesModels(genAIFeaturesProvider);
+    return {
+      strong: models.large,
+      weak: models.small,
+    }
+  }, [genAIFeaturesProvider]);
+
   // Regroup input responses by batch UID, whenever jsonResponses changes
   const batchedResponses = useMemo(
     () => (responses ? batchResponsesByUID(responses) : []),
     [responses],
   );
 
+  // For updating the global human ratings state
+  const setState = useStore((store) => store.setState);
+  const updateGlobalRating = useCallback(
+    (uid: string, label: string, payload: RatingDict) => {
+      const key = getRatingKeyForResponse(uid, label);
+      const safe_payload = deepcopy(payload);
+      setState(key, safe_payload);
+      StorageCache.store(key, safe_payload);
+    },
+    [setState],
+  );
+
   // Criteria the user defines across the stages
   const [criteria, setCriteria] = useState<EvalCriteria[]>([]);
   const [onNextCallback, setOnNextCallback] = useState(() => () => {});
 
-  // Global state
-  const apiKeys = useStore((state) => state.apiKeys);
+  // Per-criteria grades (indexed by uid of response, then uid of criteria)
+  const [perCriteriaGrades, setPerCriteriaGrades] = useState<Dict<Dict<boolean | undefined>>>({});
+  const [annotation, setAnnotation] = useState<string | undefined>(undefined);
+  const setPerCriteriaGrade = (
+    responseUID: string,
+    criteriaUID: string,
+    newGrade: boolean | undefined,
+  ) => {
+    setPerCriteriaGrades((grades) => {
+      if (!grades[responseUID]) grades[responseUID] = {};
+      grades[responseUID][criteriaUID] = newGrade;
+      updateGlobalRating(responseUID, "perCriteriaGrades", grades[responseUID]);
+      return { ...grades };
+    });
+  };
+  const numResponsesGraded = useMemo(() => {
+    let count = 0;
+    for (const uid in perCriteriaGrades) {
+      const gs = perCriteriaGrades[uid];
+      if (Object.values(gs).some(v => (v !== undefined && v !== null)))
+        count += 1; 
+    }
+    return count; 
+  }, [perCriteriaGrades]);
+  const minNumToGrade = useMemo(() => {
+    return Math.min(10, Math.ceil(batchedResponses.length * 0.5))
+  }, [batchedResponses]);
+  const minNumToGradeToStartExecutor = useMemo(() => {
+    return Math.min(5, Math.ceil(batchedResponses.length * 0.25))
+  }, [batchedResponses]);
+
+  // The EvalGen object responsible for generating, implementing, and filtering candidate implementations
+  // :: Used on screen 4 (when `active` === 3).
+  const [executor, setExecutor] = useState<EvaluationFunctionExecutor | null>(
+    null,
+  );
+
+  // Logs and state from the EvalGen backend
+  const [logs, setLogs] = useState<{ date: Date; message: string }[]>([]);
+  const [numCallsMade, setNumCallsMade] = useState({ strong: 0, weak: 0 });
+
+  // The samples to pass the executor / grading responses features. This will be bounded
+  // by maxNumSamplesForExecutor, instead of the whole dataset.
+  const samplesForExecutor = useMemo(() => {
+    // The max number of samples (responses) to pass the executor. This controls how many requests will
+    // need to be sent off and how many evaluation function executions are performed.
+    // TODO: Give the user some control over this.
+    const maxNumSamplesForExecutor = 16;
+
+    // Sample from the full set of responses, if needed:
+    if (batchedResponses.length > maxNumSamplesForExecutor)
+      return sampleRandomElements(responses, maxNumSamplesForExecutor);
+    else return batchedResponses.slice();
+  }, [batchedResponses]);
+
+  // Update executor whenever resps, grades, or criteria change
+  useEffect(() => {
+    if (criteria.length === 0 || numResponsesGraded < minNumToGradeToStartExecutor) return; 
+    if (!executor) {
+      const addLog = (message: string) => {
+        setLogs((prevLogs) => [...prevLogs, { date: new Date(), message }]);
+      };
+
+      const ex = new EvaluationFunctionExecutor(
+        getLikelyPromptTemplateAsContext(samplesForExecutor) ?? "",
+        samplesForExecutor,
+        criteria,
+        (strong, weak) => {
+          // Callback to update GPT call counts
+          setNumCallsMade((n_calls) => {
+            n_calls.strong += strong;
+            n_calls.weak += weak;
+            return {...n_calls};
+          });
+        },
+        addLog,
+        undefined,  // don't pass any holistic grades at this stage
+        perCriteriaGrades,
+      );
+      setExecutor(ex);
+
+      // ex.start((progress) => {
+      //   setExecProgress(progress?.success ?? 0);
+      // });
+    } else if (executor) {
+      // Update criteria in executor
+      executor.addCriteria(criteria);
+    }
+
+  }, [criteria, samplesForExecutor, numResponsesGraded, minNumToGradeToStartExecutor]);
 
   const handleNext = useCallback(() => {
     setActive((current) => Math.min(4, current + 1));
@@ -97,7 +211,7 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
     <Modal
       opened={opened}
       onClose={onClose}
-      title="EvalGen Wizard"
+      // title="EvalGen Wizard"
       size="90%"
       padding="md"
       // keepMounted
@@ -107,6 +221,11 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
         inner: {
           padding: "5%", // This creates space around the modal (10% total)
         },
+        header: {
+          padding: "0px",
+          backgroundColor: "transparent",
+          // borderBottom: "1px solid black",
+        },
         content: {
           height: "100%", // Fill the available space
           maxHeight: "90vh", // Limit to 90% of viewport height
@@ -147,9 +266,13 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
         <GradingResponsesStep
           onNext={handleNext}
           onPrevious={handlePrevious}
-          responses={batchedResponses}
+          executor={executor}
+          logs={logs}
+          responses={samplesForExecutor}  // This is deliberately not the entire list of responses, for now. 
           criteria={criteria}
           setCriteria={setCriteria}
+          grades={perCriteriaGrades}
+          setPerCriteriaGrade={setPerCriteriaGrade}
           setOnNextCallback={setOnNextCallback}
         />
       )}
@@ -181,8 +304,12 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
             &lt; Back
           </Button>
 
-          <Button onClick={handleNext} disabled={active === 4}>
-            Next &gt;
+          <Button 
+            color={active === 3 ? "green" : "blue"}
+            onClick={handleNext} 
+            disabled={active === 4 || (active === 3 && numResponsesGraded < minNumToGrade)}
+          >
+            {active === 3 ? (numResponsesGraded >= minNumToGrade ? "I think I'm done" : `Grade at least ${minNumToGrade - numResponsesGraded} more`) : "Next >"}
           </Button>
         </Flex>
       </div>
diff --git a/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx b/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
index 83ba89671..de0d9a19a 100644
--- a/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
+++ b/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
@@ -1,6 +1,6 @@
 import React, { useCallback, useEffect, useState } from "react";
 import { EvalCriteria } from "../backend/evalgen/typing";
-import { LLMResponse } from "../backend/typing";
+import { Dict, LLMResponse, RatingDict } from "../backend/typing";
 import {
   ActionIcon,
   Button,
@@ -9,8 +9,10 @@ import {
   Flex,
   Grid,
   Group,
+  Popover,
   Radio,
   rem,
+  ScrollArea,
   Skeleton,
   Stack,
   Text,
@@ -30,8 +32,9 @@ import {
   IconThumbUp,
   IconTrash,
 } from "@tabler/icons-react";
-import { generateLLMEvaluationCriteria } from "../backend/evalgen/utils";
+import { generateLLMEvaluationCriteria, getPromptForGenEvalCriteriaFromDesc } from "../backend/evalgen/utils";
 import useStore from "../store";
+import EvaluationFunctionExecutor from "../backend/evalgen/executor";
 
 const ThumbUpDownButtons = ({
   grade,
@@ -42,6 +45,10 @@ const ThumbUpDownButtons = ({
   onChangeGrade: (newGrade: boolean | undefined) => void;
   getGradeCount: (grade: boolean | undefined) => number;
 }) => {
+
+  const true_count = getGradeCount(true);
+  const false_count = getGradeCount(false);
+
   return (
     <>
       {/* Thumbs up/down buttons */}
@@ -56,8 +63,8 @@ const ThumbUpDownButtons = ({
         }}
       >
         <div className="gradeContainer">
-          <IconThumbUp size="14pt" fill={grade === true ? "#aea" : "white"} />
-          <div className="gradeUpCount">{getGradeCount(true)}</div>
+          <IconThumbUp size="20pt" fill={grade === true ? "#aea" : "white"} />
+          {true_count > 0 && <div className="gradeUpCount">{true_count}</div>}
         </div>
       </Button>
       <Button
@@ -72,10 +79,10 @@ const ThumbUpDownButtons = ({
       >
         <div className="gradeContainer">
           <IconThumbDown
-            size="14pt"
+            size="20pt"
             fill={grade === false ? "pink" : "white"}
           />
-          <div className="gradeDownCount">{getGradeCount(false)}</div>
+          {false_count > 0 && <div className="gradeDownCount">{false_count}</div>}
         </div>
       </Button>
     </>
@@ -188,6 +195,7 @@ const CriteriaCard: React.FC<CriteriaCardProps> = ({
       <Textarea
         value={criterion.criteria}
         placeholder="Describe here. You must describe what the criteria means before EvalGen can implement it."
+        size="xs"
         ml={38}
         onChange={(e) => {
           criterion.criteria = e.target.value;
@@ -202,6 +210,8 @@ const CriteriaCard: React.FC<CriteriaCardProps> = ({
             color: "#444",
             background: "transparent",
             lineHeight: 1.1,
+            paddingTop: "4px !important",
+            paddingBottom: "4px !important",
           },
         }}
         autosize
@@ -218,18 +228,30 @@ const CriteriaCard: React.FC<CriteriaCardProps> = ({
 interface GradingResponsesStepProps {
   onNext: () => void;
   onPrevious: () => void;
+  executor: EvaluationFunctionExecutor | null;
+  logs: { date: Date; message: string }[];
+  genAIModelNames: { strong: string; weak: string };
+  numCallsMade: { strong: number; weak: number };
   responses: LLMResponse[];
   criteria: EvalCriteria[];
   setCriteria: React.Dispatch<React.SetStateAction<EvalCriteria[]>>;
+  grades: Dict<Dict<boolean | undefined>>;  // per-criteria grades
+  setPerCriteriaGrade: (responseUID: string, criteriaUID: string, newGrade: boolean | undefined) => void;
   setOnNextCallback: React.Dispatch<React.SetStateAction<() => unknown>>;
 }
 
 const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
   onNext,
   onPrevious,
+  executor,
+  logs,
+  genAIModelNames,
+  numCallsMade,
   responses,
   criteria,
   setCriteria,
+  grades,
+  setPerCriteriaGrade,
   setOnNextCallback,
 }) => {
   const apiKeys = useStore((state) => state.apiKeys);
@@ -241,6 +263,22 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
   );
   const [shownResponseIdx, setShownResponseIdx] = useState(0);
 
+  const [newCriteriaDesc, setNewCriteriaDesc] = useState("");
+
+  const getStateValue = (stateId: number) => {
+    return Math.floor(Math.random() * 30 + 6);
+  };
+  const getGradeCount = (
+    criteriaUID: string,
+    grade: boolean | undefined,
+  ) => {
+    let count = 0;
+    for (const respUid in grades) {
+      count += grade === grades[respUid][criteriaUID] ? 1 : 0;
+    }
+    return count;
+  };
+
   useEffect(() => {
     if (!responses || responses.length === 0) return;
     setShownResponse(responses[0]);
@@ -348,8 +386,36 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
         }
         // Remove a loading Skeleton
         setIsLoadingCriteria((num) => num - 1);
+        // setNumGPT4Calls((num) => num + 1);
+      })
+      .catch((err) => {
+        console.error(err);
+        setIsLoadingCriteria((num) => num - 1);
+      });
+  };
 
-        setNumGPT4Calls((num) => num + 1);
+  const addCriteria = (desc: string) => {
+    // Add a loading Skeleton
+    setIsLoadingCriteria((num) => num + 1);
+    // Make async LLM call to expand criteria
+    generateLLMEvaluationCriteria(
+      "",
+      apiKeys,
+      getPromptForGenEvalCriteriaFromDesc(desc), // prompt
+      null, // system_msg
+    )
+      .then((evalCrits) => {
+        // Take only the first suggested by the model, if any
+        setCriteria((crit) =>
+          crit.concat([
+            {
+              ...evalCrits[0],
+              uid: uuid(),
+            },
+          ]),
+        );
+        // Remove a loading Skeleton
+        setIsLoadingCriteria((num) => num - 1);
       })
       .catch((err) => {
         console.error(err);
@@ -358,7 +424,7 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
   };
 
   return (
-    <Grid h={window?.innerHeight * 0.8}>
+    <Grid h="100%">
       <Grid.Col span={8}>
         <Stack justify="space-between">
           {/* View showing the response the user is currently grading */}
@@ -370,6 +436,47 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
             gotoPrevResponse={prevResponse}
           />
 
+          <Flex direction="column">
+              <Flex justify="space-between" align="center">
+                <Text size="lg" weight={500} mb="sm">
+                  LLM Activity
+                </Text>
+                {/* GPT Call Tally */}
+                <Text size="sm" color="dark" style={{ fontStyle: "italic" }}>
+                  Executed {numCallsMade.strong} {genAIModelNames.strong} calls and {numCallsMade.weak}{" "}
+                  {genAIModelNames.weak} calls.
+                </Text>
+              </Flex>
+              <div
+                style={{
+                  backgroundColor: "#f0f0f0",
+                  color: "#333",
+                  fontFamily: "monospace",
+                  padding: "12px",
+                  width: "calc(100% - 30px)",
+                  height: "200px",
+                  overflowY: "auto",
+                  borderRadius: "8px",
+                  border: "1px solid #ddd",
+                  marginRight: "20px", // Space on the right
+                }}
+                ref={(el) => {
+                  if (el) {
+                    el.scrollTop = el.scrollHeight;
+                  }
+                }}
+              >
+                {logs.map((log, index) => (
+                  <div key={index}>
+                    <span style={{ color: "#4A90E2" }}>
+                      {log.date.toLocaleString()} -{" "}
+                    </span>
+                    <span>{log.message}</span>
+                  </div>
+                ))}
+              </div>
+            </Flex>
+
           {/* Progress bar */}
           {/* <Flex justify="left" align="center" gap="md">
                     <Stack w="100%" spacing={4}>
@@ -381,28 +488,30 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
                   </Flex> */}
         </Stack>
       </Grid.Col>
-      <Grid.Col span={4} bg="#eee" pt="16px" h="100%">
+      <Grid.Col span={4} bg="#eee" pt="16px" h="100%" style={{ boxShadow: "-10px 0px 20px #aaa" }}>
         <Center>
           <Title order={3} ml={8} mt="sm" mb="md">
-            Rubric
+            Per-criteria grading
           </Title>
         </Center>
 
+        <ScrollArea h="75%" offsetScrollbars style={{ border: "1px solid #ccc" }}>
         <div
           style={{
             display: "flex",
             flexDirection: "column",
+            marginBottom: "40px"
           }}
         >
           <div style={{ flex: 2, overflowY: "auto" }}>
-            {criteriaForDisplay.map((e) => (
+            {criteria.map((e) => (
               <CriteriaCard
                 criterion={e}
                 key={e.uid}
                 onChange={(newCrit) => handleChangeCriteria(newCrit, e.uid)}
                 onDelete={() => handleDeleteCriteria(e.uid)}
                 grade={
-                  shownResponse ? grades[shownResponse.uid][e.uid] : undefined
+                  (shownResponse && grades[shownResponse.uid]) ? grades[shownResponse.uid][e.uid] : undefined
                 }
                 getGradeCount={(grade) => {
                   return shownResponse
@@ -431,25 +540,38 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
             ) : (
               <></>
             )}
+          </div>
 
-            <div className="criteriaButtons">
+          <div className="criteriaButtons">
+            {/* <Popover withArrow>
+              <Popover.Target>
               <Button
                 leftIcon={<IconPencil size={14} />}
                 variant="subtle"
                 color="gray"
                 // gradient={{ from: "blue", to: "green", deg: 90 }}
-                onClick={() => {
-                  handleAddCriteria({
-                    shortname: "New Criteria",
-                    criteria: "",
-                    eval_method: "code",
-                    priority: 0,
-                    uid: uuid(),
-                  });
-                }}
+                // onClick={() => {
+                //   handleAddCriteria({
+                //     shortname: "New Criteria",
+                //     criteria: "",
+                //     eval_method: "code",
+                //     priority: 0,
+                //     uid: uuid(),
+                //   });
+                // }}
               >
                 Add a new criteria
               </Button>
+              </Popover.Target>
+              <Popover.Dropdown>
+                <Flex justify="space-around" align="center" gap="md">
+                  <Textarea label="Describe the critera:">Hello</Textarea>
+                  <Button>Submit</Button>
+                </Flex>
+                
+              </Popover.Dropdown>
+            </Popover> */}
+              
               {/* <Button
                 leftIcon={<IconSparkles size={14} />}
                 variant="subtle"
@@ -462,20 +584,22 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
                 Suggest Criteria
               </Button> */}
             </div>
-          </div>
+          
+
+
 
-          <Stack spacing="0px" pl="xs" pr="lg" style={{ flex: 1 }}>
+          {/* <Stack spacing="0px" pl="xs" pr="lg" style={{ flex: 1 }}>
             <Divider mt="lg" />
             <Title mb="0px" order={4}>
-              Suggest New Criteria Based on the Feedback
+              Suggest New Criteria
             </Title>
             <Textarea
               value={annotation}
               onChange={(e) => setAnnotation(e.target.value)}
               description="How good is this response? Explain anything not captured under your existing criteria. Your feedback will be used to generate new criteria."
               mb="sm"
-            />
-            <Radio.Group
+            /> */}
+            {/* <Radio.Group
               name="favoriteFramework"
               label="Rate the response holistically:"
               value={holisticGrade}
@@ -510,9 +634,26 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
                   + Submit Feedback
                 </Button>
               </Group>
-            </Radio.Group>
-          </Stack>
+            </Radio.Group> */}
+          {/* </Stack> */}
         </div>
+
+        <Textarea value={newCriteriaDesc} onChange={(e) => setNewCriteriaDesc(e.currentTarget.value)} label="Add new criteria:" placeholder="Describe the criteria to add." ml="md" mr="md"></Textarea>
+        <Group position="right" mr="md" mt="sm">
+        <Button
+          color="green"
+          variant="filled"
+          disabled={newCriteriaDesc?.trim().length === 0 || isLoadingCriteria > 0}
+          onClick={() => {
+            addCriteria(newCriteriaDesc);
+            setNewCriteriaDesc("");
+          }}
+        >
+          + Add criteria
+        </Button>
+        </Group>
+
+        </ScrollArea>
       </Grid.Col>
     </Grid>
   );
diff --git a/chainforge/react-server/src/EvalGen/GradingView.tsx b/chainforge/react-server/src/EvalGen/GradingView.tsx
index 1640c48fb..e26927992 100644
--- a/chainforge/react-server/src/EvalGen/GradingView.tsx
+++ b/chainforge/react-server/src/EvalGen/GradingView.tsx
@@ -110,7 +110,7 @@ const GradingView: React.FC<GradingViewProps> = ({
 
           {/* Go forward to the next response */}
           <Tooltip label="To next response" withArrow>
-            <Button variant="white" color="dark" onClick={gotoNextResponse}>
+            <Button variant="white" color="dark" bg="transparent" onClick={gotoNextResponse}>
               <IconChevronRight />
             </Button>
           </Tooltip>
diff --git a/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx b/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
index 0b1867d42..27d2fae36 100644
--- a/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
+++ b/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
@@ -33,7 +33,7 @@ import {
 } from "@tabler/icons-react";
 import useStore from "../store";
 import { accuracyToColor, cmatrixTextAnnotations } from "../backend/utils";
-import { generateLLMEvaluationCriteria } from "../backend/evalgen/utils";
+import { generateLLMEvaluationCriteria, getPromptForGenEvalCriteriaFromDesc } from "../backend/evalgen/utils";
 import { v4 as uuid } from "uuid";
 import Plot from "react-plotly.js";
 
@@ -402,14 +402,7 @@ const PickCriteriaStep: React.FC<PickCriteriaStepProps> = ({
     generateLLMEvaluationCriteria(
       "",
       apiKeys,
-      `I've described a criteria I want to use to evaluate text. I want you to take the criteria and output a JSON object in the format below. 
-
-CRITERIA: 
-\`\`\`
-${addCriteriaValue}
-\`\`\`
-
-Your response should contain a short title for the criteria ("shortname"), a description of the criteria in 2 sentences ("criteria"), and whether it should be evaluated with "code", or by an "expert" if the criteria is difficult to evaluate ("eval_method"). Your answer should be JSON within a \`\`\`json \`\`\` marker, with the following three fields: "criteria", "shortname", and "eval_method" (code or expert). The "criteria" should expand upon the user's input, the "shortname" should be a very brief title for the criteria, and this list should contain as many evaluation criteria as you can think of. Each evaluation criteria should test a unit concept that should evaluate to "true" in the ideal case. Only output JSON, nothing else.`, // prompt
+      getPromptForGenEvalCriteriaFromDesc(addCriteriaValue), // prompt
       null, // system_msg
     )
       .then((evalCrits) => {
@@ -430,6 +423,7 @@ Your response should contain a short title for the criteria ("shortname"), a des
         setIsLoadingCriteria((num) => num - 1);
       });
   };
+  
   const updateCriteria = (
     newValue: string,
     critIdx: number,
@@ -449,7 +443,7 @@ Your response should contain a short title for the criteria ("shortname"), a des
   };
 
   return (
-    <Stack spacing="lg">
+    <Stack spacing="lg" p="xl">
       <Title order={3}>Define Evaluation Criteria</Title>
 
       <div>
@@ -467,10 +461,10 @@ Your response should contain a short title for the criteria ("shortname"), a des
 
         <Flex align="center" gap="lg">
           <TextInput
-            label="Type a new criteria to add, then press Enter:"
+            label="Describe a new criterion to add, then press Enter:"
             value={addCriteriaValue}
             onChange={(evt) => setAddCriteriaValue(evt.currentTarget.value)}
-            placeholder="the response is valid JSON"
+            placeholder="e.g., the response is valid JSON"
             mb="lg"
             pl="sm"
             pr="sm"
@@ -485,6 +479,16 @@ Your response should contain a short title for the criteria ("shortname"), a des
           />
           <Button
             variant="filled"
+            disabled={addCriteriaValue?.trim().length === 0}
+            onClick={() => {
+              addCriteria();
+              setAddCriteriaValue("");
+            }}
+          >
+            Generate
+          </Button>
+          <Button
+            variant="outline"
             onClick={() => {
               if (isLoadingCriteria > 0) return;
               setIsLoadingCriteria(3);
@@ -540,7 +544,7 @@ Your response should contain a short title for the criteria ("shortname"), a des
         </ScrollArea>
       </div>
 
-      <Group position="apart" mt="xl">
+      {/* <Group position="apart" mt="xl">
         <Button variant="default" onClick={onPrevious}>
           Back
         </Button>
@@ -557,7 +561,7 @@ Your response should contain a short title for the criteria ("shortname"), a des
             Ready to Grade!
           </Button>
         </Tooltip>
-      </Group>
+      </Group> */}
     </Stack>
   );
 };
diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx
index 88444038c..9ea550273 100644
--- a/chainforge/react-server/src/EvalGenModal.tsx
+++ b/chainforge/react-server/src/EvalGenModal.tsx
@@ -428,21 +428,6 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
         count += grade === grades[respUid][criteriaUID] ? 1 : 0;
       }
       return count;
-
-      // if (grades[responseUID]) {
-      //   let count = 0;
-      //   for (const critUid in grades[responseUID]) {
-      //     count += grades[responseUID][critUid] ? 1 : 0;
-      //   }
-      //   // return grade === grades[responseUID][criteriaUID] ? 1 : 0; // this needs to be changed after the grading feature is fully implemented on server side.
-      //   return count;
-      //   // return 10;
-      // }
-
-      // if (grades[responseUID]) {
-      //   return grade === grades[responseUID][criteriaUID] ? 1 : 0; // this needs to be changed after the grading feature is fully implemented on server side.
-      // }
-      // return 0;
     };
 
     // The EvalGen object responsible for generating, implementing, and filtering candidate implementations
diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts
index 09d9e0d8e..07532cd5d 100644
--- a/chainforge/react-server/src/backend/evalgen/executor.ts
+++ b/chainforge/react-server/src/backend/evalgen/executor.ts
@@ -11,7 +11,7 @@ import {
   EvalFunctionSetReport,
   EvalCriteriaUID,
 } from "./typing";
-import { LLMResponse, ResponseUID, QueryProgress, Dict } from "../typing";
+import { LLMResponse, ResponseUID, QueryProgress, Dict, LLMSpec } from "../typing";
 import { EventEmitter } from "events";
 
 /**
@@ -66,6 +66,7 @@ export default class EvaluationFunctionExecutor {
   private scores: Map<ResponseUID, number>;
   // Cache function results for each example
   private resultsCache: Map<EvalFunction, Map<ResponseUID, EvalFunctionResult>>;
+  private llms: { small: string | LLMSpec, large: string | LLMSpec };
   private grades: Map<ResponseUID, boolean>; // Grades for all examples
   private perCriteriaGrades: Dict<Dict<boolean | undefined>>; // Grades per criteria
   private annotations: Dict<string>; // Annotations for each response
@@ -77,22 +78,23 @@ export default class EvaluationFunctionExecutor {
   private backgroundTaskPromise: Promise<void> | null = null; // To keep track of the background task for generating and executing evaluation functions
   private criteriaQueue: EvalCriteria[] = []; // Queue for new criteria to be processed
   private processing = false; // To keep track of whether we are currently processing a criteria
-  private updateGPTCalls: (numGPT4Calls: number, numGPT35Calls: number) => void;
+  private updateNumLLMCalls: (numStrongModelCalls: number, numWeakModelCalls: number) => void;
   private logFunction: (logMessage: string) => void;
 
   /**
    * Initializes a new instance of the EvaluationFunctionExecutor class.
    *
    * @param evalCriteria The criteria used to generate evaluation functions. Provided/confirmed by the developer.
-   * @param promptTemplate The prompt demplate for the developer's LLM chain. This is useful for GPT-4 to generate correct evaluation functions.
+   * @param promptTemplate The prompt template for the developer's LLM chain. This is useful for the LLM to generate correct evaluation functions.
    * @param examples A set of variable-prompt-response triples that we want the developer to grade (and use for filtering incorrect evaluation functions).
    * @param existingGrades Optional. A dict in format {uid: grade}, containing existing grades.
    */
   constructor(
+    genAIModels: { small: string | LLMSpec, large: string | LLMSpec },
     promptTemplate: string,
     examples: LLMResponse[],
     evalCriteria: EvalCriteria[] = [],
-    updateGPTCalls: (numGPT4Calls: number, numGPT35Calls: number) => void,
+    updateNumLLMCalls: (numStrongModelCalls: number, numWeakModelCalls: number) => void,
     addLog: (log: string) => void,
     existingGrades?: Record<ResponseUID, boolean>,
     existingPerCriteriaGrades?: Dict<Dict<boolean | undefined>>,
@@ -108,6 +110,7 @@ export default class EvaluationFunctionExecutor {
     this.examples = examples;
     this.evalCriteria = evalCriteria;
     this.promptTemplate = promptTemplate;
+    this.llms = genAIModels;
 
     // Set scores and grades to default values of 0
     this.scores = new Map<ResponseUID, number>();
@@ -141,7 +144,7 @@ export default class EvaluationFunctionExecutor {
     this.criteriaQueue = [];
     this.processing = false;
 
-    this.updateGPTCalls = updateGPTCalls;
+    this.updateNumLLMCalls = updateNumLLMCalls;
     this.logFunction = addLog;
   }
 
@@ -216,14 +219,15 @@ export default class EvaluationFunctionExecutor {
 
           const result = await funcToExecute(
             evalFunction,
+            this.llms.small,
             example,
             randomPositiveExample,
             randomNegativeExample,
           );
 
-          // Update GPT-3.5 call count by 1 if the eval method is expert
+          // Update weak model call count by 1 if the eval method is expert
           if (evalFunction.evalCriteria.eval_method === "expert") {
-            this.updateGPTCalls(0, 1);
+            this.updateNumLLMCalls(0, 1);
           }
 
           if (onProgress) {
@@ -263,8 +267,8 @@ export default class EvaluationFunctionExecutor {
       emitter,
       badExample,
     );
-    // Update GPT-4o call count by 1
-    this.updateGPTCalls(1, 0);
+    // Update LLM call count by 1
+    this.updateNumLLMCalls(1, 0);
 
     console.log(`Generated functions for criteria: ${criteria.shortname}`);
     console.log(
@@ -335,14 +339,15 @@ export default class EvaluationFunctionExecutor {
           // Run the function on the example and if there's an error, increment skipped
           const result = await funcToExecute(
             evalFunction,
+            this.llms.small,
             example,
             randomPositiveExample,
             randomNegativeExample,
           );
 
-          // Update GPT-3.5 call count by 1 if the eval method is expert
+          // Update weak model call count by 1 if the eval method is expert
           if (evalFunction.evalCriteria.eval_method === "expert") {
-            this.updateGPTCalls(0, 1);
+            this.updateNumLLMCalls(0, 1);
           }
 
           funcsExecuted++;
@@ -382,8 +387,8 @@ export default class EvaluationFunctionExecutor {
         emitter, // Pass the EventEmitter instance
       ).then(() => {
         emitter.emit("criteriaProcessed");
-        // Update GPT-4o call count by 1
-        this.updateGPTCalls(1, 0);
+        // Update LLM call count by 1
+        this.updateNumLLMCalls(1, 0);
       });
     });
 
@@ -438,9 +443,12 @@ export default class EvaluationFunctionExecutor {
    * @param criteria The new evaluation criteria to be added.
    */
   public addCriteria(criteriaList: EvalCriteria[]): void {
+    // See if there are criteria to remove
+    this.evalCriteria = this.evalCriteria.filter((c) => (!criteriaList.includes(c)));
+
     // See if there are new criteria to add
     for (const criteria of criteriaList) {
-      if (this.evalCriteria.includes(criteria)) {
+      if (this.evalCriteria.includes(criteria)) {  // criteria already included
         continue;
       }
 
@@ -453,14 +461,6 @@ export default class EvaluationFunctionExecutor {
         this.processNextCriteria();
       }
     }
-
-    // See if there are criteria to remove
-    for (const criteria of this.evalCriteria) {
-      if (!criteriaList.includes(criteria)) {
-        console.log(`Removing criteria: ${criteria.shortname}`);
-        this.evalCriteria = this.evalCriteria.filter((c) => c !== criteria);
-      }
-    }
   }
 
   private async processNextCriteria() {
@@ -591,7 +591,7 @@ export default class EvaluationFunctionExecutor {
       this.grades.set(exampleId, boolHolistic);
     }
 
-    if (perCriteriaGrades !== null) {
+    if (perCriteriaGrades) {
       this.perCriteriaGrades[exampleId] = perCriteriaGrades;
 
       // If holisticGrade was null, set it based on the perCriteriaGrades---if all criteria in the perCriteriaGrades are true, set the holisticGrade to true, else false
@@ -603,7 +603,7 @@ export default class EvaluationFunctionExecutor {
       }
     }
 
-    if (annotation !== null) {
+    if (annotation) {
       this.annotations[exampleId] = annotation;
     }
 
@@ -662,13 +662,6 @@ export default class EvaluationFunctionExecutor {
     for (const example of examples) {
       this.scores.set(example.uid, 0);
     }
-
-    // Set grades if examples contain them
-    for (const example of examples) {
-      if (example.metavars.grade !== undefined) {
-        this.grades.set(example.uid, example.metavars.grade);
-      }
-    }
   }
 
   /**
@@ -775,7 +768,7 @@ export default class EvaluationFunctionExecutor {
           evalFunction.evalCriteria.eval_method === "code"
             ? execPyFunc
             : executeLLMEval;
-        const result = await funcToExecute(evalFunction, example);
+        const result = await funcToExecute(evalFunction, this.llms.small, example);
 
         // Put result in cache
         if (!this.resultsCache.has(evalFunction)) {
@@ -1013,7 +1006,7 @@ export default class EvaluationFunctionExecutor {
           evalFunction.evalCriteria.eval_method === "code"
             ? execPyFunc
             : executeLLMEval;
-        const result = await funcToExecute(evalFunction, example);
+        const result = await funcToExecute(evalFunction, this.llms.small, example);
 
         // Put result in cache
         if (!this.resultsCache.has(evalFunction)) {
diff --git a/chainforge/react-server/src/backend/evalgen/utils.ts b/chainforge/react-server/src/backend/evalgen/utils.ts
index e3e585ee4..6ea69676f 100644
--- a/chainforge/react-server/src/backend/evalgen/utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/utils.ts
@@ -9,7 +9,7 @@ import {
   EvalFunctionResult,
   validEvalCriteriaFormat,
 } from "./typing";
-import { Dict, LLMResponse } from "../typing";
+import { Dict, LLMResponse, LLMSpec } from "../typing";
 import { executejs, executepy, simpleQueryLLM } from "../backend";
 import {
   getVarsAndMetavars,
@@ -48,6 +48,7 @@ function extractJSONBlocks(mdText: string): string[] | undefined {
  */
 export async function generateLLMEvaluationCriteria(
   prompt: string,
+  llm: string | LLMSpec,
   apiKeys?: Dict,
   promptTemplate?: string, // overrides prompt template used
   systemMsg?: string | null, // overrides default system message, if present. Use null to specify empty.
@@ -65,7 +66,7 @@ export async function generateLLMEvaluationCriteria(
   async function _query() {
     const result = await simpleQueryLLM(
       detailedPrompt, // prompt
-      "gpt-4o", // llm
+      typeof llm === "string" ? llm : [llm], // llm
       // spec, // llm
       systemMsg !== undefined
         ? systemMsg === null
@@ -114,11 +115,23 @@ export async function generateLLMEvaluationCriteria(
   return retryAsyncFunc(_query, 3);
 }
 
+export function getPromptForGenEvalCriteriaFromDesc(desc: string) {
+  return `I've described a criteria I want to use to evaluate text. I want you to take the criteria and output a JSON object in the format below. 
+
+CRITERIA: 
+\`\`\`
+${desc}
+\`\`\`
+
+Your response should contain a short title for the criteria ("shortname"), a description of the criteria in 2 sentences ("criteria"), and whether it should be evaluated with "code", or by an "expert" if the criteria is difficult to evaluate ("eval_method"). Your answer should be JSON within a \`\`\`json \`\`\` marker, with the following three fields: "criteria", "shortname", and "eval_method" (code or expert). The "criteria" should expand upon the user's input, the "shortname" should be a very brief title for the criteria, and this list should contain as many evaluation criteria as you can think of. Each evaluation criteria should test a unit concept that should evaluate to "true" in the ideal case. Only output JSON, nothing else.`;
+}
+
 export async function executeLLMEval(
   evalFunction: EvalFunction,
+  llm: string | LLMSpec,
   example: LLMResponse,
-  positiveExample: LLMResponse,
-  negativeExample: LLMResponse,
+  positiveExample?: LLMResponse,
+  negativeExample?: LLMResponse,
 ): Promise<EvalFunctionResult> {
   // Construct call to an LLM to evaluate the example
   const evalPrompt =
@@ -128,30 +141,25 @@ export async function executeLLMEval(
     example.responses[0] +
     "\n```";
 
-  // Sleep a random number of seconds between 1 and 30
-  // const sleep = (ms: number) =>
-  //   new Promise((resolve) => setTimeout(resolve, ms));
-  // await sleep(Math.floor(Math.random() * 30) * 1000);
-
   // Query an LLM as an evaluator
   let systemMessage = "You are an expert evaluator.";
   if (
     positiveExample &&
-    positiveExample.responses[0] &&
+    positiveExample.responses.length > 0 &&
     negativeExample &&
-    negativeExample.responses[0]
+    negativeExample.responses.length > 0
   ) {
     systemMessage +=
-      " Please consider the following good example: " +
-      positiveExample.responses[0] +
-      " and bad example: " +
-      negativeExample.responses[0] +
-      " when making your evaluation.";
+      " Please consider the following GOOD example: \n" +
+      llmResponseDataToString(positiveExample.responses[0]) +
+      "\nand BAD example: \n" +
+      llmResponseDataToString(negativeExample.responses[0]) +
+      "\nwhen making your evaluation.";
   }
 
   const result = await simpleQueryLLM(
     evalPrompt, // prompt
-    "gpt-3.5-turbo-16k", // llm
+    typeof llm === "string" ? llm : [llm], // llm
     systemMessage, // system_msg
   );
   // Get the output
@@ -223,9 +231,10 @@ export async function execJSFunc(
  */
 export async function execPyFunc(
   evalFunction: EvalFunction,
+  llm: string | LLMSpec, // not used, but provided for consistency with the other exec func signature
   example: LLMResponse,
-  positiveExample: LLMResponse,
-  negativeExample: LLMResponse,
+  positiveExample?: LLMResponse,
+  negativeExample?: LLMResponse,
 ): Promise<EvalFunctionResult> {
   try {
     // We need to replace the function name with "evaluate", which is what is expected by backend:
diff --git a/chainforge/react-server/src/text-fields-node.css b/chainforge/react-server/src/text-fields-node.css
index 0069509e5..f83c0ea0f 100644
--- a/chainforge/react-server/src/text-fields-node.css
+++ b/chainforge/react-server/src/text-fields-node.css
@@ -1321,20 +1321,21 @@ th .content-editable-div {
 
 .gradeContainer {
   position: relative;
-  width: 20px;
+  overflow: visible;
+  /* width: 20px; */
 }
 
 .gradeUpCount {
   position: absolute;
-  left: 12px;
-  top: -5px;
+  right: 0px;
+  top: -3px;
   font-size: x-small;
 }
 
 .gradeDownCount {
   position: absolute;
-  left: 13px;
-  top: 13px;
+  right: 0px;
+  bottom: 0px;
   font-size: x-small;
 }
 

From 6b2b3cf7abc5f9af1c6e36cf879847129f286959 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Wed, 19 Mar 2025 19:34:29 -0400
Subject: [PATCH 20/35] wip getting executor to work

---
 .../src/EvalGen/EvalGenWizard.tsx             | 102 +++--
 .../react-server/src/EvalGen/FeedbackStep.tsx |   2 -
 .../src/EvalGen/GradeResponsesStep.tsx        | 266 ++++++------
 .../react-server/src/EvalGen/GradingView.tsx  |   7 +-
 .../src/EvalGen/PickCriteriaStep.tsx          |  10 +-
 chainforge/react-server/src/EvalGenModal.tsx  |   2 +-
 .../src/ResponseRatingToolbar.tsx             |   2 +-
 chainforge/react-server/src/backend/ai.ts     |   4 +-
 .../src/backend/evalgen/executor.ts           |  56 ++-
 .../src/backend/evalgen/oai_utils.ts          | 382 +++---------------
 .../react-server/src/backend/evalgen/utils.ts |  25 +-
 11 files changed, 359 insertions(+), 499 deletions(-)

diff --git a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
index d29df0086..93633f45a 100644
--- a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
+++ b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
@@ -11,7 +11,11 @@ import FeedbackStep from "./FeedbackStep";
 import PickCriteriaStep from "./PickCriteriaStep";
 import ReportCardStep from "./ReportCardStep";
 import GradingResponsesStep from "./GradeResponsesStep";
-import { batchResponsesByUID, deepcopy, sampleRandomElements } from "../backend/utils";
+import {
+  batchResponsesByUID,
+  deepcopy,
+  sampleRandomElements,
+} from "../backend/utils";
 import { getRatingKeyForResponse } from "../ResponseRatingToolbar";
 import EvaluationFunctionExecutor from "../backend/evalgen/executor";
 import { getAIFeaturesModels } from "../backend/ai";
@@ -39,9 +43,9 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
   const genAIModelNames = useMemo(() => {
     const models = getAIFeaturesModels(genAIFeaturesProvider);
     return {
-      strong: models.large,
-      weak: models.small,
-    }
+      large: models.large,
+      small: models.small,
+    };
   }, [genAIFeaturesProvider]);
 
   // Regroup input responses by batch UID, whenever jsonResponses changes
@@ -67,7 +71,9 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
   const [onNextCallback, setOnNextCallback] = useState(() => () => {});
 
   // Per-criteria grades (indexed by uid of response, then uid of criteria)
-  const [perCriteriaGrades, setPerCriteriaGrades] = useState<Dict<Dict<boolean | undefined>>>({});
+  const [perCriteriaGrades, setPerCriteriaGrades] = useState<
+    Dict<Dict<boolean | undefined>>
+  >({});
   const [annotation, setAnnotation] = useState<string | undefined>(undefined);
   const setPerCriteriaGrade = (
     responseUID: string,
@@ -78,6 +84,12 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
       if (!grades[responseUID]) grades[responseUID] = {};
       grades[responseUID][criteriaUID] = newGrade;
       updateGlobalRating(responseUID, "perCriteriaGrades", grades[responseUID]);
+
+      // If the EvalGen executor is running, update the per-criteria grade for this sample:
+      executor?.setGradeForExample(
+        responseUID,
+        grades[responseUID]);
+
       return { ...grades };
     });
   };
@@ -85,16 +97,16 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
     let count = 0;
     for (const uid in perCriteriaGrades) {
       const gs = perCriteriaGrades[uid];
-      if (Object.values(gs).some(v => (v !== undefined && v !== null)))
-        count += 1; 
+      if (Object.values(gs).some((v) => v !== undefined && v !== null))
+        count += 1;
     }
-    return count; 
+    return count;
   }, [perCriteriaGrades]);
   const minNumToGrade = useMemo(() => {
-    return Math.min(10, Math.ceil(batchedResponses.length * 0.5))
+    return Math.min(10, Math.ceil(batchedResponses.length * 0.5));
   }, [batchedResponses]);
   const minNumToGradeToStartExecutor = useMemo(() => {
-    return Math.min(5, Math.ceil(batchedResponses.length * 0.25))
+    return Math.min(5, Math.ceil(batchedResponses.length * 0.25));
   }, [batchedResponses]);
 
   // The EvalGen object responsible for generating, implementing, and filtering candidate implementations
@@ -106,6 +118,7 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
   // Logs and state from the EvalGen backend
   const [logs, setLogs] = useState<{ date: Date; message: string }[]>([]);
   const [numCallsMade, setNumCallsMade] = useState({ strong: 0, weak: 0 });
+  const [execProgress, setExecProgress] = useState(0);
 
   // The samples to pass the executor / grading responses features. This will be bounded
   // by maxNumSamplesForExecutor, instead of the whole dataset.
@@ -121,15 +134,35 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
     else return batchedResponses.slice();
   }, [batchedResponses]);
 
+  // When the user is done per-criteria grading
+  const handleDonePerCriteriaGrading = useCallback(async () => {
+    // Await completion of all gen + execution of eval funcs
+    await executor?.waitForCompletion();
+
+    // Filtering eval funcs by grades and present results
+    const filteredFunctions = await executor?.filterEvaluationFunctions(0.25);
+    console.log("Filtered Functions: ", filteredFunctions);
+
+    // Return selected implementations to caller
+    // TODO
+    console.warn(filteredFunctions);
+  }, [executor]);
+
   // Update executor whenever resps, grades, or criteria change
   useEffect(() => {
-    if (criteria.length === 0 || numResponsesGraded < minNumToGradeToStartExecutor) return; 
+    if (
+      criteria.length === 0 ||
+      numResponsesGraded < minNumToGradeToStartExecutor
+    )
+      return;
     if (!executor) {
       const addLog = (message: string) => {
         setLogs((prevLogs) => [...prevLogs, { date: new Date(), message }]);
       };
 
       const ex = new EvaluationFunctionExecutor(
+        genAIModelNames,
+        apiKeys,
         getLikelyPromptTemplateAsContext(samplesForExecutor) ?? "",
         samplesForExecutor,
         criteria,
@@ -138,24 +171,29 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
           setNumCallsMade((n_calls) => {
             n_calls.strong += strong;
             n_calls.weak += weak;
-            return {...n_calls};
+            return { ...n_calls };
           });
         },
         addLog,
-        undefined,  // don't pass any holistic grades at this stage
+        undefined, // don't pass any holistic grades at this stage
         perCriteriaGrades,
       );
       setExecutor(ex);
 
-      // ex.start((progress) => {
-      //   setExecProgress(progress?.success ?? 0);
-      // });
+      // Start executor process
+      ex.start((progress) => {
+        setExecProgress(progress?.success ?? 0);
+      });
     } else if (executor) {
       // Update criteria in executor
-      executor.addCriteria(criteria);
+      executor.updateCriteria(criteria);
     }
-
-  }, [criteria, samplesForExecutor, numResponsesGraded, minNumToGradeToStartExecutor]);
+  }, [
+    criteria,
+    samplesForExecutor,
+    numResponsesGraded,
+    minNumToGradeToStartExecutor,
+  ]);
 
   const handleNext = useCallback(() => {
     setActive((current) => Math.min(4, current + 1));
@@ -204,7 +242,11 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
     }
 
     // Attempt to generate criteria using an LLM
-    return await generateLLMEvaluationCriteria(inputPromptTemplate, apiKeys);
+    return await generateLLMEvaluationCriteria(
+      inputPromptTemplate,
+      genAIModelNames.large,
+      apiKeys,
+    );
   }
 
   return (
@@ -258,6 +300,7 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
           genCriteriaFromContext={() =>
             genCriteriaFromContext(batchedResponses)
           }
+          genAIModelNames={genAIModelNames}
           setOnNextCallback={setOnNextCallback}
         />
       )}
@@ -266,9 +309,11 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
         <GradingResponsesStep
           onNext={handleNext}
           onPrevious={handlePrevious}
+          genAIModelNames={genAIModelNames}
+          numCallsMade={numCallsMade}
           executor={executor}
           logs={logs}
-          responses={samplesForExecutor}  // This is deliberately not the entire list of responses, for now. 
+          responses={samplesForExecutor} // This is deliberately not the entire list of responses, for now.
           criteria={criteria}
           setCriteria={setCriteria}
           grades={perCriteriaGrades}
@@ -304,12 +349,19 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
             &lt; Back
           </Button>
 
-          <Button 
+          <Button
             color={active === 3 ? "green" : "blue"}
-            onClick={handleNext} 
-            disabled={active === 4 || (active === 3 && numResponsesGraded < minNumToGrade)}
+            onClick={active !== 3 ? handleNext : handleDonePerCriteriaGrading}
+            disabled={
+              active === 4 ||
+              (active === 3 && numResponsesGraded < minNumToGrade)
+            }
           >
-            {active === 3 ? (numResponsesGraded >= minNumToGrade ? "I think I'm done" : `Grade at least ${minNumToGrade - numResponsesGraded} more`) : "Next >"}
+            {active === 3
+              ? numResponsesGraded >= minNumToGrade
+                ? "I think I'm done"
+                : `Grade at least ${minNumToGrade - numResponsesGraded} more`
+              : "Next >"}
           </Button>
         </Flex>
       </div>
diff --git a/chainforge/react-server/src/EvalGen/FeedbackStep.tsx b/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
index 6ad4926f2..c86fbef85 100644
--- a/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
+++ b/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
@@ -43,7 +43,6 @@ const FeedbackStep: React.FC<FeedbackStepProps> = ({
     if (!shownResponse) return null;
     const key = getRatingKeyForResponse(shownResponse?.uid, "grade");
     const g = storeState[key];
-    console.log(shownResponse?.uid);
     if (g) return g[0];
     else return null;
   }, [shownResponse, storeState]);
@@ -51,7 +50,6 @@ const FeedbackStep: React.FC<FeedbackStepProps> = ({
     if (!shownResponse) return "";
     const key = getRatingKeyForResponse(shownResponse?.uid, "note");
     const a = storeState[key];
-    console.log(shownResponse?.uid);
     if (a) return a[0]?.toString();
     else return "";
   }, [shownResponse, storeState]);
diff --git a/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx b/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
index de0d9a19a..078437738 100644
--- a/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
+++ b/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
@@ -1,16 +1,13 @@
 import React, { useCallback, useEffect, useState } from "react";
 import { EvalCriteria } from "../backend/evalgen/typing";
-import { Dict, LLMResponse, RatingDict } from "../backend/typing";
+import { Dict, LLMResponse } from "../backend/typing";
 import {
   ActionIcon,
   Button,
   Center,
-  Divider,
   Flex,
   Grid,
   Group,
-  Popover,
-  Radio,
   rem,
   ScrollArea,
   Skeleton,
@@ -25,14 +22,16 @@ import GradingView from "./GradingView";
 import { useDisclosure } from "@mantine/hooks";
 import { v4 as uuid } from "uuid";
 import {
-  IconPencil,
   IconRobot,
   IconTerminal2,
   IconThumbDown,
   IconThumbUp,
   IconTrash,
 } from "@tabler/icons-react";
-import { generateLLMEvaluationCriteria, getPromptForGenEvalCriteriaFromDesc } from "../backend/evalgen/utils";
+import {
+  generateLLMEvaluationCriteria,
+  getPromptForGenEvalCriteriaFromDesc,
+} from "../backend/evalgen/utils";
 import useStore from "../store";
 import EvaluationFunctionExecutor from "../backend/evalgen/executor";
 
@@ -45,7 +44,6 @@ const ThumbUpDownButtons = ({
   onChangeGrade: (newGrade: boolean | undefined) => void;
   getGradeCount: (grade: boolean | undefined) => number;
 }) => {
-
   const true_count = getGradeCount(true);
   const false_count = getGradeCount(false);
 
@@ -82,7 +80,9 @@ const ThumbUpDownButtons = ({
             size="20pt"
             fill={grade === false ? "pink" : "white"}
           />
-          {false_count > 0 && <div className="gradeDownCount">{false_count}</div>}
+          {false_count > 0 && (
+            <div className="gradeDownCount">{false_count}</div>
+          )}
         </div>
       </Button>
     </>
@@ -230,13 +230,17 @@ interface GradingResponsesStepProps {
   onPrevious: () => void;
   executor: EvaluationFunctionExecutor | null;
   logs: { date: Date; message: string }[];
-  genAIModelNames: { strong: string; weak: string };
+  genAIModelNames: { large: string; small: string };
   numCallsMade: { strong: number; weak: number };
   responses: LLMResponse[];
   criteria: EvalCriteria[];
   setCriteria: React.Dispatch<React.SetStateAction<EvalCriteria[]>>;
-  grades: Dict<Dict<boolean | undefined>>;  // per-criteria grades
-  setPerCriteriaGrade: (responseUID: string, criteriaUID: string, newGrade: boolean | undefined) => void;
+  grades: Dict<Dict<boolean | undefined>>; // per-criteria grades
+  setPerCriteriaGrade: (
+    responseUID: string,
+    criteriaUID: string,
+    newGrade: boolean | undefined,
+  ) => void;
   setOnNextCallback: React.Dispatch<React.SetStateAction<() => unknown>>;
 }
 
@@ -268,10 +272,7 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
   const getStateValue = (stateId: number) => {
     return Math.floor(Math.random() * 30 + 6);
   };
-  const getGradeCount = (
-    criteriaUID: string,
-    grade: boolean | undefined,
-  ) => {
+  const getGradeCount = (criteriaUID: string, grade: boolean | undefined) => {
     let count = 0;
     for (const respUid in grades) {
       count += grade === grades[respUid][criteriaUID] ? 1 : 0;
@@ -346,6 +347,7 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
 
     generateLLMEvaluationCriteria(
       "",
+      genAIModelNames.large,
       apiKeys,
       `I've given some feedback on some text output. Use this feedback to decide on a single new evaluation criteria with a yes/no answer, only if the feedback isn't encompassed by existing criteria. I want you to take the criteria and output a JSON object in the format below. 
   
@@ -400,6 +402,7 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
     // Make async LLM call to expand criteria
     generateLLMEvaluationCriteria(
       "",
+      genAIModelNames.large,
       apiKeys,
       getPromptForGenEvalCriteriaFromDesc(desc), // prompt
       null, // system_msg
@@ -437,45 +440,45 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
           />
 
           <Flex direction="column">
-              <Flex justify="space-between" align="center">
-                <Text size="lg" weight={500} mb="sm">
-                  LLM Activity
-                </Text>
-                {/* GPT Call Tally */}
-                <Text size="sm" color="dark" style={{ fontStyle: "italic" }}>
-                  Executed {numCallsMade.strong} {genAIModelNames.strong} calls and {numCallsMade.weak}{" "}
-                  {genAIModelNames.weak} calls.
-                </Text>
-              </Flex>
-              <div
-                style={{
-                  backgroundColor: "#f0f0f0",
-                  color: "#333",
-                  fontFamily: "monospace",
-                  padding: "12px",
-                  width: "calc(100% - 30px)",
-                  height: "200px",
-                  overflowY: "auto",
-                  borderRadius: "8px",
-                  border: "1px solid #ddd",
-                  marginRight: "20px", // Space on the right
-                }}
-                ref={(el) => {
-                  if (el) {
-                    el.scrollTop = el.scrollHeight;
-                  }
-                }}
-              >
-                {logs.map((log, index) => (
-                  <div key={index}>
-                    <span style={{ color: "#4A90E2" }}>
-                      {log.date.toLocaleString()} -{" "}
-                    </span>
-                    <span>{log.message}</span>
-                  </div>
-                ))}
-              </div>
+            <Flex justify="space-between" align="center">
+              <Text size="lg" weight={500} mb="sm">
+                LLM Activity
+              </Text>
+              {/* GPT Call Tally */}
+              <Text size="sm" color="dark" style={{ fontStyle: "italic" }}>
+                Executed {numCallsMade.strong} {genAIModelNames.large} calls and{" "}
+                {numCallsMade.weak} {genAIModelNames.small} calls.
+              </Text>
             </Flex>
+            <div
+              style={{
+                backgroundColor: "#f0f0f0",
+                color: "#333",
+                fontFamily: "monospace",
+                padding: "12px",
+                width: "calc(100% - 30px)",
+                height: "200px",
+                overflowY: "auto",
+                borderRadius: "8px",
+                border: "1px solid #ddd",
+                marginRight: "20px", // Space on the right
+              }}
+              ref={(el) => {
+                if (el) {
+                  el.scrollTop = el.scrollHeight;
+                }
+              }}
+            >
+              {logs.map((log, index) => (
+                <div key={index}>
+                  <span style={{ color: "#4A90E2" }}>
+                    {log.date.toLocaleString()} -{" "}
+                  </span>
+                  <span>{log.message}</span>
+                </div>
+              ))}
+            </div>
+          </Flex>
 
           {/* Progress bar */}
           {/* <Flex justify="left" align="center" gap="md">
@@ -488,62 +491,74 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
                   </Flex> */}
         </Stack>
       </Grid.Col>
-      <Grid.Col span={4} bg="#eee" pt="16px" h="100%" style={{ boxShadow: "-10px 0px 20px #aaa" }}>
+      <Grid.Col
+        span={4}
+        bg="#eee"
+        pt="16px"
+        h="100%"
+        style={{ boxShadow: "-10px 0px 20px #aaa" }}
+      >
         <Center>
           <Title order={3} ml={8} mt="sm" mb="md">
             Per-criteria grading
           </Title>
         </Center>
 
-        <ScrollArea h="75%" offsetScrollbars style={{ border: "1px solid #ccc" }}>
-        <div
-          style={{
-            display: "flex",
-            flexDirection: "column",
-            marginBottom: "40px"
-          }}
+        <ScrollArea
+          h="75%"
+          offsetScrollbars
+          style={{ border: "1px solid #ccc" }}
         >
-          <div style={{ flex: 2, overflowY: "auto" }}>
-            {criteria.map((e) => (
-              <CriteriaCard
-                criterion={e}
-                key={e.uid}
-                onChange={(newCrit) => handleChangeCriteria(newCrit, e.uid)}
-                onDelete={() => handleDeleteCriteria(e.uid)}
-                grade={
-                  (shownResponse && grades[shownResponse.uid]) ? grades[shownResponse.uid][e.uid] : undefined
-                }
-                getGradeCount={(grade) => {
-                  return shownResponse
-                    ? getGradeCount(
-                        // shownResponse.uid,
-                        e.uid,
-                        grade,
-                      )
-                    : 0;
-                }}
-                onChangeGrade={(newGrade) => {
-                  if (shownResponse)
-                    setPerCriteriaGrade(shownResponse.uid, e.uid, newGrade);
-                }}
-                initiallyOpen={true}
-                getStateValue={(stateId) => getStateValue(stateId)}
-              />
-            ))}
-            {isLoadingCriteria > 0 ? (
-              Array.from(
-                { length: isLoadingCriteria },
-                (v: unknown, idx: number) => (
-                  <Skeleton key={idx} h={80} mb={4} />
-                ),
-              )
-            ) : (
-              <></>
-            )}
-          </div>
+          <div
+            style={{
+              display: "flex",
+              flexDirection: "column",
+              marginBottom: "40px",
+            }}
+          >
+            <div style={{ flex: 2, overflowY: "auto" }}>
+              {criteria.map((e) => (
+                <CriteriaCard
+                  criterion={e}
+                  key={e.uid}
+                  onChange={(newCrit) => handleChangeCriteria(newCrit, e.uid)}
+                  onDelete={() => handleDeleteCriteria(e.uid)}
+                  grade={
+                    shownResponse && grades[shownResponse.uid]
+                      ? grades[shownResponse.uid][e.uid]
+                      : undefined
+                  }
+                  getGradeCount={(grade) => {
+                    return shownResponse
+                      ? getGradeCount(
+                          // shownResponse.uid,
+                          e.uid,
+                          grade,
+                        )
+                      : 0;
+                  }}
+                  onChangeGrade={(newGrade) => {
+                    if (shownResponse)
+                      setPerCriteriaGrade(shownResponse.uid, e.uid, newGrade);
+                  }}
+                  initiallyOpen={true}
+                  getStateValue={(stateId) => getStateValue(stateId)}
+                />
+              ))}
+              {isLoadingCriteria > 0 ? (
+                Array.from(
+                  { length: isLoadingCriteria },
+                  (v: unknown, idx: number) => (
+                    <Skeleton key={idx} h={80} mb={4} />
+                  ),
+                )
+              ) : (
+                <></>
+              )}
+            </div>
 
-          <div className="criteriaButtons">
-            {/* <Popover withArrow>
+            <div className="criteriaButtons">
+              {/* <Popover withArrow>
               <Popover.Target>
               <Button
                 leftIcon={<IconPencil size={14} />}
@@ -571,7 +586,7 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
                 
               </Popover.Dropdown>
             </Popover> */}
-              
+
               {/* <Button
                 leftIcon={<IconSparkles size={14} />}
                 variant="subtle"
@@ -584,11 +599,8 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
                 Suggest Criteria
               </Button> */}
             </div>
-          
-
 
-
-          {/* <Stack spacing="0px" pl="xs" pr="lg" style={{ flex: 1 }}>
+            {/* <Stack spacing="0px" pl="xs" pr="lg" style={{ flex: 1 }}>
             <Divider mt="lg" />
             <Title mb="0px" order={4}>
               Suggest New Criteria
@@ -635,24 +647,32 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
                 </Button>
               </Group>
             </Radio.Group> */}
-          {/* </Stack> */}
-        </div>
-
-        <Textarea value={newCriteriaDesc} onChange={(e) => setNewCriteriaDesc(e.currentTarget.value)} label="Add new criteria:" placeholder="Describe the criteria to add." ml="md" mr="md"></Textarea>
-        <Group position="right" mr="md" mt="sm">
-        <Button
-          color="green"
-          variant="filled"
-          disabled={newCriteriaDesc?.trim().length === 0 || isLoadingCriteria > 0}
-          onClick={() => {
-            addCriteria(newCriteriaDesc);
-            setNewCriteriaDesc("");
-          }}
-        >
-          + Add criteria
-        </Button>
-        </Group>
+            {/* </Stack> */}
+          </div>
 
+          <Textarea
+            value={newCriteriaDesc}
+            onChange={(e) => setNewCriteriaDesc(e.currentTarget.value)}
+            label="Add new criteria:"
+            placeholder="Describe the criteria to add."
+            ml="md"
+            mr="md"
+          ></Textarea>
+          <Group position="right" mr="md" mt="sm">
+            <Button
+              color="green"
+              variant="filled"
+              disabled={
+                newCriteriaDesc?.trim().length === 0 || isLoadingCriteria > 0
+              }
+              onClick={() => {
+                addCriteria(newCriteriaDesc);
+                setNewCriteriaDesc("");
+              }}
+            >
+              + Add criteria
+            </Button>
+          </Group>
         </ScrollArea>
       </Grid.Col>
     </Grid>
diff --git a/chainforge/react-server/src/EvalGen/GradingView.tsx b/chainforge/react-server/src/EvalGen/GradingView.tsx
index e26927992..41f3767f4 100644
--- a/chainforge/react-server/src/EvalGen/GradingView.tsx
+++ b/chainforge/react-server/src/EvalGen/GradingView.tsx
@@ -110,7 +110,12 @@ const GradingView: React.FC<GradingViewProps> = ({
 
           {/* Go forward to the next response */}
           <Tooltip label="To next response" withArrow>
-            <Button variant="white" color="dark" bg="transparent" onClick={gotoNextResponse}>
+            <Button
+              variant="white"
+              color="dark"
+              bg="transparent"
+              onClick={gotoNextResponse}
+            >
               <IconChevronRight />
             </Button>
           </Tooltip>
diff --git a/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx b/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
index 27d2fae36..000aadb4b 100644
--- a/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
+++ b/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
@@ -33,7 +33,10 @@ import {
 } from "@tabler/icons-react";
 import useStore from "../store";
 import { accuracyToColor, cmatrixTextAnnotations } from "../backend/utils";
-import { generateLLMEvaluationCriteria, getPromptForGenEvalCriteriaFromDesc } from "../backend/evalgen/utils";
+import {
+  generateLLMEvaluationCriteria,
+  getPromptForGenEvalCriteriaFromDesc,
+} from "../backend/evalgen/utils";
 import { v4 as uuid } from "uuid";
 import Plot from "react-plotly.js";
 
@@ -44,6 +47,7 @@ interface PickCriteriaStepProps {
   setCriteria: React.Dispatch<React.SetStateAction<EvalCriteria[]>>;
   genCriteriaFromContext: () => Promise<EvalCriteria[] | undefined>;
   setOnNextCallback: React.Dispatch<React.SetStateAction<() => unknown>>;
+  genAIModelNames: { large: string; small: string };
 }
 
 interface CriteriaCardProps {
@@ -375,6 +379,7 @@ const PickCriteriaStep: React.FC<PickCriteriaStepProps> = ({
   criteria,
   setCriteria,
   genCriteriaFromContext,
+  genAIModelNames,
 }) => {
   // State for criteria cards
   const [addCriteriaValue, setAddCriteriaValue] = useState("");
@@ -401,6 +406,7 @@ const PickCriteriaStep: React.FC<PickCriteriaStepProps> = ({
     // Make async LLM call to expand criteria
     generateLLMEvaluationCriteria(
       "",
+      genAIModelNames.large,
       apiKeys,
       getPromptForGenEvalCriteriaFromDesc(addCriteriaValue), // prompt
       null, // system_msg
@@ -423,7 +429,7 @@ const PickCriteriaStep: React.FC<PickCriteriaStepProps> = ({
         setIsLoadingCriteria((num) => num - 1);
       });
   };
-  
+
   const updateCriteria = (
     newValue: string,
     critIdx: number,
diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx
index 9ea550273..b35bd7d38 100644
--- a/chainforge/react-server/src/EvalGenModal.tsx
+++ b/chainforge/react-server/src/EvalGenModal.tsx
@@ -499,7 +499,7 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
         // });
       } else if (executor) {
         // Update criteria in executor
-        executor.addCriteria(criteria);
+        executor.updateCriteria(criteria);
       }
 
       updateCriteriaForDisplay();
diff --git a/chainforge/react-server/src/ResponseRatingToolbar.tsx b/chainforge/react-server/src/ResponseRatingToolbar.tsx
index 8e40e6e91..f07d77839 100644
--- a/chainforge/react-server/src/ResponseRatingToolbar.tsx
+++ b/chainforge/react-server/src/ResponseRatingToolbar.tsx
@@ -123,7 +123,7 @@ const ResponseRatingToolbar: React.FC<ResponseRatingToolbarProps> = ({
 
   // Override the text in the internal textarea whenever upstream annotation changes.
   useEffect(() => {
-    setNoteText(note !== undefined ? note.toString() : "");
+    setNoteText(note != null ? note.toString() : "");
   }, [note]);
 
   // The label for the pop-up comment box.
diff --git a/chainforge/react-server/src/backend/ai.ts b/chainforge/react-server/src/backend/ai.ts
index 389866441..cb89e4918 100644
--- a/chainforge/react-server/src/backend/ai.ts
+++ b/chainforge/react-server/src/backend/ai.ts
@@ -25,8 +25,8 @@ export type Row = string;
 const AIFeaturesLLMs = [
   {
     provider: "OpenAI",
-    small: { value: "gpt-4o", label: "OpenAI GPT4o" },
-    large: { value: "gpt-4", label: "OpenAI GPT4" },
+    small: { value: "gpt-4o-mini", label: "OpenAI GPT4o-mini" },
+    large: { value: "gpt-4o", label: "OpenAI GPT4o" },
   },
   {
     provider: "Bedrock",
diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts
index 07532cd5d..7f429ac3e 100644
--- a/chainforge/react-server/src/backend/evalgen/executor.ts
+++ b/chainforge/react-server/src/backend/evalgen/executor.ts
@@ -11,7 +11,13 @@ import {
   EvalFunctionSetReport,
   EvalCriteriaUID,
 } from "./typing";
-import { LLMResponse, ResponseUID, QueryProgress, Dict, LLMSpec } from "../typing";
+import {
+  LLMResponse,
+  ResponseUID,
+  QueryProgress,
+  Dict,
+  LLMSpec,
+} from "../typing";
 import { EventEmitter } from "events";
 
 /**
@@ -66,7 +72,8 @@ export default class EvaluationFunctionExecutor {
   private scores: Map<ResponseUID, number>;
   // Cache function results for each example
   private resultsCache: Map<EvalFunction, Map<ResponseUID, EvalFunctionResult>>;
-  private llms: { small: string | LLMSpec, large: string | LLMSpec };
+  private llms: { small: string | LLMSpec; large: string | LLMSpec };
+  private apiKeys: Dict;
   private grades: Map<ResponseUID, boolean>; // Grades for all examples
   private perCriteriaGrades: Dict<Dict<boolean | undefined>>; // Grades per criteria
   private annotations: Dict<string>; // Annotations for each response
@@ -78,7 +85,11 @@ export default class EvaluationFunctionExecutor {
   private backgroundTaskPromise: Promise<void> | null = null; // To keep track of the background task for generating and executing evaluation functions
   private criteriaQueue: EvalCriteria[] = []; // Queue for new criteria to be processed
   private processing = false; // To keep track of whether we are currently processing a criteria
-  private updateNumLLMCalls: (numStrongModelCalls: number, numWeakModelCalls: number) => void;
+  private updateNumLLMCalls: (
+    numStrongModelCalls: number,
+    numWeakModelCalls: number,
+  ) => void;
+
   private logFunction: (logMessage: string) => void;
 
   /**
@@ -90,11 +101,15 @@ export default class EvaluationFunctionExecutor {
    * @param existingGrades Optional. A dict in format {uid: grade}, containing existing grades.
    */
   constructor(
-    genAIModels: { small: string | LLMSpec, large: string | LLMSpec },
+    genAIModels: { small: string | LLMSpec; large: string | LLMSpec },
+    apiKeys: Dict,
     promptTemplate: string,
     examples: LLMResponse[],
     evalCriteria: EvalCriteria[] = [],
-    updateNumLLMCalls: (numStrongModelCalls: number, numWeakModelCalls: number) => void,
+    updateNumLLMCalls: (
+      numStrongModelCalls: number,
+      numWeakModelCalls: number,
+    ) => void,
     addLog: (log: string) => void,
     existingGrades?: Record<ResponseUID, boolean>,
     existingPerCriteriaGrades?: Dict<Dict<boolean | undefined>>,
@@ -111,6 +126,7 @@ export default class EvaluationFunctionExecutor {
     this.evalCriteria = evalCriteria;
     this.promptTemplate = promptTemplate;
     this.llms = genAIModels;
+    this.apiKeys = apiKeys;
 
     // Set scores and grades to default values of 0
     this.scores = new Map<ResponseUID, number>();
@@ -262,10 +278,12 @@ export default class EvaluationFunctionExecutor {
 
     await generateFunctionsForCriteria(
       criteria,
+      this.llms.large,
       this.promptTemplate,
       this.examples[Math.floor(Math.random() * this.examples.length)],
       emitter,
       badExample,
+      this.apiKeys,
     );
     // Update LLM call count by 1
     this.updateNumLLMCalls(1, 0);
@@ -382,9 +400,12 @@ export default class EvaluationFunctionExecutor {
       console.log(criteria);
       generateFunctionsForCriteria(
         criteria,
+        this.llms.large,
         this.promptTemplate,
         this.examples[Math.floor(Math.random() * this.examples.length)],
         emitter, // Pass the EventEmitter instance
+        undefined,
+        this.apiKeys,
       ).then(() => {
         emitter.emit("criteriaProcessed");
         // Update LLM call count by 1
@@ -435,20 +456,23 @@ export default class EvaluationFunctionExecutor {
   }
 
   /**
-   * Adds another evaluation criteria and triggers the generation and execution of evaluation functions for the new criteria.
+   * Updates the set of evaluation criteria and triggers the generation and execution of evaluation functions for any new criteria.
    * This method allows the client to add new evaluation criteria after the executor has been initialized.
    * The new criteria will be processed in parallel with the existing criteria.
    * The method returns immediately, allowing the client to continue with other tasks.
    *
-   * @param criteria The new evaluation criteria to be added.
+   * @param criteria The new state of the evaluation criteria list.
    */
-  public addCriteria(criteriaList: EvalCriteria[]): void {
+  public updateCriteria(criteriaList: EvalCriteria[]): void {
     // See if there are criteria to remove
-    this.evalCriteria = this.evalCriteria.filter((c) => (!criteriaList.includes(c)));
+    this.evalCriteria = this.evalCriteria.filter(
+      (c) => !criteriaList.includes(c),
+    );
 
     // See if there are new criteria to add
     for (const criteria of criteriaList) {
-      if (this.evalCriteria.includes(criteria)) {  // criteria already included
+      if (this.evalCriteria.includes(criteria)) {
+        // criteria already included
         continue;
       }
 
@@ -768,7 +792,11 @@ export default class EvaluationFunctionExecutor {
           evalFunction.evalCriteria.eval_method === "code"
             ? execPyFunc
             : executeLLMEval;
-        const result = await funcToExecute(evalFunction, this.llms.small, example);
+        const result = await funcToExecute(
+          evalFunction,
+          this.llms.small,
+          example,
+        );
 
         // Put result in cache
         if (!this.resultsCache.has(evalFunction)) {
@@ -1006,7 +1034,11 @@ export default class EvaluationFunctionExecutor {
           evalFunction.evalCriteria.eval_method === "code"
             ? execPyFunc
             : executeLLMEval;
-        const result = await funcToExecute(evalFunction, this.llms.small, example);
+        const result = await funcToExecute(
+          evalFunction,
+          this.llms.small,
+          example,
+        );
 
         // Put result in cache
         if (!this.resultsCache.has(evalFunction)) {
diff --git a/chainforge/react-server/src/backend/evalgen/oai_utils.ts b/chainforge/react-server/src/backend/evalgen/oai_utils.ts
index b9f23ce18..1f8634f32 100644
--- a/chainforge/react-server/src/backend/evalgen/oai_utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/oai_utils.ts
@@ -1,339 +1,81 @@
 // import { env as process_env } from "process";
 import { EventEmitter } from "events";
 // import { AzureKeyCredential, OpenAIClient } from "@azure/openai";
-import { get_openai_api_key } from "../utils";
-type ContentType = "criteria" | "python_fn" | "llm_eval";
+import { llmResponseDataToString } from "../utils";
+import { simpleQueryLLM } from "../backend";
+import { Dict, LLMSpec } from "../typing";
+import { extractMdBlocks } from "./utils";
+type ContentType = "python_fn" | "llm_eval";
 
-export class OpenAIStreamer extends EventEmitter {
-  private buffer = "";
-  private isJsonContentStarted = false;
-  private isPythonContentStarted = false;
-  private pythonBlockBuffer = "";
-  // private client;
-  private openai_api_key;
+export class EvalGenAssertionEmitter extends EventEmitter {
+  private apiKeys: Dict | undefined;
 
-  constructor() {
+  constructor(apiKeys?: Dict) {
     super();
-
-    const OPENAI_API_KEY = get_openai_api_key();
-    this.openai_api_key = OPENAI_API_KEY;
-
-    // this.client = new OpenAIClient(
-    //   process?.env?.AZURE_OPENAI_ENDPOINT ?? AZURE_OPENAI_ENDPOINT ?? "",
-    //   new AzureKeyCredential(
-    //     process?.env?.AZURE_OPENAI_KEY ?? AZURE_OPENAI_KEY ?? "",
-    //   ),
-    // );
-
-    // this.client = new OpenAIApi(configuration);
-  }
-
-  private buildMessages(prompt: string): any[] {
-    return [
-      {
-        content:
-          "You are an expert Python programmer and helping me write assertions for my LLM pipeline. An LLM pipeline accepts an example and prompt template, fills the template's placeholders with the example, and generates a response.",
-        role: "system",
-      },
-      { role: "user", content: prompt },
-    ];
-  }
-
-  private resetBuffer(): void {
-    this.buffer = "";
-    this.isJsonContentStarted = false;
-    this.isPythonContentStarted = false;
-    this.pythonBlockBuffer = "";
+    this.apiKeys = apiKeys;
   }
 
   async generate(
     prompt: string,
-    model: string,
-    type: ContentType,
+    llm: string | LLMSpec,
+    contentType: ContentType,
   ): Promise<void> {
-    this.resetBuffer();
-    const messages = this.buildMessages(prompt);
-
-    // const events = await this.client.listChatCompletions(model, messages, {});
-
-    // for await (const event of events) {
-    //   for (const choice of event.choices) {
-    //     const delta = choice.delta?.content;
-    //     if (delta !== undefined) {
-    //       if (type === "criteria") {
-    //         this.processCriteriaDelta(delta);
-    //       } else if (type === "llm_eval") {
-    //         this.processStringDelta(delta);
-    //       } else if (type === "python_fn") {
-    //         this.processFunctionDelta(delta);
-    //       } else {
-    //         throw new Error("Invalid type");
-    //       }
-    //     }
-    //   }
-    // }
-
-    // Used restapi as here: https://stackoverflow.com/questions/76137987/openai-completion-stream-with-node-js-and-express-js
-
-    const streamRes = await fetch(
-      "https://api.openai.com/v1/chat/completions",
-      {
-        method: "POST",
-        headers: {
-          Authorization: `Bearer ${this.openai_api_key}`,
-          "Content-Type": "application/json",
-        },
-        body: JSON.stringify({
-          model,
-          messages,
-          stream: true,
-        }),
-      },
+    const emit_prompt = ((p: string) => this.emit("function", p)).bind(this);
+
+    const result = await simpleQueryLLM(
+      prompt, // prompt
+      typeof llm === "string" ? llm : [llm], // llm
+      // spec, // llm
+      "You are an expert Python programmer and helping me write assertions for my LLM pipeline. An LLM pipeline accepts an example and prompt template, fills the template's placeholders with the example, and generates a response.", // system_msg
+      this.apiKeys, // API keys (if any)
     );
 
-    const reader = streamRes.body?.getReader();
-    if (!reader) {
-      console.error("Error initializing reader for OpenAI requests.");
-      return;
-    }
-
-    let done = false;
-    let concenattedJsonStrn = "";
-
-    while (!done) {
-      const { value, done: readerDone } = await reader.read();
-      done = readerDone;
-      const buffer = Buffer.from(value as ArrayBuffer);
-      const textPayload = buffer.toString();
-      concenattedJsonStrn += textPayload;
-      if (
-        !concenattedJsonStrn.includes(`data: `) ||
-        !concenattedJsonStrn.includes(`\n\n`)
-      ) {
-        continue;
-      }
-      const payloads = concenattedJsonStrn.toString().split("\n\n");
-      concenattedJsonStrn = "";
-
-      for (const payload of payloads) {
-        if (payload.includes("[DONE]")) return;
-        if (payload.startsWith("data:")) {
-          try {
-            const data = JSON.parse(payload.replace("data: ", ""));
-            const delta: undefined | string = data.choices[0].delta?.content;
-            if (delta !== undefined) {
-              if (type === "criteria") {
-                this.processCriteriaDelta(delta);
-              } else if (type === "llm_eval") {
-                this.processStringDelta(delta);
-              } else if (type === "python_fn") {
-                this.processFunctionDelta(delta);
-              } else {
-                throw new Error("Invalid type");
-              }
-            }
-          } catch (error) {
-            console.log(`Error with JSON.parse and ${payload}.\n${error}`);
-            concenattedJsonStrn += payload;
-          }
-        }
-      }
-    }
-
-    this.emit("end"); // Signal that streaming is complete
-  }
-
-  private processCriteriaDelta(delta: string): void {
-    this.buffer += delta;
-    if (!this.isJsonContentStarted) {
-      const startIndex = this.buffer.indexOf("```json\n");
-      if (startIndex !== -1) {
-        this.isJsonContentStarted = true;
-        this.buffer = this.buffer.substring(startIndex + 8); // Skip the '```json \n' part
-      }
-      // Trim the buffer to avoid whitespace at beginning and end
-      this.buffer = this.buffer.trim();
-    }
-
-    if (this.isJsonContentStarted) {
-      this.tryEmitEvalCriteria();
-    }
-  }
-
-  private tryEmitEvalCriteria(): void {
-    let braceCount = 0;
-    let lastIndex = 0; // Track start of the next JSON object
-
-    // Detect and handle the start of an array
-    if (this.buffer.trim().startsWith("[")) {
-      this.buffer = this.buffer.trim().substring(1); // Remove the leading '['
-    }
-
-    // Remove leading commas if they exist right before a JSON object
-    this.buffer = this.buffer.replace(/^\s*,\s*/, "");
-
-    for (let i = 0; i < this.buffer.length; i++) {
-      const char = this.buffer[i];
-      if (char === "{") {
-        braceCount++;
-      } else if (char === "}") {
-        braceCount--;
-      }
-
-      // When a complete JSON object is detected
-      if (braceCount === 0 && char === "}") {
-        const jsonStr = this.buffer.substring(lastIndex, i + 1).trim();
-        lastIndex = i + 1; // Update for potential next object
-
-        // Remove any leading comma for the next object
-        if (this.buffer[lastIndex] === ",") {
-          lastIndex++; // Skip the comma for the next object
-        }
-
-        try {
-          const jsonObj = JSON.parse(jsonStr);
-          this.emit("evalCriteria", jsonObj);
-        } catch (error) {
-          console.error("Error parsing JSON:", error);
-        }
-      }
-    }
-
-    // Keep any incomplete JSON for the next delta
-    this.buffer = this.buffer.substring(lastIndex).trim();
-  }
-
-  private processStringDelta(delta: string): void {
-    this.buffer += delta;
-    if (!this.isJsonContentStarted) {
-      const startIndex = this.buffer.indexOf("```json\n");
-      if (startIndex !== -1) {
-        this.isJsonContentStarted = true;
-        this.buffer = this.buffer.substring(startIndex + 8); // Skip the '```json\n' part
-      }
-    }
-
-    if (this.isJsonContentStarted) {
-      this.tryEmitStrings();
-    }
-  }
-
-  private tryEmitStrings(): void {
-    let quoteCount = 0;
-    let lastIndex = 0; // Track the start of the next string
-
-    // Detect and handle the start of an array
-    if (this.buffer.startsWith("[")) {
-      this.buffer = this.buffer.substring(1); // Remove the leading '['
-    }
-
-    // Remove leading commas and whitespace that might be right before a JSON string
-    this.buffer = this.buffer.replace(/^\s*,\s*/, "");
-
-    for (let i = 0; i < this.buffer.length; i++) {
-      const char = this.buffer[i];
-
-      // Toggle quote count on encountering quotes, ignoring escaped quotes
-      if (char === '"' && (i === 0 || this.buffer[i - 1] !== "\\")) {
-        quoteCount++;
-      }
-
-      // When a complete string is detected (every second quote)
-      if (quoteCount === 2) {
-        const jsonString = this.buffer.substring(lastIndex, i + 1); // Include the closing quote
-        lastIndex = i + 1; // Update for the potential next string
-
-        // Remove any leading comma for the next string
-        if (this.buffer[lastIndex] === ",") {
-          lastIndex++; // Skip the comma for the next string
-        }
-
-        quoteCount = 0; // Reset for the next string
-
-        // Extract the string value from JSON
-        try {
-          const strValue = JSON.parse(jsonString);
-          this.emit("function", strValue);
-        } catch (error) {
-          console.error("Error parsing JSON string:", error);
-        }
-      }
-    }
-
-    // Keep any incomplete JSON string for the next delta
-    this.buffer = this.buffer.substring(lastIndex).trim();
-  }
-
-  private processFunctionDelta(delta: string): void {
-    this.buffer += delta;
-    if (!this.isPythonContentStarted) {
-      let startIndex = this.buffer.indexOf("```python");
-      if (startIndex === -1) startIndex = this.buffer.indexOf("```");
-      if (startIndex !== -1) {
-        this.isPythonContentStarted = true;
-        this.buffer = this.buffer.substring(startIndex);
-      }
-    } else {
-      const endIndex = this.buffer.indexOf("```", 8); // Look for end marker after the start
-      if (endIndex !== -1) {
-        // Extract Python code block
-        const pythonCode = this.buffer
-          .replace("```python", "")
-          .replaceAll("```", "")
-          .trim();
-        this.pythonBlockBuffer += pythonCode;
-        this.buffer = this.buffer.substring(endIndex + 3);
-        this.isPythonContentStarted = false;
-        // Now process the Python code block for functions
-        this.tryEmitFunctionCriteria();
-      }
-    }
-  }
-
-  private tryEmitFunctionCriteria(): void {
-    // Split the buffer into lines
-    const lines = this.pythonBlockBuffer.split("\n");
-    let collecting = false;
-    let functionBody: string[] = [];
-    let baseIndentation = 0;
-
-    for (const line of lines) {
-      if (!collecting) {
-        // Check if the line is a function definition
-        if (line.trim().startsWith("def ")) {
-          collecting = true;
-          functionBody = [line];
-          // Determine the base indentation level
-          baseIndentation = line.indexOf("def");
-        }
+    if (result.errors && Object.keys(result.errors).length > 0)
+      throw new Error(Object.values(result.errors as Dict)[0].toString());
+
+    // Get output (text from LLM response)
+    const output = llmResponseDataToString(result.responses[0].responses[0]);
+    console.log("Streamer: LLM said: ", output); // for debuggging
+
+    // Attempt to extract output depending on content type
+    if (contentType === "llm_eval") {
+      // Expected output is a ``json block that is just a list of three strings representing the prompts i.e. ["str1", "str2", "str3"]
+      // Attempt to extract JSON blocks (strings) from output
+      const json_blocks = extractMdBlocks(output, "json");
+      if (json_blocks === undefined || json_blocks.length === 0)
+        throw new Error(
+          "EvalGen: Could not parse LLM response into evaluation prompt: No JSON detected in output.",
+        );
+
+      // If we passed, this should be a list of strings:
+      const prompts = json_blocks.flatMap((b) => JSON.parse(b));
+      // Verify format:
+      if (prompts.every((p) => typeof p === "string")) {
+        // If these are all strings, we are good to go--
+        // Emit all the LLM eval prompt candidates in one burst
+        prompts.forEach(emit_prompt);
       } else {
-        // Check if the line returns to the base indentation level or lower, indicating the end of the function
-        const currentIndentation = line.search(/\S|$/); // Find first non-space character or end of line
-        if (currentIndentation <= baseIndentation) {
-          // Emit the collected function body
-          this.emit("function", functionBody.join("\n"));
-          functionBody = []; // Reset for the next function
-          collecting = false;
-
-          // If the current line is another function definition, start collecting again
-          if (line.trim().startsWith("def ")) {
-            collecting = true;
-            functionBody = [line];
-            baseIndentation = line.indexOf("def");
-          }
-        } else if (collecting) {
-          // Continue collecting the function body
-          functionBody.push(line);
-        }
+        console.error(
+          "Unexpected output type after JSON parsing: At least generated LLM eval prompt is not a string.",
+          prompts,
+        );
+        throw new Error("Unexpected output type after JSON parsing");
       }
+    } else if (contentType === "python_fn") {
+      // Expected output has ~3 Python codeblocks within ```python markers
+      // Attempt to extract code blocks from output
+      const code_blocks = extractMdBlocks(output, "python");
+      if (code_blocks === undefined || code_blocks.length === 0)
+        throw new Error(
+          "EvalGen: Could not parse LLM response into Python function: No code detected in output.",
+        );
+
+      // If we passed, this should be a list of Python code functions. We assume it is OK, and treat them separately:
+      code_blocks.forEach(emit_prompt);
+    } else {
+      throw new Error("Unknown content type: " + contentType);
     }
 
-    // Check if there's a function body collected at the end of the buffer without returning to the base indentation
-    if (collecting && functionBody.length > 0) {
-      this.emit("function", functionBody.join("\n"));
-    }
-
-    // Clear the buffer after processing
-    this.pythonBlockBuffer = "";
+    this.emit("end"); // Signal that streaming is complete
   }
 }
diff --git a/chainforge/react-server/src/backend/evalgen/utils.ts b/chainforge/react-server/src/backend/evalgen/utils.ts
index 6ea69676f..fdc47d170 100644
--- a/chainforge/react-server/src/backend/evalgen/utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/utils.ts
@@ -17,19 +17,22 @@ import {
   retryAsyncFunc,
 } from "../utils";
 import { v4 as uuid } from "uuid";
-import { OpenAIStreamer } from "./oai_utils";
+import { EvalGenAssertionEmitter } from "./oai_utils";
 import {
   buildContextPromptForVarsMetavars,
   buildGenEvalCodePrompt,
 } from "../../AiPopover";
 
 /**
- * Extracts substrings within "```json" and "```" ticks. Excludes the ticks from return.
+ * Extracts substrings within "```" and "```" ticks. Excludes the ticks from return.
  * @param mdText
  * @returns
  */
-function extractJSONBlocks(mdText: string): string[] | undefined {
-  const regex = /```json(.*?)```/gs;
+export function extractMdBlocks(
+  mdText: string,
+  blockName: string,
+): string[] | undefined {
+  const regex = new RegExp(`\`\`\`${blockName}(.*?)\`\`\``, "gs");
   const matches = mdText.match(regex);
   if (matches)
     return matches.map((s) => s.replace("```json", "").replace("```", ""));
@@ -84,7 +87,7 @@ export async function generateLLMEvaluationCriteria(
     // console.log("LLM said: ", output); // for debuggging
 
     // Attempt to extract JSON blocks (strings) from input
-    const json_blocks = extractJSONBlocks(output);
+    const json_blocks = extractMdBlocks(output, "json");
     if (json_blocks === undefined || json_blocks.length === 0)
       throw new Error(
         "EvalGen: Could not parse LLM response into evaluation critera: No JSON detected in output.",
@@ -138,7 +141,7 @@ export async function executeLLMEval(
     "Evaluate the text below according to this criteria: " +
     evalFunction.code +
     ' Only return "yes" or "no", nothing else.\n\n```\n' +
-    example.responses[0] +
+    llmResponseDataToString(example.responses[0]) +
     "\n```";
 
   // Query an LLM as an evaluator
@@ -286,10 +289,12 @@ export async function execPyFunc(
 
 export async function generateFunctionsForCriteria(
   criteria: EvalCriteria,
+  llm: string | LLMSpec,
   promptTemplate: string,
   example: LLMResponse,
   emitter: EventEmitter,
   badExample?: LLMResponse,
+  apiKeys?: Dict,
 ): Promise<void> {
   const functionGenPrompt = buildFunctionGenPrompt(
     criteria,
@@ -300,7 +305,7 @@ export async function generateFunctionsForCriteria(
   console.log("Function generation prompt:", functionGenPrompt);
 
   try {
-    const streamer = new OpenAIStreamer();
+    const streamer = new EvalGenAssertionEmitter(apiKeys);
 
     streamer.on("function", (functionDefinition: string) => {
       processAndEmitFunction(criteria, functionDefinition, emitter);
@@ -308,7 +313,7 @@ export async function generateFunctionsForCriteria(
 
     const modelType =
       criteria.eval_method === "expert" ? "llm_eval" : "python_fn";
-    await streamer.generate(functionGenPrompt, "gpt-4o", modelType);
+    await streamer.generate(functionGenPrompt, llm, modelType);
   } catch (error) {
     console.error("Error generating function for criteria:", error);
     throw new Error(
@@ -328,7 +333,7 @@ function buildFunctionGenPrompt(
     badExampleSection = `
     Here is an example response that DOES NOT meet the criteria:
     \`\`\`
-    ${badExample.responses[0]}
+    ${llmResponseDataToString(badExample.responses[0])}
     \`\`\`
     `;
   }
@@ -343,7 +348,7 @@ function buildFunctionGenPrompt(
     ${badExampleSection}
     Create 3 implementations of the criterion.
     ${buildGenEvalCodePrompt("python", buildContextPromptForVarsMetavars(getVarsAndMetavars([example])), criteria.criteria, true)}
-    Be creative in your implementations. Our goal is to explore diverse approaches to evaluate LLM responses effectively. Try to avoid using third-party libraries for code-based evaluation methods. Include the full implementation of each function. Each function should return only True or False.`;
+    Be creative in your implementations. Our goal is to explore diverse approaches to evaluate LLM responses effectively. Try to avoid using third-party libraries for code-based evaluation methods. Include the full implementation of each function in separate "\`\`\`python" blocks. Each function should return only True or False.`;
 
     return prompt;
   }

From bc453a973c7872254590b2c6f03238e17db9ee9a Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Sat, 22 Mar 2025 09:55:50 -0400
Subject: [PATCH 21/35] wip

---
 chainforge/react-server/src/EvalGen/EvalGenWizard.tsx    | 4 +---
 chainforge/react-server/src/backend/evalgen/oai_utils.ts | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
index 93633f45a..fc2684d12 100644
--- a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
+++ b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
@@ -86,9 +86,7 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
       updateGlobalRating(responseUID, "perCriteriaGrades", grades[responseUID]);
 
       // If the EvalGen executor is running, update the per-criteria grade for this sample:
-      executor?.setGradeForExample(
-        responseUID,
-        grades[responseUID]);
+      executor?.setGradeForExample(responseUID, grades[responseUID]);
 
       return { ...grades };
     });
diff --git a/chainforge/react-server/src/backend/evalgen/oai_utils.ts b/chainforge/react-server/src/backend/evalgen/oai_utils.ts
index 1f8634f32..840789119 100644
--- a/chainforge/react-server/src/backend/evalgen/oai_utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/oai_utils.ts
@@ -20,7 +20,7 @@ export class EvalGenAssertionEmitter extends EventEmitter {
     llm: string | LLMSpec,
     contentType: ContentType,
   ): Promise<void> {
-    const emit_prompt = ((p: string) => this.emit("function", p)).bind(this);
+    const emit_prompt = (p: string) => this.emit("function", p);
 
     const result = await simpleQueryLLM(
       prompt, // prompt

From 2a0d6c42b04911ab49677dd549d915cee0aa5219 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Tue, 25 Mar 2025 23:37:04 -0400
Subject: [PATCH 22/35] Fixed bug in executor (whew)

---
 .../react-server/src/backend/backend.ts       |  80 ++++++---
 .../src/backend/evalgen/executor.ts           | 165 ++----------------
 .../react-server/src/backend/evalgen/utils.ts |  69 ++++++--
 chainforge/react-server/src/backend/utils.ts  |  49 ++++++
 4 files changed, 180 insertions(+), 183 deletions(-)

diff --git a/chainforge/react-server/src/backend/backend.ts b/chainforge/react-server/src/backend/backend.ts
index 988c837fa..00dd413e7 100644
--- a/chainforge/react-server/src/backend/backend.ts
+++ b/chainforge/react-server/src/backend/backend.ts
@@ -17,6 +17,7 @@ import {
   LLMResponseData,
   PromptVarType,
   StringOrHash,
+  ChatHistory,
 } from "./typing";
 import { LLM, LLMProvider, getEnumName, getProvider } from "./models";
 import {
@@ -31,6 +32,7 @@ import {
   llmResponseDataToString,
   extendArray,
   extendArrayDict,
+  stripWrappingQuotes,
 } from "./utils";
 import StorageCache, { StringLookup } from "./cache";
 import { PromptPipeline } from "./query";
@@ -1280,41 +1282,45 @@ export async function executepy(
  *
  * @param id a unique ID to refer to this information. Used when cache'ing evaluation results.
  * @param llm the LLM to query (as an LLM specification dict)
- * @param root_prompt the prompt template to use as the scoring function. Should include exactly one template var, {input}, where input responses will be put.
+ * @param root_prompt the prompt template to use as the scoring function. Should include exactly one template var, {__input}, where input responses will be put.
  * @param response_ids the cache'd response to run on, which must be a unique ID or list of unique IDs of cache'd data
  * @param api_keys optional. any api keys to set before running the LLM
  */
 export async function evalWithLLM(
   id: string,
-  llm: LLMSpec,
+  llm: string | LLMSpec,
   root_prompt: string,
-  response_ids: string | string[],
+  response_ids: string | string[] | LLMResponse[],
   api_keys?: Dict,
   progress_listener?: (progress: { [key: symbol]: any }) => void,
   cancel_id?: string | number,
+  system_msg?: string,
 ): Promise<{ responses?: LLMResponse[]; errors: string[] }> {
   // Check format of response_ids
   if (!Array.isArray(response_ids)) response_ids = [response_ids];
-  response_ids = response_ids as Array<string>;
+  if (response_ids.length === 0) return { responses: [], errors: [] };
+
+  const load_resps_from_cache = typeof response_ids[0] === "string";
+  const system_message: ChatHistoryInfo[] | undefined = system_msg
+    ? [
+        {
+          messages: [{ role: "system", content: system_msg }],
+          fill_history: {},
+        },
+      ]
+    : undefined;
 
   if (api_keys !== undefined) set_api_keys(api_keys);
 
   // Load all responses with the given ID:
   let all_evald_responses: LLMResponse[] = [];
   let all_errors: string[] = [];
-  for (const cache_id of response_ids) {
-    const fname = `${cache_id}.json`;
-    if (!StorageCache.has(fname))
-      throw new Error(`Did not find cache file for id ${cache_id}`);
-
-    // Load the raw responses from the cache + clone them all:
-    const resp_objs = (load_cache_responses(fname) as LLMResponse[]).map((r) =>
-      JSON.parse(JSON.stringify(r)),
-    ) as LLMResponse[];
-
-    if (resp_objs.length === 0) continue;
 
-    console.log(resp_objs);
+  const _runOverResponses = async (
+    resp_objs: LLMResponse[],
+    cache_id?: string,
+  ) => {
+    console.log("Running LLM evaluator over response objects:", resp_objs);
 
     // We need to keep track of the index of each response in the response object.
     // We can generate var dicts with metadata to store the indices:
@@ -1338,16 +1344,16 @@ export async function evalWithLLM(
 
     // Now run all inputs through the LLM grader!:
     const { responses, errors } = await queryLLM(
-      `eval-${id}-${cache_id}`,
+      `eval-${id}-${cache_id ?? "provided"}`,
       [llm],
       1,
       root_prompt,
       { __input: inputs },
-      undefined,
+      system_message, // if there's a sys_message, we pass it in chat history format
       undefined,
       undefined,
       progress_listener,
-      false,
+      !cache_id, // if there's no cache_id, we don't want to cache the responses
       cancel_id,
     );
 
@@ -1371,7 +1377,34 @@ export async function evalWithLLM(
       }
     });
 
-    all_evald_responses = all_evald_responses.concat(resp_objs);
+    return resp_objs;
+  };
+
+  // Run over cache'd response data
+  if (load_resps_from_cache) {
+    for (const cache_id of response_ids) {
+      const fname = `${cache_id}.json`;
+      if (!StorageCache.has(fname))
+        throw new Error(`Did not find cache file for id ${cache_id}`);
+
+      // Load the raw responses from the cache + clone them all:
+      const resp_objs = (load_cache_responses(fname) as LLMResponse[]).map(
+        (r) => JSON.parse(JSON.stringify(r)),
+      ) as LLMResponse[];
+      if (resp_objs.length === 0) continue;
+
+      const evald_resp_objs = await _runOverResponses(
+        resp_objs,
+        cache_id as string,
+      );
+
+      all_evald_responses = all_evald_responses.concat(evald_resp_objs);
+    }
+  } else {
+    // Run over provided response objects
+    const resp_objs = response_ids as LLMResponse[];
+    const evald_resp_objs = await _runOverResponses(resp_objs); // no cache
+    all_evald_responses = all_evald_responses.concat(evald_resp_objs);
   }
 
   // Do additional processing to check if all evaluations are
@@ -1381,7 +1414,9 @@ export async function evalWithLLM(
     if (!resp_obj.eval_res) continue;
     for (const score of resp_obj.eval_res.items) {
       if (score !== undefined)
-        all_eval_res.add(score.toString().trim().toLowerCase());
+        all_eval_res.add(
+          stripWrappingQuotes(score.toString().trim().toLowerCase()),
+        );
     }
   }
 
@@ -1421,7 +1456,8 @@ export async function evalWithLLM(
   }
 
   // Store the evaluated responses in a new cache json:
-  StorageCache.store(`${id}.json`, all_evald_responses);
+  if (load_resps_from_cache)
+    StorageCache.store(`${id}.json`, all_evald_responses);
 
   return { responses: all_evald_responses, errors: all_errors };
 }
diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts
index 7f429ac3e..98915d56c 100644
--- a/chainforge/react-server/src/backend/evalgen/executor.ts
+++ b/chainforge/react-server/src/backend/evalgen/executor.ts
@@ -194,8 +194,9 @@ export default class EvaluationFunctionExecutor {
    */
   public async waitForCompletion(): Promise<void> {
     if (this.backgroundTaskPromise) {
-      await this.backgroundTaskPromise;
+      const promise = this.backgroundTaskPromise;
       this.backgroundTaskPromise = null;
+      await promise;
     }
   }
 
@@ -214,6 +215,10 @@ export default class EvaluationFunctionExecutor {
     const functionExecutionPromises: Promise<any>[] = [];
 
     emitter.on("functionGenerated", (evalFunction) => {
+      this.logFunction(
+        `Generated a new ${evalFunction.evalCriteria.eval_method === "code" ? "code-based" : "LLM-based"} validator for criteria: ${evalFunction.evalCriteria.shortname}${evalFunction.evalCriteria.eval_method === "expert" ? `, with prompt: ${evalFunction.name}` : ""}. Executing it on ${this.examples.length} examples.`,
+      );
+
       const executionPromise = (async () => {
         this.evalFunctions.push(evalFunction);
         const executionPromises = this.examples.map(async (example) => {
@@ -233,6 +238,7 @@ export default class EvaluationFunctionExecutor {
               ? execPyFunc
               : executeLLMEval;
 
+          // Run the function on the example and if there's an error, increment skipped
           const result = await funcToExecute(
             evalFunction,
             this.llms.small,
@@ -285,6 +291,7 @@ export default class EvaluationFunctionExecutor {
       badExample,
       this.apiKeys,
     );
+
     // Update LLM call count by 1
     this.updateNumLLMCalls(1, 0);
 
@@ -306,152 +313,17 @@ export default class EvaluationFunctionExecutor {
   public async generateAndExecuteEvaluationFunctions(
     onProgress?: (progress: QueryProgress) => void,
   ): Promise<void> {
-    const emitter = new EventEmitter();
-    const numCriteriaToProcess = this.evalCriteria.length;
-
-    // Since we don't know how many implementations the LLM will suggest,
-    // we must estimate it here so we can use this information to stream
-    // "progress" updates back to the client:
-    let funcsExecuted = 0;
-    const estimatedFuncsToExecute =
-      numCriteriaToProcess +
-      this.evalCriteria.length * 5 * this.examples.length;
-
-    let criteriaProcessed = 0; // Track the number of criteria processed
-    let resolveAllFunctionsGenerated: any; // To be called when all functions are generated and executed
-    const functionExecutionPromises: Promise<any>[] = []; // Track execution promises for function executions
-
-    // This promise resolves when the 'allFunctionsGenerated' event is emitted
-    const allFunctionsGeneratedPromise = new Promise<void>((resolve) => {
-      resolveAllFunctionsGenerated = resolve;
-    });
-
-    // Listen for generated functions and execute them as they come in
-    emitter.on("functionGenerated", (evalFunction) => {
-      this.logFunction(
-        `Generated a new ${evalFunction.evalCriteria.eval_method === "code" ? "code-based" : "LLM-based"} validator for criteria: ${evalFunction.evalCriteria.shortname}${evalFunction.evalCriteria.eval_method === "expert" ? `, with prompt: ${evalFunction.name}` : ""}. Executing it on ${this.examples.length} examples.`,
-      );
-
-      // Capture the execution promise of each function
-      const executionPromise = (async () => {
-        // Add the eval function to the list of functions
-        this.evalFunctions.push(evalFunction);
-
-        const executionPromises = this.examples.map(async (example) => {
-          // Get random positive and negative examples for this criteria using the perCriteriaGrades
-          const criteriaId = evalFunction.evalCriteria.uid;
-          const randomPositiveExample = this.examples.find(
-            (example) =>
-              this.perCriteriaGrades[criteriaId]?.[example.uid] === true,
-          );
-          const randomNegativeExample = this.examples.find(
-            (example) =>
-              this.perCriteriaGrades[criteriaId]?.[example.uid] === false,
-          );
-
-          const funcToExecute =
-            evalFunction.evalCriteria.eval_method === "code"
-              ? execPyFunc
-              : executeLLMEval;
-
-          // Run the function on the example and if there's an error, increment skipped
-          const result = await funcToExecute(
-            evalFunction,
-            this.llms.small,
-            example,
-            randomPositiveExample,
-            randomNegativeExample,
-          );
-
-          // Update weak model call count by 1 if the eval method is expert
-          if (evalFunction.evalCriteria.eval_method === "expert") {
-            this.updateNumLLMCalls(0, 1);
-          }
-
-          funcsExecuted++;
-          if (onProgress) {
-            onProgress({
-              success: (100 * funcsExecuted) / estimatedFuncsToExecute,
-              error: 0,
-            });
-          }
-
-          // Put result in cache
-          if (!this.resultsCache.has(evalFunction)) {
-            this.resultsCache.set(evalFunction, new Map());
-          }
-          this.resultsCache.get(evalFunction)?.set(example.uid, result);
-
-          // Update the score if the result is false
-          if (result === EvalFunctionResult.FAIL) {
-            this.updateScore(example.uid, evalFunction);
-          }
-        });
-
-        await Promise.all(executionPromises);
-        // console.log(`Function ${evalFunction.name} executed on all examples.`);
-      })();
-
-      functionExecutionPromises.push(executionPromise);
-    });
-
-    // Generate functions for each criterion
-    this.evalCriteria.forEach((criteria) => {
-      console.log(criteria);
-      generateFunctionsForCriteria(
-        criteria,
-        this.llms.large,
-        this.promptTemplate,
-        this.examples[Math.floor(Math.random() * this.examples.length)],
-        emitter, // Pass the EventEmitter instance
-        undefined,
-        this.apiKeys,
-      ).then(() => {
-        emitter.emit("criteriaProcessed");
-        // Update LLM call count by 1
-        this.updateNumLLMCalls(1, 0);
-      });
-    });
-
-    // Listen for a custom 'criteriaProcessed' event to track when each criterion's functions have been generated
-    emitter.on("criteriaProcessed", () => {
-      criteriaProcessed++;
-      if (criteriaProcessed === this.evalCriteria.length) {
-        // Ensure all function executions have completed before emitting 'allFunctionsGenerated'
-        Promise.all(functionExecutionPromises).then(() => {
-          console.log(
-            "All evaluation functions have been generated and executed.",
-          );
-          this.logFunction(
-            "All initially-generated evaluation functions have been generated and executed.",
-          );
-          if (resolveAllFunctionsGenerated) {
-            resolveAllFunctionsGenerated(); // Resolve the promise when all functions have been generated and executed
-          }
-        });
-
-        if (onProgress)
-          onProgress({
-            success: 100,
-            error: 0,
-          });
+    // Enter a continuous monitoring loop for new criteria
+    while (this.backgroundTaskPromise !== null) {
+      // Check if there are any criteria in the queue to process
+      if (this.criteriaQueue.length > 0 && !this.processing) {
+        // Pop a criteria off the queue and process it
+        // TODO: use worker pool to parallelize this
+        await this.processNextCriteria();
       }
-    });
-
-    // Wait for the 'allFunctionsGenerated' event, which now waits for all executions
-    await allFunctionsGeneratedPromise;
-  }
 
-  public generateNewImplementationsForCriteria(
-    criteriaID: EvalCriteriaUID,
-  ): void {
-    const crit = this.evalCriteria.find((c) => c.uid === criteriaID);
-    if (!crit) {
-      throw new Error(`Criteria with ID ${criteriaID} not found.`);
-    }
-    this.criteriaQueue.push(crit);
-    if (!this.processing) {
-      this.processNextCriteria();
+      // Sleep for a short time before checking again (prevents CPU hogging)
+      await new Promise((resolve) => setTimeout(resolve, 500));
     }
   }
 
@@ -488,11 +360,12 @@ export default class EvaluationFunctionExecutor {
   }
 
   private async processNextCriteria() {
-    // TODO: use worker pool to parallelize this
     this.processing = true;
     while (this.criteriaQueue.length > 0) {
       const criteria = this.criteriaQueue.shift();
       if (criteria) {
+        // Log the processing of new criteria
+        this.logFunction(`Processing new criteria: ${criteria.shortname}`);
         await this.generateAndExecuteFunctionsForCriteria(criteria);
       }
     }
diff --git a/chainforge/react-server/src/backend/evalgen/utils.ts b/chainforge/react-server/src/backend/evalgen/utils.ts
index fdc47d170..7bcfb0718 100644
--- a/chainforge/react-server/src/backend/evalgen/utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/utils.ts
@@ -10,9 +10,16 @@ import {
   validEvalCriteriaFormat,
 } from "./typing";
 import { Dict, LLMResponse, LLMSpec } from "../typing";
-import { executejs, executepy, simpleQueryLLM } from "../backend";
+import {
+  evalWithLLM,
+  executejs,
+  executepy,
+  queryLLM,
+  simpleQueryLLM,
+} from "../backend";
 import {
   getVarsAndMetavars,
+  hashtagTemplateVars,
   llmResponseDataToString,
   retryAsyncFunc,
 } from "../utils";
@@ -136,42 +143,74 @@ export async function executeLLMEval(
   positiveExample?: LLMResponse,
   negativeExample?: LLMResponse,
 ): Promise<EvalFunctionResult> {
+  // The LLM eval prompt might include template vars. We need to add
+  // a hashtag to indicate to ChainForge that it should use the
+  // fill_history in the provided `example` LLMResponse.
+  const candidateCriteriaPrompt = hashtagTemplateVars(evalFunction.code);
+
   // Construct call to an LLM to evaluate the example
   const evalPrompt =
     "Evaluate the text below according to this criteria: " +
-    evalFunction.code +
+    candidateCriteriaPrompt +
     ' Only return "yes" or "no", nothing else.\n\n```\n' +
-    llmResponseDataToString(example.responses[0]) +
+    "{__input}" +
     "\n```";
 
   // Query an LLM as an evaluator
-  let systemMessage = "You are an expert evaluator.";
+  let systemMessage;
   if (
     positiveExample &&
     positiveExample.responses.length > 0 &&
     negativeExample &&
     negativeExample.responses.length > 0
   ) {
-    systemMessage +=
-      " Please consider the following GOOD example: \n" +
+    systemMessage =
+      "You are an expert evaluator. Please consider the following GOOD example:\n" +
       llmResponseDataToString(positiveExample.responses[0]) +
-      "\nand BAD example: \n" +
+      "\n\nand BAD example:\n" +
       llmResponseDataToString(negativeExample.responses[0]) +
-      "\nwhen making your evaluation.";
+      "\n\nwhen making your evaluation.";
   }
 
-  const result = await simpleQueryLLM(
-    evalPrompt, // prompt
-    typeof llm === "string" ? llm : [llm], // llm
-    systemMessage, // system_msg
+  // We use ChainForge's infrastructure for running LLM evaluators
+  // to score responses based on the criteria.
+  const { responses, errors } = await evalWithLLM(
+    Date.now().toString(), // id to refer to this query
+    llm, // llm
+    evalPrompt,
+    [example], // we pass in a single example
+    undefined,
+    undefined,
+    undefined,
+    systemMessage,
   );
+
+  if (
+    !responses ||
+    responses.length === 0 ||
+    !responses[0].eval_res ||
+    responses[0].eval_res.items.length === 0
+  ) {
+    console.error(
+      "Error executing LLM eval candidate:",
+      errors,
+      evalFunction.code,
+    );
+    return EvalFunctionResult.SKIP;
+  }
+
   // Get the output
-  const output = llmResponseDataToString(result.responses[0].responses[0]);
+  const output = responses[0].eval_res?.items[0];
+  // This should be a boolean... but we need to parse it
+  const is_pass =
+    output === true || (typeof output === "string" && output.includes("yes"));
+  const is_fail =
+    output === false || (typeof output === "string" && output.includes("no"));
 
   // Parse the response to determine the boolean value to return
-  if (output.toLowerCase().includes("yes")) {
+  if (is_pass) {
     return EvalFunctionResult.PASS;
-  } else if (output.toLowerCase().includes("no")) {
+  } else if (is_fail) {
     return EvalFunctionResult.FAIL;
   } else {
     // throw new EvalExecutionError(
diff --git a/chainforge/react-server/src/backend/utils.ts b/chainforge/react-server/src/backend/utils.ts
index 823bda765..e512fdbb1 100644
--- a/chainforge/react-server/src/backend/utils.ts
+++ b/chainforge/react-server/src/backend/utils.ts
@@ -2488,3 +2488,52 @@ export const cmatrixTextAnnotations = (
   }
   return annotations as Partial<Annotations>[];
 };
+
+/**
+ * Adds a hashtag prefix to template variables in a string.
+ * Converts unescaped templates of the form {template} to {#template}.
+ * Ignores escaped braces like \{ and \}.
+ *
+ * @param input - The input string containing templates
+ * @returns The string with templates converted to hashtagged form
+ */
+export function hashtagTemplateVars(input: string): string {
+  let result = "";
+  let i = 0;
+
+  while (i < input.length) {
+    // Check for escaped braces
+    if (
+      input[i] === "\\" &&
+      i + 1 < input.length &&
+      (input[i + 1] === "{" || input[i + 1] === "}")
+    ) {
+      // Add the escape character and the brace
+      result += input[i] + input[i + 1];
+      i += 2;
+    }
+    // Check for opening brace of a template (that isn't already hashtagged)
+    else if (input[i] === "{" && i + 1 < input.length && input[i + 1] !== "#") {
+      // Add the opening brace and the hashtag
+      result += "{#";
+      i++;
+    }
+    // Regular character
+    else {
+      result += input[i];
+      i++;
+    }
+  }
+
+  return result;
+}
+
+export function stripWrappingQuotes(s: string): string {
+  if (s.startsWith('"') && s.endsWith('"')) {
+    return s.slice(1, -1);
+  }
+  if (s.startsWith("'") && s.endsWith("'")) {
+    return s.slice(1, -1);
+  }
+  return s;
+}

From 8910d78e7e905edcc86872e5bb33c104a66e7f1b Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Thu, 27 Mar 2025 22:44:24 -0400
Subject: [PATCH 23/35] wip

---
 chainforge/react-server/src/backend/evalgen/executor.ts | 1 +
 1 file changed, 1 insertion(+)

diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts
index 98915d56c..6846a6c03 100644
--- a/chainforge/react-server/src/backend/evalgen/executor.ts
+++ b/chainforge/react-server/src/backend/evalgen/executor.ts
@@ -716,6 +716,7 @@ export default class EvaluationFunctionExecutor {
 
         // Calculate alignment for this function based on the graded examples
         for (const example of gradedExamples) {
+          // TODO: Change this to use perCriteriaGrades !! 
           const result = gradedResultMap.get(example.uid)?.get(evalFunction);
           const grade = this.grades.get(example.uid)
             ? EvalFunctionResult.PASS

From 1d207f13cc5c10476922bdf0401c1aa2deb4ddfc Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Sun, 30 Mar 2025 16:57:14 -0400
Subject: [PATCH 24/35] Began refactoring for executor to use
 perCriteriaGrades. Changed 'alignment' to three options: F1, MCC, and Cohen's
 kappa.

---
 .../src/EvalGen/EvalGenWizard.tsx             |  17 +-
 .../src/EvalGen/PickCriteriaStep.tsx          |   5 +-
 .../src/EvalGen/ReportCardStep.tsx            | 117 ++++++++--
 .../src/backend/evalgen/executor.ts           | 221 +++++++++++-------
 .../src/backend/evalgen/typing.ts             |   6 +-
 .../react-server/src/backend/evalgen/utils.ts |  77 ++++++
 6 files changed, 337 insertions(+), 106 deletions(-)

diff --git a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
index fc2684d12..07d205b72 100644
--- a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
+++ b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
@@ -1,5 +1,9 @@
 import React, { useCallback, useEffect, useMemo, useState } from "react";
-import { EvalCriteria, EvalGenReport } from "../backend/evalgen/typing";
+import {
+  EvalCriteria,
+  EvalFunctionSetReport,
+  EvalGenReport,
+} from "../backend/evalgen/typing";
 import { Dict, LLMResponse, RatingDict } from "../backend/typing";
 import useStore from "../store";
 import { escapeBraces } from "../backend/template";
@@ -112,6 +116,8 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
   const [executor, setExecutor] = useState<EvaluationFunctionExecutor | null>(
     null,
   );
+  const [evalGenReport, setEvalGenReport] =
+    useState<EvalFunctionSetReport | null>(null);
 
   // Logs and state from the EvalGen backend
   const [logs, setLogs] = useState<{ date: Date; message: string }[]>([]);
@@ -138,12 +144,16 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
     await executor?.waitForCompletion();
 
     // Filtering eval funcs by grades and present results
-    const filteredFunctions = await executor?.filterEvaluationFunctions(0.25);
+    const filteredFunctions =
+      (await executor?.filterEvaluationFunctions(0.25)) ?? null;
     console.log("Filtered Functions: ", filteredFunctions);
 
     // Return selected implementations to caller
     // TODO
     console.warn(filteredFunctions);
+
+    setActive(4); // Move to the report card step
+    setEvalGenReport(filteredFunctions);
   }, [executor]);
 
   // Update executor whenever resps, grades, or criteria change
@@ -323,9 +333,10 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
       {active === 4 && (
         <ReportCardStep
           onPrevious={handlePrevious}
-          onComplete={handleComplete}
+          onFinish={handleComplete}
           criteria={criteria}
           setOnNextCallback={setOnNextCallback}
+          report={evalGenReport}
         />
       )}
 
diff --git a/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx b/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
index 000aadb4b..e6c775a9a 100644
--- a/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
+++ b/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
@@ -50,7 +50,7 @@ interface PickCriteriaStepProps {
   genAIModelNames: { large: string; small: string };
 }
 
-interface CriteriaCardProps {
+export interface CriteriaCardProps {
   title: string;
   description: string;
   evalMethod: string;
@@ -64,7 +64,7 @@ interface CriteriaCardProps {
   otherFuncs?: EvalFunctionReport[];
 }
 
-const CriteriaCard: React.FC<CriteriaCardProps> = function CriteriaCard({
+export const CriteriaCard: React.FC<CriteriaCardProps> = function CriteriaCard({
   title,
   description,
   evalMethod,
@@ -118,6 +118,7 @@ const CriteriaCard: React.FC<CriteriaCardProps> = function CriteriaCard({
       />
     );
   }, [evalFuncReport]);
+
   const reportAccuracyRing = useMemo(() => {
     if (!evalFuncReport) return undefined;
     return {
diff --git a/chainforge/react-server/src/EvalGen/ReportCardStep.tsx b/chainforge/react-server/src/EvalGen/ReportCardStep.tsx
index 43dbf9355..9e1deea20 100644
--- a/chainforge/react-server/src/EvalGen/ReportCardStep.tsx
+++ b/chainforge/react-server/src/EvalGen/ReportCardStep.tsx
@@ -1,39 +1,116 @@
-import React from "react";
-import { Button, Group, Stack, Text, Title } from "@mantine/core";
-import { EvalCriteria } from "../backend/evalgen/typing";
+import React, { useMemo } from "react";
+import {
+  Button,
+  Card,
+  Flex,
+  Group,
+  ScrollArea,
+  SimpleGrid,
+  Stack,
+  Text,
+} from "@mantine/core";
+import { EvalCriteria, EvalFunctionSetReport } from "../backend/evalgen/typing";
+import { CriteriaCard } from "./PickCriteriaStep";
 
 interface ReportCardStepProps {
-  onPrevious: () => void;
-  onComplete: () => void;
   criteria: EvalCriteria[];
+  report: EvalFunctionSetReport | null;
+  onFinish: (reports: EvalFunctionSetReport) => void;
+  onPrevious: () => void;
   setOnNextCallback: React.Dispatch<React.SetStateAction<() => unknown>>;
 }
 
 const ReportCardStep: React.FC<ReportCardStepProps> = ({
+  report,
+  onFinish,
   onPrevious,
-  onComplete,
 }) => {
-  // TODO: Calculate alignment scores based on criteria and grading data
-  const alignmentScores = {};
+  const cards = useMemo(() => {
+    if (!report) return null;
+    const cards = [];
+
+    // Iterate through selected eval functions and create cards
+    for (const selectedFunc of report.selectedEvalFunctions) {
+      const c = selectedFunc.evalCriteria;
+      // Find corresponding report in allEvalFunctionReports map from criteria to list
+      const evalFuncReports = report.allEvalFunctionReports.get(c);
+      const evalFuncReport = evalFuncReports?.find(
+        (rep) => rep.evalFunction === selectedFunc,
+      );
+      // Get the functions that were not selected for this criteria
+      const otherFuncs = evalFuncReports?.filter(
+        (rep) => rep.evalFunction !== selectedFunc,
+      );
+
+      cards.push(
+        <CriteriaCard
+          reportMode
+          title={c.shortname}
+          description={c.criteria}
+          evalMethod={c.eval_method}
+          key={c.uid}
+          evalFuncReport={evalFuncReport}
+          otherFuncs={otherFuncs}
+        />,
+      );
+    }
+    return cards;
+  }, [report]);
 
   return (
     <Stack spacing="lg">
-      <Title order={3}>Evaluation Results</Title>
-      <Text>
-        Here&apos;s how well each evaluation criteria aligns with your grades:
+      <Text align="center" size="lg" pl="sm" mb="lg">
+        Chosen Functions and Alignment
       </Text>
 
-      {/* TODO: Display alignment scores */}
-      <Text>TODO: Show alignment scores for each criteria</Text>
+      {/* Show coverage and false failure rate numbers */}
+      <Flex justify="center" gap="md" mb="lg">
+        <Group position="center" spacing="xl" style={{ textAlign: "center" }}>
+          <Card
+            shadow="sm"
+            padding="md"
+            radius="md"
+            style={{ backgroundColor: "#f0f0f0" }}
+          >
+            <Text weight={500} size="md">
+              Coverage of Bad Responses
+            </Text>
+            <Text color="blue" weight={700} size="md">
+              {report?.failureCoverage.toFixed(2)}%
+            </Text>
+          </Card>
+          <Card
+            shadow="sm"
+            padding="md"
+            radius="md"
+            style={{ backgroundColor: "#f0f0f0" }}
+          >
+            <Text weight={500} size="md">
+              False Failure Rate
+            </Text>
+            <Text color="red" weight={700} size="md">
+              {report?.falseFailureRate.toFixed(2)}%
+            </Text>
+          </Card>
+        </Group>
+      </Flex>
 
-      <Group position="apart" mt="xl">
-        <Button variant="default" onClick={onPrevious}>
-          Back
-        </Button>
-        <Button onClick={onComplete} color="green">
-          Done
+      <ScrollArea mih={300} h={500} mah={500}>
+        <SimpleGrid cols={3} spacing="sm" verticalSpacing="sm" mb="lg">
+          {cards}
+        </SimpleGrid>
+      </ScrollArea>
+
+      <Flex justify="center" gap={12} mt="xs">
+        <Button
+          onClick={() => {
+            if (!report) return;
+            onFinish(report);
+          }}
+        >
+          Finish with selected evaluators
         </Button>
-      </Group>
+      </Flex>
     </Stack>
   );
 };
diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts
index 6846a6c03..68c699272 100644
--- a/chainforge/react-server/src/backend/evalgen/executor.ts
+++ b/chainforge/react-server/src/backend/evalgen/executor.ts
@@ -1,4 +1,7 @@
 import {
+  calculateCohensKappa,
+  calculateF1Score,
+  calculateMCC,
   execPyFunc,
   executeLLMEval,
   generateFunctionsForCriteria,
@@ -629,6 +632,114 @@ export default class EvaluationFunctionExecutor {
     return this.getExampleForId(ungraded[pickIndex]);
   }
 
+  /**
+   * Given an eval function and the results of that function against the examples (LLM responses),
+   * computes the alignment statistics between the eval function and the user grades.
+   * @param evalFunc
+   * @returns A Report, assuming the the function has been executed over some examples and the user has provided grades for those examples. If there's not enough data, returns undefined.
+   */
+  public computeAlignmentStats(
+    evalFunc: EvalFunction,
+  ): EvalFunctionReport | undefined {
+    // Get the eval function results from the cache
+    const results = this.resultsCache.get(evalFunc);
+    if (results === undefined) {
+      console.warn(
+        "No cache results found for this eval function. First ensure that the function has been executed over some examples.",
+      );
+      return undefined;
+    }
+
+    // Get a reference to the perCriteria grades for this eval function
+    const criteriaId = evalFunc.evalCriteria.uid;
+    if (!(criteriaId in this.perCriteriaGrades)) {
+      console.warn(
+        "No user grades found for this eval criteria. You must first grade some examples against this criteria (thumbs up/down) before we can compute alignment.",
+      );
+      return undefined;
+    }
+    // The perCriteriaGrades is a map of ResponseUID to boolean (user grade true/false)
+    // or undefined (no user grade for that example).
+    const userGradedExamples = this.perCriteriaGrades[criteriaId];
+
+    // Now `evalFuncResults` is a Map<ResponseUID, EvalFunctionResult>.
+    // We can compute the alignment stats across all examples.
+    // First, create a report for this function
+    const report: EvalFunctionReport = {
+      evalFunction: evalFunc,
+      true_pass: 0,
+      true_fail: 0,
+      false_pass: 0,
+      false_fail: 0,
+      skipped: 0,
+    };
+
+    // Calculate alignment for this function based on the graded examples
+    Object.entries(userGradedExamples).forEach(([exampleId, grade]) => {
+      if (grade === undefined) return; // Skip if user provides no grade for this example
+      const result = results.get(exampleId);
+      const userGrade = grade
+        ? EvalFunctionResult.PASS
+        : EvalFunctionResult.FAIL;
+
+      if (result !== undefined) {
+        // Handle true positives and true negatives
+        if (result === userGrade) {
+          if (result === EvalFunctionResult.PASS) {
+            report.true_pass++;
+          } else if (result === EvalFunctionResult.FAIL) {
+            report.true_fail++;
+          }
+        } else {
+          if (result === EvalFunctionResult.PASS) {
+            report.false_pass++;
+          } else if (result === EvalFunctionResult.FAIL) {
+            report.false_fail++;
+          } else {
+            report.skipped++;
+          }
+        }
+      }
+    });
+
+    // Calculate alignment in different ways
+    // NOTE: If a denominator during the calculate is 0, this will set the score to undefined.
+    report.f1 = calculateF1Score(
+      report.true_pass,
+      report.false_pass,
+      report.false_fail,
+    );
+    report.mcc = calculateMCC(
+      report.true_pass,
+      report.true_fail,
+      report.false_pass,
+      report.false_fail,
+    );
+    report.cohens_kappa = calculateCohensKappa(
+      report.true_pass,
+      report.true_fail,
+      report.false_pass,
+      report.false_fail,
+    );
+
+    // Calculate failure coverage
+    const failureCoverage =
+      report.true_fail + report.false_pass > 0
+        ? report.true_fail / (report.true_fail + report.false_pass)
+        : 0.0; // 0.0 if there are no failures to detect
+
+    // Calculate false failure rate
+    const falseFailureRate =
+      report.true_pass + report.false_fail > 0
+        ? report.false_fail / (report.true_pass + report.false_fail)
+        : 0.0; // Default to 0.0 if there are no examples that could trigger false failures
+
+    report.failureCoverage = failureCoverage;
+    report.falseFailureRate = falseFailureRate;
+
+    return report;
+  }
+
   /**
    * Filters out evaluation functions that are incorrect based on the grades provided by the developer.
    *
@@ -682,12 +793,6 @@ export default class EvaluationFunctionExecutor {
       gradedResultMap.set(example.uid, row);
     }
 
-    const numFailGrades = gradedExamples.filter(
-      (example) => !this.grades.get(example.uid),
-    ).length;
-    const numPassGrades = gradedExamples.filter((example) =>
-      this.grades.get(example.uid),
-    ).length;
     const bestEvalFunctions: EvalFunction[] = [];
     const evalFunctionReport: Map<EvalCriteria, EvalFunctionReport[]> =
       new Map();
@@ -695,7 +800,7 @@ export default class EvaluationFunctionExecutor {
     // Iterate through each criteria
     // For each criteria, select the function with the highest alignment rate
     for (const criteria of this.evalCriteria) {
-      let scoredFunctions = [];
+      const scoredFunctions = [];
 
       for (const evalFunction of this.evalFunctions) {
         // Skip functions that don't match the criteria
@@ -704,60 +809,8 @@ export default class EvaluationFunctionExecutor {
         }
 
         // Create a report for this function
-        const report: EvalFunctionReport = {
-          evalFunction,
-          true_pass: 0,
-          true_fail: 0,
-          false_pass: 0,
-          false_fail: 0,
-          alignment: 0,
-          skipped: 0,
-        };
-
-        // Calculate alignment for this function based on the graded examples
-        for (const example of gradedExamples) {
-          // TODO: Change this to use perCriteriaGrades !! 
-          const result = gradedResultMap.get(example.uid)?.get(evalFunction);
-          const grade = this.grades.get(example.uid)
-            ? EvalFunctionResult.PASS
-            : EvalFunctionResult.FAIL;
-
-          if (result !== undefined) {
-            // Handle true positives and true negatives
-            if (result === grade) {
-              if (result === EvalFunctionResult.PASS) {
-                report.true_pass++;
-              } else if (result === EvalFunctionResult.FAIL) {
-                report.true_fail++;
-              }
-            } else {
-              if (result === EvalFunctionResult.PASS) {
-                report.false_pass++;
-              } else if (result === EvalFunctionResult.FAIL) {
-                report.false_fail++;
-              } else {
-                report.skipped++;
-              }
-            }
-          }
-        }
-
-        // Calculate coverage
-        const failureCoverage =
-          numFailGrades > 0
-            ? report.true_fail / (report.true_fail + report.false_pass)
-            : 1.0;
-
-        // Calculate false failure rate
-        const falseFailureRate =
-          report.false_fail / (report.true_pass + report.false_fail);
-
-        // The alignment is the F1 score of failure coverage and 1 - false failure rate
-        report.alignment =
-          numFailGrades > 0 || numPassGrades > 0
-            ? (2 * failureCoverage * (1 - falseFailureRate)) /
-              (failureCoverage + (1 - falseFailureRate))
-            : undefined;
+        const report: EvalFunctionReport | undefined =
+          this.computeAlignmentStats(evalFunction);
 
         // Save the report for this function
         if (!evalFunctionReport.has(criteria)) {
@@ -768,33 +821,41 @@ export default class EvaluationFunctionExecutor {
 
         scoredFunctions.push({
           evalFunction,
-          failureCoverage,
-          falseFailureRate:
-            report.false_fail / (report.true_pass + report.false_fail),
+          report,
         });
       }
 
-      // See if we can filter out functions with ffr > threshold
-      const numFunctionsBelowThreshold = scoredFunctions.filter(
-        (func) => func.falseFailureRate <= falseFailureRateThreshold,
-      ).length;
-      if (numFunctionsBelowThreshold > 0) {
-        // Filter out functions with ffr > threshold
-        scoredFunctions = scoredFunctions.filter(
-          (func) => func.falseFailureRate <= falseFailureRateThreshold,
-        );
-      }
-
-      // Save the best function for this criteria
-      // Maximize failure coverage and minimize false failure rate
+      // Sort the functions by "alignment"
+      // Here, we are using MCC as the alignment metric, where higher is better.
       scoredFunctions.sort((a, b) => {
-        if (a.failureCoverage === b.failureCoverage) {
-          return a.falseFailureRate - b.falseFailureRate;
+        const a_mcc = a.report?.mcc ?? -1; // If undefined, set to -1, which is lowest possible.
+        const b_mcc = b.report?.mcc ?? -1;
+        if (a_mcc === b_mcc) {
+          // If MCC is the same or not present, sort by false failure rate
+          return (
+            (a.report?.falseFailureRate ?? 0) -
+            (b.report?.falseFailureRate ?? 0)
+          );
         }
-        return b.failureCoverage - a.failureCoverage;
+        return b_mcc - a_mcc; // Sort by MCC descending
       });
 
+      // // See if we can filter out functions with ffr > threshold
+      // const funcsBelowThreshold = scoredFunctions.filter(
+      //   (func) => func.report?.falseFailureRate !== undefined && func.report?.falseFailureRate <= falseFailureRateThreshold,
+      // );
+
+      // // Save the best function for this criteria
+      // // Maximize failure coverage and minimize false failure rate
+      // funcsBelowThreshold.sort((a, b) => {
+      //   if (a.report?.failureCoverage === b.report?.failureCoverage) {
+      //     return a.report?.falseFailureRate - b.report?.falseFailureRate;
+      //   }
+      //   return b.failureCoverage - a.failureCoverage;
+      // });
+
       if (scoredFunctions.length > 0) {
+        // The top result is the 'best' / most aligned function
         bestEvalFunctions.push(scoredFunctions[0].evalFunction);
       }
     }
diff --git a/chainforge/react-server/src/backend/evalgen/typing.ts b/chainforge/react-server/src/backend/evalgen/typing.ts
index e9e6cd24d..ef15d1551 100644
--- a/chainforge/react-server/src/backend/evalgen/typing.ts
+++ b/chainforge/react-server/src/backend/evalgen/typing.ts
@@ -45,7 +45,11 @@ export interface EvalFunctionReport {
   false_pass: number;
   false_fail: number;
   skipped: number;
-  alignment?: number;
+  mcc?: number; // Matthews correlation coefficient, which is a measure of the quality of binary classifications
+  f1?: number; // F1 score, which is the harmonic mean of precision and recall
+  cohens_kappa?: number; // Cohen's kappa, which is a measure of inter-rater agreement
+  failureCoverage?: number; // The percentage of failures that were covered by the eval function
+  falseFailureRate?: number; // The percentage of false failures
 }
 
 export interface EvalFunctionSetReport {
diff --git a/chainforge/react-server/src/backend/evalgen/utils.ts b/chainforge/react-server/src/backend/evalgen/utils.ts
index 7bcfb0718..054a50b2c 100644
--- a/chainforge/react-server/src/backend/evalgen/utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/utils.ts
@@ -421,3 +421,80 @@ function processAndEmitFunction(
 
   emitter.emit("functionGenerated", evalFunction);
 }
+
+/**
+ * Calculates the F1 score based on true positives, false positives, and false negatives.
+ * The F1 score is the harmonic mean of precision and recall.
+ * Precision = TP / (TP + FP)
+ * Recall = TP / (TP + FN)
+ * F1 = 2 * (Precision * Recall) / (Precision + Recall)
+ * @param true_positive The number of true positive predictions
+ * @param false_positive The number of false positive predictions
+ * @param false_negative The number of false negative predictions
+ * @returns The F1 score, or undefined if precision and recall are both zero
+ */
+export function calculateF1Score(
+  true_positive: number,
+  false_positive: number,
+  false_negative: number,
+): number | undefined {
+  const precision = true_positive / (true_positive + false_positive);
+  const recall = true_positive / (true_positive + false_negative);
+  if (precision + recall === 0) return undefined; // Avoid division by zero
+  return (2 * precision * recall) / (precision + recall);
+}
+
+/**
+ * Calculates Matthews correlation coefficient (MCC) based on the confusion matrix values.
+ * ```
+ *  MCC = (TP * TN - FP * FN) / sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
+ * ```
+ * @param true_positive The number of true positive predictions
+ * @param true_negative The number of true negative predictions
+ * @param false_positive The number of false positive predictions
+ * @param false_negative The number of false negative predictions
+ * @returns The Matthews correlation coefficient, or undefined if the denominator is zero
+ */
+export function calculateMCC(
+  true_positive: number,
+  true_negative: number,
+  false_positive: number,
+  false_negative: number,
+): number | undefined {
+  const numerator =
+    true_positive * true_negative - false_positive * false_negative;
+  const denominator = Math.sqrt(
+    (true_positive + false_positive) *
+      (true_positive + false_negative) *
+      (true_negative + false_positive) *
+      (true_negative + false_negative),
+  );
+  if (denominator === 0) return undefined; // Avoid division by zero
+  return numerator / denominator;
+}
+
+/**
+ * Calculates Cohen's Kappa coefficient based on the confusion matrix values.
+ * ```
+ *  Kappa = (Po - Pe) / (1 - Pe)
+ * ```
+ * where Po is the observed agreement and Pe is the expected agreement.
+ * @param TP The number of true positive predictions
+ * @param TN The number of true negative predictions
+ * @param FP The number of false positive predictions
+ * @param FN The number of false negative predictions
+ * @returns The Cohen's Kappa coefficient, or undefined if the denominator is zero
+ */
+export function calculateCohensKappa(
+  TP: number,
+  TN: number,
+  FP: number,
+  FN: number,
+): number | undefined {
+  const numerator = 2 * (TP * TN - FP * FN);
+  const denominator = (TP + FP) * (FP + TN) + (TP + FN) * (FN + TN);
+  if (denominator === 0) {
+    return undefined; // Avoid division by zero
+  }
+  return numerator / denominator;
+}

From 16fbaa6def356a55e579ff4adeebe495337b114d Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Sun, 11 May 2025 20:55:05 -0400
Subject: [PATCH 25/35] cleanup

---
 chainforge/react-server/src/ItemsNode.tsx     | 17 ++++++-----------
 chainforge/react-server/src/ResponseBoxes.tsx |  3 +--
 chainforge/react-server/src/backend/utils.ts  | 17 +++++++++++++++--
 3 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/chainforge/react-server/src/ItemsNode.tsx b/chainforge/react-server/src/ItemsNode.tsx
index 7d007e0a3..d1aad2c0c 100644
--- a/chainforge/react-server/src/ItemsNode.tsx
+++ b/chainforge/react-server/src/ItemsNode.tsx
@@ -12,7 +12,12 @@ import NodeLabel from "./NodeLabelComponent";
 import { IconForms, IconTransform } from "@tabler/icons-react";
 import { Handle, Node, Position } from "reactflow";
 import BaseNode from "./BaseNode";
-import { DebounceRef, genDebounceFunc, processCSV } from "./backend/utils";
+import {
+  DebounceRef,
+  genDebounceFunc,
+  processCSV,
+  stripWrappingQuotes,
+} from "./backend/utils";
 import { AIGenReplaceItemsPopover } from "./AiPopover";
 import { cleanEscapedBraces, escapeBraces } from "./backend/template";
 import { TextFieldsNodeProps } from "./TextFieldsNode";
@@ -22,16 +27,6 @@ const wrapInQuotesIfContainsComma = (str: string) =>
   str.includes(",") ? `"${str}"` : str;
 export const makeSafeForCSLFormat = (str: string) =>
   wrapInQuotesIfContainsComma(replaceDoubleQuotesWithSingle(str));
-const stripWrappingQuotes = (str: string) => {
-  if (
-    typeof str === "string" &&
-    str.length >= 2 &&
-    str.charAt(0) === '"' &&
-    str.charAt(str.length - 1) === '"'
-  )
-    return str.substring(1, str.length - 1);
-  else return str;
-};
 export const prepareItemsNodeData = (text: string) => ({
   text,
   fields: processCSV(text).map(stripWrappingQuotes).map(escapeBraces),
diff --git a/chainforge/react-server/src/ResponseBoxes.tsx b/chainforge/react-server/src/ResponseBoxes.tsx
index 786e90e89..74962da39 100644
--- a/chainforge/react-server/src/ResponseBoxes.tsx
+++ b/chainforge/react-server/src/ResponseBoxes.tsx
@@ -20,11 +20,10 @@ import {
   LLMResponse,
   LLMResponseData,
 } from "./backend/typing";
-import StorageCache, { StringLookup } from "./backend/cache";
+import StorageCache, { MediaLookup } from "./backend/cache";
 import { IconCheck, IconChecks, IconX } from "@tabler/icons-react";
 import { getRatingKeyForResponse } from "./ResponseRatingToolbar";
 import useStore from "./store";
-import { MediaLookup } from "./backend/cache";
 
 // Lazy load the response toolbars
 const ResponseRatingToolbar = lazy(() => import("./ResponseRatingToolbar"));
diff --git a/chainforge/react-server/src/backend/utils.ts b/chainforge/react-server/src/backend/utils.ts
index 810d7169a..a8dc2013d 100644
--- a/chainforge/react-server/src/backend/utils.ts
+++ b/chainforge/react-server/src/backend/utils.ts
@@ -1485,7 +1485,9 @@ export async function call_ollama_provider(
   } else {
     // Text-only models
     query.prompt = prompt;
-    query.images = (await imagesToBase64(images ?? [])).map(getBase64DataFromDataURL);
+    query.images = (await imagesToBase64(images ?? [])).map(
+      getBase64DataFromDataURL,
+    );
     url += "generate";
   }
 
@@ -2764,7 +2766,7 @@ export function dataURLToBlob(dataURL: string): Blob {
  * Extracts the MIME type from a Data URL.
  * @param dataUrl The Data URL to extract the MIME type from.
  * @returns The MIME type as a string, or null if not found.
-*/
+ */
 function getMimeTypeFromDataURL(dataUrl: string): string | null {
   const match = dataUrl.match(/^data:([^;,]+)[;,]/);
   return match ? match[1] : null;
@@ -2841,3 +2843,14 @@ export const __http_url_to_base64 = (url: string) => {
     xhr.send();
   });
 };
+
+export const stripWrappingQuotes = (str: string) => {
+  if (
+    typeof str === "string" &&
+    str.length >= 2 &&
+    str.charAt(0) === '"' &&
+    str.charAt(str.length - 1) === '"'
+  )
+    return str.substring(1, str.length - 1);
+  else return str;
+};

From ed86c9999686fea8dcdc21abecf1dcfbc17cd8b2 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Tue, 13 May 2025 11:53:55 -0400
Subject: [PATCH 26/35] wip

---
 chainforge/react-server/src/backend/evalgen/executor.ts | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts
index 68c699272..d38fd1f9d 100644
--- a/chainforge/react-server/src/backend/evalgen/executor.ts
+++ b/chainforge/react-server/src/backend/evalgen/executor.ts
@@ -812,6 +812,11 @@ export default class EvaluationFunctionExecutor {
         const report: EvalFunctionReport | undefined =
           this.computeAlignmentStats(evalFunction);
 
+        if (!report) {
+          console.warn("Could not compute alignment stats for an eval function. Skipping.");
+          continue;
+        }
+
         // Save the report for this function
         if (!evalFunctionReport.has(criteria)) {
           evalFunctionReport.set(criteria, []);

From 65c24e149de4ebcf78bfa12c33e6282406e43b8a Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Tue, 13 May 2025 11:57:56 -0400
Subject: [PATCH 27/35] wip

---
 .../src/backend/evalgen/executor.ts           |  4 +-
 .../react-server/src/backend/evalgen/utils.ts |  1 +
 chainforge/react-server/src/backend/utils.ts  | 74 +++++++++++++++++++
 3 files changed, 78 insertions(+), 1 deletion(-)

diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts
index d38fd1f9d..7ffc30639 100644
--- a/chainforge/react-server/src/backend/evalgen/executor.ts
+++ b/chainforge/react-server/src/backend/evalgen/executor.ts
@@ -813,7 +813,9 @@ export default class EvaluationFunctionExecutor {
           this.computeAlignmentStats(evalFunction);
 
         if (!report) {
-          console.warn("Could not compute alignment stats for an eval function. Skipping.");
+          console.warn(
+            "Could not compute alignment stats for an eval function. Skipping.",
+          );
           continue;
         }
 
diff --git a/chainforge/react-server/src/backend/evalgen/utils.ts b/chainforge/react-server/src/backend/evalgen/utils.ts
index c0c3dd6a5..054a50b2c 100644
--- a/chainforge/react-server/src/backend/evalgen/utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/utils.ts
@@ -19,6 +19,7 @@ import {
 } from "../backend";
 import {
   getVarsAndMetavars,
+  hashtagTemplateVars,
   llmResponseDataToString,
   retryAsyncFunc,
 } from "../utils";
diff --git a/chainforge/react-server/src/backend/utils.ts b/chainforge/react-server/src/backend/utils.ts
index a8dc2013d..887f8b22c 100644
--- a/chainforge/react-server/src/backend/utils.ts
+++ b/chainforge/react-server/src/backend/utils.ts
@@ -54,6 +54,7 @@ import {
 } from "@mirai73/bedrock-fm";
 import StorageCache, { StringLookup, MediaLookup } from "./cache";
 import Compressor from "compressorjs";
+import { Annotations } from "plotly.js";
 // import { Models } from "@mirai73/bedrock-fm/lib/bedrock";
 
 const ANTHROPIC_HUMAN_PROMPT = "\n\nHuman:";
@@ -2854,3 +2855,76 @@ export const stripWrappingQuotes = (str: string) => {
     return str.substring(1, str.length - 1);
   else return str;
 };
+
+export const accuracyToColor = (acc: number) => {
+  if (acc > 0.9) return "green";
+  else if (acc > 0.7) return "yellow";
+  else if (acc > 0.5) return "orange";
+  else return "red";
+};
+
+export const cmatrixTextAnnotations = (
+  x: string[],
+  y: string[],
+  z: number[][],
+) => {
+  const annotations = [];
+  const midVal = Math.max(...z.flat());
+  for (let i = 0; i < y.length; i++) {
+    for (let j = 0; j < x.length; j++) {
+      annotations.push({
+        xref: "x1",
+        yref: "y1",
+        x: x[j],
+        y: y[i],
+        text: z[i][j].toString(),
+        font: {
+          // family: "monospace",
+          // size: 12,
+          color: z[i][j] < midVal ? "white" : "black",
+        },
+        showarrow: false,
+      });
+    }
+  }
+  return annotations as Partial<Annotations>[];
+};
+
+/**
+ * Adds a hashtag prefix to template variables in a string.
+ * Converts unescaped templates of the form {template} to {#template}.
+ * Ignores escaped braces like \{ and \}.
+ *
+ * @param input - The input string containing templates
+ * @returns The string with templates converted to hashtagged form
+ */
+export function hashtagTemplateVars(input: string): string {
+  let result = "";
+  let i = 0;
+
+  while (i < input.length) {
+    // Check for escaped braces
+    if (
+      input[i] === "\\" &&
+      i + 1 < input.length &&
+      (input[i + 1] === "{" || input[i + 1] === "}")
+    ) {
+      // Add the escape character and the brace
+      result += input[i] + input[i + 1];
+      i += 2;
+    }
+    // Check for opening brace of a template (that isn't already hashtagged)
+    else if (input[i] === "{" && i + 1 < input.length && input[i + 1] !== "#") {
+      // Add the opening brace and the hashtag
+      result += "{#";
+      i++;
+    }
+    // Regular character
+    else {
+      result += input[i];
+      i++;
+    }
+  }
+
+  return result;
+}

From ef3045b48a25e26a950de474be14796fbf2d82a4 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Tue, 13 May 2025 13:30:58 -0400
Subject: [PATCH 28/35] Bug and typing fixing

---
 .../src/EvalGen/EvalGenWizard.tsx             |   23 +-
 .../react-server/src/EvalGen/FeedbackStep.tsx |   55 +-
 .../src/EvalGen/GradeResponsesStep.tsx        |    6 +-
 .../react-server/src/EvalGen/GradingView.tsx  |   27 +-
 .../src/EvalGen/PickCriteriaStep.tsx          |    8 +-
 .../src/EvalGen/ReportCardStep.tsx            |    4 +-
 .../react-server/src/EvalGen/WelcomeStep.tsx  |   31 +-
 chainforge/react-server/src/EvalGenModal.tsx  | 3330 ++++++++---------
 chainforge/react-server/src/MultiEvalNode.tsx |   23 +-
 chainforge/react-server/src/ResponseBoxes.tsx |    2 +-
 .../src/backend/evalgen/executor.ts           |   85 +-
 chainforge/react-server/src/styles.css        |    5 +-
 12 files changed, 1830 insertions(+), 1769 deletions(-)

diff --git a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
index 07d205b72..d2782986a 100644
--- a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
+++ b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
@@ -1,3 +1,23 @@
+/**
+ * EvalGen 2.0
+ *
+ * Ian Arawjo, Shreya Shankar, J.D. Zamfirescu, Helen Weixu Chen
+ *
+ * This file and its directory concerns the front-end to evaluation generator, EvalGen.
+ * EvalGen supports users in generating eval funcs (here binary assertions) and aligning them with their preferences.
+ *
+ * Specifically, the modal lets users:
+ *  - make and refine criteria to grade on (on the left)
+ *  - grade responses (on the right)
+ *  - while in the backend, an LLM is generating candidate assertions and selected the ones most aligned with user grades
+ * As the user grades responses, they add/refine existing criteria.
+ * This modal presents a shared interface where criteria can be iterated on *alongside* grading.
+ * This is because of **criteria drift,** a phenomenon identified observing users in EvalGen 1.0 (unreleased).
+ *
+ * An AI (LLM call) can also suggest criteria based on the implicit context (inputs, such as the prompt)
+ * and user feedback during grading (written feedback about failing outputs whose failure couldn't be classified under the immediate criteria set.)
+ */
+
 import React, { useCallback, useEffect, useMemo, useState } from "react";
 import {
   EvalCriteria,
@@ -262,11 +282,10 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
       opened={opened}
       onClose={onClose}
       // title="EvalGen Wizard"
-      size="90%"
+      size="95%"
       padding="md"
       // keepMounted
       // closeOnClickOutside={true}
-      style={{ position: "relative", left: "-5%" }}
       styles={{
         inner: {
           padding: "5%", // This creates space around the modal (10% total)
diff --git a/chainforge/react-server/src/EvalGen/FeedbackStep.tsx b/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
index c86fbef85..2ba0b24c7 100644
--- a/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
+++ b/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
@@ -8,6 +8,7 @@ import {
   Text,
   Textarea,
   Title,
+  Tooltip,
 } from "@mantine/core";
 import GradingView from "./GradingView";
 import { IconThumbDown, IconThumbUp } from "@tabler/icons-react";
@@ -95,7 +96,7 @@ const FeedbackStep: React.FC<FeedbackStepProps> = ({
   }, [shownResponseIdx, responses]);
 
   return (
-    <Stack spacing="lg">
+    <Stack spacing="sm">
       <Title order={3}>Provide Feedback on Some Model Outputs</Title>
 
       <GradingView
@@ -107,31 +108,35 @@ const FeedbackStep: React.FC<FeedbackStepProps> = ({
         gotoPrevResponse={prevResponse}
       />
 
-      <Flex justify="center" gap="50px" mb="xl">
-        <Button
-          color={grade === true ? "gray" : "red"}
-          variant={grade !== false ? "outline" : "filled"}
-          onClick={() => {
-            setGrade(grade !== false ? false : null);
-          }}
-        >
-          <IconThumbDown />
-          &nbsp;Bad!
-        </Button>
-        <Button
-          color={grade === false ? "gray" : "green"}
-          variant={grade !== true ? "outline" : "filled"}
-          onClick={() => {
-            setGrade(grade !== true ? true : null);
-          }}
-        >
-          <IconThumbUp />
-          &nbsp;Good!
-        </Button>
+      <Flex justify="center" gap="50px">
+        <Tooltip label="This response is bad!" withinPortal withArrow>
+          <Button
+            color={grade === true ? "gray" : "red"}
+            variant={grade !== false ? "outline" : "filled"}
+            onClick={() => {
+              setGrade(grade !== false ? false : null);
+            }}
+          >
+            <IconThumbDown />
+            &nbsp;Bad!
+          </Button>
+        </Tooltip>
+        <Tooltip label="This response is good!" withinPortal withArrow>
+          <Button
+            color={grade === false ? "gray" : "green"}
+            variant={grade !== true ? "outline" : "filled"}
+            onClick={() => {
+              setGrade(grade !== true ? true : null);
+            }}
+          >
+            <IconThumbUp />
+            &nbsp;Good!
+          </Button>
+        </Tooltip>
       </Flex>
       <Center mb={100}>
         <Stack spacing="xs" w="80%">
-          <Text>What&apos;s the reason for your score?</Text>
+          <Text>What&apos;s the reason for your grade? Explain why:</Text>
           <Flex align="center" justify="space-around" gap="lg">
             <Textarea
               value={annotation}
@@ -148,7 +153,9 @@ const FeedbackStep: React.FC<FeedbackStepProps> = ({
             />
             <Button
               onClick={nextResponse}
-              disabled={grade === null || !annotation}
+              color="dark"
+              disabled={grade === null || (grade === false && !annotation)}
+              h={54}
             >
               Submit and Next
             </Button>
diff --git a/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx b/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
index 078437738..076b03755 100644
--- a/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
+++ b/chainforge/react-server/src/EvalGen/GradeResponsesStep.tsx
@@ -455,13 +455,15 @@ const GradingResponsesStep: React.FC<GradingResponsesStepProps> = ({
                 backgroundColor: "#f0f0f0",
                 color: "#333",
                 fontFamily: "monospace",
+                fontSize: "8pt",
                 padding: "12px",
-                width: "calc(100% - 30px)",
+                lineHeight: "1.2",
+                width: "calc(100% - 10px)",
                 height: "200px",
                 overflowY: "auto",
                 borderRadius: "8px",
                 border: "1px solid #ddd",
-                marginRight: "20px", // Space on the right
+                marginRight: "10px", // Space on the right
               }}
               ref={(el) => {
                 if (el) {
diff --git a/chainforge/react-server/src/EvalGen/GradingView.tsx b/chainforge/react-server/src/EvalGen/GradingView.tsx
index 41f3767f4..a6bed5753 100644
--- a/chainforge/react-server/src/EvalGen/GradingView.tsx
+++ b/chainforge/react-server/src/EvalGen/GradingView.tsx
@@ -82,16 +82,25 @@ const GradingView: React.FC<GradingViewProps> = ({
         {/* Middle response box with chevron buttons < and > for going back and forward a response */}
         <Flex justify="center" align="center" mb="sm">
           {/* Go back to previous response */}
-          <Button variant="white" color="dark" onClick={gotoPrevResponse}>
-            <IconChevronLeft />
-          </Button>
+          <Tooltip label="To previous response" withArrow>
+            <Button
+              variant="filled"
+              color="dark"
+              onClick={gotoPrevResponse}
+              h={84}
+              p="10px 4px"
+              mr={4}
+            >
+              <IconChevronLeft />
+            </Button>
+          </Tooltip>
 
           {/* The response one is currently grading */}
           <div
             className="response-box"
             style={{
               backgroundColor: "#eee",
-              width: "80%",
+              width: "90%",
               maxHeight: "340px",
               overflowY: "scroll",
               borderColor: "black",
@@ -111,10 +120,12 @@ const GradingView: React.FC<GradingViewProps> = ({
           {/* Go forward to the next response */}
           <Tooltip label="To next response" withArrow>
             <Button
-              variant="white"
+              variant="filled"
               color="dark"
-              bg="transparent"
               onClick={gotoNextResponse}
+              h={84}
+              p="10px 4px"
+              ml={4}
             >
               <IconChevronRight />
             </Button>
@@ -126,7 +137,7 @@ const GradingView: React.FC<GradingViewProps> = ({
             style={{
               backgroundColor: "#fff",
               padding: "12px",
-              width: "31%",
+              width: "45%",
               borderRadius: "12px",
               borderWidth: "1px",
               borderStyle: "solid",
@@ -142,7 +153,7 @@ const GradingView: React.FC<GradingViewProps> = ({
             style={{
               backgroundColor: "#fff",
               padding: "12px",
-              width: "41%",
+              width: "45%",
               borderRadius: "2px",
             }}
           >
diff --git a/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx b/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
index e6c775a9a..b9e57c6c4 100644
--- a/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
+++ b/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
@@ -122,8 +122,8 @@ export const CriteriaCard: React.FC<CriteriaCardProps> = function CriteriaCard({
   const reportAccuracyRing = useMemo(() => {
     if (!evalFuncReport) return undefined;
     return {
-      percent: Math.floor((evalFuncReport.alignment ?? 0) * 100),
-      color: accuracyToColor(evalFuncReport.alignment ?? 0),
+      percent: Math.floor((evalFuncReport.f1 ?? 0) * 100),
+      color: accuracyToColor(evalFuncReport.f1 ?? 0),
     };
   }, [evalFuncReport]);
 
@@ -503,6 +503,10 @@ const PickCriteriaStep: React.FC<PickCriteriaStepProps> = ({
                 .then((crit) =>
                   setCriteria(crit ? criteria.concat(crit) : criteria),
                 )
+                .catch((err) => {
+                  console.error(err);
+                  setIsLoadingCriteria(0);
+                })
                 .finally(() => setIsLoadingCriteria(0));
             }}
           >
diff --git a/chainforge/react-server/src/EvalGen/ReportCardStep.tsx b/chainforge/react-server/src/EvalGen/ReportCardStep.tsx
index 9e1deea20..8e421bcf0 100644
--- a/chainforge/react-server/src/EvalGen/ReportCardStep.tsx
+++ b/chainforge/react-server/src/EvalGen/ReportCardStep.tsx
@@ -57,6 +57,8 @@ const ReportCardStep: React.FC<ReportCardStepProps> = ({
     return cards;
   }, [report]);
 
+  console.log(report);
+
   return (
     <Stack spacing="lg">
       <Text align="center" size="lg" pl="sm" mb="lg">
@@ -95,7 +97,7 @@ const ReportCardStep: React.FC<ReportCardStepProps> = ({
         </Group>
       </Flex>
 
-      <ScrollArea mih={300} h={500} mah={500}>
+      <ScrollArea mih={300} h={400} mah={400}>
         <SimpleGrid cols={3} spacing="sm" verticalSpacing="sm" mb="lg">
           {cards}
         </SimpleGrid>
diff --git a/chainforge/react-server/src/EvalGen/WelcomeStep.tsx b/chainforge/react-server/src/EvalGen/WelcomeStep.tsx
index 4902d05ec..18fcfc965 100644
--- a/chainforge/react-server/src/EvalGen/WelcomeStep.tsx
+++ b/chainforge/react-server/src/EvalGen/WelcomeStep.tsx
@@ -10,7 +10,7 @@ const WelcomeStep: React.FC<WelcomeStepProps> = ({ setOnNextCallback }) => (
     <Title order={2}>Welcome to the EvalGen Wizard</Title>
     <Text>
       This wizard will guide you through creating automated evaluators for LLM
-      responses that are aligned with your preferences. You`&apos;ll look at
+      responses that are aligned with your preferences. You&apos;ll look at
       data, define what you care about, apply those criteria to grade data, and
       refine your criteria as you see more outputs. EvalGen then generates
       automated evaluators that implement each criteria, chooses implementations
@@ -24,9 +24,10 @@ const WelcomeStep: React.FC<WelcomeStepProps> = ({ setOnNextCallback }) => (
       >
         empirical research at UIST 2024
       </Anchor>
-      , and is inspired by similar inductive processes in grounded theory and
-      heuristic evaluation. Currently, Evalgen:
+      , and is inspired by inductive processes in UX research (heuristic
+      evaluation and grounded theory).
     </Text>
+    <Text>Currently, Evalgen:</Text>
     <List>
       <List.Item>
         Only generates <b>assertions (pass/fail tests)</b>. Numeric and
@@ -42,15 +43,15 @@ const WelcomeStep: React.FC<WelcomeStepProps> = ({ setOnNextCallback }) => (
         be powerful enough to generate code. (By default, it is OpenAI.)
       </List.Item>
       <List.Item>
-        Should be run on the outputs of <b>already-run</b> Prompt Nodes (LLM
-        responses).
-      </List.Item>
-      <List.Item>
-        EvalGen will send off many requests during usage. 🔔{" "}
-        <b>By using Evalgen, you take full responsibility for credit usage.</b>
+        Should be run on the outputs of <b>already-run</b> Prompt Nodes
+        (you&apos;ve already collected some LLM responses).
       </List.Item>
+      <List.Item>EvalGen will send off many requests during usage.</List.Item>
     </List>
-    <Text>Currently, EvalGen does NOT:</Text>
+    <Text>
+      🔔 <b>By using Evalgen, you take full responsibility for credit usage.</b>{" "}
+      Currently, EvalGen does NOT:
+    </Text>
     <List>
       <List.Item>
         Work on imported spreadsheets of data (although if you are interested in
@@ -58,9 +59,10 @@ const WelcomeStep: React.FC<WelcomeStepProps> = ({ setOnNextCallback }) => (
       </List.Item>
       <List.Item>
         Generate code that uses third-party libraries. For safety, LLM-generated
-        Python code is run sandboxed in the browser with pyodide. (If your eval
-        criteria implementation must use a third-party library, we suggest you
-        use ChainForge’s genAI features on the specific eval node, outside this
+        Python code is run sandboxed in the browser with pyodide. Pyodide does
+        not have access to many libraries out-of-the-box. (If your eval criteria
+        implementation must use a third-party library, we suggest you use
+        ChainForge’s genAI features on the specific eval node, outside this
         wizard.)
       </List.Item>
     </List>
@@ -71,8 +73,7 @@ const WelcomeStep: React.FC<WelcomeStepProps> = ({ setOnNextCallback }) => (
     </ul> */}
     <Text>
       After EvalGen finishes, the chosen evaluators appear in the MultiEval
-      node. You can export evaluator details by right-clicking the node and
-      selecting Copy Eval Specs.
+      node.
     </Text>
     <Text>
       EvalGen is in Beta. To improve it, provide feedback on our Github Issues
diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx
index b35bd7d38..097bb1bb9 100644
--- a/chainforge/react-server/src/EvalGenModal.tsx
+++ b/chainforge/react-server/src/EvalGenModal.tsx
@@ -1,1665 +1,1665 @@
-/**
- * EvalGen 2.0
- *
- * Ian Arawjo, Shreya Shankar, J.D. Zamf., Helen Weixu Chen
- *
- * This file concerns the front-end to evaluation generator, EvalGen.
- * EvalGen supports users in generating eval funcs (here binary assertions) and aligning them with their preferences.
- *
- * Specifically, the modal lets users:
- *  - make and refine criteria to grade on (on the left)
- *  - grade responses (on the right)
- *  - while in the backend, an LLM is generating candidate assertions and selected the ones most aligned with user grades
- * As the user grades responses, they add/refine existing criteria.
- * This modal presents a shared interface where criteria can be iterated on *alongside* grading.
- * This is because of **criteria drift,** a phenomenon identified observing users in EvalGen 1.0 (unreleased).
- *
- * An AI (LLM call) can also suggest criteria based on the implicit context (inputs, such as the prompt)
- * and user feedback during grading (written feedback about failing outputs whose failure couldn't be classified under the immediate criteria set.)
- */
-import React, {
-  ReactNode,
-  forwardRef,
-  useCallback,
-  useEffect,
-  useImperativeHandle,
-  useMemo,
-  useState,
-} from "react";
-import { v4 as uuid } from "uuid";
-import {
-  Accordion,
-  ActionIcon,
-  Box,
-  Button,
-  Card,
-  Center,
-  Checkbox,
-  Code,
-  Collapse,
-  Divider,
-  Flex,
-  Grid,
-  Group,
-  Menu,
-  Modal,
-  Radio,
-  RingProgress,
-  ScrollArea,
-  SimpleGrid,
-  Skeleton,
-  Stack,
-  Text,
-  TextInput,
-  Textarea,
-  Title,
-  Tooltip,
-  rem,
-} from "@mantine/core";
-import { useDisclosure } from "@mantine/hooks";
-import {
-  // CriteriaGradeCount,
-  Dict,
-  LLMResponse,
-  PromptVarsDict,
-  RatingDict,
-  ResponseUID,
-} from "./backend/typing";
-import { EvalCriteria, EvalGenReport } from "./backend/evalgen/typing";
-import {
-  IconChevronDown,
-  IconChevronLeft,
-  IconChevronRight,
-  IconDots,
-  IconRobot,
-  IconStarFilled,
-  IconTerminal2,
-  IconThumbDown,
-  IconThumbUp,
-  IconTrash,
-  IconFlagFilled,
-  IconPencil,
-  IconSparkles,
-} from "@tabler/icons-react";
-import {
-  cleanMetavarsFilterFunc,
-  deepcopy,
-  sampleRandomElements,
-  transformDict,
-} from "./backend/utils";
-import {
-  extractUIDFromRatingKey,
-  getRatingKeyForResponse,
-} from "./ResponseRatingToolbar";
-import useStore from "./store";
-import StorageCache from "./backend/cache";
-import EvaluationFunctionExecutor from "./backend/evalgen/executor";
-import { generateLLMEvaluationCriteria } from "./backend/evalgen/utils";
-import { escapeBraces } from "./backend/template";
-import { update } from "lodash";
-// import "./EvalGenModel.css";
-
-const INIT_CRITERIA: EvalCriteria[] = [
-  {
-    shortname: "Grammatical",
-    criteria: "The text is grammatically correct.",
-    eval_method: "expert",
-    uid: uuid(),
-    priority: 0,
-  },
-  {
-    shortname: "Tweet-length",
-    criteria: "The text is less than 144 characters.",
-    eval_method: "code",
-    uid: uuid(),
-    priority: 0,
-  },
-  {
-    shortname: "Bombastic",
-    criteria: "The message will drive views because it's controversial.",
-    eval_method: "expert",
-    uid: uuid(),
-    priority: 0,
-  },
-];
-
-const Contributor = ({
-  getStateValue,
-  style = { size: 22, thickness: 4 },
-}: {
-  getStateValue: (id: number) => number;
-  style: { size: number; thickness: number };
-}) => {
-  return (
-    <RingProgress
-      size={style.size}
-      thickness={style.thickness}
-      // label=""
-      sections={[
-        {
-          value: getStateValue(1),
-          color: "cyan",
-          tooltip: "You have successfully contributed 7 responses.",
-        },
-        {
-          value: getStateValue(2),
-          color: "orange",
-          tooltip: "You have successfully contributed 20 responses.",
-        },
-        {
-          value: getStateValue(3),
-          color: "green",
-          tooltip: "You have gone to buffet 100 times.",
-        },
-        {
-          value: getStateValue(4),
-          color: "grape",
-          tooltip: "You have made 21 nightmare",
-        },
-      ]}
-    />
-  );
-};
-
-const ThumbUpDownButtons = ({
-  grade,
-  onChangeGrade,
-  getGradeCount,
-}: {
-  grade: boolean | undefined;
-  onChangeGrade: (newGrade: boolean | undefined) => void;
-  getGradeCount: (grade: boolean | undefined) => number;
-}) => {
-  // console.log(
-  //   "getGradeCount",
-  //   getGradeCount(true),
-  //   getGradeCount(false),
-  //   getGradeCount(undefined),
-  // );
-  return (
-    <>
-      {/* Thumbs up/down buttons */}
-      <Button
-        color={grade === true ? "green" : "gray"}
-        m={0}
-        p={0}
-        variant="subtle"
-        onClick={() => {
-          // Toggle grade: if on (true), turn 'off' (undefined, for neutral).
-          if (onChangeGrade) onChangeGrade(grade === true ? undefined : true);
-        }}
-      >
-        <div className="gradeContainer">
-          <IconThumbUp size="14pt" fill={grade === true ? "#aea" : "white"} />
-          <div className="gradeUpCount">{getGradeCount(true)}</div>
-        </div>
-      </Button>
-      <Button
-        color={grade === false ? "red" : "gray"}
-        m={0}
-        p={0}
-        variant="subtle"
-        onClick={() => {
-          // Toggle grade: if on (true), turn 'off' (undefined, for neutral).
-          if (onChangeGrade) onChangeGrade(grade === false ? undefined : false);
-        }}
-      >
-        <div className="gradeContainer">
-          <IconThumbDown
-            size="14pt"
-            fill={grade === false ? "pink" : "white"}
-          />
-          <div className="gradeDownCount">{getGradeCount(false)}</div>
-        </div>
-      </Button>
-    </>
-  );
-};
-
-export interface CriteriaCardProps {
-  criterion: EvalCriteria;
-  onChange: (changedCriteria: EvalCriteria) => void;
-  onDelete: () => void;
-  initiallyOpen?: boolean;
-  grade: boolean | undefined;
-  onChangeGrade: (newGrade: boolean | undefined) => void;
-  getGradeCount: (grade: boolean | undefined) => number;
-  getStateValue: (stateId: number) => number;
-}
-
-const CriteriaCard: React.FC<CriteriaCardProps> = ({
-  criterion,
-  onChange,
-  onDelete,
-  initiallyOpen,
-  grade,
-  getGradeCount,
-  onChangeGrade,
-  getStateValue,
-}) => {
-  const [opened, { toggle }] = useDisclosure(initiallyOpen ?? false);
-  const [title, setTitle] = useState(criterion.shortname ?? "New Criteria");
-
-  return (
-    <Stack spacing={0} ml={8}>
-      <Flex align="center">
-        <Group spacing="0px">
-          {/* The arrow chevron user can click to collapse/expand
-            <Button
-              color="gray"
-              p={0}
-              m={0}
-              variant="subtle"
-              mr="4px"
-              onClick={toggle}
-            >
-              {opened ? (
-                <IconChevronDown size="14pt" />
-              ) : (
-                <IconChevronRight size="14pt" />
-              )}
-            </Button> */}
-
-          {/* Thumbs up/down buttons */}
-          <ThumbUpDownButtons
-            grade={grade}
-            onChangeGrade={onChangeGrade}
-            getGradeCount={getGradeCount}
-          />
-
-          {/* Title of the criteria */}
-          <TextInput
-            value={title}
-            onChange={(e) => setTitle(e.target.value)}
-            onBlur={(e) => {
-              criterion.shortname = e.target.value;
-              if (onChange) onChange(criterion);
-            }}
-            placeholder="Criteria name"
-            variant="unstyled"
-            size="md"
-            ml="xs"
-            className="nodrag nowheel"
-            styles={{
-              input: {
-                padding: "0px",
-                height: "14pt",
-                minHeight: "0pt",
-                fontWeight: 500,
-              },
-            }}
-          />
-        </Group>
-
-        <Group spacing="4px" ml="auto">
-          {/* Whether this criteria should be implemented with code (function) or an LLM evaluator */}
-          <Tooltip
-            label={
-              criterion.eval_method === "code"
-                ? "Change to an LLM evaluator"
-                : "Change to a code evaluator"
-            }
-            withinPortal
-            withArrow
-          >
-            <Text
-              color="#999"
-              size="sm"
-              mr="6px"
-              onClick={() => {
-                criterion.eval_method =
-                  criterion.eval_method === "code" ? "expert" : "code";
-                if (onChange) onChange(criterion);
-              }}
-            >
-              {criterion.eval_method === "code" ? (
-                <Flex style={{ userSelect: "none" }}>
-                  <IconTerminal2 size="14pt" />
-                  &nbsp;Python
-                </Flex>
-              ) : (
-                <Flex style={{ userSelect: "none" }}>
-                  <IconRobot size="14pt" />
-                  &nbsp;LLM
-                </Flex>
-              )}
-            </Text>
-          </Tooltip>
-
-          {/* <Contributor getStateValue={getStateValue} /> */}
-
-          {/* Delete button (and any other criterion-specific changes in the future) */}
-          <ActionIcon variant="subtle" color="red" onClick={onDelete}>
-            <IconTrash style={{ width: rem(16), height: rem(16) }} />
-          </ActionIcon>
-        </Group>
-      </Flex>
-
-      <Textarea
-        value={criterion.criteria}
-        placeholder="Describe here. You must describe what the criteria means before EvalGen can implement it."
-        ml={38}
-        onChange={(e) => {
-          criterion.criteria = e.target.value;
-          if (onChange) onChange(criterion);
-        }}
-        onClickCapture={(e) => e.stopPropagation()}
-        styles={{
-          input: {
-            border: "none",
-            borderWidth: "0px",
-            margin: "0px",
-            color: "#444",
-            background: "transparent",
-            lineHeight: 1.1,
-          },
-        }}
-        autosize
-        minRows={2}
-        maxRows={5}
-        fz="sm"
-        mb="xs"
-        c="dimmed"
-      />
-    </Stack>
-  );
-};
-
-export interface EvalGenModalRef {
-  trigger: (
-    resps: LLMResponse[],
-    setFinalReports: (reports: EvalGenReport) => void,
-  ) => void;
-}
-
-const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
-  function EvalGenModal(props, ref) {
-    const [opened, { open, close }] = useDisclosure(false);
-    const apiKeys = useStore((state) => state.apiKeys);
-    const globalState = useStore((store) => store.state);
-    const [criteria, setCriteria] = useState<EvalCriteria[]>([]);
-    const [criteriaForDisplay, setCriteriaForDisplay] = useState<
-      EvalCriteria[]
-    >([]);
-
-    const [responses, setResponses] = useState<LLMResponse[]>([]);
-    const [shownResponse, setShownResponse] = useState<LLMResponse | undefined>(
-      undefined,
-    );
-    const [pastShownResponses, setPastShownResponses] = useState<LLMResponse[]>(
-      [],
-    );
-    const [shownResponseIdx, setShownResponseIdx] = useState(0);
-
-    const [annotation, setAnnotation] = useState<string | undefined>(undefined);
-    const [holisticGrade, setHolisticGrade] = useState<
-      "good" | "bad" | undefined
-    >(undefined);
-
-    // Per-criteria grades (indexed by uid of response, then uid of criteria)
-    const [grades, setGrades] = useState<Dict<Dict<boolean | undefined>>>({});
-    const setPerCriteriaGrade = (
-      responseUID: string,
-      criteriaUID: string,
-      newGrade: boolean | undefined,
-    ) => {
-      setGrades((grades) => {
-        if (!grades[responseUID]) grades[responseUID] = {};
-        grades[responseUID][criteriaUID] = newGrade;
-        // grades[responseUID] = { ...grades[responseUID] };
-        // console.error("grades-2", grades);
-        return { ...grades };
-      });
-      updateGlobalRating(responseUID, "perCriteriaGrades", grades[responseUID]);
-    };
-    const getStateValue = (stateId: number) => {
-      return Math.floor(Math.random() * 30 + 6);
-    };
-    const getGradeCount = (
-      // responseUID: string,
-      criteriaUID: string,
-      grade: boolean | undefined,
-    ) => {
-      // console.log("getGradeCount", responseUID, criteriaUID, grade);
-      // console.log("getGradeCount", grades);
-
-      let count = 0;
-      for (const respUid in grades) {
-        count += grade === grades[respUid][criteriaUID] ? 1 : 0;
-      }
-      return count;
-    };
-
-    // The EvalGen object responsible for generating, implementing, and filtering candidate implementations
-    const [executor, setExecutor] = useState<EvaluationFunctionExecutor | null>(
-      null,
-    );
-
-    const [execProgress, setExecProgress] = useState(0);
-
-    // State variables to keep track of GPT call counts
-    const [numGPT4Calls, setNumGPT4Calls] = useState(0);
-    const [numGPT35Calls, setNumGPT35Calls] = useState(0);
-    const [logs, setLogs] = useState<{ date: Date; message: string }[]>([]);
-
-    // For updating the global human ratings state
-    const setState = useStore((store) => store.setState);
-    const updateGlobalRating = useCallback(
-      (uid: string, label: string, payload: RatingDict) => {
-        const key = getRatingKeyForResponse(uid, label);
-        const safe_payload = deepcopy(payload);
-        setState(key, safe_payload);
-        StorageCache.store(key, safe_payload);
-      },
-      [setState],
-    );
-
-    // console.error("criteria", criteria);
-
-    // Update executor whenever resps, grades, or criteria change
-    React.useEffect(() => {
-      if (criteria.length > 0 && !executor) {
-        const existingGrades = transformDict(
-          globalState,
-          (key) => key.startsWith("r.") && key.endsWith(".grade"),
-          extractUIDFromRatingKey,
-          (_, val) => {
-            // The grades are in { idx: grade } format. Take only the first,
-            // as we only take the first response in this iteration of EvalGen:
-            if (typeof val !== "object") return undefined;
-            const gs = Object.values(val);
-            if (gs.length === 0) return undefined;
-            return gs[0];
-          },
-        );
-
-        const addLog = (message: string) => {
-          setLogs((prevLogs) => [...prevLogs, { date: new Date(), message }]);
-        };
-
-        const ex = new EvaluationFunctionExecutor(
-          getLikelyPromptTemplateAsContext(responses),
-          responses,
-          criteria,
-          (gpt4Calls, gpt35Calls) => {
-            // Callback to update GPT call counts
-            setNumGPT4Calls((num) => num + gpt4Calls);
-            setNumGPT35Calls((num) => num + gpt35Calls);
-          },
-          addLog,
-          existingGrades,
-          grades,
-        );
-        setExecutor(ex);
-
-        setExecProgress(0);
-
-        // ex.start((progress) => {
-        //   setExecProgress(progress?.success ?? 0);
-        // });
-      } else if (executor) {
-        // Update criteria in executor
-        executor.updateCriteria(criteria);
-      }
-
-      updateCriteriaForDisplay();
-    }, [criteria]);
-
-    const generateCriteria = (resps) => {
-      // Create criteria
-      // setIsLoadingCriteria((num) => num + 3);
-      genCriteriaFromContext(resps)
-        .then((crits) => {
-          console.log("crits #1", crits);
-          crits = [...criteria, ...crits];
-          console.log("crits #2", crits);
-          setCriteria(crits.map((c) => ({ ...c, uid: uuid() })));
-        })
-        .catch((err) => {
-          console.error(err);
-        })
-        .finally(() => {
-          setIsLoadingCriteria((num) => num - 3);
-          setNumGPT4Calls((num) => num + 1);
-        });
-    };
-
-    // const defaultOnFinish = (reports: string) => {};
-    const [onFinish, setOnFinish] = useState({
-      setFinalRpts: (reports: EvalGenReport) => {
-        // console.log("");
-      },
-    });
-
-    // Open the EvalGen wizard
-    const trigger = (
-      resps: LLMResponse[],
-      setFinalReports: (reports: EvalGenReport) => void,
-    ) => {
-      // We pass the responses here manually to ensure they remain the same
-      // for the duration of one EvalGen operation.
-      setResponses(resps);
-      gotoNextScreen("response");
-      // setFinalReports("A plenty response");
-      setOnFinish({
-        setFinalRpts: (reports: EvalGenReport) => {
-          close();
-          setFinalReports(reports);
-        },
-      });
-
-      const firstGrades = resps.reduce(
-        (acc: Dict<Dict<boolean | undefined>>, curr) => {
-          if (!(curr.uid in acc)) acc[curr.uid] = {};
-          return acc;
-        },
-        grades,
-      );
-      setGrades(firstGrades);
-
-      console.log("*****************************resps", resps);
-      if (criteria && criteria.length === 0) {
-        generateCriteria(resps);
-      }
-
-      setShownResponseIdx(0);
-      if (resps.length > 0) {
-        const first_resp = sampleRandomElements(resps, 1)[0];
-        // setShownResponse(first_resp);
-        setPastShownResponses([first_resp]);
-      } else {
-        // setShownResponse(undefined);
-        setPastShownResponses([]);
-      }
-      setShownResponse(resps[shownResponseIdx]);
-      open();
-    };
-    useImperativeHandle(ref, () => ({
-      trigger,
-    }));
-
-    const getLikelyPromptTemplateAsContext = (resps) => {
-      // Attempt to infer the prompt template used to generate the responses:
-      const prompts = new Set();
-      for (const resp_obj of resps) {
-        if (resp_obj?.metavars?.__pt !== undefined) {
-          prompts.add(resp_obj.metavars.__pt);
-        }
-      }
-
-      if (prompts.size === 0) return null;
-
-      // Pick a prompt template at random to serve as context....
-      return escapeBraces(prompts.values().next().value);
-    };
-
-    async function genCriteriaFromContext(responses) {
-      // Get the context from the input responses
-      const inputPromptTemplate = getLikelyPromptTemplateAsContext(responses);
-
-      if (inputPromptTemplate === null) {
-        console.error("No context found. Cannot proceed.");
-        return;
-      }
-
-      // Attempt to generate criteria using an LLM
-      return await generateLLMEvaluationCriteria(inputPromptTemplate, apiKeys);
-    }
-
-    // Add a criterion
-    const handleAddCriteria = (newCrit: EvalCriteria) => {
-      setCriteria((cs) => {
-        if (!newCrit.uid) newCrit.uid = uuid();
-        return [...cs, newCrit];
-      });
-    };
-
-    // Modify an existing criterion
-    const handleChangeCriteria = (newCrit: EvalCriteria, uid: string) => {
-      setCriteria((cs) => {
-        const idx = cs.findIndex((c) => c.uid === uid);
-        if (idx === -1) {
-          console.error("Could not find criteria with uid", uid);
-          return cs;
-        }
-        cs[idx] = newCrit;
-        return [...cs];
-      });
-    };
-
-    // Delete a criterion
-    const handleDeleteCriteria = (uid: string) => {
-      setCriteria((cs) => {
-        return cs.filter((c) => c.uid !== uid);
-      });
-    };
-
-    // Synthesize a new criteria according to the feedback given for the shown response
-    const [isLoadingCriteria, setIsLoadingCriteria] = useState(0);
-    const synthNewCriteriaWithLLM = (
-      response: string,
-      feedback: string,
-      grade: "good" | "bad" | "unknown",
-    ) => {
-      // Add a loading Skeleton
-      setIsLoadingCriteria((num) => num + 1);
-      // Make async LLM call to expand criteria only if the feedback contains some idea of a constraint on the output and isn't covered by existing criteria
-      const prettyCriteria = criteria
-        .map((crit) => {
-          return `${crit.shortname}: ${crit.criteria}`;
-        })
-        .join("\n");
-
-      generateLLMEvaluationCriteria(
-        "",
-        apiKeys,
-        `I've given some feedback on some text output. Use this feedback to decide on a single new evaluation criteria with a yes/no answer, only if the feedback isn't encompassed by existing criteria. I want you to take the criteria and output a JSON object in the format below. 
-
-TEXT OUTPUT: 
-\`\`\`
-${response}
-\`\`\`
-
-EXISTING CRITERIA:
-\`\`\`
-${prettyCriteria}
-\`\`\`
-
-GRADE (whether text was good or bad):
-\`\`\`
-${grade}
-\`\`\`
-
-FEEDBACK: 
-\`\`\`
-${feedback}
-\`\`\`
-
-If you determine the feedback corresponds to a new criteria, your response should contain a short title for the criteria ("shortname"), a description of the criteria in 2 sentences ("criteria"), and whether it should be evaluated with "code", or by an "expert" if the criteria is difficult to evaluate ("eval_method"). Your answer should be JSON within a \`\`\`json \`\`\` marker, with the following three fields: "criteria", "shortname", and "eval_method" (code or expert). The "criteria" should expand upon the user's input, the "shortname" should be a very brief title for the criteria, and this list should contain as many evaluation criteria as you can think of. Each evaluation criteria should test a unit concept that should evaluate to "true" in the ideal case. Only output JSON, nothing else. Output an empty list if there is no new evaluation criteria`, // prompt
-        "gpt-4o", // llm
-      )
-        .then((evalCrits) => {
-          // Take only the first if evalCrits has a nonempty list
-          if (evalCrits[0]) {
-            setCriteria((crit) =>
-              crit.concat([
-                {
-                  ...evalCrits[0],
-                  uid: uuid(),
-                },
-              ]),
-            );
-          }
-          // Remove a loading Skeleton
-          setIsLoadingCriteria((num) => num - 1);
-
-          setNumGPT4Calls((num) => num + 1);
-        })
-        .catch((err) => {
-          console.error(err);
-          setIsLoadingCriteria((num) => num - 1);
-        });
-    };
-
-    // Goto next response in the queue (skipping grading the current one)
-    const nextResponse = () => {
-      if (responses.length === 0) return;
-
-      // Update annotation for current response (if any)
-      // TODO: Fix this for generate case when num resp per prompt > 1
-
-      if (
-        grades[shownResponse.uid] ||
-        holisticGrade ||
-        (annotation && annotation.trim())
-      ) {
-        executor?.setGradeForExample(
-          shownResponse.uid,
-          grades[shownResponse.uid],
-          holisticGrade,
-          annotation ? annotation.trim() : null,
-        );
-      }
-
-      if (
-        shownResponse &&
-        annotation &&
-        typeof annotation === "string" &&
-        annotation.trim().length > 0
-      ) {
-        console.log(
-          "setting annotation for resp",
-          shownResponse.uid,
-          annotation,
-        );
-        updateGlobalRating(shownResponse.uid, "note", { 0: annotation });
-        setAnnotation("");
-      }
-
-      if (shownResponse && holisticGrade) {
-        updateGlobalRating(shownResponse.uid, "grade", {
-          0: holisticGrade === "good",
-        });
-      }
-
-      if (shownResponse && grades[shownResponse.uid]) {
-        updateGlobalRating(
-          shownResponse.uid,
-          "perCriteriaGrades",
-          grades[shownResponse.uid],
-        );
-      }
-
-      // @ts-expect-error The only way to deselect the Radio.Group is to set it to null. Undefined doesn't work.
-      setHolisticGrade(null);
-
-      if (shownResponseIdx < pastShownResponses.length - 1) {
-        // If we are not at the end of the history of shown responses, then show the next response:
-        setShownResponse(pastShownResponses[shownResponseIdx + 1]);
-        setShownResponseIdx(shownResponseIdx + 1); // increment the shown resp idx
-      } else {
-        // We are at the end of the history; pick the next response off the stack:
-        // TODO: Make this unique (maybe by removing picked responses from the list!)
-        let num_tries = 3;
-        let next_resp = executor?.getNextExampleToGrade();
-        while (
-          num_tries > 0 &&
-          (!next_resp ||
-            pastShownResponses.some((r) => r.uid === next_resp?.uid))
-        ) {
-          // We're presenting a response that's already been shown. Try again.
-          // NOTE: If we're trying again the first time, executor will flip and get the response on the other side of the grading stack, so we try once more:
-          if (next_resp && num_tries === 3)
-            next_resp =
-              executor?.getNextExampleToGrade() ??
-              sampleRandomElements(responses, 1)[0];
-          // Otherwise we just choose a response at random:
-          else next_resp = sampleRandomElements(responses, 1)[0];
-          num_tries -= 1;
-        }
-        // Note that this doesn't guarantee uniqueness here ---it is possible to see a response again.
-        // However, the internal "grades" dict will help us in remembering what grade the user gave the response.
-        setShownResponse(next_resp ?? undefined);
-        if (next_resp)
-          setPastShownResponses(pastShownResponses.concat(next_resp));
-        setShownResponseIdx(pastShownResponses.length);
-      }
-      updateShownResponseUniqueIndex();
-    };
-
-    // Go back to previously shown response
-    const prevResponse = () => {
-      if (pastShownResponses.length === 0 || shownResponseIdx === 0) return;
-      setShownResponse(pastShownResponses[shownResponseIdx - 1]);
-      setShownResponseIdx(shownResponseIdx - 1); // decrement shown resp idx
-      updateShownResponseUniqueIndex();
-    };
-
-    const updateShownResponseUniqueIndex = () => {
-      let idx = 0;
-      for (const resp of responses) {
-        if (resp === shownResponse) {
-          setShownResponseUniqueIdx(idx);
-          break;
-        }
-        idx++;
-      }
-    };
-
-    const nextResponse2 = () => {
-      if (responses.length === 0) return;
-      if (shownResponseIdx < responses.length - 1) {
-        // setShownResponse(responses[shownResponseIdx + 1]);
-        setShownResponseIdx(shownResponseIdx + 1);
-      }
-    };
-
-    const prevResponse2 = () => {
-      if (shownResponseIdx > 0) {
-        // setShownResponse(responses[shownResponseIdx - 1]);
-        setShownResponseIdx(shownResponseIdx - 1); // decrement shown resp idx
-      }
-    };
-
-    React.useEffect(() => {
-      setShownResponse(responses[shownResponseIdx]);
-    }, [shownResponseIdx]);
-
-    const estimateGPTCalls = () => {
-      return executor
-        ? `This will trigger around ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT4Calls} GPT-4o and ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT35Calls} GPT-3.5-turbo-16k calls.`
-        : "# estimated GPT calls not available.";
-    };
-
-    const updateCriteriaForDisplay = () => {
-      const highCriteria = criteria.filter((c) => c.priority === 1);
-      const lowCriteria = criteria.filter((c) => c.priority === 0);
-      setCriteriaForDisplay(highCriteria.concat(lowCriteria));
-    };
-    useEffect(() => {
-      const highCriteria = criteria.filter((c) => c.priority === 1);
-      const lowCriteria = criteria.filter((c) => c.priority === 0);
-      setCriteriaForDisplay(highCriteria.concat(lowCriteria));
-    }, [criteria]);
-
-    const [screen, setScreen] = useState("");
-    const gotoNextScreen = (screenName: string) => {
-      setScreen(screenName);
-    };
-
-    // const [onFinish, setOnFinish] = useState(null);
-
-    return (
-      <Modal
-        size="95%"
-        keepMounted
-        opened={opened}
-        onClose={close}
-        closeOnClickOutside={true}
-        style={{ position: "relative", left: "-5%" }}
-      >
-        {screen === "response" && (
-          <Grid h={window?.innerHeight * 0.8}>
-            <Grid.Col span={8}>
-              <Stack justify="space-between">
-                {/* View showing the response the user is currently grading */}
-                <GradingView
-                  shownResponse={shownResponse}
-                  shownResponseIdx={shownResponseIdx}
-                  // shownResponseIdx={shownResponseUniqueIdx}
-                  responseCount={responses.length}
-                  numGPT4Calls={numGPT4Calls}
-                  numGPT35Calls={numGPT35Calls}
-                  logs={logs}
-                  gotoNextResponse={nextResponse2}
-                  gotoPrevResponse={prevResponse2}
-                  estimateGPTCalls={estimateGPTCalls}
-                  gotoNextScreen={gotoNextScreen}
-                />
-
-                {/* Progress bar */}
-                {/* <Flex justify="left" align="center" gap="md">
-                <Stack w="100%" spacing={4}>
-                  <Text color="#aaa" size="sm">
-                    {bottomBar.progressLabel}
-                  </Text>
-                  <Progress w="100%" value={bottomBar.progressPerc} mb="0px" />
-                </Stack>
-
-                <Button
-                  onClick={handleDone}
-                  variant={bottomBar.buttonStyle}
-                  disabled={bottomBar.buttonDisabled}
-                >
-                  {bottomBar.buttonLabel}
-                </Button>
-              </Flex> */}
-              </Stack>
-            </Grid.Col>
-            <Grid.Col span={4} bg="#eee" pt="16px" h="100%">
-              <Center>
-                <Title order={3} ml={8} mt="sm" mb="md">
-                  Rubric
-                </Title>
-              </Center>
-
-              <div
-                style={{
-                  display: "flex",
-                  flexDirection: "column",
-                }}
-              >
-                <div style={{ flex: 2, overflowY: "auto" }}>
-                  {criteriaForDisplay.map((e) => (
-                    <CriteriaCard
-                      criterion={e}
-                      key={e.uid}
-                      onChange={(newCrit) =>
-                        handleChangeCriteria(newCrit, e.uid)
-                      }
-                      onDelete={() => handleDeleteCriteria(e.uid)}
-                      grade={
-                        shownResponse
-                          ? grades[shownResponse.uid][e.uid]
-                          : undefined
-                      }
-                      getGradeCount={(grade) => {
-                        return shownResponse
-                          ? getGradeCount(
-                              // shownResponse.uid,
-                              e.uid,
-                              grade,
-                            )
-                          : 0;
-                      }}
-                      onChangeGrade={(newGrade) => {
-                        if (shownResponse)
-                          setPerCriteriaGrade(
-                            shownResponse.uid,
-                            e.uid,
-                            newGrade,
-                          );
-                      }}
-                      initiallyOpen={true}
-                      getStateValue={(stateId) => getStateValue(stateId)}
-                    />
-                  ))}
-                  {isLoadingCriteria > 0 ? (
-                    Array.from(
-                      { length: isLoadingCriteria },
-                      (v: unknown, idx: number) => (
-                        <Skeleton key={idx} h={80} mb={4} />
-                      ),
-                    )
-                  ) : (
-                    <></>
-                  )}
-
-                  <div className="criteriaButtons">
-                    <Button
-                      leftIcon={<IconPencil size={14} />}
-                      variant="subtle"
-                      color="gray"
-                      // gradient={{ from: "blue", to: "green", deg: 90 }}
-                      onClick={() => {
-                        handleAddCriteria({
-                          shortname: "New Criteria",
-                          criteria: "",
-                          eval_method: "code",
-                          priority: 0,
-                          uid: uuid(),
-                        });
-                      }}
-                    >
-                      Add a new criteria
-                    </Button>
-                    <Button
-                      leftIcon={<IconSparkles size={14} />}
-                      variant="subtle"
-                      color="gray"
-                      // gradient={{ from: "blue", to: "green", deg: 90 }}
-                      onClick={() => {
-                        generateCriteria(responses);
-                      }}
-                    >
-                      Suggest Criteria
-                    </Button>
-                  </div>
-                </div>
-
-                <Stack spacing="0px" pl="xs" pr="lg" style={{ flex: 1 }}>
-                  <Divider mt="lg" />
-                  <Title mb="0px" order={4}>
-                    Suggest New Criteria Based on the Feedback
-                  </Title>
-                  <Textarea
-                    value={annotation}
-                    onChange={(e) => setAnnotation(e.target.value)}
-                    description="How good is this response? Explain anything not captured under your existing criteria. Your feedback will be used to generate new criteria."
-                    mb="sm"
-                  />
-                  <Radio.Group
-                    name="favoriteFramework"
-                    label="Rate the response holistically:"
-                    value={holisticGrade}
-                    onChange={(v) => setHolisticGrade(v as "good" | "bad")}
-                    withAsterisk
-                    mb="md"
-                  >
-                    <Group mt="xs">
-                      <Radio value="good" label="Good" />
-                      <Radio value="bad" label="Bad" />
-                      <span>
-                        &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
-                      </span>
-                      <Button
-                        color="green"
-                        variant="filled"
-                        disabled={
-                          !holisticGrade ||
-                          annotation === undefined ||
-                          annotation.length === 0
-                        }
-                        onClick={() => {
-                          synthNewCriteriaWithLLM(
-                            shownResponse?.responses[0].toString() ?? "",
-                            annotation ?? "",
-                            holisticGrade ?? "unknown",
-                          );
-
-                          nextResponse();
-                        }}
-                      >
-                        + Submit Feedback
-                      </Button>
-                    </Group>
-                  </Radio.Group>
-                </Stack>
-              </div>
-            </Grid.Col>
-          </Grid>
-        )}
-        {screen === "report" && (
-          <Grid>
-            <ReportCardView
-              report={{
-                criteria: criteria,
-                failureCoverage: 99.2,
-                falseFailureRate: 66.7,
-              }}
-              onFinish={(reports: EvalGenReport) => {
-                onFinish.setFinalRpts(reports);
-              }}
-              getGradeCount={(crit: EvalCriteria, grade: boolean) => {
-                return shownResponse
-                  ? getGradeCount(
-                      // shownResponse.uid,
-                      crit.uid,
-                      grade,
-                    )
-                  : 0;
-              }}
-              getStateValue={(stateId) => getStateValue(stateId)}
-            />
-          </Grid>
-        )}
-      </Modal>
-    );
-  },
-);
-
-const HeaderText = ({ children }: { children: ReactNode }) => {
-  return (
-    <Text size="xl" fw={500} pl="sm" mb="lg">
-      {children}
-    </Text>
-  );
-};
-
-interface GradingViewProps {
-  shownResponse: LLMResponse | undefined;
-  shownResponseIdx: number;
-  responseCount: number;
-  numGPT4Calls: number;
-  numGPT35Calls: number;
-  logs: { date: Date; message: string }[];
-  gotoPrevResponse: () => void;
-  gotoNextResponse: () => void;
-  estimateGPTCalls: () => string;
-  gotoNextScreen: (screenName: string) => void;
-}
-
-const GradingView: React.FC<GradingViewProps> = ({
-  shownResponse,
-  shownResponseIdx,
-  responseCount,
-  numGPT4Calls,
-  numGPT35Calls,
-  logs,
-  gotoPrevResponse,
-  gotoNextResponse,
-  estimateGPTCalls,
-  gotoNextScreen,
-}) => {
-  // Calculate inner values only when shownResponse changes
-  const responseText = useMemo(
-    () =>
-      shownResponse && shownResponse.responses?.length > 0
-        ? shownResponse.responses[0].toString()
-        : "",
-    [shownResponse],
-  );
-
-  const prompt = useMemo(() => shownResponse?.prompt ?? "", [shownResponse]);
-  const varsDivs = useMemo(() => {
-    const combined_vars_metavars = shownResponse
-      ? {
-          ...shownResponse.vars,
-          ...transformDict(shownResponse.metavars, cleanMetavarsFilterFunc),
-        }
-      : {};
-
-    // console.log("**************shownResponse", shownResponse);
-    return Object.entries(combined_vars_metavars).map(([varname, val]) => (
-      <div key={varname} className="grade-resp-var-container">
-        <span className="response-var-name">{varname}&nbsp;=&nbsp;</span>
-        <span className="response-var-value linebreaks">{val}</span>
-      </div>
-    ));
-  }, [shownResponse]);
-
-  // const [shownResponseIdx, setShownResponseIdx] = useState(0);
-  // const [shownResponses, setShownResponses] = useState<LLMResponse[]>([]);
-  // React.useEffect(() => {
-  //   console.error("current response", shownResponse);
-  //   if (shownResponse && !shownResponses.includes(shownResponse)) {
-  //     shownResponses.push(shownResponse);
-  //     setShownResponses(shownResponses);
-  //     setShownResponseIdx(shownResponses.length - 1);
-  //     console.error("current response is saved.", shownResponses.length);
-  //   } else {
-  //     console.error("current response already saved.");
-  //     for (const [idx, resp] of shownResponses.entries()) {
-  //       if (shownResponse === resp) {
-  //         setShownResponseIdx(idx);
-  //         break;
-  //       }
-  //     }
-  //   }
-  // }, [shownResponse]);
-
-  return (
-    <Stack justify="space-between" mih={500}>
-      <Box>
-        {/* Top header */}
-        <Flex justify="center">
-          <HeaderText>
-            {/* What do you think of this response? */}
-            What do you think of response #{shownResponseIdx + 1} of{" "}
-            {responseCount}?
-          </HeaderText>
-        </Flex>
-        {/* Middle response box with chevron buttons < and > for going back and forward a response */}
-        <Flex justify="center" align="center" mb="sm">
-          {/* Go back to previous response */}
-          <Button variant="white" color="dark" onClick={gotoPrevResponse}>
-            <IconChevronLeft />
-          </Button>
-
-          {/* The response one is currently grading */}
-          <div
-            className="response-box"
-            style={{
-              backgroundColor: "#eee",
-              width: "80%",
-              maxHeight: "340px",
-              overflowY: "scroll",
-              borderColor: "black",
-              borderStyle: "solid",
-            }}
-          >
-            <div className="response-item-llm-name-wrapper">
-              <div
-                className="small-response"
-                style={{ fontSize: "11pt", padding: "12pt" }}
-              >
-                {responseText}
-              </div>
-            </div>
-          </div>
-
-          {/* Go forward to the next response */}
-          <Tooltip label={estimateGPTCalls()} withArrow>
-            <Button variant="white" color="dark" onClick={gotoNextResponse}>
-              <IconChevronRight />
-            </Button>
-          </Tooltip>
-        </Flex>
-        {/* Views for the vars (inputs) that generated this response, and the concrete prompt */}
-        <Flex justify="center" mb="xl" gap="lg">
-          <div
-            style={{
-              backgroundColor: "#fff",
-              padding: "12px",
-              width: "31%",
-              borderRadius: "12px",
-              borderWidth: "1px",
-              borderStyle: "solid",
-            }}
-          >
-            Vars
-            <hr />
-            <div style={{ maxHeight: "160px", overflowY: "scroll" }}>
-              {varsDivs}
-            </div>
-          </div>
-          <div
-            style={{
-              backgroundColor: "#fff",
-              padding: "12px",
-              width: "41%",
-              borderRadius: "2px",
-            }}
-          >
-            Prompt
-            <hr />
-            <div
-              className="monofont linebreaks"
-              style={{
-                maxHeight: "160px",
-                overflowY: "scroll",
-                fontSize: "10pt",
-                lineHeight: "1.2",
-              }}
-            >
-              {prompt}
-            </div>
-          </div>
-        </Flex>
-        <Flex direction="column">
-          <Flex justify="space-between" align="center">
-            <Text size="lg" weight={500} mb="sm">
-              LLM Activity
-            </Text>
-            {/* GPT Call Tally */}
-            <Text size="sm" color="dark" style={{ fontStyle: "italic" }}>
-              Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls}{" "}
-              GPT-3.5-Turbo-16k calls.
-            </Text>
-          </Flex>
-          <div
-            style={{
-              backgroundColor: "#f0f0f0",
-              color: "#333",
-              fontFamily: "monospace",
-              padding: "12px",
-              width: "calc(100% - 30px)",
-              height: "200px",
-              overflowY: "auto",
-              borderRadius: "8px",
-              border: "1px solid #ddd",
-              marginRight: "20px", // Space on the right
-            }}
-            ref={(el) => {
-              if (el) {
-                el.scrollTop = el.scrollHeight;
-              }
-            }}
-          >
-            {logs.map((log, index) => (
-              <div key={index}>
-                <span style={{ color: "#4A90E2" }}>
-                  {log.date.toLocaleString()} -{" "}
-                </span>
-                <span>{log.message}</span>
-              </div>
-            ))}
-          </div>
-        </Flex>
-      </Box>
-      <div>
-        <Center>
-          <Button
-            leftIcon={<IconSparkles size={14} />}
-            variant="gradient"
-            gradient={{ from: "blue", to: "green", deg: 45 }}
-            onClick={() => {
-              // console.log("(3) gotoNextScreen", gotoNextScreen);
-              gotoNextScreen("report");
-            }}
-          >
-            I&apos;m done. Access EvalGen Report!
-          </Button>
-        </Center>
-      </div>
-    </Stack>
-  );
-};
-
-interface ReportCardViewProps {
-  report: EvalGenReport;
-  // recomputeAlignment,
-  onFinish: (reports: EvalGenReport) => void;
-  getGradeCount: (crit: EvalCriteria, grade: boolean) => number;
-  getStateValue: (stateId: number) => number;
-}
-
-// const ReportCardScreen = () => {
-const ReportCardView: React.FC<ReportCardViewProps> = ({
-  report,
-  // recomputeAlignment,
-  onFinish,
-  getGradeCount,
-  getStateValue,
-}) => {
-  // The criteria cards, now with report information
-
-  const [finalReport, setFinalReport] = useState(report);
-
-  const onSelect = (criterion: EvalCriteria, isSelected: boolean) => {
-    if (isSelected) {
-      finalReport.criteria.push(criterion);
-    } else {
-      finalReport.criteria = finalReport.criteria.filter(
-        (c) => c !== criterion,
-      );
-    }
-    setFinalReport(finalReport);
-  };
-  const cards = useMemo(() => {
-    const res = [];
-
-    // Iterate through selected eval functions and create cards
-    // for (const selectedFunc of report.selectedEvalFunctions) {
-    //   const crit = selectedFunc.evalCriteria;
-    //   // Find corresponding report in allEvalFunctionReports map from criteria to list
-    //   const critEvalFuncReports = report.allEvalFunctionReports.get(crit);
-    //   const evalFuncReport = critEvalFuncReports.find(
-    //     (rep) => rep.evalFunction === selectedFunc,
-    //   );
-
-    //   // Get the functions that were not selected for this criteria
-    //   const otherFuncs = critEvalFuncReports.filter(
-    //     (rep) => rep.evalFunction !== selectedFunc,
-    //   );
-    for (const crit of report.criteria) {
-      res.push(
-        <ReportCriteriaCard
-          criterion={crit}
-          key={crit.uid}
-          // onCheck={(checked) => {
-          //   crit.selected = checked;
-          //   recomputeAlignment();
-          // }}
-          getGradeCount={getGradeCount}
-          getStateValue={getStateValue}
-          onSelect={onSelect}
-        />,
-      );
-    }
-
-    return res;
-  }, [report]);
-
-  return (
-    report && (
-      <div>
-        <Text align="center" size="lg" pl="sm" mb="lg">
-          Chosen Functions and Alignment
-        </Text>
-
-        {/* Show coverage and false failure rate numbers */}
-        <Flex justify="center" gap="md" mb="lg">
-          <Group position="center" spacing="xl" style={{ textAlign: "center" }}>
-            <Card
-              shadow="sm"
-              padding="md"
-              radius="md"
-              style={{ backgroundColor: "#f0f0f0" }}
-            >
-              <Text weight={500} size="md">
-                Coverage of Bad Responses
-              </Text>
-              <Text color="blue" weight={700} size="md">
-                {report.failureCoverage.toFixed(2)}%
-              </Text>
-            </Card>
-            <Card
-              shadow="sm"
-              padding="md"
-              radius="md"
-              style={{ backgroundColor: "#f0f0f0" }}
-            >
-              <Text weight={500} size="md">
-                False Failure Rate
-              </Text>
-              <Text color="red" weight={700} size="md">
-                {report.falseFailureRate.toFixed(2)}%
-              </Text>
-            </Card>
-          </Group>
-        </Flex>
-
-        <ScrollArea mih={300} h={500} mah={500}>
-          <SimpleGrid cols={3} spacing="sm" verticalSpacing="sm" mb="lg">
-            {cards}
-          </SimpleGrid>
-        </ScrollArea>
-
-        <Flex justify="center" gap={12} mt="xs">
-          <Button
-            onClick={() => {
-              // console.log("finalReport", finalReport);
-              onFinish(finalReport);
-            }}
-          >
-            Finish with selected evaluators
-          </Button>
-        </Flex>
-      </div>
-    )
-  );
-};
-
-interface ReportCriteriaCardProps {
-  criterion: EvalCriteria;
-  // onChange: (changedCriteria: EvalCriteria) => void;
-  // onDelete: () => void;
-  // initiallyOpen?: boolean;
-  // grade: boolean | undefined;
-  // onChangeGrade: (newGrade: boolean | undefined) => void;
-  getGradeCount: (crit: EvalCriteria, grade: boolean) => number;
-  getStateValue: (stateId: number) => number;
-  onSelect: (criterion: EvalCriteria, isChecked: boolean) => void;
-}
-
-const ReportCriteriaCard: React.FC<ReportCriteriaCardProps> = ({
-  criterion,
-  // onChange,
-  // onDelete,
-  // initiallyOpen,
-  // grade,
-  getGradeCount,
-  // onChangeGrade,
-  getStateValue,
-  onSelect,
-}) => {
-  // const [opened, { toggle }] = useDisclosure(true);
-  // const [title, setTitle] = useState(criterion.shortname);
-  const [checked, setChecked] = useState(true);
-
-  // Simulates eval functions that are expected to be passed in later on (TODO)
-  const evalFuncs = [
-    { evalFunction: { code: "To be provided (1) ..." } },
-    { evalFunction: { code: "To be provided (2) ..." } },
-    { evalFunction: { code: "To be provided (3) ..." } },
-  ];
-  const unselectedImplementations = evalFuncs.map((item) => (
-    <div key={uuid()}>
-      <Code style={{ whiteSpace: "pre-wrap" }} key={uuid()}>
-        {item.evalFunction.code}
-      </Code>
-      <Divider />
-    </div>
-  ));
-
-  return (
-    // <Card withBorder mb={4} radius="md" style={{ cursor: "default" }}>
-    <Card
-      shadow="sm"
-      padding="sm"
-      pl="md"
-      pb="xl"
-      radius="md"
-      withBorder
-      style={{ backgroundColor: checked ? "#f2f7fc" : "#fff" }}
-    >
-      <div
-        // onClick={() => setChecked(!checked)}
-        onKeyUp={(e) => e.preventDefault()}
-        className="checkcard"
-      >
-        {/* <Card.Section withBorder pl="8px">
-          <Flex align="center">
-            <Group spacing="0px"> */}
-        {/* The arrow chevron user can click to collapse/expand */}
-        {/* <Button
-                color="gray"
-                p={0}
-                m={0}
-                variant="subtle"
-                mr="4px"
-                onClick={toggle}
-              >
-                {opened ? (
-                  <IconChevronDown size="14pt" />
-                ) : (
-                  <IconChevronRight size="14pt" />
-                )}
-              </Button> */}
-
-        <Tooltip label={checked ? "Don't use this" : "Use this"} withArrow>
-          <Checkbox
-            checked={checked}
-            onChange={() => {
-              setChecked(!checked);
-              if (onSelect) onSelect(criterion, !checked);
-            }}
-            tabIndex={-1}
-            size="xs"
-            mr="sm"
-            mt="xs"
-            styles={{ input: { cursor: "pointer" } }}
-            aria-hidden
-          />
-        </Tooltip>
-
-        {/* Thumbs up/down buttons - disable for now */}
-        {/* <ReadOnlyThumbUpDownButtons
-                  upCount={getGradeCount(criterion, true)}
-                  downCount={getGradeCount(criterion, false)}
-                /> */}
-
-        <div style={{ width: "100%" }}>
-          {/* Title of the criteria */}
-          <TextInput
-            value={criterion.shortname}
-            // placeholder="Criteria name"
-            readOnly
-            variant="unstyled"
-            size="sm"
-            ml="xs"
-            className="nodrag nowheel"
-            styles={{
-              input: {
-                border: "none",
-                borderWidth: "0px",
-                padding: "0px",
-                background: "transparent",
-                fontWeight: 500,
-                fontSize: "12pt",
-                margin: "0px",
-                height: "auto",
-                minHeight: "auto",
-              },
-            }}
-          />
-          {/* </Group> */}
-
-          {/* <Group spacing="4px" ml="auto"> */}
-
-          {/* <Button
-                  color={criterion.priority <= 0 ? "gray" : "red"}
-                  m={0}
-                  p={0}
-                  variant="subtle"
-                >
-                  <IconFlagFilled size="14pt" />
-                </Button> */}
-          {/* </Group>
-            </Flex>
-          </Card.Section> */}
-
-          {/* Description of the criteria */}
-          {/* <Card.Section p="0px"> */}
-          {/* <Collapse in={opened}> */}
-          <Textarea
-            value={criterion.criteria}
-            // placeholder="Describe here."
-            readOnly
-            // onClickCapture={(e) => e.stopPropagation()}
-            styles={{
-              input: {
-                border: "none",
-                borderWidth: "0px",
-                paddingTop: "0px !important",
-                paddingLeft: "0px",
-                margin: "0px",
-                color: "#444",
-                background: "transparent",
-                lineHeight: 1.1,
-              },
-            }}
-            autosize
-            minRows={2}
-            maxRows={5}
-            fz="sm"
-            mb="xs"
-            c="dimmed"
-          />
-
-          {/* Whether this criteria should be implemented with code (function) or an LLM evaluator */}
-          <Text color="#999" size="sm" mr="6px">
-            {criterion.eval_method === "code" ? (
-              <Flex style={{ userSelect: "none" }}>
-                <IconTerminal2 size="14pt" />
-                &nbsp;Python
-              </Flex>
-            ) : (
-              <Flex style={{ userSelect: "none" }}>
-                <IconRobot size="14pt" />
-                &nbsp;LLM
-              </Flex>
-            )}
-          </Text>
-        </div>
-        <Stack spacing={0}>
-          <Contributor
-            getStateValue={getStateValue}
-            style={{ size: 90, thickness: 12 }}
-          />
-          <Text align="center" size="xs" color="gray" maw="90%" lh={1.1}>
-            Alignment with your grades
-          </Text>
-        </Stack>
-      </div>
-      {/* </Collapse> */}
-      {/* </Card.Section> */}
-      <div>
-        <Accordion>
-          <Accordion.Item
-            key={"Show Bad Implementations"}
-            value={"Show Bad Implementations"}
-          >
-            <Accordion.Control>
-              <Text size="sm"> Show Bad Implementations </Text>
-            </Accordion.Control>
-            <Accordion.Panel>{unselectedImplementations}</Accordion.Panel>
-          </Accordion.Item>
-        </Accordion>
-      </div>
-    </Card>
-  );
-};
-
-const ReadOnlyThumbUpDownButtons = ({
-  upCount,
-  downCount,
-}: {
-  upCount: number;
-  downCount: number;
-  // grade: boolean | undefined;
-  // onChangeGrade: (newGrade: boolean | undefined) => void;
-  // getGradeCount: (grade: boolean | undefined) => number;
-}) => {
-  return (
-    <>
-      {/* Thumbs up/down buttons */}
-      <Button color={"green"} m={0} p={0} variant="subtle">
-        <div className="gradeContainer">
-          <IconThumbUp size="14pt" fill={"#aea"} />
-          <div className="gradeUpCount">{upCount}</div>
-        </div>
-      </Button>
-      <Button color={"red"} m={0} p={0} variant="subtle">
-        <div className="gradeContainer">
-          <IconThumbDown size="14pt" fill={"pink"} />
-          <div className="gradeDownCount">{downCount}</div>
-        </div>
-      </Button>
-    </>
-  );
-};
-
-// export default { EvalGenModal, ReportCardScreen };
-export default EvalGenModal;
+// /**
+//  * EvalGen 2.0
+//  *
+//  * Ian Arawjo, Shreya Shankar, J.D. Zamf., Helen Weixu Chen
+//  *
+//  * This file concerns the front-end to evaluation generator, EvalGen.
+//  * EvalGen supports users in generating eval funcs (here binary assertions) and aligning them with their preferences.
+//  *
+//  * Specifically, the modal lets users:
+//  *  - make and refine criteria to grade on (on the left)
+//  *  - grade responses (on the right)
+//  *  - while in the backend, an LLM is generating candidate assertions and selected the ones most aligned with user grades
+//  * As the user grades responses, they add/refine existing criteria.
+//  * This modal presents a shared interface where criteria can be iterated on *alongside* grading.
+//  * This is because of **criteria drift,** a phenomenon identified observing users in EvalGen 1.0 (unreleased).
+//  *
+//  * An AI (LLM call) can also suggest criteria based on the implicit context (inputs, such as the prompt)
+//  * and user feedback during grading (written feedback about failing outputs whose failure couldn't be classified under the immediate criteria set.)
+//  */
+// import React, {
+//   ReactNode,
+//   forwardRef,
+//   useCallback,
+//   useEffect,
+//   useImperativeHandle,
+//   useMemo,
+//   useState,
+// } from "react";
+// import { v4 as uuid } from "uuid";
+// import {
+//   Accordion,
+//   ActionIcon,
+//   Box,
+//   Button,
+//   Card,
+//   Center,
+//   Checkbox,
+//   Code,
+//   Collapse,
+//   Divider,
+//   Flex,
+//   Grid,
+//   Group,
+//   Menu,
+//   Modal,
+//   Radio,
+//   RingProgress,
+//   ScrollArea,
+//   SimpleGrid,
+//   Skeleton,
+//   Stack,
+//   Text,
+//   TextInput,
+//   Textarea,
+//   Title,
+//   Tooltip,
+//   rem,
+// } from "@mantine/core";
+// import { useDisclosure } from "@mantine/hooks";
+// import {
+//   // CriteriaGradeCount,
+//   Dict,
+//   LLMResponse,
+//   PromptVarsDict,
+//   RatingDict,
+//   ResponseUID,
+// } from "./backend/typing";
+// import { EvalCriteria, EvalGenReport } from "./backend/evalgen/typing";
+// import {
+//   IconChevronDown,
+//   IconChevronLeft,
+//   IconChevronRight,
+//   IconDots,
+//   IconRobot,
+//   IconStarFilled,
+//   IconTerminal2,
+//   IconThumbDown,
+//   IconThumbUp,
+//   IconTrash,
+//   IconFlagFilled,
+//   IconPencil,
+//   IconSparkles,
+// } from "@tabler/icons-react";
+// import {
+//   cleanMetavarsFilterFunc,
+//   deepcopy,
+//   sampleRandomElements,
+//   transformDict,
+// } from "./backend/utils";
+// import {
+//   extractUIDFromRatingKey,
+//   getRatingKeyForResponse,
+// } from "./ResponseRatingToolbar";
+// import useStore from "./store";
+// import StorageCache from "./backend/cache";
+// import EvaluationFunctionExecutor from "./backend/evalgen/executor";
+// import { generateLLMEvaluationCriteria } from "./backend/evalgen/utils";
+// import { escapeBraces } from "./backend/template";
+// import { update } from "lodash";
+// // import "./EvalGenModel.css";
+
+// const INIT_CRITERIA: EvalCriteria[] = [
+//   {
+//     shortname: "Grammatical",
+//     criteria: "The text is grammatically correct.",
+//     eval_method: "expert",
+//     uid: uuid(),
+//     priority: 0,
+//   },
+//   {
+//     shortname: "Tweet-length",
+//     criteria: "The text is less than 144 characters.",
+//     eval_method: "code",
+//     uid: uuid(),
+//     priority: 0,
+//   },
+//   {
+//     shortname: "Bombastic",
+//     criteria: "The message will drive views because it's controversial.",
+//     eval_method: "expert",
+//     uid: uuid(),
+//     priority: 0,
+//   },
+// ];
+
+// const Contributor = ({
+//   getStateValue,
+//   style = { size: 22, thickness: 4 },
+// }: {
+//   getStateValue: (id: number) => number;
+//   style: { size: number; thickness: number };
+// }) => {
+//   return (
+//     <RingProgress
+//       size={style.size}
+//       thickness={style.thickness}
+//       // label=""
+//       sections={[
+//         {
+//           value: getStateValue(1),
+//           color: "cyan",
+//           tooltip: "You have successfully contributed 7 responses.",
+//         },
+//         {
+//           value: getStateValue(2),
+//           color: "orange",
+//           tooltip: "You have successfully contributed 20 responses.",
+//         },
+//         {
+//           value: getStateValue(3),
+//           color: "green",
+//           tooltip: "You have gone to buffet 100 times.",
+//         },
+//         {
+//           value: getStateValue(4),
+//           color: "grape",
+//           tooltip: "You have made 21 nightmare",
+//         },
+//       ]}
+//     />
+//   );
+// };
+
+// const ThumbUpDownButtons = ({
+//   grade,
+//   onChangeGrade,
+//   getGradeCount,
+// }: {
+//   grade: boolean | undefined;
+//   onChangeGrade: (newGrade: boolean | undefined) => void;
+//   getGradeCount: (grade: boolean | undefined) => number;
+// }) => {
+//   // console.log(
+//   //   "getGradeCount",
+//   //   getGradeCount(true),
+//   //   getGradeCount(false),
+//   //   getGradeCount(undefined),
+//   // );
+//   return (
+//     <>
+//       {/* Thumbs up/down buttons */}
+//       <Button
+//         color={grade === true ? "green" : "gray"}
+//         m={0}
+//         p={0}
+//         variant="subtle"
+//         onClick={() => {
+//           // Toggle grade: if on (true), turn 'off' (undefined, for neutral).
+//           if (onChangeGrade) onChangeGrade(grade === true ? undefined : true);
+//         }}
+//       >
+//         <div className="gradeContainer">
+//           <IconThumbUp size="14pt" fill={grade === true ? "#aea" : "white"} />
+//           <div className="gradeUpCount">{getGradeCount(true)}</div>
+//         </div>
+//       </Button>
+//       <Button
+//         color={grade === false ? "red" : "gray"}
+//         m={0}
+//         p={0}
+//         variant="subtle"
+//         onClick={() => {
+//           // Toggle grade: if on (true), turn 'off' (undefined, for neutral).
+//           if (onChangeGrade) onChangeGrade(grade === false ? undefined : false);
+//         }}
+//       >
+//         <div className="gradeContainer">
+//           <IconThumbDown
+//             size="14pt"
+//             fill={grade === false ? "pink" : "white"}
+//           />
+//           <div className="gradeDownCount">{getGradeCount(false)}</div>
+//         </div>
+//       </Button>
+//     </>
+//   );
+// };
+
+// export interface CriteriaCardProps {
+//   criterion: EvalCriteria;
+//   onChange: (changedCriteria: EvalCriteria) => void;
+//   onDelete: () => void;
+//   initiallyOpen?: boolean;
+//   grade: boolean | undefined;
+//   onChangeGrade: (newGrade: boolean | undefined) => void;
+//   getGradeCount: (grade: boolean | undefined) => number;
+//   getStateValue: (stateId: number) => number;
+// }
+
+// const CriteriaCard: React.FC<CriteriaCardProps> = ({
+//   criterion,
+//   onChange,
+//   onDelete,
+//   initiallyOpen,
+//   grade,
+//   getGradeCount,
+//   onChangeGrade,
+//   getStateValue,
+// }) => {
+//   const [opened, { toggle }] = useDisclosure(initiallyOpen ?? false);
+//   const [title, setTitle] = useState(criterion.shortname ?? "New Criteria");
+
+//   return (
+//     <Stack spacing={0} ml={8}>
+//       <Flex align="center">
+//         <Group spacing="0px">
+//           {/* The arrow chevron user can click to collapse/expand
+//             <Button
+//               color="gray"
+//               p={0}
+//               m={0}
+//               variant="subtle"
+//               mr="4px"
+//               onClick={toggle}
+//             >
+//               {opened ? (
+//                 <IconChevronDown size="14pt" />
+//               ) : (
+//                 <IconChevronRight size="14pt" />
+//               )}
+//             </Button> */}
+
+//           {/* Thumbs up/down buttons */}
+//           <ThumbUpDownButtons
+//             grade={grade}
+//             onChangeGrade={onChangeGrade}
+//             getGradeCount={getGradeCount}
+//           />
+
+//           {/* Title of the criteria */}
+//           <TextInput
+//             value={title}
+//             onChange={(e) => setTitle(e.target.value)}
+//             onBlur={(e) => {
+//               criterion.shortname = e.target.value;
+//               if (onChange) onChange(criterion);
+//             }}
+//             placeholder="Criteria name"
+//             variant="unstyled"
+//             size="md"
+//             ml="xs"
+//             className="nodrag nowheel"
+//             styles={{
+//               input: {
+//                 padding: "0px",
+//                 height: "14pt",
+//                 minHeight: "0pt",
+//                 fontWeight: 500,
+//               },
+//             }}
+//           />
+//         </Group>
+
+//         <Group spacing="4px" ml="auto">
+//           {/* Whether this criteria should be implemented with code (function) or an LLM evaluator */}
+//           <Tooltip
+//             label={
+//               criterion.eval_method === "code"
+//                 ? "Change to an LLM evaluator"
+//                 : "Change to a code evaluator"
+//             }
+//             withinPortal
+//             withArrow
+//           >
+//             <Text
+//               color="#999"
+//               size="sm"
+//               mr="6px"
+//               onClick={() => {
+//                 criterion.eval_method =
+//                   criterion.eval_method === "code" ? "expert" : "code";
+//                 if (onChange) onChange(criterion);
+//               }}
+//             >
+//               {criterion.eval_method === "code" ? (
+//                 <Flex style={{ userSelect: "none" }}>
+//                   <IconTerminal2 size="14pt" />
+//                   &nbsp;Python
+//                 </Flex>
+//               ) : (
+//                 <Flex style={{ userSelect: "none" }}>
+//                   <IconRobot size="14pt" />
+//                   &nbsp;LLM
+//                 </Flex>
+//               )}
+//             </Text>
+//           </Tooltip>
+
+//           {/* <Contributor getStateValue={getStateValue} /> */}
+
+//           {/* Delete button (and any other criterion-specific changes in the future) */}
+//           <ActionIcon variant="subtle" color="red" onClick={onDelete}>
+//             <IconTrash style={{ width: rem(16), height: rem(16) }} />
+//           </ActionIcon>
+//         </Group>
+//       </Flex>
+
+//       <Textarea
+//         value={criterion.criteria}
+//         placeholder="Describe here. You must describe what the criteria means before EvalGen can implement it."
+//         ml={38}
+//         onChange={(e) => {
+//           criterion.criteria = e.target.value;
+//           if (onChange) onChange(criterion);
+//         }}
+//         onClickCapture={(e) => e.stopPropagation()}
+//         styles={{
+//           input: {
+//             border: "none",
+//             borderWidth: "0px",
+//             margin: "0px",
+//             color: "#444",
+//             background: "transparent",
+//             lineHeight: 1.1,
+//           },
+//         }}
+//         autosize
+//         minRows={2}
+//         maxRows={5}
+//         fz="sm"
+//         mb="xs"
+//         c="dimmed"
+//       />
+//     </Stack>
+//   );
+// };
+
+// export interface EvalGenModalRef {
+//   trigger: (
+//     resps: LLMResponse[],
+//     setFinalReports: (reports: EvalGenReport) => void,
+//   ) => void;
+// }
+
+// const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
+//   function EvalGenModal(props, ref) {
+//     const [opened, { open, close }] = useDisclosure(false);
+//     const apiKeys = useStore((state) => state.apiKeys);
+//     const globalState = useStore((store) => store.state);
+//     const [criteria, setCriteria] = useState<EvalCriteria[]>([]);
+//     const [criteriaForDisplay, setCriteriaForDisplay] = useState<
+//       EvalCriteria[]
+//     >([]);
+
+//     const [responses, setResponses] = useState<LLMResponse[]>([]);
+//     const [shownResponse, setShownResponse] = useState<LLMResponse | undefined>(
+//       undefined,
+//     );
+//     const [pastShownResponses, setPastShownResponses] = useState<LLMResponse[]>(
+//       [],
+//     );
+//     const [shownResponseIdx, setShownResponseIdx] = useState(0);
+
+//     const [annotation, setAnnotation] = useState<string | undefined>(undefined);
+//     const [holisticGrade, setHolisticGrade] = useState<
+//       "good" | "bad" | undefined
+//     >(undefined);
+
+//     // Per-criteria grades (indexed by uid of response, then uid of criteria)
+//     const [grades, setGrades] = useState<Dict<Dict<boolean | undefined>>>({});
+//     const setPerCriteriaGrade = (
+//       responseUID: string,
+//       criteriaUID: string,
+//       newGrade: boolean | undefined,
+//     ) => {
+//       setGrades((grades) => {
+//         if (!grades[responseUID]) grades[responseUID] = {};
+//         grades[responseUID][criteriaUID] = newGrade;
+//         // grades[responseUID] = { ...grades[responseUID] };
+//         // console.error("grades-2", grades);
+//         return { ...grades };
+//       });
+//       updateGlobalRating(responseUID, "perCriteriaGrades", grades[responseUID]);
+//     };
+//     const getStateValue = (stateId: number) => {
+//       return Math.floor(Math.random() * 30 + 6);
+//     };
+//     const getGradeCount = (
+//       // responseUID: string,
+//       criteriaUID: string,
+//       grade: boolean | undefined,
+//     ) => {
+//       // console.log("getGradeCount", responseUID, criteriaUID, grade);
+//       // console.log("getGradeCount", grades);
+
+//       let count = 0;
+//       for (const respUid in grades) {
+//         count += grade === grades[respUid][criteriaUID] ? 1 : 0;
+//       }
+//       return count;
+//     };
+
+//     // The EvalGen object responsible for generating, implementing, and filtering candidate implementations
+//     const [executor, setExecutor] = useState<EvaluationFunctionExecutor | null>(
+//       null,
+//     );
+
+//     const [execProgress, setExecProgress] = useState(0);
+
+//     // State variables to keep track of GPT call counts
+//     const [numGPT4Calls, setNumGPT4Calls] = useState(0);
+//     const [numGPT35Calls, setNumGPT35Calls] = useState(0);
+//     const [logs, setLogs] = useState<{ date: Date; message: string }[]>([]);
+
+//     // For updating the global human ratings state
+//     const setState = useStore((store) => store.setState);
+//     const updateGlobalRating = useCallback(
+//       (uid: string, label: string, payload: RatingDict) => {
+//         const key = getRatingKeyForResponse(uid, label);
+//         const safe_payload = deepcopy(payload);
+//         setState(key, safe_payload);
+//         StorageCache.store(key, safe_payload);
+//       },
+//       [setState],
+//     );
+
+//     // console.error("criteria", criteria);
+
+//     // Update executor whenever resps, grades, or criteria change
+//     React.useEffect(() => {
+//       if (criteria.length > 0 && !executor) {
+//         const existingGrades = transformDict(
+//           globalState,
+//           (key) => key.startsWith("r.") && key.endsWith(".grade"),
+//           extractUIDFromRatingKey,
+//           (_, val) => {
+//             // The grades are in { idx: grade } format. Take only the first,
+//             // as we only take the first response in this iteration of EvalGen:
+//             if (typeof val !== "object") return undefined;
+//             const gs = Object.values(val);
+//             if (gs.length === 0) return undefined;
+//             return gs[0];
+//           },
+//         );
+
+//         const addLog = (message: string) => {
+//           setLogs((prevLogs) => [...prevLogs, { date: new Date(), message }]);
+//         };
+
+//         const ex = new EvaluationFunctionExecutor(
+//           getLikelyPromptTemplateAsContext(responses),
+//           responses,
+//           criteria,
+//           (gpt4Calls, gpt35Calls) => {
+//             // Callback to update GPT call counts
+//             setNumGPT4Calls((num) => num + gpt4Calls);
+//             setNumGPT35Calls((num) => num + gpt35Calls);
+//           },
+//           addLog,
+//           existingGrades,
+//           grades,
+//         );
+//         setExecutor(ex);
+
+//         setExecProgress(0);
+
+//         // ex.start((progress) => {
+//         //   setExecProgress(progress?.success ?? 0);
+//         // });
+//       } else if (executor) {
+//         // Update criteria in executor
+//         executor.updateCriteria(criteria);
+//       }
+
+//       updateCriteriaForDisplay();
+//     }, [criteria]);
+
+//     const generateCriteria = (resps) => {
+//       // Create criteria
+//       // setIsLoadingCriteria((num) => num + 3);
+//       genCriteriaFromContext(resps)
+//         .then((crits) => {
+//           console.log("crits #1", crits);
+//           crits = [...criteria, ...crits];
+//           console.log("crits #2", crits);
+//           setCriteria(crits.map((c) => ({ ...c, uid: uuid() })));
+//         })
+//         .catch((err) => {
+//           console.error(err);
+//         })
+//         .finally(() => {
+//           setIsLoadingCriteria((num) => num - 3);
+//           setNumGPT4Calls((num) => num + 1);
+//         });
+//     };
+
+//     // const defaultOnFinish = (reports: string) => {};
+//     const [onFinish, setOnFinish] = useState({
+//       setFinalRpts: (reports: EvalGenReport) => {
+//         // console.log("");
+//       },
+//     });
+
+//     // Open the EvalGen wizard
+//     const trigger = (
+//       resps: LLMResponse[],
+//       setFinalReports: (reports: EvalGenReport) => void,
+//     ) => {
+//       // We pass the responses here manually to ensure they remain the same
+//       // for the duration of one EvalGen operation.
+//       setResponses(resps);
+//       gotoNextScreen("response");
+//       // setFinalReports("A plenty response");
+//       setOnFinish({
+//         setFinalRpts: (reports: EvalGenReport) => {
+//           close();
+//           setFinalReports(reports);
+//         },
+//       });
+
+//       const firstGrades = resps.reduce(
+//         (acc: Dict<Dict<boolean | undefined>>, curr) => {
+//           if (!(curr.uid in acc)) acc[curr.uid] = {};
+//           return acc;
+//         },
+//         grades,
+//       );
+//       setGrades(firstGrades);
+
+//       console.log("*****************************resps", resps);
+//       if (criteria && criteria.length === 0) {
+//         generateCriteria(resps);
+//       }
+
+//       setShownResponseIdx(0);
+//       if (resps.length > 0) {
+//         const first_resp = sampleRandomElements(resps, 1)[0];
+//         // setShownResponse(first_resp);
+//         setPastShownResponses([first_resp]);
+//       } else {
+//         // setShownResponse(undefined);
+//         setPastShownResponses([]);
+//       }
+//       setShownResponse(resps[shownResponseIdx]);
+//       open();
+//     };
+//     useImperativeHandle(ref, () => ({
+//       trigger,
+//     }));
+
+//     const getLikelyPromptTemplateAsContext = (resps) => {
+//       // Attempt to infer the prompt template used to generate the responses:
+//       const prompts = new Set();
+//       for (const resp_obj of resps) {
+//         if (resp_obj?.metavars?.__pt !== undefined) {
+//           prompts.add(resp_obj.metavars.__pt);
+//         }
+//       }
+
+//       if (prompts.size === 0) return null;
+
+//       // Pick a prompt template at random to serve as context....
+//       return escapeBraces(prompts.values().next().value);
+//     };
+
+//     async function genCriteriaFromContext(responses) {
+//       // Get the context from the input responses
+//       const inputPromptTemplate = getLikelyPromptTemplateAsContext(responses);
+
+//       if (inputPromptTemplate === null) {
+//         console.error("No context found. Cannot proceed.");
+//         return;
+//       }
+
+//       // Attempt to generate criteria using an LLM
+//       return await generateLLMEvaluationCriteria(inputPromptTemplate, apiKeys);
+//     }
+
+//     // Add a criterion
+//     const handleAddCriteria = (newCrit: EvalCriteria) => {
+//       setCriteria((cs) => {
+//         if (!newCrit.uid) newCrit.uid = uuid();
+//         return [...cs, newCrit];
+//       });
+//     };
+
+//     // Modify an existing criterion
+//     const handleChangeCriteria = (newCrit: EvalCriteria, uid: string) => {
+//       setCriteria((cs) => {
+//         const idx = cs.findIndex((c) => c.uid === uid);
+//         if (idx === -1) {
+//           console.error("Could not find criteria with uid", uid);
+//           return cs;
+//         }
+//         cs[idx] = newCrit;
+//         return [...cs];
+//       });
+//     };
+
+//     // Delete a criterion
+//     const handleDeleteCriteria = (uid: string) => {
+//       setCriteria((cs) => {
+//         return cs.filter((c) => c.uid !== uid);
+//       });
+//     };
+
+//     // Synthesize a new criteria according to the feedback given for the shown response
+//     const [isLoadingCriteria, setIsLoadingCriteria] = useState(0);
+//     const synthNewCriteriaWithLLM = (
+//       response: string,
+//       feedback: string,
+//       grade: "good" | "bad" | "unknown",
+//     ) => {
+//       // Add a loading Skeleton
+//       setIsLoadingCriteria((num) => num + 1);
+//       // Make async LLM call to expand criteria only if the feedback contains some idea of a constraint on the output and isn't covered by existing criteria
+//       const prettyCriteria = criteria
+//         .map((crit) => {
+//           return `${crit.shortname}: ${crit.criteria}`;
+//         })
+//         .join("\n");
+
+//       generateLLMEvaluationCriteria(
+//         "",
+//         apiKeys,
+//         `I've given some feedback on some text output. Use this feedback to decide on a single new evaluation criteria with a yes/no answer, only if the feedback isn't encompassed by existing criteria. I want you to take the criteria and output a JSON object in the format below.
+
+// TEXT OUTPUT:
+// \`\`\`
+// ${response}
+// \`\`\`
+
+// EXISTING CRITERIA:
+// \`\`\`
+// ${prettyCriteria}
+// \`\`\`
+
+// GRADE (whether text was good or bad):
+// \`\`\`
+// ${grade}
+// \`\`\`
+
+// FEEDBACK:
+// \`\`\`
+// ${feedback}
+// \`\`\`
+
+// If you determine the feedback corresponds to a new criteria, your response should contain a short title for the criteria ("shortname"), a description of the criteria in 2 sentences ("criteria"), and whether it should be evaluated with "code", or by an "expert" if the criteria is difficult to evaluate ("eval_method"). Your answer should be JSON within a \`\`\`json \`\`\` marker, with the following three fields: "criteria", "shortname", and "eval_method" (code or expert). The "criteria" should expand upon the user's input, the "shortname" should be a very brief title for the criteria, and this list should contain as many evaluation criteria as you can think of. Each evaluation criteria should test a unit concept that should evaluate to "true" in the ideal case. Only output JSON, nothing else. Output an empty list if there is no new evaluation criteria`, // prompt
+//         "gpt-4o", // llm
+//       )
+//         .then((evalCrits) => {
+//           // Take only the first if evalCrits has a nonempty list
+//           if (evalCrits[0]) {
+//             setCriteria((crit) =>
+//               crit.concat([
+//                 {
+//                   ...evalCrits[0],
+//                   uid: uuid(),
+//                 },
+//               ]),
+//             );
+//           }
+//           // Remove a loading Skeleton
+//           setIsLoadingCriteria((num) => num - 1);
+
+//           setNumGPT4Calls((num) => num + 1);
+//         })
+//         .catch((err) => {
+//           console.error(err);
+//           setIsLoadingCriteria((num) => num - 1);
+//         });
+//     };
+
+//     // Goto next response in the queue (skipping grading the current one)
+//     const nextResponse = () => {
+//       if (responses.length === 0) return;
+
+//       // Update annotation for current response (if any)
+//       // TODO: Fix this for generate case when num resp per prompt > 1
+
+//       if (
+//         grades[shownResponse.uid] ||
+//         holisticGrade ||
+//         (annotation && annotation.trim())
+//       ) {
+//         executor?.setGradeForExample(
+//           shownResponse.uid,
+//           grades[shownResponse.uid],
+//           holisticGrade,
+//           annotation ? annotation.trim() : null,
+//         );
+//       }
+
+//       if (
+//         shownResponse &&
+//         annotation &&
+//         typeof annotation === "string" &&
+//         annotation.trim().length > 0
+//       ) {
+//         console.log(
+//           "setting annotation for resp",
+//           shownResponse.uid,
+//           annotation,
+//         );
+//         updateGlobalRating(shownResponse.uid, "note", { 0: annotation });
+//         setAnnotation("");
+//       }
+
+//       if (shownResponse && holisticGrade) {
+//         updateGlobalRating(shownResponse.uid, "grade", {
+//           0: holisticGrade === "good",
+//         });
+//       }
+
+//       if (shownResponse && grades[shownResponse.uid]) {
+//         updateGlobalRating(
+//           shownResponse.uid,
+//           "perCriteriaGrades",
+//           grades[shownResponse.uid],
+//         );
+//       }
+
+//       // @ts-expect-error The only way to deselect the Radio.Group is to set it to null. Undefined doesn't work.
+//       setHolisticGrade(null);
+
+//       if (shownResponseIdx < pastShownResponses.length - 1) {
+//         // If we are not at the end of the history of shown responses, then show the next response:
+//         setShownResponse(pastShownResponses[shownResponseIdx + 1]);
+//         setShownResponseIdx(shownResponseIdx + 1); // increment the shown resp idx
+//       } else {
+//         // We are at the end of the history; pick the next response off the stack:
+//         // TODO: Make this unique (maybe by removing picked responses from the list!)
+//         let num_tries = 3;
+//         let next_resp = executor?.getNextExampleToGrade();
+//         while (
+//           num_tries > 0 &&
+//           (!next_resp ||
+//             pastShownResponses.some((r) => r.uid === next_resp?.uid))
+//         ) {
+//           // We're presenting a response that's already been shown. Try again.
+//           // NOTE: If we're trying again the first time, executor will flip and get the response on the other side of the grading stack, so we try once more:
+//           if (next_resp && num_tries === 3)
+//             next_resp =
+//               executor?.getNextExampleToGrade() ??
+//               sampleRandomElements(responses, 1)[0];
+//           // Otherwise we just choose a response at random:
+//           else next_resp = sampleRandomElements(responses, 1)[0];
+//           num_tries -= 1;
+//         }
+//         // Note that this doesn't guarantee uniqueness here ---it is possible to see a response again.
+//         // However, the internal "grades" dict will help us in remembering what grade the user gave the response.
+//         setShownResponse(next_resp ?? undefined);
+//         if (next_resp)
+//           setPastShownResponses(pastShownResponses.concat(next_resp));
+//         setShownResponseIdx(pastShownResponses.length);
+//       }
+//       updateShownResponseUniqueIndex();
+//     };
+
+//     // Go back to previously shown response
+//     const prevResponse = () => {
+//       if (pastShownResponses.length === 0 || shownResponseIdx === 0) return;
+//       setShownResponse(pastShownResponses[shownResponseIdx - 1]);
+//       setShownResponseIdx(shownResponseIdx - 1); // decrement shown resp idx
+//       updateShownResponseUniqueIndex();
+//     };
+
+//     const updateShownResponseUniqueIndex = () => {
+//       let idx = 0;
+//       for (const resp of responses) {
+//         if (resp === shownResponse) {
+//           setShownResponseUniqueIdx(idx);
+//           break;
+//         }
+//         idx++;
+//       }
+//     };
+
+//     const nextResponse2 = () => {
+//       if (responses.length === 0) return;
+//       if (shownResponseIdx < responses.length - 1) {
+//         // setShownResponse(responses[shownResponseIdx + 1]);
+//         setShownResponseIdx(shownResponseIdx + 1);
+//       }
+//     };
+
+//     const prevResponse2 = () => {
+//       if (shownResponseIdx > 0) {
+//         // setShownResponse(responses[shownResponseIdx - 1]);
+//         setShownResponseIdx(shownResponseIdx - 1); // decrement shown resp idx
+//       }
+//     };
+
+//     React.useEffect(() => {
+//       setShownResponse(responses[shownResponseIdx]);
+//     }, [shownResponseIdx]);
+
+//     const estimateGPTCalls = () => {
+//       return executor
+//         ? `This will trigger around ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT4Calls} GPT-4o and ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT35Calls} GPT-3.5-turbo-16k calls.`
+//         : "# estimated GPT calls not available.";
+//     };
+
+//     const updateCriteriaForDisplay = () => {
+//       const highCriteria = criteria.filter((c) => c.priority === 1);
+//       const lowCriteria = criteria.filter((c) => c.priority === 0);
+//       setCriteriaForDisplay(highCriteria.concat(lowCriteria));
+//     };
+//     useEffect(() => {
+//       const highCriteria = criteria.filter((c) => c.priority === 1);
+//       const lowCriteria = criteria.filter((c) => c.priority === 0);
+//       setCriteriaForDisplay(highCriteria.concat(lowCriteria));
+//     }, [criteria]);
+
+//     const [screen, setScreen] = useState("");
+//     const gotoNextScreen = (screenName: string) => {
+//       setScreen(screenName);
+//     };
+
+//     // const [onFinish, setOnFinish] = useState(null);
+
+//     return (
+//       <Modal
+//         size="95%"
+//         keepMounted
+//         opened={opened}
+//         onClose={close}
+//         closeOnClickOutside={true}
+//         style={{ position: "relative", left: "-5%" }}
+//       >
+//         {screen === "response" && (
+//           <Grid h={window?.innerHeight * 0.8}>
+//             <Grid.Col span={8}>
+//               <Stack justify="space-between">
+//                 {/* View showing the response the user is currently grading */}
+//                 <GradingView
+//                   shownResponse={shownResponse}
+//                   shownResponseIdx={shownResponseIdx}
+//                   // shownResponseIdx={shownResponseUniqueIdx}
+//                   responseCount={responses.length}
+//                   numGPT4Calls={numGPT4Calls}
+//                   numGPT35Calls={numGPT35Calls}
+//                   logs={logs}
+//                   gotoNextResponse={nextResponse2}
+//                   gotoPrevResponse={prevResponse2}
+//                   estimateGPTCalls={estimateGPTCalls}
+//                   gotoNextScreen={gotoNextScreen}
+//                 />
+
+//                 {/* Progress bar */}
+//                 {/* <Flex justify="left" align="center" gap="md">
+//                 <Stack w="100%" spacing={4}>
+//                   <Text color="#aaa" size="sm">
+//                     {bottomBar.progressLabel}
+//                   </Text>
+//                   <Progress w="100%" value={bottomBar.progressPerc} mb="0px" />
+//                 </Stack>
+
+//                 <Button
+//                   onClick={handleDone}
+//                   variant={bottomBar.buttonStyle}
+//                   disabled={bottomBar.buttonDisabled}
+//                 >
+//                   {bottomBar.buttonLabel}
+//                 </Button>
+//               </Flex> */}
+//               </Stack>
+//             </Grid.Col>
+//             <Grid.Col span={4} bg="#eee" pt="16px" h="100%">
+//               <Center>
+//                 <Title order={3} ml={8} mt="sm" mb="md">
+//                   Rubric
+//                 </Title>
+//               </Center>
+
+//               <div
+//                 style={{
+//                   display: "flex",
+//                   flexDirection: "column",
+//                 }}
+//               >
+//                 <div style={{ flex: 2, overflowY: "auto" }}>
+//                   {criteriaForDisplay.map((e) => (
+//                     <CriteriaCard
+//                       criterion={e}
+//                       key={e.uid}
+//                       onChange={(newCrit) =>
+//                         handleChangeCriteria(newCrit, e.uid)
+//                       }
+//                       onDelete={() => handleDeleteCriteria(e.uid)}
+//                       grade={
+//                         shownResponse
+//                           ? grades[shownResponse.uid][e.uid]
+//                           : undefined
+//                       }
+//                       getGradeCount={(grade) => {
+//                         return shownResponse
+//                           ? getGradeCount(
+//                               // shownResponse.uid,
+//                               e.uid,
+//                               grade,
+//                             )
+//                           : 0;
+//                       }}
+//                       onChangeGrade={(newGrade) => {
+//                         if (shownResponse)
+//                           setPerCriteriaGrade(
+//                             shownResponse.uid,
+//                             e.uid,
+//                             newGrade,
+//                           );
+//                       }}
+//                       initiallyOpen={true}
+//                       getStateValue={(stateId) => getStateValue(stateId)}
+//                     />
+//                   ))}
+//                   {isLoadingCriteria > 0 ? (
+//                     Array.from(
+//                       { length: isLoadingCriteria },
+//                       (v: unknown, idx: number) => (
+//                         <Skeleton key={idx} h={80} mb={4} />
+//                       ),
+//                     )
+//                   ) : (
+//                     <></>
+//                   )}
+
+//                   <div className="criteriaButtons">
+//                     <Button
+//                       leftIcon={<IconPencil size={14} />}
+//                       variant="subtle"
+//                       color="gray"
+//                       // gradient={{ from: "blue", to: "green", deg: 90 }}
+//                       onClick={() => {
+//                         handleAddCriteria({
+//                           shortname: "New Criteria",
+//                           criteria: "",
+//                           eval_method: "code",
+//                           priority: 0,
+//                           uid: uuid(),
+//                         });
+//                       }}
+//                     >
+//                       Add a new criteria
+//                     </Button>
+//                     <Button
+//                       leftIcon={<IconSparkles size={14} />}
+//                       variant="subtle"
+//                       color="gray"
+//                       // gradient={{ from: "blue", to: "green", deg: 90 }}
+//                       onClick={() => {
+//                         generateCriteria(responses);
+//                       }}
+//                     >
+//                       Suggest Criteria
+//                     </Button>
+//                   </div>
+//                 </div>
+
+//                 <Stack spacing="0px" pl="xs" pr="lg" style={{ flex: 1 }}>
+//                   <Divider mt="lg" />
+//                   <Title mb="0px" order={4}>
+//                     Suggest New Criteria Based on the Feedback
+//                   </Title>
+//                   <Textarea
+//                     value={annotation}
+//                     onChange={(e) => setAnnotation(e.target.value)}
+//                     description="How good is this response? Explain anything not captured under your existing criteria. Your feedback will be used to generate new criteria."
+//                     mb="sm"
+//                   />
+//                   <Radio.Group
+//                     name="favoriteFramework"
+//                     label="Rate the response holistically:"
+//                     value={holisticGrade}
+//                     onChange={(v) => setHolisticGrade(v as "good" | "bad")}
+//                     withAsterisk
+//                     mb="md"
+//                   >
+//                     <Group mt="xs">
+//                       <Radio value="good" label="Good" />
+//                       <Radio value="bad" label="Bad" />
+//                       <span>
+//                         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
+//                       </span>
+//                       <Button
+//                         color="green"
+//                         variant="filled"
+//                         disabled={
+//                           !holisticGrade ||
+//                           annotation === undefined ||
+//                           annotation.length === 0
+//                         }
+//                         onClick={() => {
+//                           synthNewCriteriaWithLLM(
+//                             shownResponse?.responses[0].toString() ?? "",
+//                             annotation ?? "",
+//                             holisticGrade ?? "unknown",
+//                           );
+
+//                           nextResponse();
+//                         }}
+//                       >
+//                         + Submit Feedback
+//                       </Button>
+//                     </Group>
+//                   </Radio.Group>
+//                 </Stack>
+//               </div>
+//             </Grid.Col>
+//           </Grid>
+//         )}
+//         {screen === "report" && (
+//           <Grid>
+//             <ReportCardView
+//               report={{
+//                 criteria: criteria,
+//                 failureCoverage: 99.2,
+//                 falseFailureRate: 66.7,
+//               }}
+//               onFinish={(reports: EvalGenReport) => {
+//                 onFinish.setFinalRpts(reports);
+//               }}
+//               getGradeCount={(crit: EvalCriteria, grade: boolean) => {
+//                 return shownResponse
+//                   ? getGradeCount(
+//                       // shownResponse.uid,
+//                       crit.uid,
+//                       grade,
+//                     )
+//                   : 0;
+//               }}
+//               getStateValue={(stateId) => getStateValue(stateId)}
+//             />
+//           </Grid>
+//         )}
+//       </Modal>
+//     );
+//   },
+// );
+
+// const HeaderText = ({ children }: { children: ReactNode }) => {
+//   return (
+//     <Text size="xl" fw={500} pl="sm" mb="lg">
+//       {children}
+//     </Text>
+//   );
+// };
+
+// interface GradingViewProps {
+//   shownResponse: LLMResponse | undefined;
+//   shownResponseIdx: number;
+//   responseCount: number;
+//   numGPT4Calls: number;
+//   numGPT35Calls: number;
+//   logs: { date: Date; message: string }[];
+//   gotoPrevResponse: () => void;
+//   gotoNextResponse: () => void;
+//   estimateGPTCalls: () => string;
+//   gotoNextScreen: (screenName: string) => void;
+// }
+
+// const GradingView: React.FC<GradingViewProps> = ({
+//   shownResponse,
+//   shownResponseIdx,
+//   responseCount,
+//   numGPT4Calls,
+//   numGPT35Calls,
+//   logs,
+//   gotoPrevResponse,
+//   gotoNextResponse,
+//   estimateGPTCalls,
+//   gotoNextScreen,
+// }) => {
+//   // Calculate inner values only when shownResponse changes
+//   const responseText = useMemo(
+//     () =>
+//       shownResponse && shownResponse.responses?.length > 0
+//         ? shownResponse.responses[0].toString()
+//         : "",
+//     [shownResponse],
+//   );
+
+//   const prompt = useMemo(() => shownResponse?.prompt ?? "", [shownResponse]);
+//   const varsDivs = useMemo(() => {
+//     const combined_vars_metavars = shownResponse
+//       ? {
+//           ...shownResponse.vars,
+//           ...transformDict(shownResponse.metavars, cleanMetavarsFilterFunc),
+//         }
+//       : {};
+
+//     // console.log("**************shownResponse", shownResponse);
+//     return Object.entries(combined_vars_metavars).map(([varname, val]) => (
+//       <div key={varname} className="grade-resp-var-container">
+//         <span className="response-var-name">{varname}&nbsp;=&nbsp;</span>
+//         <span className="response-var-value linebreaks">{val}</span>
+//       </div>
+//     ));
+//   }, [shownResponse]);
+
+//   // const [shownResponseIdx, setShownResponseIdx] = useState(0);
+//   // const [shownResponses, setShownResponses] = useState<LLMResponse[]>([]);
+//   // React.useEffect(() => {
+//   //   console.error("current response", shownResponse);
+//   //   if (shownResponse && !shownResponses.includes(shownResponse)) {
+//   //     shownResponses.push(shownResponse);
+//   //     setShownResponses(shownResponses);
+//   //     setShownResponseIdx(shownResponses.length - 1);
+//   //     console.error("current response is saved.", shownResponses.length);
+//   //   } else {
+//   //     console.error("current response already saved.");
+//   //     for (const [idx, resp] of shownResponses.entries()) {
+//   //       if (shownResponse === resp) {
+//   //         setShownResponseIdx(idx);
+//   //         break;
+//   //       }
+//   //     }
+//   //   }
+//   // }, [shownResponse]);
+
+//   return (
+//     <Stack justify="space-between" mih={500}>
+//       <Box>
+//         {/* Top header */}
+//         <Flex justify="center">
+//           <HeaderText>
+//             {/* What do you think of this response? */}
+//             What do you think of response #{shownResponseIdx + 1} of{" "}
+//             {responseCount}?
+//           </HeaderText>
+//         </Flex>
+//         {/* Middle response box with chevron buttons < and > for going back and forward a response */}
+//         <Flex justify="center" align="center" mb="sm">
+//           {/* Go back to previous response */}
+//           <Button variant="white" color="dark" onClick={gotoPrevResponse}>
+//             <IconChevronLeft />
+//           </Button>
+
+//           {/* The response one is currently grading */}
+//           <div
+//             className="response-box"
+//             style={{
+//               backgroundColor: "#eee",
+//               width: "80%",
+//               maxHeight: "340px",
+//               overflowY: "scroll",
+//               borderColor: "black",
+//               borderStyle: "solid",
+//             }}
+//           >
+//             <div className="response-item-llm-name-wrapper">
+//               <div
+//                 className="small-response"
+//                 style={{ fontSize: "11pt", padding: "12pt" }}
+//               >
+//                 {responseText}
+//               </div>
+//             </div>
+//           </div>
+
+//           {/* Go forward to the next response */}
+//           <Tooltip label={estimateGPTCalls()} withArrow>
+//             <Button variant="white" color="dark" onClick={gotoNextResponse}>
+//               <IconChevronRight />
+//             </Button>
+//           </Tooltip>
+//         </Flex>
+//         {/* Views for the vars (inputs) that generated this response, and the concrete prompt */}
+//         <Flex justify="center" mb="xl" gap="lg">
+//           <div
+//             style={{
+//               backgroundColor: "#fff",
+//               padding: "12px",
+//               width: "31%",
+//               borderRadius: "12px",
+//               borderWidth: "1px",
+//               borderStyle: "solid",
+//             }}
+//           >
+//             Vars
+//             <hr />
+//             <div style={{ maxHeight: "160px", overflowY: "scroll" }}>
+//               {varsDivs}
+//             </div>
+//           </div>
+//           <div
+//             style={{
+//               backgroundColor: "#fff",
+//               padding: "12px",
+//               width: "41%",
+//               borderRadius: "2px",
+//             }}
+//           >
+//             Prompt
+//             <hr />
+//             <div
+//               className="monofont linebreaks"
+//               style={{
+//                 maxHeight: "160px",
+//                 overflowY: "scroll",
+//                 fontSize: "10pt",
+//                 lineHeight: "1.2",
+//               }}
+//             >
+//               {prompt}
+//             </div>
+//           </div>
+//         </Flex>
+//         <Flex direction="column">
+//           <Flex justify="space-between" align="center">
+//             <Text size="lg" weight={500} mb="sm">
+//               LLM Activity
+//             </Text>
+//             {/* GPT Call Tally */}
+//             <Text size="sm" color="dark" style={{ fontStyle: "italic" }}>
+//               Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls}{" "}
+//               GPT-3.5-Turbo-16k calls.
+//             </Text>
+//           </Flex>
+//           <div
+//             style={{
+//               backgroundColor: "#f0f0f0",
+//               color: "#333",
+//               fontFamily: "monospace",
+//               padding: "12px",
+//               width: "calc(100% - 30px)",
+//               height: "200px",
+//               overflowY: "auto",
+//               borderRadius: "8px",
+//               border: "1px solid #ddd",
+//               marginRight: "20px", // Space on the right
+//             }}
+//             ref={(el) => {
+//               if (el) {
+//                 el.scrollTop = el.scrollHeight;
+//               }
+//             }}
+//           >
+//             {logs.map((log, index) => (
+//               <div key={index}>
+//                 <span style={{ color: "#4A90E2" }}>
+//                   {log.date.toLocaleString()} -{" "}
+//                 </span>
+//                 <span>{log.message}</span>
+//               </div>
+//             ))}
+//           </div>
+//         </Flex>
+//       </Box>
+//       <div>
+//         <Center>
+//           <Button
+//             leftIcon={<IconSparkles size={14} />}
+//             variant="gradient"
+//             gradient={{ from: "blue", to: "green", deg: 45 }}
+//             onClick={() => {
+//               // console.log("(3) gotoNextScreen", gotoNextScreen);
+//               gotoNextScreen("report");
+//             }}
+//           >
+//             I&apos;m done. Access EvalGen Report!
+//           </Button>
+//         </Center>
+//       </div>
+//     </Stack>
+//   );
+// };
+
+// interface ReportCardViewProps {
+//   report: EvalGenReport;
+//   // recomputeAlignment,
+//   onFinish: (reports: EvalGenReport) => void;
+//   getGradeCount: (crit: EvalCriteria, grade: boolean) => number;
+//   getStateValue: (stateId: number) => number;
+// }
+
+// // const ReportCardScreen = () => {
+// const ReportCardView: React.FC<ReportCardViewProps> = ({
+//   report,
+//   // recomputeAlignment,
+//   onFinish,
+//   getGradeCount,
+//   getStateValue,
+// }) => {
+//   // The criteria cards, now with report information
+
+//   const [finalReport, setFinalReport] = useState(report);
+
+//   const onSelect = (criterion: EvalCriteria, isSelected: boolean) => {
+//     if (isSelected) {
+//       finalReport.criteria.push(criterion);
+//     } else {
+//       finalReport.criteria = finalReport.criteria.filter(
+//         (c) => c !== criterion,
+//       );
+//     }
+//     setFinalReport(finalReport);
+//   };
+//   const cards = useMemo(() => {
+//     const res = [];
+
+//     // Iterate through selected eval functions and create cards
+//     // for (const selectedFunc of report.selectedEvalFunctions) {
+//     //   const crit = selectedFunc.evalCriteria;
+//     //   // Find corresponding report in allEvalFunctionReports map from criteria to list
+//     //   const critEvalFuncReports = report.allEvalFunctionReports.get(crit);
+//     //   const evalFuncReport = critEvalFuncReports.find(
+//     //     (rep) => rep.evalFunction === selectedFunc,
+//     //   );
+
+//     //   // Get the functions that were not selected for this criteria
+//     //   const otherFuncs = critEvalFuncReports.filter(
+//     //     (rep) => rep.evalFunction !== selectedFunc,
+//     //   );
+//     for (const crit of report.criteria) {
+//       res.push(
+//         <ReportCriteriaCard
+//           criterion={crit}
+//           key={crit.uid}
+//           // onCheck={(checked) => {
+//           //   crit.selected = checked;
+//           //   recomputeAlignment();
+//           // }}
+//           getGradeCount={getGradeCount}
+//           getStateValue={getStateValue}
+//           onSelect={onSelect}
+//         />,
+//       );
+//     }
+
+//     return res;
+//   }, [report]);
+
+//   return (
+//     report && (
+//       <div>
+//         <Text align="center" size="lg" pl="sm" mb="lg">
+//           Chosen Functions and Alignment
+//         </Text>
+
+//         {/* Show coverage and false failure rate numbers */}
+//         <Flex justify="center" gap="md" mb="lg">
+//           <Group position="center" spacing="xl" style={{ textAlign: "center" }}>
+//             <Card
+//               shadow="sm"
+//               padding="md"
+//               radius="md"
+//               style={{ backgroundColor: "#f0f0f0" }}
+//             >
+//               <Text weight={500} size="md">
+//                 Coverage of Bad Responses
+//               </Text>
+//               <Text color="blue" weight={700} size="md">
+//                 {report.failureCoverage.toFixed(2)}%
+//               </Text>
+//             </Card>
+//             <Card
+//               shadow="sm"
+//               padding="md"
+//               radius="md"
+//               style={{ backgroundColor: "#f0f0f0" }}
+//             >
+//               <Text weight={500} size="md">
+//                 False Failure Rate
+//               </Text>
+//               <Text color="red" weight={700} size="md">
+//                 {report.falseFailureRate.toFixed(2)}%
+//               </Text>
+//             </Card>
+//           </Group>
+//         </Flex>
+
+//         <ScrollArea mih={300} h={500} mah={500}>
+//           <SimpleGrid cols={3} spacing="sm" verticalSpacing="sm" mb="lg">
+//             {cards}
+//           </SimpleGrid>
+//         </ScrollArea>
+
+//         <Flex justify="center" gap={12} mt="xs">
+//           <Button
+//             onClick={() => {
+//               // console.log("finalReport", finalReport);
+//               onFinish(finalReport);
+//             }}
+//           >
+//             Finish with selected evaluators
+//           </Button>
+//         </Flex>
+//       </div>
+//     )
+//   );
+// };
+
+// interface ReportCriteriaCardProps {
+//   criterion: EvalCriteria;
+//   // onChange: (changedCriteria: EvalCriteria) => void;
+//   // onDelete: () => void;
+//   // initiallyOpen?: boolean;
+//   // grade: boolean | undefined;
+//   // onChangeGrade: (newGrade: boolean | undefined) => void;
+//   getGradeCount: (crit: EvalCriteria, grade: boolean) => number;
+//   getStateValue: (stateId: number) => number;
+//   onSelect: (criterion: EvalCriteria, isChecked: boolean) => void;
+// }
+
+// const ReportCriteriaCard: React.FC<ReportCriteriaCardProps> = ({
+//   criterion,
+//   // onChange,
+//   // onDelete,
+//   // initiallyOpen,
+//   // grade,
+//   getGradeCount,
+//   // onChangeGrade,
+//   getStateValue,
+//   onSelect,
+// }) => {
+//   // const [opened, { toggle }] = useDisclosure(true);
+//   // const [title, setTitle] = useState(criterion.shortname);
+//   const [checked, setChecked] = useState(true);
+
+//   // Simulates eval functions that are expected to be passed in later on (TODO)
+//   const evalFuncs = [
+//     { evalFunction: { code: "To be provided (1) ..." } },
+//     { evalFunction: { code: "To be provided (2) ..." } },
+//     { evalFunction: { code: "To be provided (3) ..." } },
+//   ];
+//   const unselectedImplementations = evalFuncs.map((item) => (
+//     <div key={uuid()}>
+//       <Code style={{ whiteSpace: "pre-wrap" }} key={uuid()}>
+//         {item.evalFunction.code}
+//       </Code>
+//       <Divider />
+//     </div>
+//   ));
+
+//   return (
+//     // <Card withBorder mb={4} radius="md" style={{ cursor: "default" }}>
+//     <Card
+//       shadow="sm"
+//       padding="sm"
+//       pl="md"
+//       pb="xl"
+//       radius="md"
+//       withBorder
+//       style={{ backgroundColor: checked ? "#f2f7fc" : "#fff" }}
+//     >
+//       <div
+//         // onClick={() => setChecked(!checked)}
+//         onKeyUp={(e) => e.preventDefault()}
+//         className="checkcard"
+//       >
+//         {/* <Card.Section withBorder pl="8px">
+//           <Flex align="center">
+//             <Group spacing="0px"> */}
+//         {/* The arrow chevron user can click to collapse/expand */}
+//         {/* <Button
+//                 color="gray"
+//                 p={0}
+//                 m={0}
+//                 variant="subtle"
+//                 mr="4px"
+//                 onClick={toggle}
+//               >
+//                 {opened ? (
+//                   <IconChevronDown size="14pt" />
+//                 ) : (
+//                   <IconChevronRight size="14pt" />
+//                 )}
+//               </Button> */}
+
+//         <Tooltip label={checked ? "Don't use this" : "Use this"} withArrow>
+//           <Checkbox
+//             checked={checked}
+//             onChange={() => {
+//               setChecked(!checked);
+//               if (onSelect) onSelect(criterion, !checked);
+//             }}
+//             tabIndex={-1}
+//             size="xs"
+//             mr="sm"
+//             mt="xs"
+//             styles={{ input: { cursor: "pointer" } }}
+//             aria-hidden
+//           />
+//         </Tooltip>
+
+//         {/* Thumbs up/down buttons - disable for now */}
+//         {/* <ReadOnlyThumbUpDownButtons
+//                   upCount={getGradeCount(criterion, true)}
+//                   downCount={getGradeCount(criterion, false)}
+//                 /> */}
+
+//         <div style={{ width: "100%" }}>
+//           {/* Title of the criteria */}
+//           <TextInput
+//             value={criterion.shortname}
+//             // placeholder="Criteria name"
+//             readOnly
+//             variant="unstyled"
+//             size="sm"
+//             ml="xs"
+//             className="nodrag nowheel"
+//             styles={{
+//               input: {
+//                 border: "none",
+//                 borderWidth: "0px",
+//                 padding: "0px",
+//                 background: "transparent",
+//                 fontWeight: 500,
+//                 fontSize: "12pt",
+//                 margin: "0px",
+//                 height: "auto",
+//                 minHeight: "auto",
+//               },
+//             }}
+//           />
+//           {/* </Group> */}
+
+//           {/* <Group spacing="4px" ml="auto"> */}
+
+//           {/* <Button
+//                   color={criterion.priority <= 0 ? "gray" : "red"}
+//                   m={0}
+//                   p={0}
+//                   variant="subtle"
+//                 >
+//                   <IconFlagFilled size="14pt" />
+//                 </Button> */}
+//           {/* </Group>
+//             </Flex>
+//           </Card.Section> */}
+
+//           {/* Description of the criteria */}
+//           {/* <Card.Section p="0px"> */}
+//           {/* <Collapse in={opened}> */}
+//           <Textarea
+//             value={criterion.criteria}
+//             // placeholder="Describe here."
+//             readOnly
+//             // onClickCapture={(e) => e.stopPropagation()}
+//             styles={{
+//               input: {
+//                 border: "none",
+//                 borderWidth: "0px",
+//                 paddingTop: "0px !important",
+//                 paddingLeft: "0px",
+//                 margin: "0px",
+//                 color: "#444",
+//                 background: "transparent",
+//                 lineHeight: 1.1,
+//               },
+//             }}
+//             autosize
+//             minRows={2}
+//             maxRows={5}
+//             fz="sm"
+//             mb="xs"
+//             c="dimmed"
+//           />
+
+//           {/* Whether this criteria should be implemented with code (function) or an LLM evaluator */}
+//           <Text color="#999" size="sm" mr="6px">
+//             {criterion.eval_method === "code" ? (
+//               <Flex style={{ userSelect: "none" }}>
+//                 <IconTerminal2 size="14pt" />
+//                 &nbsp;Python
+//               </Flex>
+//             ) : (
+//               <Flex style={{ userSelect: "none" }}>
+//                 <IconRobot size="14pt" />
+//                 &nbsp;LLM
+//               </Flex>
+//             )}
+//           </Text>
+//         </div>
+//         <Stack spacing={0}>
+//           <Contributor
+//             getStateValue={getStateValue}
+//             style={{ size: 90, thickness: 12 }}
+//           />
+//           <Text align="center" size="xs" color="gray" maw="90%" lh={1.1}>
+//             Alignment with your grades
+//           </Text>
+//         </Stack>
+//       </div>
+//       {/* </Collapse> */}
+//       {/* </Card.Section> */}
+//       <div>
+//         <Accordion>
+//           <Accordion.Item
+//             key={"Show Bad Implementations"}
+//             value={"Show Bad Implementations"}
+//           >
+//             <Accordion.Control>
+//               <Text size="sm"> Show Bad Implementations </Text>
+//             </Accordion.Control>
+//             <Accordion.Panel>{unselectedImplementations}</Accordion.Panel>
+//           </Accordion.Item>
+//         </Accordion>
+//       </div>
+//     </Card>
+//   );
+// };
+
+// const ReadOnlyThumbUpDownButtons = ({
+//   upCount,
+//   downCount,
+// }: {
+//   upCount: number;
+//   downCount: number;
+//   // grade: boolean | undefined;
+//   // onChangeGrade: (newGrade: boolean | undefined) => void;
+//   // getGradeCount: (grade: boolean | undefined) => number;
+// }) => {
+//   return (
+//     <>
+//       {/* Thumbs up/down buttons */}
+//       <Button color={"green"} m={0} p={0} variant="subtle">
+//         <div className="gradeContainer">
+//           <IconThumbUp size="14pt" fill={"#aea"} />
+//           <div className="gradeUpCount">{upCount}</div>
+//         </div>
+//       </Button>
+//       <Button color={"red"} m={0} p={0} variant="subtle">
+//         <div className="gradeContainer">
+//           <IconThumbDown size="14pt" fill={"pink"} />
+//           <div className="gradeDownCount">{downCount}</div>
+//         </div>
+//       </Button>
+//     </>
+//   );
+// };
+
+// // export default { EvalGenModal, ReportCardScreen };
+// export default EvalGenModal;
diff --git a/chainforge/react-server/src/MultiEvalNode.tsx b/chainforge/react-server/src/MultiEvalNode.tsx
index 068745e76..b6961828b 100644
--- a/chainforge/react-server/src/MultiEvalNode.tsx
+++ b/chainforge/react-server/src/MultiEvalNode.tsx
@@ -444,7 +444,7 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
           crit.shortname,
           "python",
           {
-            code: "def evaluate(r):\n\treturn len(r.text)", // to be populated once python code is implemented for the criteria
+            code: crit.criteria,
             sandbox: true,
           },
           false,
@@ -455,8 +455,7 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
           crit.shortname,
           "llm",
           {
-            // to be populated once LLM code is implemented for the criteria
-            prompt: "",
+            prompt: crit.criteria,
             format: "bin",
           },
           false,
@@ -467,7 +466,7 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
           crit.shortname,
           "javascript",
           {
-            code: "function evaluate(r) {\n\treturn r.text.length;\n}", // to be populated once javascript code is implemented for the criteria
+            code: crit.criteria,
           },
           false,
         );
@@ -726,7 +725,8 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
   }, []);
   const handleEvalGenComplete = (evaluationData: EvalGenReport) => {
     console.log("Evaluation wizard completed with data:", evaluationData);
-    // Do something with the evaluation implementations
+    onFinalReportsReady(evaluationData);
+    setEvalGenOpened(false);
   };
 
   return (
@@ -940,8 +940,8 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
             ) : (
               <></>
             )} */}
-            <Menu.Divider />
-            {EVALUATOR_PRESETS.map((category, idx) => (
+            {/* <Menu.Divider /> */}
+            {/* {EVALUATOR_PRESETS.map((category, idx) => (
               <React.Fragment key={category.label}>
                 {idx > 0 && <Menu.Divider />}
                 <Menu.Label>{category.label}</Menu.Label>
@@ -968,7 +968,7 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
                   </Menu.Item>
                 ))}
               </React.Fragment>
-            ))}
+            ))} */}
           </Menu.Dropdown>
         </Menu>
       </div>
@@ -981,7 +981,12 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
             position="bottom"
             withArrow
           >
-            <Button onClick={openEvalGen} variant="outline" size="xs">
+            <Button
+              onClick={openEvalGen}
+              variant="filled"
+              color="violet"
+              size="xs"
+            >
               <IconSparkles size="11pt" />
               &nbsp;Generate evals with EvalGen
             </Button>
diff --git a/chainforge/react-server/src/ResponseBoxes.tsx b/chainforge/react-server/src/ResponseBoxes.tsx
index 74962da39..6549993c5 100644
--- a/chainforge/react-server/src/ResponseBoxes.tsx
+++ b/chainforge/react-server/src/ResponseBoxes.tsx
@@ -73,7 +73,7 @@ export const getEvalResultStr = (
       return [
         <Stack key={1} spacing={0}>
           {strs.map((s, i) => (
-            <span key={i}>s</span>
+            <div key={i}>{s[0]}</div>
           ))}
         </Stack>,
         joined_strs,
diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts
index 7ffc30639..3bf5669ca 100644
--- a/chainforge/react-server/src/backend/evalgen/executor.ts
+++ b/chainforge/react-server/src/backend/evalgen/executor.ts
@@ -282,7 +282,7 @@ export default class EvaluationFunctionExecutor {
 
     const badExample = this.examples.find(
       (example) =>
-        this.perCriteriaGrades[criteria.uid]?.[example.uid] === false,
+        this.perCriteriaGrades[example.uid]?.[criteria.uid] === false,
     );
 
     await generateFunctionsForCriteria(
@@ -650,21 +650,12 @@ export default class EvaluationFunctionExecutor {
       return undefined;
     }
 
-    // Get a reference to the perCriteria grades for this eval function
+    console.log(this.perCriteriaGrades, evalFunc.evalCriteria.uid);
+
+    // Get the criteria ID for this eval function
     const criteriaId = evalFunc.evalCriteria.uid;
-    if (!(criteriaId in this.perCriteriaGrades)) {
-      console.warn(
-        "No user grades found for this eval criteria. You must first grade some examples against this criteria (thumbs up/down) before we can compute alignment.",
-      );
-      return undefined;
-    }
-    // The perCriteriaGrades is a map of ResponseUID to boolean (user grade true/false)
-    // or undefined (no user grade for that example).
-    const userGradedExamples = this.perCriteriaGrades[criteriaId];
 
-    // Now `evalFuncResults` is a Map<ResponseUID, EvalFunctionResult>.
-    // We can compute the alignment stats across all examples.
-    // First, create a report for this function
+    // Create a report for this function
     const report: EvalFunctionReport = {
       evalFunction: evalFunc,
       true_pass: 0,
@@ -674,33 +665,53 @@ export default class EvaluationFunctionExecutor {
       skipped: 0,
     };
 
+    // Check if we have any examples graded for this criteria
+    let hasGradedExamples = false;
+    for (const exampleId in this.perCriteriaGrades) {
+      if (this.perCriteriaGrades[exampleId]?.[criteriaId] !== undefined) {
+        hasGradedExamples = true;
+        break;
+      }
+    }
+
+    if (!hasGradedExamples) {
+      console.warn(
+        "No user grades found for this eval criteria. You must first grade some examples against this criteria (thumbs up/down) before we can compute alignment.",
+      );
+      return undefined;
+    }
+
     // Calculate alignment for this function based on the graded examples
-    Object.entries(userGradedExamples).forEach(([exampleId, grade]) => {
-      if (grade === undefined) return; // Skip if user provides no grade for this example
-      const result = results.get(exampleId);
-      const userGrade = grade
-        ? EvalFunctionResult.PASS
-        : EvalFunctionResult.FAIL;
-
-      if (result !== undefined) {
-        // Handle true positives and true negatives
-        if (result === userGrade) {
-          if (result === EvalFunctionResult.PASS) {
-            report.true_pass++;
-          } else if (result === EvalFunctionResult.FAIL) {
-            report.true_fail++;
-          }
-        } else {
-          if (result === EvalFunctionResult.PASS) {
-            report.false_pass++;
-          } else if (result === EvalFunctionResult.FAIL) {
-            report.false_fail++;
+    Object.entries(this.perCriteriaGrades).forEach(
+      ([exampleId, criteriaGrades]) => {
+        const grade = criteriaGrades[criteriaId];
+        if (grade === undefined) return; // Skip if user provides no grade for this criteria
+        if (grade === undefined) return; // Skip if user provides no grade for this example
+        const result = results.get(exampleId);
+        const userGrade = grade
+          ? EvalFunctionResult.PASS
+          : EvalFunctionResult.FAIL;
+
+        if (result !== undefined) {
+          // Handle true positives and true negatives
+          if (result === userGrade) {
+            if (result === EvalFunctionResult.PASS) {
+              report.true_pass++;
+            } else if (result === EvalFunctionResult.FAIL) {
+              report.true_fail++;
+            }
           } else {
-            report.skipped++;
+            if (result === EvalFunctionResult.PASS) {
+              report.false_pass++;
+            } else if (result === EvalFunctionResult.FAIL) {
+              report.false_fail++;
+            } else {
+              report.skipped++;
+            }
           }
         }
-      }
-    });
+      },
+    );
 
     // Calculate alignment in different ways
     // NOTE: If a denominator during the calculate is 0, this will set the score to undefined.
diff --git a/chainforge/react-server/src/styles.css b/chainforge/react-server/src/styles.css
index b93e78e74..9be9ce14a 100644
--- a/chainforge/react-server/src/styles.css
+++ b/chainforge/react-server/src/styles.css
@@ -346,7 +346,7 @@ html[data-mantine-color-scheme="dark"] .multi-eval-node {
 
 .eval-vote-icons {
   display: none;
-  position: absolute;
+  position: relative;
   /* opacity: 0.3; */
   margin-left: 6px;
   margin-top: -6px;
@@ -354,7 +354,7 @@ html[data-mantine-color-scheme="dark"] .multi-eval-node {
 
 .eval-vote-chosen {
   display: inline-flex;
-  position: absolute;
+  position: relative;
   margin-left: 6px;
   margin-top: -6px;
 }
@@ -1524,7 +1524,6 @@ html[data-mantine-color-scheme="dark"] .chat-bubble textarea {
   column-gap: normal;
   -moz-column-gap: 100px; */
 }
-
 html[data-mantine-color-scheme="dark"] .react-flow__controls-button {
   background-color: #777 !important;
   color: #ddd;

From 79111f08ee63dd57fe795c120283a6ffa7d0df28 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Wed, 14 May 2025 12:23:13 -0400
Subject: [PATCH 29/35] Add ability to change x-axis var in plot

---
 .../src/EvalGen/EvalGenWizard.tsx             |  12 +-
 chainforge/react-server/src/MultiEvalNode.tsx | 143 +++---
 chainforge/react-server/src/VisNode.tsx       | 477 ++++++++++--------
 .../react-server/src/backend/evalgen/utils.ts |  11 +-
 4 files changed, 361 insertions(+), 282 deletions(-)

diff --git a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
index d2782986a..b010b256d 100644
--- a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
+++ b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
@@ -48,7 +48,7 @@ import { getAIFeaturesModels } from "../backend/ai";
 interface EvalGenWizardProps {
   opened: boolean;
   onClose: () => void;
-  onComplete: (result: EvalGenReport) => void;
+  onComplete: (result: EvalFunctionSetReport) => void;
   responses: LLMResponse[];
 }
 
@@ -231,15 +231,9 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
     setActive((current) => Math.max(0, current - 1));
   }, []);
 
-  const handleComplete = () => {
+  const handleComplete = (evalFuncReport: EvalFunctionSetReport) => {
     // Return final data to the caller
-    onComplete({
-      criteria: criteria,
-      failureCoverage: 0,
-      falseFailureRate: 0,
-      // grades: gradingData,
-      // alignmentScores: {} // TODO: Include actual alignment scores
-    });
+    onComplete(evalFuncReport);
     onClose();
   };
 
diff --git a/chainforge/react-server/src/MultiEvalNode.tsx b/chainforge/react-server/src/MultiEvalNode.tsx
index b6961828b..44c76c7db 100644
--- a/chainforge/react-server/src/MultiEvalNode.tsx
+++ b/chainforge/react-server/src/MultiEvalNode.tsx
@@ -59,7 +59,7 @@ import { GatheringResponsesRingProgress } from "./LLMItemButtonGroup";
 import { Dict, LLMResponse, QueryProgress } from "./backend/typing";
 import { AlertModalContext } from "./AlertModal";
 import { Status } from "./StatusIndicatorComponent";
-import { EvalGenReport } from "./backend/evalgen/typing";
+import { EvalFunctionSetReport, EvalGenReport } from "./backend/evalgen/typing";
 import EvalGenWizard from "./EvalGen/EvalGenWizard";
 import StorageCache from "./backend/cache";
 const IS_RUNNING_LOCALLY = APP_IS_RUNNING_LOCALLY();
@@ -370,12 +370,13 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
       type: EvaluatorContainerDesc["type"],
       state: Dict,
       initiallyOpen = true,
+      uid?: string,
     ) => {
       setEvaluators(
         // evaluators.concat({ name, uid: uuid(), type, state, justAdded: true }),
         (e) => [
           ...e,
-          { name, uid: uuid(), type, state, justAdded: initiallyOpen },
+          { name, uid: uid ?? uuid(), type, state, justAdded: initiallyOpen },
         ],
       );
     },
@@ -432,49 +433,74 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
   //   evalGenModalRef.current?.trigger(resps, onFinalReportsReady);
   // };
 
-  const onFinalReportsReady = (reports: EvalGenReport) => {
-    // Placeholder for process the final reports returned from EvalGenModel
-    console.log("!!!!!!!!!!!!!!!!!!!!!!!!!! final reports", reports);
-    for (const crit of reports.criteria) {
-      // setTimeout(() => {
-      // console.log("crit", crit);
-      if (crit.eval_method === "code") {
-        // Python
-        addEvaluator(
-          crit.shortname,
-          "python",
-          {
-            code: crit.criteria,
-            sandbox: true,
-          },
-          false,
-        );
-      } else if (crit.eval_method === "expert") {
-        // LLM
-        addEvaluator(
-          crit.shortname,
-          "llm",
-          {
-            prompt: crit.criteria,
-            format: "bin",
-          },
-          false,
-        );
-      } else {
-        // JavaScript
-        addEvaluator(
-          crit.shortname,
-          "javascript",
-          {
-            code: crit.criteria,
-          },
-          false,
+  const onFinalReportsReady = useCallback(
+    (report: EvalFunctionSetReport) => {
+      // Turn the criteria in the final report into evaluators
+
+      for (const selectedFunc of report.selectedEvalFunctions) {
+        const crit = selectedFunc.evalCriteria;
+
+        // Find corresponding report in allEvalFunctionReports map from criteria to list
+        const evalFuncReports = report.allEvalFunctionReports.get(crit);
+        const evalFuncReport = evalFuncReports?.find(
+          (rep) => rep.evalFunction === selectedFunc,
         );
+
+        if (!evalFuncReport) {
+          console.error(
+            "EvalGen: That's strange. No report found for selected function. Skipping...",
+            selectedFunc,
+          );
+          continue;
+        }
+
+        // Extract the code from the selected function
+        const code = evalFuncReport?.evalFunction.code;
+        // Get the functions that were not selected for this criteria
+        // const otherFuncs = evalFuncReports?.filter(
+        //   (rep) => rep.evalFunction !== selectedFunc,
+        // );
+
+        if (crit.eval_method === "code") {
+          // Python
+          addEvaluator(
+            crit.shortname,
+            "python",
+            {
+              code: code.trim(),
+              sandbox: true,
+            },
+            false,
+            crit.uid,
+          );
+        } else if (crit.eval_method === "expert") {
+          // LLM
+          addEvaluator(
+            crit.shortname,
+            "llm",
+            {
+              prompt: code,
+              format: "bin",
+            },
+            false,
+            crit.uid,
+          );
+        } else {
+          // JavaScript
+          addEvaluator(
+            crit.shortname,
+            "javascript",
+            {
+              code: code.trim(),
+            },
+            false,
+            crit.uid,
+          );
+        }
       }
-      // }, kkk * 5000);
-      // kkk++;
-    }
-  };
+    },
+    [addEvaluator],
+  );
 
   const handleError = useCallback(
     (err: Error | string) => {
@@ -723,11 +749,14 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
     setPulledInputs(handlePullInputs());
     setEvalGenOpened(true);
   }, []);
-  const handleEvalGenComplete = (evaluationData: EvalGenReport) => {
-    console.log("Evaluation wizard completed with data:", evaluationData);
-    onFinalReportsReady(evaluationData);
-    setEvalGenOpened(false);
-  };
+  const handleEvalGenComplete = useCallback(
+    (evaluationData: EvalFunctionSetReport) => {
+      console.log("Evaluation wizard completed with data:", evaluationData);
+      onFinalReportsReady(evaluationData);
+      setEvalGenOpened(false);
+    },
+    [onFinalReportsReady],
+  );
 
   return (
     <BaseNode classNames="evaluator-node multi-eval-node" nodeId={id}>
@@ -929,17 +958,13 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
             >
               LLM
             </Menu.Item>
-            {/* {AI_SUPPORT_ENABLED ? <Menu.Divider /> : <></>} */}
-            {/* {AI_SUPPORT_ENABLED ? (
-              <Menu.Item
-                icon={<IconSparkles size="14px" />}
-                onClick={onClickPickCriteria}
-              >
-                Let an AI decide!
-              </Menu.Item>
-            ) : (
-              <></>
-            )} */}
+            <Menu.Divider />
+            <Menu.Item
+              icon={<IconSparkles size="11pt" />}
+              onClick={openEvalGen}
+            >
+              Generate with EvalGen
+            </Menu.Item>
             {/* <Menu.Divider /> */}
             {/* {EVALUATOR_PRESETS.map((category, idx) => (
               <React.Fragment key={category.label}>
diff --git a/chainforge/react-server/src/VisNode.tsx b/chainforge/react-server/src/VisNode.tsx
index 4990dc102..f6aef73c0 100644
--- a/chainforge/react-server/src/VisNode.tsx
+++ b/chainforge/react-server/src/VisNode.tsx
@@ -127,6 +127,23 @@ const castEvalScoreToNum = (score: EvaluationScore): number => {
   else return 0; // unknown, soft fail
 };
 
+const findEvalResKeys = (resps: LLMResponse[]): Set<string> => {
+  const eval_res_keys = new Set<string>();
+  resps.forEach((resp_obj) => {
+    if (resp_obj.eval_res && resp_obj.eval_res.items) {
+      resp_obj.eval_res.items.forEach((item) => {
+        if (typeof item === "object") {
+          Object.keys(item).forEach((k) => eval_res_keys.add(k));
+        } else {
+          // If the item is not an object, we can assume it's a single value
+          eval_res_keys.add("score");
+        }
+      });
+    }
+  });
+  return eval_res_keys;
+};
+
 /**
  *  UTIL FUNCTIONS FOR VIS PLOTS
  */
@@ -250,6 +267,8 @@ interface VisNodeData {
   selected_vars: string[] | string;
   llm_groups?: { value: string; label: string }[];
   selected_llm_group?: string;
+  eval_res_vars?: string[];
+  selected_eval_res_var?: string;
   input: string;
   refresh: boolean;
   title: string;
@@ -324,6 +343,24 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
         : "LLM (default)",
     );
 
+    // The x-axis, which are the names of eval results (if a dictionary)
+    const [evalResVars, setEvalResVars] = useState<string[]>(
+      data?.eval_res_vars ?? ["score"],
+    );
+    const [selectedEvalResVar, setSelectedEvalResVar] = useState(
+      data?.selected_eval_res_var ?? "score",
+    );
+    const handleChangeSelectedEvalResVar = useCallback(
+      (new_val: React.ChangeEvent<HTMLSelectElement>) => {
+        setSelectedEvalResVar(new_val.target.value);
+        if (id)
+          setDataPropsForNode(id, {
+            selected_eval_res_var: new_val.target.value,
+          });
+      },
+      [id, setDataPropsForNode],
+    );
+
     // Typically, a user will only need the default LLM 'group' --all LLMs in responses.
     // However, when prompts are chained together, the original LLM info is stored in metavars as a key.
     // LLM groups allow you to plot against the original LLMs, even though a 'scorer' LLM might come after.
@@ -333,13 +370,14 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
     const [selectedLLMGroup, setSelectedLLMGroup] = useState(
       data?.selected_llm_group ?? "LLM",
     );
-    const handleChangeLLMGroup = (
-      new_val: React.ChangeEvent<HTMLSelectElement>,
-    ) => {
-      setSelectedLLMGroup(new_val.target.value);
-      if (id)
-        setDataPropsForNode(id, { selected_llm_group: new_val.target.value });
-    };
+    const handleChangeLLMGroup = useCallback(
+      (new_val: React.ChangeEvent<HTMLSelectElement>) => {
+        setSelectedLLMGroup(new_val.target.value);
+        if (id)
+          setDataPropsForNode(id, { selected_llm_group: new_val.target.value });
+      },
+      [id, setDataPropsForNode],
+    );
 
     // When the user clicks an item in the drop-down,
     // we want to autoclose the multiselect drop-down:
@@ -375,6 +413,15 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
       varnames = Array.from(varnames);
       metavars = Array.from(metavars);
 
+      // Find all keys in eval results
+      const eval_res_keys = findEvalResKeys(resps);
+      if (eval_res_keys.size === 0) {
+        eval_res_keys.add("score"); // default to 'score' if no keys found
+      } else if (selectedEvalResVar === "score") {
+        // We need to set the default eval res var to the first one in the list
+        setSelectedEvalResVar(eval_res_keys.values().next().value as string);
+      }
+
       // Get all vars for the y-axis dropdown, merging metavars and vars into one list,
       // and excluding any special 'LLM group' metavars:
       const msvars = [{ value: "LLM (default)", label: "LLM (default)" }]
@@ -401,18 +448,22 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
       if (
         !multiSelectVars ||
         !multiSelectValue ||
+        !evalResVars ||
         !areSetsEqual(
           new Set(msvars.map((o) => o.value)),
           new Set(multiSelectVars.map((o) => o.value)),
-        )
+        ) ||
+        !areSetsEqual(new Set(evalResVars), eval_res_keys)
       ) {
         setMultiSelectValue("LLM (default)");
         setMultiSelectVars(msvars);
+        setEvalResVars(Array.from(eval_res_keys));
         if (id)
           setDataPropsForNode(id, {
             vars: msvars,
             selected_vars: [],
             llm_groups: available_llm_groups,
+            eval_res_vars: Array.from(eval_res_keys),
           });
       }
     };
@@ -552,14 +603,14 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
             max_num_results_per_prompt = res_obj.eval_res.items.length;
         });
 
-        let plot_legend: React.ReactNode | null = null;
+        const plot_legend: React.ReactNode | null = null;
         let metric_axes_labels: string[] = [];
         let num_metrics = 1;
         if (
           typeof_eval_res.includes("KeyValue") &&
-          responses[0].eval_res !== undefined
+          responses.some((r) => r.eval_res !== undefined)
         ) {
-          metric_axes_labels = Object.keys(responses[0].eval_res.items[0]);
+          metric_axes_labels = Array.from(findEvalResKeys(responses));
           num_metrics = metric_axes_labels.length;
         }
 
@@ -590,9 +641,7 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
           if (typeof_eval_res.includes("KeyValue"))
             return eval_res_obj.items.map(
               (item) =>
-                (item as Dict<boolean | number | string>)[
-                  metric_axes_labels[0]
-                ],
+                (item as Dict<boolean | number | string>)[selectedEvalResVar],
             );
           return eval_res_obj.items;
         };
@@ -680,7 +729,7 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
 
           if (metric_axes_labels.length > 0)
             layout.xaxis = {
-              title: { font: { size: 12 }, text: metric_axes_labels[0] },
+              title: { font: { size: 12 }, text: selectedEvalResVar },
               ...layout.xaxis,
             };
           else
@@ -792,8 +841,12 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
                   d.histfunc = "sum";
                   d.y = new Array(x_items.length).fill(shortnames[name]);
                   d.textposition = "none"; // hide the text which appears within each bar
+                  const xaxis_title =
+                    metric_axes_labels.length > 0
+                      ? "Sum of '" + selectedEvalResVar + "'"
+                      : "Sum of scores";
                   layout.xaxis = {
-                    title: { font: { size: 12 }, text: "Sum of scores" },
+                    title: { font: { size: 12 }, text: xaxis_title },
                     ...layout.xaxis,
                   };
 
@@ -829,7 +882,7 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
 
           if (metric_axes_labels.length > 0)
             layout.xaxis = {
-              title: { font: { size: 12 }, text: metric_axes_labels[0] },
+              title: { font: { size: 12 }, text: selectedEvalResVar },
               ...layout.xaxis,
             };
         };
@@ -899,7 +952,10 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
               if (graphType.key === "bar") {
                 d.type = "bar";
                 d.textposition = "none"; // hide the text which appears within each bar
-                xaxis_title = "Sum of scores";
+                xaxis_title =
+                  metric_axes_labels.length > 0
+                    ? "Sum of '" + selectedEvalResVar + "'"
+                    : "Sum of scores";
 
                 if (typeof_eval_res === "Numeric") {
                   // To make error bars work, we need to sum the numbers, instead of relying
@@ -945,10 +1001,6 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
           });
           layout.boxmode = "group";
           layout.bargap = 0.5;
-          // layout.yaxis = {
-          //   tickfont: { size: 10 },
-          //   ...layout.yaxis,
-          // };
 
           // Set the left margin to fit the yticks labels
           layout.margin.l = calcLeftPaddingForYLabels(
@@ -957,177 +1009,205 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
 
           if (metric_axes_labels.length > 0)
             layout.xaxis = {
-              title: { font: { size: 12 }, text: metric_axes_labels[0] },
+              title: { font: { size: 12 }, text: selectedEvalResVar },
               ...layout.xaxis,
             };
         };
 
-        if (num_metrics > 1) {
-          // For 2 or more metrics, display a parallel coordinates plot.
-          // :: For instance, if evaluator produces { height: 32, weight: 120 } plot responses with 2 metrics, 'height' and 'weight'
-          if (varnames.length === 1) {
-            const unique_vals = getUniqueKeysInResponses(
-              responses,
-              (resp_obj) => get_var(resp_obj, varnames[0]),
-            );
-            // const response_txts = responses.map(res_obj => res_obj.responses).flat();
-
-            const group_colors = varcolors;
-            const unselected_line_color = "#ddd";
-            const spec_colors = responses
-              .map((resp_obj) => {
-                const idx = unique_vals.indexOf(get_var(resp_obj, varnames[0]));
-                return resp_obj.eval_res
-                  ? Array(resp_obj.eval_res.items.length).fill(idx)
-                  : [];
-              })
-              .flat();
-
-            const colorscale: [number, string][] = [];
-            for (let i = 0; i < unique_vals.length; i++) {
-              if (
-                !selectedLegendItems ||
-                selectedLegendItems.indexOf(unique_vals[i]) > -1
-              )
-                colorscale.push([
-                  i / (unique_vals.length - 1),
-                  group_colors[i % group_colors.length],
-                ]);
-              else
-                colorscale.push([
-                  i / (unique_vals.length - 1),
-                  unselected_line_color,
-                ]);
+        // PARALLEL COORDINATES PLOT -- Disabled for now.
+        // May be re-enabled in the future.
+        // if (num_metrics > 1) {
+        //   // For 2 or more metrics, display a parallel coordinates plot.
+        //   // :: For instance, if evaluator produces { height: 32, weight: 120 } plot responses with 2 metrics, 'height' and 'weight'
+        //   if (varnames.length === 1) {
+        //     const unique_vals = getUniqueKeysInResponses(
+        //       responses,
+        //       (resp_obj) => get_var(resp_obj, varnames[0]),
+        //     );
+        //     // const response_txts = responses.map(res_obj => res_obj.responses).flat();
+
+        //     const group_colors = varcolors;
+        //     const unselected_line_color = "#ddd";
+        //     const spec_colors = responses
+        //       .map((resp_obj) => {
+        //         const idx = unique_vals.indexOf(get_var(resp_obj, varnames[0]));
+        //         return resp_obj.eval_res
+        //           ? Array(resp_obj.eval_res.items.length).fill(idx)
+        //           : [];
+        //       })
+        //       .flat();
+
+        //     const colorscale: [number, string][] = [];
+        //     for (let i = 0; i < unique_vals.length; i++) {
+        //       if (
+        //         !selectedLegendItems ||
+        //         selectedLegendItems.indexOf(unique_vals[i]) > -1
+        //       )
+        //         colorscale.push([
+        //           i / (unique_vals.length - 1),
+        //           group_colors[i % group_colors.length],
+        //         ]);
+        //       else
+        //         colorscale.push([
+        //           i / (unique_vals.length - 1),
+        //           unselected_line_color,
+        //         ]);
+        //     }
+
+        //     const dimensions: Dict = [];
+        //     metric_axes_labels.forEach((metric) => {
+        //       const evals = extractEvalResultsForMetric(metric, responses);
+        //       dimensions.push({
+        //         range: evals.every((e) => typeof e === "number")
+        //           ? [
+        //               Math.min(...(evals as number[])),
+        //               Math.max(...(evals as number[])),
+        //             ]
+        //           : undefined,
+        //         label: metric,
+        //         values: evals,
+        //       });
+        //     });
+
+        //     spec.push({
+        //       type: "parcoords",
+        //       pad: [10, 10, 10, 10],
+        //       line: {
+        //         color: spec_colors,
+        //         colorscale,
+        //       },
+        //       dimensions,
+        //     });
+        //     layout.margin = { l: 40, r: 40, b: 40, t: 50, pad: 0 };
+        //     layout.paper_bgcolor = "white";
+        //     layout.font = { color: "black" };
+        //     layout.selectedpoints = [];
+
+        //     // There's no built-in legend for parallel coords, unfortunately, so we need to construct our own:
+        //     const legend_labels: Dict<string> = {};
+        //     unique_vals.forEach((v, idx) => {
+        //       if (!selectedLegendItems || selectedLegendItems.indexOf(v) > -1)
+        //         legend_labels[v] = group_colors[idx % group_colors.length];
+        //       else legend_labels[v] = unselected_line_color;
+        //     });
+        //     const onClickLegendItem = (label: string) => {
+        //       if (
+        //         selectedLegendItems &&
+        //         selectedLegendItems.length === 1 &&
+        //         selectedLegendItems[0] === label
+        //       )
+        //         setSelectedLegendItems(null); // Clicking twice on a legend item deselects it and displays all
+        //       else setSelectedLegendItems([label]);
+        //     };
+        //     plot_legend = (
+        //       <PlotLegend
+        //         labels={legend_labels}
+        //         onClickLabel={onClickLegendItem}
+        //       />
+        //     );
+
+        //     // Tried to support Plotly hover events here, but looks like
+        //     // currently there are unsupported for parcoords: https://github.com/plotly/plotly.js/issues/3012
+        //     // onHover = (e) => {
+        //     //     console.log(e.curveNumber);
+        //     //     // const curveIdx = e.curveNumber;
+        //     //     // if (curveIdx < response_txts.length) {
+        //     //     //     if (!selectedLegendItems || selectedLegendItems.indexOf(unique_vals[spec_colors[curveIdx]]) > -1)
+        //     //     //         console.log(response_txts[curveIdx]);
+        //     //     // }
+        //     // };
+        //   } else {
+        //     setSelectedLegendItems(null);
+        //     const error_text =
+        //       "Plotting evaluations with more than one metric and more than one prompt parameter is currently unsupported.";
+        //     setPlaceholderText(
+        //       <p
+        //         style={{
+        //           maxWidth: "220px",
+        //           backgroundColor: "#f0aaaa",
+        //           padding: "10px",
+        //           fontSize: "10pt",
+        //         }}
+        //       >
+        //         {error_text}
+        //       </p>,
+        //     );
+        //     console.error(error_text);
+        //   }
+        // } else {
+
+        // A single metric --use plots like grouped box-and-whiskers, 3d scatterplot
+        if (varnames.length === 0) {
+          // No variables means they used a single prompt (no template) to generate responses
+          // (Users are likely evaluating differences in responses between LLMs)
+          if (typeof_eval_res === "Boolean") plot_accuracy(get_llm, "llm");
+          else plot_simple_boxplot(get_llm, "llm");
+        } else if (varnames.length === 1) {
+          // 1 var; numeric eval
+          if (llm_names.length === 1) {
+            if (typeof_eval_res === "Boolean")
+              // Accuracy plot per value of the selected variable:
+              plot_accuracy((r) => get_var_and_trim(r, varnames[0]), "var");
+            else {
+              // Simple box plot, as there is only a single LLM in the response
+              plot_simple_boxplot(
+                (r) => get_var_and_trim(r, varnames[0]),
+                "var",
+              );
             }
+          } else {
+            // There are multiple LLMs in the response; do a grouped box plot by LLM.
+            // Note that 'name' is now the LLM, and 'x' stores the value of the var:
+            plot_grouped_boxplot((r) => get_var_and_trim(r, varnames[0]));
+          }
+        } else if (varnames.length === 2) {
+          // Input is 2 vars; numeric eval
+          // Display a 3D scatterplot with 2 dimensions:
 
-            const dimensions: Dict = [];
-            metric_axes_labels.forEach((metric) => {
-              const evals = extractEvalResultsForMetric(metric, responses);
-              dimensions.push({
-                range: evals.every((e) => typeof e === "number")
-                  ? [
-                      Math.min(...(evals as number[])),
-                      Math.max(...(evals as number[])),
-                    ]
-                  : undefined,
-                label: metric,
-                values: evals,
-              });
-            });
-
-            spec.push({
-              type: "parcoords",
-              pad: [10, 10, 10, 10],
-              line: {
-                color: spec_colors,
-                colorscale,
+          const names_0 = new Set(
+            responses.map((r) => get_var_and_trim(r, varnames[0])),
+          );
+          const shortnames_0 = genUniqueShortnames(names_0);
+          const names_1 = new Set(
+            responses.map((r) => get_var_and_trim(r, varnames[1])),
+          );
+          const shortnames_1 = genUniqueShortnames(names_1);
+
+          if (llm_names.length === 1) {
+            spec = {
+              type: "scatter3d",
+              x: responses
+                .map((r) => get_var(r, varnames[0], true))
+                .map((s) => shortnames_0[s]),
+              y: responses
+                .map((r) => get_var(r, varnames[1], true))
+                .map((s) => shortnames_1[s]),
+              z: responses.map(
+                (r) =>
+                  get_items(r.eval_res).reduce(
+                    (acc: number, val) =>
+                      acc + (typeof val === "number" ? val : 0),
+                    0,
+                  ) / (r.eval_res?.items.length ?? 1),
+              ), // calculates mean
+              mode: "markers",
+              marker: {
+                color: getColorForLLMAndSetIfNotFound(llm_names[0]),
               },
-              dimensions,
-            });
-            layout.margin = { l: 40, r: 40, b: 40, t: 50, pad: 0 };
-            layout.paper_bgcolor = "white";
-            layout.font = { color: "black" };
-            layout.selectedpoints = [];
-
-            // There's no built-in legend for parallel coords, unfortunately, so we need to construct our own:
-            const legend_labels: Dict<string> = {};
-            unique_vals.forEach((v, idx) => {
-              if (!selectedLegendItems || selectedLegendItems.indexOf(v) > -1)
-                legend_labels[v] = group_colors[idx % group_colors.length];
-              else legend_labels[v] = unselected_line_color;
-            });
-            const onClickLegendItem = (label: string) => {
-              if (
-                selectedLegendItems &&
-                selectedLegendItems.length === 1 &&
-                selectedLegendItems[0] === label
-              )
-                setSelectedLegendItems(null); // Clicking twice on a legend item deselects it and displays all
-              else setSelectedLegendItems([label]);
             };
-            plot_legend = (
-              <PlotLegend
-                labels={legend_labels}
-                onClickLabel={onClickLegendItem}
-              />
-            );
-
-            // Tried to support Plotly hover events here, but looks like
-            // currently there are unsupported for parcoords: https://github.com/plotly/plotly.js/issues/3012
-            // onHover = (e) => {
-            //     console.log(e.curveNumber);
-            //     // const curveIdx = e.curveNumber;
-            //     // if (curveIdx < response_txts.length) {
-            //     //     if (!selectedLegendItems || selectedLegendItems.indexOf(unique_vals[spec_colors[curveIdx]]) > -1)
-            //     //         console.log(response_txts[curveIdx]);
-            //     // }
-            // };
           } else {
-            setSelectedLegendItems(null);
-            const error_text =
-              "Plotting evaluations with more than one metric and more than one prompt parameter is currently unsupported.";
-            setPlaceholderText(
-              <p
-                style={{
-                  maxWidth: "220px",
-                  backgroundColor: "#f0aaaa",
-                  padding: "10px",
-                  fontSize: "10pt",
-                }}
-              >
-                {error_text}
-              </p>,
-            );
-            console.error(error_text);
-          }
-        } else {
-          // A single metric --use plots like grouped box-and-whiskers, 3d scatterplot
-          if (varnames.length === 0) {
-            // No variables means they used a single prompt (no template) to generate responses
-            // (Users are likely evaluating differences in responses between LLMs)
-            if (typeof_eval_res === "Boolean") plot_accuracy(get_llm, "llm");
-            else plot_simple_boxplot(get_llm, "llm");
-          } else if (varnames.length === 1) {
-            // 1 var; numeric eval
-            if (llm_names.length === 1) {
-              if (typeof_eval_res === "Boolean")
-                // Accuracy plot per value of the selected variable:
-                plot_accuracy((r) => get_var_and_trim(r, varnames[0]), "var");
-              else {
-                // Simple box plot, as there is only a single LLM in the response
-                plot_simple_boxplot(
-                  (r) => get_var_and_trim(r, varnames[0]),
-                  "var",
-                );
-              }
-            } else {
-              // There are multiple LLMs in the response; do a grouped box plot by LLM.
-              // Note that 'name' is now the LLM, and 'x' stores the value of the var:
-              plot_grouped_boxplot((r) => get_var_and_trim(r, varnames[0]));
-            }
-          } else if (varnames.length === 2) {
-            // Input is 2 vars; numeric eval
-            // Display a 3D scatterplot with 2 dimensions:
-
-            const names_0 = new Set(
-              responses.map((r) => get_var_and_trim(r, varnames[0])),
-            );
-            const shortnames_0 = genUniqueShortnames(names_0);
-            const names_1 = new Set(
-              responses.map((r) => get_var_and_trim(r, varnames[1])),
-            );
-            const shortnames_1 = genUniqueShortnames(names_1);
-
-            if (llm_names.length === 1) {
-              spec = {
+            spec = [];
+            llm_names.forEach((llm) => {
+              const resps = responses.filter((r) => get_llm(r) === llm);
+              spec.push({
                 type: "scatter3d",
-                x: responses
+                x: resps
                   .map((r) => get_var(r, varnames[0], true))
                   .map((s) => shortnames_0[s]),
-                y: responses
+                y: resps
                   .map((r) => get_var(r, varnames[1], true))
                   .map((s) => shortnames_1[s]),
-                z: responses.map(
+                z: resps.map(
                   (r) =>
                     get_items(r.eval_res).reduce(
                       (acc: number, val) =>
@@ -1137,37 +1217,11 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
                 ), // calculates mean
                 mode: "markers",
                 marker: {
-                  color: getColorForLLMAndSetIfNotFound(llm_names[0]),
+                  color: getColorForLLMAndSetIfNotFound(llm),
                 },
-              };
-            } else {
-              spec = [];
-              llm_names.forEach((llm) => {
-                const resps = responses.filter((r) => get_llm(r) === llm);
-                spec.push({
-                  type: "scatter3d",
-                  x: resps
-                    .map((r) => get_var(r, varnames[0], true))
-                    .map((s) => shortnames_0[s]),
-                  y: resps
-                    .map((r) => get_var(r, varnames[1], true))
-                    .map((s) => shortnames_1[s]),
-                  z: resps.map(
-                    (r) =>
-                      get_items(r.eval_res).reduce(
-                        (acc: number, val) =>
-                          acc + (typeof val === "number" ? val : 0),
-                        0,
-                      ) / (r.eval_res?.items.length ?? 1),
-                  ), // calculates mean
-                  mode: "markers",
-                  marker: {
-                    color: getColorForLLMAndSetIfNotFound(llm),
-                  },
-                  name: llm,
-                });
+                name: llm,
               });
-            }
+            });
           }
         }
 
@@ -1184,6 +1238,8 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
     }, [
       multiSelectVars,
       multiSelectValue,
+      evalResVars,
+      selectedEvalResVar,
       selectedLLMGroup,
       responses,
       selectedLegendItems,
@@ -1254,9 +1310,10 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
             <span style={smallTextStyle}>x-axis:</span>
             <NativeSelect
               className="nodrag nowheel"
-              data={["score"]}
+              data={evalResVars}
               size="xs"
-              value={"score"}
+              value={selectedEvalResVar}
+              onChange={handleChangeSelectedEvalResVar}
               miw="80px"
             />
           </div>
diff --git a/chainforge/react-server/src/backend/evalgen/utils.ts b/chainforge/react-server/src/backend/evalgen/utils.ts
index 054a50b2c..ff8bda7c3 100644
--- a/chainforge/react-server/src/backend/evalgen/utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/utils.ts
@@ -40,11 +40,14 @@ export function extractMdBlocks(
   blockName: string,
 ): string[] | undefined {
   const regex = new RegExp(`\`\`\`${blockName}(.*?)\`\`\``, "gs");
-  const matches = mdText.match(regex);
-  if (matches)
-    return matches.map((s) => s.replace("```json", "").replace("```", ""));
+  const matches = [];
+  let match: RegExpExecArray | null;
 
-  console.error("No JSON found in output.");
+  while ((match = regex.exec(mdText)) !== null) matches.push(match[1]);
+
+  if (matches.length > 0) return matches;
+
+  console.error("No md blocks found for name:", blockName);
   return undefined;
 }
 

From 9b6ad335669bf58976901288d840b3589fd5b319 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Wed, 14 May 2025 12:43:18 -0400
Subject: [PATCH 30/35] Bug fix plotting acc

---
 chainforge/react-server/src/VisNode.tsx | 29 +++++++++++++++++++------
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/chainforge/react-server/src/VisNode.tsx b/chainforge/react-server/src/VisNode.tsx
index f6aef73c0..6c29a12dd 100644
--- a/chainforge/react-server/src/VisNode.tsx
+++ b/chainforge/react-server/src/VisNode.tsx
@@ -571,6 +571,21 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
             ? responses[0].eval_res.dtype
             : "Numeric";
 
+        let sel_typeof_eval_res = typeof_eval_res;
+        if (typeof_eval_res.includes("KeyValue")) {
+          const first_item = responses[0].eval_res?.items?.[0];
+          if (typeof first_item === "object") {
+            const val = first_item[selectedEvalResVar];
+            if (typeof val === "boolean") {
+              sel_typeof_eval_res = "Boolean";
+            } else if (typeof val === "number") {
+              sel_typeof_eval_res = "Numeric";
+            } else if (typeof val === "string") {
+              sel_typeof_eval_res = "Categorical";
+            }
+          }
+        }
+
         // If categorical type, check if all binary:
         if (typeof_eval_res === "Categorical") {
           const is_all_bools = responses.reduce(
@@ -745,7 +760,7 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
         ) => {
           let names = new Set<string>();
           const plotting_categorical_vars =
-            group_type === "var" && typeof_eval_res === "Categorical";
+            group_type === "var" && sel_typeof_eval_res === "Categorical";
 
           // When we're plotting vars, we want the stacked bar colors to be the *categories*,
           // and the x_items to be the names of vars, so that the left axis is a vertical list of varnames.
@@ -793,8 +808,8 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
                   getColorForLLMAndSetIfNotFound(get_llm(responses[0]));
 
             if (
-              typeof_eval_res === "Boolean" ||
-              typeof_eval_res === "Categorical"
+              sel_typeof_eval_res === "Boolean" ||
+              sel_typeof_eval_res === "Categorical"
             ) {
               // Plot a histogram for categorical or boolean data.
               spec.push({
@@ -916,7 +931,7 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
               });
             }
 
-            if (typeof_eval_res === "Boolean") {
+            if (sel_typeof_eval_res === "Boolean") {
               // Plot a histogram for boolean (true/false) categorical data.
               spec.push({
                 type: "histogram",
@@ -957,7 +972,7 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
                     ? "Sum of '" + selectedEvalResVar + "'"
                     : "Sum of scores";
 
-                if (typeof_eval_res === "Numeric") {
+                if (sel_typeof_eval_res === "Numeric") {
                   // To make error bars work, we need to sum the numbers, instead of relying
                   // upon the stacked bar chart:
                   let sum_x_items: number[] = [];
@@ -1140,12 +1155,12 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
         if (varnames.length === 0) {
           // No variables means they used a single prompt (no template) to generate responses
           // (Users are likely evaluating differences in responses between LLMs)
-          if (typeof_eval_res === "Boolean") plot_accuracy(get_llm, "llm");
+          if (sel_typeof_eval_res === "Boolean") plot_accuracy(get_llm, "llm");
           else plot_simple_boxplot(get_llm, "llm");
         } else if (varnames.length === 1) {
           // 1 var; numeric eval
           if (llm_names.length === 1) {
-            if (typeof_eval_res === "Boolean")
+            if (sel_typeof_eval_res === "Boolean")
               // Accuracy plot per value of the selected variable:
               plot_accuracy((r) => get_var_and_trim(r, varnames[0]), "var");
             else {

From e470a723b97cc65e2c8eea8e4b9281c390070ecd Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Wed, 14 May 2025 15:40:59 -0400
Subject: [PATCH 31/35] Bug fixes and inject feedback into eval criteria gen
 context

---
 .../src/EvalGen/EvalGenWizard.tsx             | 10 ++++
 .../react-server/src/EvalGen/FeedbackStep.tsx |  2 +-
 .../react-server/src/EvalGen/GradingView.tsx  |  5 +-
 .../src/EvalGen/PickCriteriaStep.tsx          |  2 +-
 .../react-server/src/EvalGen/WelcomeStep.tsx  | 14 ++---
 .../src/backend/evalgen/executor.ts           | 43 ++++++++------
 .../src/backend/evalgen/oai_utils.ts          |  7 ++-
 .../react-server/src/backend/evalgen/utils.ts | 57 +++++++++++++++++--
 chainforge/react-server/src/store.tsx         | 25 +++++++-
 chainforge/react-server/src/styles.css        |  6 +-
 10 files changed, 134 insertions(+), 37 deletions(-)

diff --git a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
index b010b256d..aef7237ce 100644
--- a/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
+++ b/chainforge/react-server/src/EvalGen/EvalGenWizard.tsx
@@ -253,6 +253,7 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
     return escapeBraces(prompts.values().next().value ?? "");
   };
 
+  const exportGradesAndNotes = useStore((store) => store.exportGradesAndNotes);
   async function genCriteriaFromContext(responses: LLMResponse[]) {
     // Get the context from the input responses
     const inputPromptTemplate =
@@ -263,11 +264,17 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
       return;
     }
 
+    // Get the user feedback on the responses, if any, from the global state
+    const feedback = exportGradesAndNotes(responses);
+
     // Attempt to generate criteria using an LLM
     return await generateLLMEvaluationCriteria(
       inputPromptTemplate,
       genAIModelNames.large,
       apiKeys,
+      undefined,
+      undefined,
+      feedback,
     );
   }
 
@@ -360,6 +367,7 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
           bottom: 106,
           padding: "10px",
           width: "95%",
+          pointerEvents: "none",
         }}
       >
         <Flex justify="space-between">
@@ -367,6 +375,7 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
             variant="default"
             onClick={handlePrevious}
             disabled={active === 0}
+            style={{ pointerEvents: "all" }}
           >
             &lt; Back
           </Button>
@@ -378,6 +387,7 @@ const EvalGenWizard: React.FC<EvalGenWizardProps> = ({
               active === 4 ||
               (active === 3 && numResponsesGraded < minNumToGrade)
             }
+            style={{ pointerEvents: "all" }}
           >
             {active === 3
               ? numResponsesGraded >= minNumToGrade
diff --git a/chainforge/react-server/src/EvalGen/FeedbackStep.tsx b/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
index 2ba0b24c7..719e50c86 100644
--- a/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
+++ b/chainforge/react-server/src/EvalGen/FeedbackStep.tsx
@@ -96,7 +96,7 @@ const FeedbackStep: React.FC<FeedbackStepProps> = ({
   }, [shownResponseIdx, responses]);
 
   return (
-    <Stack spacing="sm">
+    <Stack spacing="sm" mb={200}>
       <Title order={3}>Provide Feedback on Some Model Outputs</Title>
 
       <GradingView
diff --git a/chainforge/react-server/src/EvalGen/GradingView.tsx b/chainforge/react-server/src/EvalGen/GradingView.tsx
index a6bed5753..9f7aca3b0 100644
--- a/chainforge/react-server/src/EvalGen/GradingView.tsx
+++ b/chainforge/react-server/src/EvalGen/GradingView.tsx
@@ -12,6 +12,7 @@ import {
   IconSparkles,
 } from "@tabler/icons-react";
 import { StringLookup } from "../backend/cache";
+import { cleanEscapedBraces } from "../backend/template";
 
 const HeaderText = ({ children }: { children: ReactNode }) => {
   return (
@@ -40,7 +41,9 @@ const GradingView: React.FC<GradingViewProps> = ({
   const responseText = useMemo(
     () =>
       shownResponse && shownResponse.responses?.length > 0
-        ? llmResponseDataToString(shownResponse.responses[0])
+        ? cleanEscapedBraces(
+            llmResponseDataToString(shownResponse.responses[0]),
+          )
         : "",
     [shownResponse],
   );
diff --git a/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx b/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
index b9e57c6c4..cd4e908fe 100644
--- a/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
+++ b/chainforge/react-server/src/EvalGen/PickCriteriaStep.tsx
@@ -512,7 +512,7 @@ const PickCriteriaStep: React.FC<PickCriteriaStepProps> = ({
           >
             <IconRepeat />
             <IconSparkles />
-            &nbsp;Suggest more
+            &nbsp;Suggest criteria
           </Button>
         </Flex>
 
diff --git a/chainforge/react-server/src/EvalGen/WelcomeStep.tsx b/chainforge/react-server/src/EvalGen/WelcomeStep.tsx
index 18fcfc965..fe50ae4ef 100644
--- a/chainforge/react-server/src/EvalGen/WelcomeStep.tsx
+++ b/chainforge/react-server/src/EvalGen/WelcomeStep.tsx
@@ -27,7 +27,7 @@ const WelcomeStep: React.FC<WelcomeStepProps> = ({ setOnNextCallback }) => (
       , and is inspired by inductive processes in UX research (heuristic
       evaluation and grounded theory).
     </Text>
-    <Text>Currently, Evalgen:</Text>
+    <Text>Currently, Evalgen is in a public beta. It:</Text>
     <List>
       <List.Item>
         Only generates <b>assertions (pass/fail tests)</b>. Numeric and
@@ -38,9 +38,9 @@ const WelcomeStep: React.FC<WelcomeStepProps> = ({ setOnNextCallback }) => (
         screen. This is the chief difference from our paper.
       </List.Item>
       <List.Item>
-        Requires access to the GenAI features of ChainForge. Set up the Provider
-        you wish to use for this in your Global Settings view. The Provider must
-        be powerful enough to generate code. (By default, it is OpenAI.)
+        Requires access to the GenAI features of ChainForge, which (currently)
+        requires an OpenAI API key. (If you&apos;d like to use other models,
+        more general access to GenAI features is coming soon.)
       </List.Item>
       <List.Item>
         Should be run on the outputs of <b>already-run</b> Prompt Nodes
@@ -62,8 +62,8 @@ const WelcomeStep: React.FC<WelcomeStepProps> = ({ setOnNextCallback }) => (
         Python code is run sandboxed in the browser with pyodide. Pyodide does
         not have access to many libraries out-of-the-box. (If your eval criteria
         implementation must use a third-party library, we suggest you use
-        ChainForge’s genAI features on the specific eval node, outside this
-        wizard.)
+        ChainForge&apos;s genAI features on an individual code eval node,
+        outside this wizard.)
       </List.Item>
     </List>
     {/* <Text>We have captured the following about your context:</Text>
@@ -76,7 +76,7 @@ const WelcomeStep: React.FC<WelcomeStepProps> = ({ setOnNextCallback }) => (
       node.
     </Text>
     <Text>
-      EvalGen is in Beta. To improve it, provide feedback on our Github Issues
+      EvalGen is in beta. To improve it, provide feedback on our Github Issues
       or Discussion pages, or raise a Pull Request with the changes.
     </Text>
   </Stack>
diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts
index 3bf5669ca..24c7802b1 100644
--- a/chainforge/react-server/src/backend/evalgen/executor.ts
+++ b/chainforge/react-server/src/backend/evalgen/executor.ts
@@ -285,27 +285,36 @@ export default class EvaluationFunctionExecutor {
         this.perCriteriaGrades[example.uid]?.[criteria.uid] === false,
     );
 
-    await generateFunctionsForCriteria(
-      criteria,
-      this.llms.large,
-      this.promptTemplate,
-      this.examples[Math.floor(Math.random() * this.examples.length)],
-      emitter,
-      badExample,
-      this.apiKeys,
-    );
+    try {
+      await generateFunctionsForCriteria(
+        criteria,
+        this.llms.large,
+        this.promptTemplate,
+        this.examples[Math.floor(Math.random() * this.examples.length)],
+        emitter,
+        badExample,
+        this.apiKeys,
+      );
+
+      console.log(`Generated functions for criteria: ${criteria.shortname}`);
+      console.log(
+        `Number of functions generated: ${functionExecutionPromises.length}`,
+      );
+      this.logFunction(
+        `Generated ${functionExecutionPromises.length} functions for criteria: ${criteria.shortname}`,
+      );
+    } catch (error) {
+      console.error(
+        `Error generating functions for criteria ${criteria.shortname}: ${error}`,
+      );
+      this.logFunction(
+        `Error generating functions for criteria ${criteria.shortname}: ${error}`,
+      );
+    }
 
     // Update LLM call count by 1
     this.updateNumLLMCalls(1, 0);
 
-    console.log(`Generated functions for criteria: ${criteria.shortname}`);
-    console.log(
-      `Number of functions generated: ${functionExecutionPromises.length}`,
-    );
-    this.logFunction(
-      `Generated ${functionExecutionPromises.length} functions for criteria: ${criteria.shortname}`,
-    );
-
     await Promise.all(functionExecutionPromises);
   }
 
diff --git a/chainforge/react-server/src/backend/evalgen/oai_utils.ts b/chainforge/react-server/src/backend/evalgen/oai_utils.ts
index 840789119..958c1d70e 100644
--- a/chainforge/react-server/src/backend/evalgen/oai_utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/oai_utils.ts
@@ -1,7 +1,7 @@
 // import { env as process_env } from "process";
 import { EventEmitter } from "events";
 // import { AzureKeyCredential, OpenAIClient } from "@azure/openai";
-import { llmResponseDataToString } from "../utils";
+import { hashtagTemplateVars, llmResponseDataToString } from "../utils";
 import { simpleQueryLLM } from "../backend";
 import { Dict, LLMSpec } from "../typing";
 import { extractMdBlocks } from "./utils";
@@ -52,8 +52,11 @@ export class EvalGenAssertionEmitter extends EventEmitter {
       // Verify format:
       if (prompts.every((p) => typeof p === "string")) {
         // If these are all strings, we are good to go--
+        // We must be careful to first hashtag all template variables in the prompt
+        // before emitting them, so that they are not interpreted as template variables.
+        const hashtagged_prompts = prompts.map((p) => hashtagTemplateVars(p));
         // Emit all the LLM eval prompt candidates in one burst
-        prompts.forEach(emit_prompt);
+        hashtagged_prompts.forEach(emit_prompt);
       } else {
         console.error(
           "Unexpected output type after JSON parsing: At least generated LLM eval prompt is not a string.",
diff --git a/chainforge/react-server/src/backend/evalgen/utils.ts b/chainforge/react-server/src/backend/evalgen/utils.ts
index ff8bda7c3..76c4194e7 100644
--- a/chainforge/react-server/src/backend/evalgen/utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/utils.ts
@@ -29,6 +29,7 @@ import {
   buildContextPromptForVarsMetavars,
   buildGenEvalCodePrompt,
 } from "../../AiPopover";
+import { escapeBraces } from "../template";
 
 /**
  * Extracts substrings within "```" and "```" ticks. Excludes the ticks from return.
@@ -65,15 +66,34 @@ export async function generateLLMEvaluationCriteria(
   apiKeys?: Dict,
   promptTemplate?: string, // overrides prompt template used
   systemMsg?: string | null, // overrides default system message, if present. Use null to specify empty.
+  userFeedback?: { grade: boolean; note?: string; response: string }[], // user feedback to include in the prompt
 ): Promise<EvalCriteria[]> {
+  // Compose user feedback
+  let userFeedbackPrompt = "";
+  if (userFeedback) {
+    userFeedbackPrompt = `\n\n-----------------\nHere is some feedback on the LLM's responses to this prompt:`;
+    for (const feedback of userFeedback) {
+      userFeedbackPrompt += `\n\nFor the response: "${feedback.response}", the user gave the following feedback:`;
+      if (feedback.grade !== undefined) {
+        userFeedbackPrompt += `\nGrade: ${feedback.grade === true ? "Good" : "Bad"}`;
+      }
+      if (feedback.note !== undefined) {
+        userFeedbackPrompt += `\nExplanation for grade: "${feedback.note}"`;
+      }
+    }
+    userFeedbackPrompt += "\n-----------------\n";
+  }
+
   // Construct the detailed prompt for the LLM
   const detailedPrompt =
     promptTemplate ??
     `Here is my LLM prompt template:
   
   \`${prompt}\`
+
+    ${userFeedbackPrompt}
     
-    Based on the instructions in the prompt that need to be followed, I want to write assertions for my LLM pipeline to run on all pipeline responses. Give me a list of 2 or 3 distinct criteria to check for in LLM responses. Each item in the list should contain a string description of a criteria to check for, and whether it should be evaluated with code or by an expert if the criteria is difficult to evaluate. Your answer should be a JSON list of objects within \`\`\`json \`\`\` markers, where each object has the following three fields: "criteria", "shortname", and "eval_method" (code or expert). At most 3 criteria should have eval_method as expert. The "criteria" should be short, and the "shortname" should be a very brief title for the criteria. Each evaluation criteria should test a concept that should evaluate to "true" in the ideal case.`;
+    Based on the instructions in the prompt that need to be followed, I want to write a list of assertions for my LLM pipeline to run on all pipeline responses. Give me a list of 3 distinct criteria to check for in LLM responses. Each item in the list should contain a string description of a criteria to check for, and whether it should be evaluated with code or by an expert if the criteria is difficult to evaluate. Your answer should be a JSON list of objects within \`\`\`json \`\`\` markers, where each object has the following three fields: "criteria", "shortname", and "eval_method" (code or expert). At most 3 criteria should have eval_method as expert. The "criteria" should be short, and the "shortname" should be a very brief title for the criteria. Each evaluation criteria should test a concept that should evaluate to "true" in the ideal case.`;
 
   // Query the LLM (below, we will try this up to 3 times)
   async function _query() {
@@ -381,10 +401,37 @@ function buildFunctionGenPrompt(
   }
 
   if (criteria.eval_method === "expert") {
-    return `Given the following prompt template for an LLM pipeline:\n\n ${promptTemplate}\n\n, your task is to devise a prompt for an expert to evaluate the pipeline's responses based on the following criteria: ${criteria.criteria}
+    const varsAndMetavars = getVarsAndMetavars([example]);
+    // Turn the vars and metavars into a string
+    const _composeVarsContext = (vars: string[]) => {
+      if (vars.length === 0) return "";
+      vars.map((v) => ` - "${v}": ${example.vars[v]}`).join("\n");
+    };
+    const varsAndMetavarsContext =
+      _composeVarsContext(varsAndMetavars.vars) +
+      "\n" +
+      _composeVarsContext(varsAndMetavars.metavars);
+    const varsAndMetavarsContextPrompt =
+      varsAndMetavarsContext.length > 2
+        ? `\n\nIn your prompts, it may be useful to refer to metadata associated with the LLM output, such as when you are comparing to a ground truth. For instance, consider a situation where the user has a prompt template with a variable {writing_style} —'poem', 'text message', or 'formal letter' —and they want to validate that the LLM's output was really in that style. You would produce a prompt template like:
+
+"Respond with 'yes' if the text below is in the style of a {writing_style}, 'no' if not. Only reply with the classification, nothing else."
+
+The template indicates that the same {writing_style} variable used upstream in the LLM pipeline, should be used in your evaluation prompt.
+
+If you want to refer to the value of an input variable, you **must** use template braces like {variable}.
+
+Here are the variables you have access to (keys), and example values for one output: 
+${varsAndMetavarsContext}`
+        : "";
+
+    return escapeBraces(`Given the following prompt template for an LLM pipeline:\n\n ${promptTemplate}\n\nYour task is to devise a prompt for an expert to evaluate the pipeline's responses based on the following criteria: "${criteria.criteria}"
     ${badExampleSection}
-    You will devise 3 prompts for the evaluation criterion to see which has the best accuracy. Each prompt you generate should be a short question that an expert can answer with a "yes" or "no" to evaluate entire criteria (don't miss anything in the criteria). Try different variations/wordings in the prompts. Return your prompts in a JSON list of strings within \`\`\`json \`\`\` markers. Each string should be a question for the expert to answer, and each question should be contained on its own line.
-    `;
+    You will devise 3 prompts for the evaluation criterion to see which has the best accuracy. Each prompt you generate should be a short question that an expert can answer with a "yes" or "no" to evaluate entire criteria (don't miss anything in the criteria). Try different variations/wordings in the prompts. ${varsAndMetavarsContextPrompt}
+    
+    Return your prompts in a JSON list of strings within \`\`\`json \`\`\` markers. Each string should be a question for the expert to answer, and each question should be contained on its own line.
+    ---
+    `);
   } else {
     const prompt = `Given the following prompt template for an LLM pipeline:\n\n ${promptTemplate}\n\n, your task is to devise multiple Python assertions to evaluate LLM responses based on the criteria "${criteria.shortname}". 
     ${badExampleSection}
@@ -392,7 +439,7 @@ function buildFunctionGenPrompt(
     ${buildGenEvalCodePrompt("python", buildContextPromptForVarsMetavars(getVarsAndMetavars([example])), criteria.criteria, true)}
     Be creative in your implementations. Our goal is to explore diverse approaches to evaluate LLM responses effectively. Try to avoid using third-party libraries for code-based evaluation methods. Include the full implementation of each function in separate "\`\`\`python" blocks. Each function should return only True or False.`;
 
-    return prompt;
+    return escapeBraces(prompt); // Escape braces in the prompt
   }
 }
 
diff --git a/chainforge/react-server/src/store.tsx b/chainforge/react-server/src/store.tsx
index f16f6b281..bb381416e 100644
--- a/chainforge/react-server/src/store.tsx
+++ b/chainforge/react-server/src/store.tsx
@@ -16,6 +16,7 @@ import {
   deepcopy,
   transformDict,
   APP_IS_RUNNING_LOCALLY,
+  llmResponseDataToString,
 } from "./backend/utils";
 import { DuplicateVariableNameError } from "./backend/errors";
 import {
@@ -27,12 +28,12 @@ import {
   TabularDataColType,
   TabularDataRowType,
   JSONCompatible,
+  LLMResponse,
 } from "./backend/typing";
 import { TogetherChatSettings } from "./ModelSettingSchemas";
 import { NativeLLM } from "./backend/models";
 import { StringLookup } from "./backend/cache";
 import { saveGlobalConfig } from "./backend/backend";
-import { remove } from "jszip";
 const IS_RUNNING_LOCALLY = APP_IS_RUNNING_LOCALLY();
 
 // Initial project settings
@@ -462,6 +463,9 @@ export interface StoreHandles {
   state: Dict;
   setState: (key: string, val: any) => void;
   importState: (state: Dict) => void;
+  exportGradesAndNotes: (
+    responses: LLMResponse[],
+  ) => { grade: boolean; note?: string; response: string }[];
 
   // The color to represent a specific LLM, to be globally consistent
   llmColors: Dict<string>;
@@ -664,6 +668,25 @@ const useStore = create<StoreHandles>((set, get) => ({
       state,
     }));
   },
+  exportGradesAndNotes: (responses: LLMResponse[]) => {
+    const state = get().state;
+    const res: { grade: boolean; note?: string; response: string }[] = [];
+    responses.forEach((r) => {
+      const uid = r.uid;
+      if (r.uid === undefined || r.responses?.length === 0) return;
+      const gradeKey = `r.${uid}.grade`;
+      const noteKey = `r.${uid}.note`;
+      const grade = state[gradeKey];
+      const note = state[noteKey];
+      if (grade === undefined) return;
+      res.push({
+        grade: grade?.[0],
+        note: note?.[0],
+        response: llmResponseDataToString(r.responses[0]),
+      }); // TODO: support multiple responses when n>1
+    });
+    return res;
+  },
 
   // Keep track of LLM colors, to ensure color consistency across various plots and displays
   llmColors: initialLLMColors,
diff --git a/chainforge/react-server/src/styles.css b/chainforge/react-server/src/styles.css
index 9be9ce14a..b4a478f9e 100644
--- a/chainforge/react-server/src/styles.css
+++ b/chainforge/react-server/src/styles.css
@@ -353,14 +353,16 @@ html[data-mantine-color-scheme="dark"] .multi-eval-node {
 }
 
 .eval-vote-chosen {
-  display: inline-flex;
+  display: none; /* Disable eval the eval voting for initial EvalGen release */
+  /* display: inline-flex; */
   position: relative;
   margin-left: 6px;
   margin-top: -6px;
 }
 
 .eval-score:hover .eval-vote-icons {
-  display: inline-flex;
+  display: none; /* Disable eval the eval voting for initial EvalGen release */
+  /* display: inline-flex; */
   /* opacity: 1.0; */
 }
 

From c81ac65677d097719b6017348160da27b233df0e Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Wed, 14 May 2025 15:45:35 -0400
Subject: [PATCH 32/35] cleanup

---
 chainforge/react-server/src/MultiEvalNode.tsx | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/chainforge/react-server/src/MultiEvalNode.tsx b/chainforge/react-server/src/MultiEvalNode.tsx
index 44c76c7db..c19343ac2 100644
--- a/chainforge/react-server/src/MultiEvalNode.tsx
+++ b/chainforge/react-server/src/MultiEvalNode.tsx
@@ -706,15 +706,13 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
           }
         });
       });
+
       const finalResponses = Object.values(merged_res_objs_by_uid);
-      console.log("Output length:", finalResponses.length);
-      console.log("MultiEval Output:", finalResponses[0]?.eval_res?.items[0]);
       // We now have a dict of the form { uid: LLMResponse }
       // We need return only the values of this dict:
       setLastResponses(finalResponses);
       setLastRunSuccess(true);
       setDataPropsForNode(id, { output: finalResponses });
-      console.log("Setting output");
       StorageCache.store(`${id}.json`, finalResponses);
       pingOutputNodes(id);
       setStatus(Status.READY);

From 140366d4fdb44096f8d5dc7c449810809b790b26 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Wed, 14 May 2025 15:52:04 -0400
Subject: [PATCH 33/35] Plot acc bug fix

---
 chainforge/react-server/src/VisNode.tsx | 1 +
 setup.py                                | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/chainforge/react-server/src/VisNode.tsx b/chainforge/react-server/src/VisNode.tsx
index 6c29a12dd..6542f0098 100644
--- a/chainforge/react-server/src/VisNode.tsx
+++ b/chainforge/react-server/src/VisNode.tsx
@@ -601,6 +601,7 @@ export const VisView = forwardRef<VisViewRef, VisViewProps>(
           );
           if (is_all_bools) {
             typeof_eval_res = "Boolean";
+            sel_typeof_eval_res = "Boolean";
             setDisableGraphTypeOption(true);
           }
         } else {
diff --git a/setup.py b/setup.py
index 43ce61067..32dfbec16 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,7 @@ def readme():
 
 setup(
     name="chainforge",
-    version="0.3.6.0",
+    version="0.3.6.1",
     packages=find_packages(),
     author="Ian Arawjo",
     description="A Visual Programming Environment for Prompt Engineering",

From 8c2e80841ddcfd96bb06150a6eb9552606188b36 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Wed, 14 May 2025 15:58:44 -0400
Subject: [PATCH 34/35] Remove old evalgen files

---
 chainforge/react-server/src/EvalGenModal.tsx  | 1665 -----------------
 .../react-server/src/OldEvalGenModal.js       | 1494 ---------------
 2 files changed, 3159 deletions(-)
 delete mode 100644 chainforge/react-server/src/EvalGenModal.tsx
 delete mode 100644 chainforge/react-server/src/OldEvalGenModal.js

diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx
deleted file mode 100644
index 097bb1bb9..000000000
--- a/chainforge/react-server/src/EvalGenModal.tsx
+++ /dev/null
@@ -1,1665 +0,0 @@
-// /**
-//  * EvalGen 2.0
-//  *
-//  * Ian Arawjo, Shreya Shankar, J.D. Zamf., Helen Weixu Chen
-//  *
-//  * This file concerns the front-end to evaluation generator, EvalGen.
-//  * EvalGen supports users in generating eval funcs (here binary assertions) and aligning them with their preferences.
-//  *
-//  * Specifically, the modal lets users:
-//  *  - make and refine criteria to grade on (on the left)
-//  *  - grade responses (on the right)
-//  *  - while in the backend, an LLM is generating candidate assertions and selected the ones most aligned with user grades
-//  * As the user grades responses, they add/refine existing criteria.
-//  * This modal presents a shared interface where criteria can be iterated on *alongside* grading.
-//  * This is because of **criteria drift,** a phenomenon identified observing users in EvalGen 1.0 (unreleased).
-//  *
-//  * An AI (LLM call) can also suggest criteria based on the implicit context (inputs, such as the prompt)
-//  * and user feedback during grading (written feedback about failing outputs whose failure couldn't be classified under the immediate criteria set.)
-//  */
-// import React, {
-//   ReactNode,
-//   forwardRef,
-//   useCallback,
-//   useEffect,
-//   useImperativeHandle,
-//   useMemo,
-//   useState,
-// } from "react";
-// import { v4 as uuid } from "uuid";
-// import {
-//   Accordion,
-//   ActionIcon,
-//   Box,
-//   Button,
-//   Card,
-//   Center,
-//   Checkbox,
-//   Code,
-//   Collapse,
-//   Divider,
-//   Flex,
-//   Grid,
-//   Group,
-//   Menu,
-//   Modal,
-//   Radio,
-//   RingProgress,
-//   ScrollArea,
-//   SimpleGrid,
-//   Skeleton,
-//   Stack,
-//   Text,
-//   TextInput,
-//   Textarea,
-//   Title,
-//   Tooltip,
-//   rem,
-// } from "@mantine/core";
-// import { useDisclosure } from "@mantine/hooks";
-// import {
-//   // CriteriaGradeCount,
-//   Dict,
-//   LLMResponse,
-//   PromptVarsDict,
-//   RatingDict,
-//   ResponseUID,
-// } from "./backend/typing";
-// import { EvalCriteria, EvalGenReport } from "./backend/evalgen/typing";
-// import {
-//   IconChevronDown,
-//   IconChevronLeft,
-//   IconChevronRight,
-//   IconDots,
-//   IconRobot,
-//   IconStarFilled,
-//   IconTerminal2,
-//   IconThumbDown,
-//   IconThumbUp,
-//   IconTrash,
-//   IconFlagFilled,
-//   IconPencil,
-//   IconSparkles,
-// } from "@tabler/icons-react";
-// import {
-//   cleanMetavarsFilterFunc,
-//   deepcopy,
-//   sampleRandomElements,
-//   transformDict,
-// } from "./backend/utils";
-// import {
-//   extractUIDFromRatingKey,
-//   getRatingKeyForResponse,
-// } from "./ResponseRatingToolbar";
-// import useStore from "./store";
-// import StorageCache from "./backend/cache";
-// import EvaluationFunctionExecutor from "./backend/evalgen/executor";
-// import { generateLLMEvaluationCriteria } from "./backend/evalgen/utils";
-// import { escapeBraces } from "./backend/template";
-// import { update } from "lodash";
-// // import "./EvalGenModel.css";
-
-// const INIT_CRITERIA: EvalCriteria[] = [
-//   {
-//     shortname: "Grammatical",
-//     criteria: "The text is grammatically correct.",
-//     eval_method: "expert",
-//     uid: uuid(),
-//     priority: 0,
-//   },
-//   {
-//     shortname: "Tweet-length",
-//     criteria: "The text is less than 144 characters.",
-//     eval_method: "code",
-//     uid: uuid(),
-//     priority: 0,
-//   },
-//   {
-//     shortname: "Bombastic",
-//     criteria: "The message will drive views because it's controversial.",
-//     eval_method: "expert",
-//     uid: uuid(),
-//     priority: 0,
-//   },
-// ];
-
-// const Contributor = ({
-//   getStateValue,
-//   style = { size: 22, thickness: 4 },
-// }: {
-//   getStateValue: (id: number) => number;
-//   style: { size: number; thickness: number };
-// }) => {
-//   return (
-//     <RingProgress
-//       size={style.size}
-//       thickness={style.thickness}
-//       // label=""
-//       sections={[
-//         {
-//           value: getStateValue(1),
-//           color: "cyan",
-//           tooltip: "You have successfully contributed 7 responses.",
-//         },
-//         {
-//           value: getStateValue(2),
-//           color: "orange",
-//           tooltip: "You have successfully contributed 20 responses.",
-//         },
-//         {
-//           value: getStateValue(3),
-//           color: "green",
-//           tooltip: "You have gone to buffet 100 times.",
-//         },
-//         {
-//           value: getStateValue(4),
-//           color: "grape",
-//           tooltip: "You have made 21 nightmare",
-//         },
-//       ]}
-//     />
-//   );
-// };
-
-// const ThumbUpDownButtons = ({
-//   grade,
-//   onChangeGrade,
-//   getGradeCount,
-// }: {
-//   grade: boolean | undefined;
-//   onChangeGrade: (newGrade: boolean | undefined) => void;
-//   getGradeCount: (grade: boolean | undefined) => number;
-// }) => {
-//   // console.log(
-//   //   "getGradeCount",
-//   //   getGradeCount(true),
-//   //   getGradeCount(false),
-//   //   getGradeCount(undefined),
-//   // );
-//   return (
-//     <>
-//       {/* Thumbs up/down buttons */}
-//       <Button
-//         color={grade === true ? "green" : "gray"}
-//         m={0}
-//         p={0}
-//         variant="subtle"
-//         onClick={() => {
-//           // Toggle grade: if on (true), turn 'off' (undefined, for neutral).
-//           if (onChangeGrade) onChangeGrade(grade === true ? undefined : true);
-//         }}
-//       >
-//         <div className="gradeContainer">
-//           <IconThumbUp size="14pt" fill={grade === true ? "#aea" : "white"} />
-//           <div className="gradeUpCount">{getGradeCount(true)}</div>
-//         </div>
-//       </Button>
-//       <Button
-//         color={grade === false ? "red" : "gray"}
-//         m={0}
-//         p={0}
-//         variant="subtle"
-//         onClick={() => {
-//           // Toggle grade: if on (true), turn 'off' (undefined, for neutral).
-//           if (onChangeGrade) onChangeGrade(grade === false ? undefined : false);
-//         }}
-//       >
-//         <div className="gradeContainer">
-//           <IconThumbDown
-//             size="14pt"
-//             fill={grade === false ? "pink" : "white"}
-//           />
-//           <div className="gradeDownCount">{getGradeCount(false)}</div>
-//         </div>
-//       </Button>
-//     </>
-//   );
-// };
-
-// export interface CriteriaCardProps {
-//   criterion: EvalCriteria;
-//   onChange: (changedCriteria: EvalCriteria) => void;
-//   onDelete: () => void;
-//   initiallyOpen?: boolean;
-//   grade: boolean | undefined;
-//   onChangeGrade: (newGrade: boolean | undefined) => void;
-//   getGradeCount: (grade: boolean | undefined) => number;
-//   getStateValue: (stateId: number) => number;
-// }
-
-// const CriteriaCard: React.FC<CriteriaCardProps> = ({
-//   criterion,
-//   onChange,
-//   onDelete,
-//   initiallyOpen,
-//   grade,
-//   getGradeCount,
-//   onChangeGrade,
-//   getStateValue,
-// }) => {
-//   const [opened, { toggle }] = useDisclosure(initiallyOpen ?? false);
-//   const [title, setTitle] = useState(criterion.shortname ?? "New Criteria");
-
-//   return (
-//     <Stack spacing={0} ml={8}>
-//       <Flex align="center">
-//         <Group spacing="0px">
-//           {/* The arrow chevron user can click to collapse/expand
-//             <Button
-//               color="gray"
-//               p={0}
-//               m={0}
-//               variant="subtle"
-//               mr="4px"
-//               onClick={toggle}
-//             >
-//               {opened ? (
-//                 <IconChevronDown size="14pt" />
-//               ) : (
-//                 <IconChevronRight size="14pt" />
-//               )}
-//             </Button> */}
-
-//           {/* Thumbs up/down buttons */}
-//           <ThumbUpDownButtons
-//             grade={grade}
-//             onChangeGrade={onChangeGrade}
-//             getGradeCount={getGradeCount}
-//           />
-
-//           {/* Title of the criteria */}
-//           <TextInput
-//             value={title}
-//             onChange={(e) => setTitle(e.target.value)}
-//             onBlur={(e) => {
-//               criterion.shortname = e.target.value;
-//               if (onChange) onChange(criterion);
-//             }}
-//             placeholder="Criteria name"
-//             variant="unstyled"
-//             size="md"
-//             ml="xs"
-//             className="nodrag nowheel"
-//             styles={{
-//               input: {
-//                 padding: "0px",
-//                 height: "14pt",
-//                 minHeight: "0pt",
-//                 fontWeight: 500,
-//               },
-//             }}
-//           />
-//         </Group>
-
-//         <Group spacing="4px" ml="auto">
-//           {/* Whether this criteria should be implemented with code (function) or an LLM evaluator */}
-//           <Tooltip
-//             label={
-//               criterion.eval_method === "code"
-//                 ? "Change to an LLM evaluator"
-//                 : "Change to a code evaluator"
-//             }
-//             withinPortal
-//             withArrow
-//           >
-//             <Text
-//               color="#999"
-//               size="sm"
-//               mr="6px"
-//               onClick={() => {
-//                 criterion.eval_method =
-//                   criterion.eval_method === "code" ? "expert" : "code";
-//                 if (onChange) onChange(criterion);
-//               }}
-//             >
-//               {criterion.eval_method === "code" ? (
-//                 <Flex style={{ userSelect: "none" }}>
-//                   <IconTerminal2 size="14pt" />
-//                   &nbsp;Python
-//                 </Flex>
-//               ) : (
-//                 <Flex style={{ userSelect: "none" }}>
-//                   <IconRobot size="14pt" />
-//                   &nbsp;LLM
-//                 </Flex>
-//               )}
-//             </Text>
-//           </Tooltip>
-
-//           {/* <Contributor getStateValue={getStateValue} /> */}
-
-//           {/* Delete button (and any other criterion-specific changes in the future) */}
-//           <ActionIcon variant="subtle" color="red" onClick={onDelete}>
-//             <IconTrash style={{ width: rem(16), height: rem(16) }} />
-//           </ActionIcon>
-//         </Group>
-//       </Flex>
-
-//       <Textarea
-//         value={criterion.criteria}
-//         placeholder="Describe here. You must describe what the criteria means before EvalGen can implement it."
-//         ml={38}
-//         onChange={(e) => {
-//           criterion.criteria = e.target.value;
-//           if (onChange) onChange(criterion);
-//         }}
-//         onClickCapture={(e) => e.stopPropagation()}
-//         styles={{
-//           input: {
-//             border: "none",
-//             borderWidth: "0px",
-//             margin: "0px",
-//             color: "#444",
-//             background: "transparent",
-//             lineHeight: 1.1,
-//           },
-//         }}
-//         autosize
-//         minRows={2}
-//         maxRows={5}
-//         fz="sm"
-//         mb="xs"
-//         c="dimmed"
-//       />
-//     </Stack>
-//   );
-// };
-
-// export interface EvalGenModalRef {
-//   trigger: (
-//     resps: LLMResponse[],
-//     setFinalReports: (reports: EvalGenReport) => void,
-//   ) => void;
-// }
-
-// const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
-//   function EvalGenModal(props, ref) {
-//     const [opened, { open, close }] = useDisclosure(false);
-//     const apiKeys = useStore((state) => state.apiKeys);
-//     const globalState = useStore((store) => store.state);
-//     const [criteria, setCriteria] = useState<EvalCriteria[]>([]);
-//     const [criteriaForDisplay, setCriteriaForDisplay] = useState<
-//       EvalCriteria[]
-//     >([]);
-
-//     const [responses, setResponses] = useState<LLMResponse[]>([]);
-//     const [shownResponse, setShownResponse] = useState<LLMResponse | undefined>(
-//       undefined,
-//     );
-//     const [pastShownResponses, setPastShownResponses] = useState<LLMResponse[]>(
-//       [],
-//     );
-//     const [shownResponseIdx, setShownResponseIdx] = useState(0);
-
-//     const [annotation, setAnnotation] = useState<string | undefined>(undefined);
-//     const [holisticGrade, setHolisticGrade] = useState<
-//       "good" | "bad" | undefined
-//     >(undefined);
-
-//     // Per-criteria grades (indexed by uid of response, then uid of criteria)
-//     const [grades, setGrades] = useState<Dict<Dict<boolean | undefined>>>({});
-//     const setPerCriteriaGrade = (
-//       responseUID: string,
-//       criteriaUID: string,
-//       newGrade: boolean | undefined,
-//     ) => {
-//       setGrades((grades) => {
-//         if (!grades[responseUID]) grades[responseUID] = {};
-//         grades[responseUID][criteriaUID] = newGrade;
-//         // grades[responseUID] = { ...grades[responseUID] };
-//         // console.error("grades-2", grades);
-//         return { ...grades };
-//       });
-//       updateGlobalRating(responseUID, "perCriteriaGrades", grades[responseUID]);
-//     };
-//     const getStateValue = (stateId: number) => {
-//       return Math.floor(Math.random() * 30 + 6);
-//     };
-//     const getGradeCount = (
-//       // responseUID: string,
-//       criteriaUID: string,
-//       grade: boolean | undefined,
-//     ) => {
-//       // console.log("getGradeCount", responseUID, criteriaUID, grade);
-//       // console.log("getGradeCount", grades);
-
-//       let count = 0;
-//       for (const respUid in grades) {
-//         count += grade === grades[respUid][criteriaUID] ? 1 : 0;
-//       }
-//       return count;
-//     };
-
-//     // The EvalGen object responsible for generating, implementing, and filtering candidate implementations
-//     const [executor, setExecutor] = useState<EvaluationFunctionExecutor | null>(
-//       null,
-//     );
-
-//     const [execProgress, setExecProgress] = useState(0);
-
-//     // State variables to keep track of GPT call counts
-//     const [numGPT4Calls, setNumGPT4Calls] = useState(0);
-//     const [numGPT35Calls, setNumGPT35Calls] = useState(0);
-//     const [logs, setLogs] = useState<{ date: Date; message: string }[]>([]);
-
-//     // For updating the global human ratings state
-//     const setState = useStore((store) => store.setState);
-//     const updateGlobalRating = useCallback(
-//       (uid: string, label: string, payload: RatingDict) => {
-//         const key = getRatingKeyForResponse(uid, label);
-//         const safe_payload = deepcopy(payload);
-//         setState(key, safe_payload);
-//         StorageCache.store(key, safe_payload);
-//       },
-//       [setState],
-//     );
-
-//     // console.error("criteria", criteria);
-
-//     // Update executor whenever resps, grades, or criteria change
-//     React.useEffect(() => {
-//       if (criteria.length > 0 && !executor) {
-//         const existingGrades = transformDict(
-//           globalState,
-//           (key) => key.startsWith("r.") && key.endsWith(".grade"),
-//           extractUIDFromRatingKey,
-//           (_, val) => {
-//             // The grades are in { idx: grade } format. Take only the first,
-//             // as we only take the first response in this iteration of EvalGen:
-//             if (typeof val !== "object") return undefined;
-//             const gs = Object.values(val);
-//             if (gs.length === 0) return undefined;
-//             return gs[0];
-//           },
-//         );
-
-//         const addLog = (message: string) => {
-//           setLogs((prevLogs) => [...prevLogs, { date: new Date(), message }]);
-//         };
-
-//         const ex = new EvaluationFunctionExecutor(
-//           getLikelyPromptTemplateAsContext(responses),
-//           responses,
-//           criteria,
-//           (gpt4Calls, gpt35Calls) => {
-//             // Callback to update GPT call counts
-//             setNumGPT4Calls((num) => num + gpt4Calls);
-//             setNumGPT35Calls((num) => num + gpt35Calls);
-//           },
-//           addLog,
-//           existingGrades,
-//           grades,
-//         );
-//         setExecutor(ex);
-
-//         setExecProgress(0);
-
-//         // ex.start((progress) => {
-//         //   setExecProgress(progress?.success ?? 0);
-//         // });
-//       } else if (executor) {
-//         // Update criteria in executor
-//         executor.updateCriteria(criteria);
-//       }
-
-//       updateCriteriaForDisplay();
-//     }, [criteria]);
-
-//     const generateCriteria = (resps) => {
-//       // Create criteria
-//       // setIsLoadingCriteria((num) => num + 3);
-//       genCriteriaFromContext(resps)
-//         .then((crits) => {
-//           console.log("crits #1", crits);
-//           crits = [...criteria, ...crits];
-//           console.log("crits #2", crits);
-//           setCriteria(crits.map((c) => ({ ...c, uid: uuid() })));
-//         })
-//         .catch((err) => {
-//           console.error(err);
-//         })
-//         .finally(() => {
-//           setIsLoadingCriteria((num) => num - 3);
-//           setNumGPT4Calls((num) => num + 1);
-//         });
-//     };
-
-//     // const defaultOnFinish = (reports: string) => {};
-//     const [onFinish, setOnFinish] = useState({
-//       setFinalRpts: (reports: EvalGenReport) => {
-//         // console.log("");
-//       },
-//     });
-
-//     // Open the EvalGen wizard
-//     const trigger = (
-//       resps: LLMResponse[],
-//       setFinalReports: (reports: EvalGenReport) => void,
-//     ) => {
-//       // We pass the responses here manually to ensure they remain the same
-//       // for the duration of one EvalGen operation.
-//       setResponses(resps);
-//       gotoNextScreen("response");
-//       // setFinalReports("A plenty response");
-//       setOnFinish({
-//         setFinalRpts: (reports: EvalGenReport) => {
-//           close();
-//           setFinalReports(reports);
-//         },
-//       });
-
-//       const firstGrades = resps.reduce(
-//         (acc: Dict<Dict<boolean | undefined>>, curr) => {
-//           if (!(curr.uid in acc)) acc[curr.uid] = {};
-//           return acc;
-//         },
-//         grades,
-//       );
-//       setGrades(firstGrades);
-
-//       console.log("*****************************resps", resps);
-//       if (criteria && criteria.length === 0) {
-//         generateCriteria(resps);
-//       }
-
-//       setShownResponseIdx(0);
-//       if (resps.length > 0) {
-//         const first_resp = sampleRandomElements(resps, 1)[0];
-//         // setShownResponse(first_resp);
-//         setPastShownResponses([first_resp]);
-//       } else {
-//         // setShownResponse(undefined);
-//         setPastShownResponses([]);
-//       }
-//       setShownResponse(resps[shownResponseIdx]);
-//       open();
-//     };
-//     useImperativeHandle(ref, () => ({
-//       trigger,
-//     }));
-
-//     const getLikelyPromptTemplateAsContext = (resps) => {
-//       // Attempt to infer the prompt template used to generate the responses:
-//       const prompts = new Set();
-//       for (const resp_obj of resps) {
-//         if (resp_obj?.metavars?.__pt !== undefined) {
-//           prompts.add(resp_obj.metavars.__pt);
-//         }
-//       }
-
-//       if (prompts.size === 0) return null;
-
-//       // Pick a prompt template at random to serve as context....
-//       return escapeBraces(prompts.values().next().value);
-//     };
-
-//     async function genCriteriaFromContext(responses) {
-//       // Get the context from the input responses
-//       const inputPromptTemplate = getLikelyPromptTemplateAsContext(responses);
-
-//       if (inputPromptTemplate === null) {
-//         console.error("No context found. Cannot proceed.");
-//         return;
-//       }
-
-//       // Attempt to generate criteria using an LLM
-//       return await generateLLMEvaluationCriteria(inputPromptTemplate, apiKeys);
-//     }
-
-//     // Add a criterion
-//     const handleAddCriteria = (newCrit: EvalCriteria) => {
-//       setCriteria((cs) => {
-//         if (!newCrit.uid) newCrit.uid = uuid();
-//         return [...cs, newCrit];
-//       });
-//     };
-
-//     // Modify an existing criterion
-//     const handleChangeCriteria = (newCrit: EvalCriteria, uid: string) => {
-//       setCriteria((cs) => {
-//         const idx = cs.findIndex((c) => c.uid === uid);
-//         if (idx === -1) {
-//           console.error("Could not find criteria with uid", uid);
-//           return cs;
-//         }
-//         cs[idx] = newCrit;
-//         return [...cs];
-//       });
-//     };
-
-//     // Delete a criterion
-//     const handleDeleteCriteria = (uid: string) => {
-//       setCriteria((cs) => {
-//         return cs.filter((c) => c.uid !== uid);
-//       });
-//     };
-
-//     // Synthesize a new criteria according to the feedback given for the shown response
-//     const [isLoadingCriteria, setIsLoadingCriteria] = useState(0);
-//     const synthNewCriteriaWithLLM = (
-//       response: string,
-//       feedback: string,
-//       grade: "good" | "bad" | "unknown",
-//     ) => {
-//       // Add a loading Skeleton
-//       setIsLoadingCriteria((num) => num + 1);
-//       // Make async LLM call to expand criteria only if the feedback contains some idea of a constraint on the output and isn't covered by existing criteria
-//       const prettyCriteria = criteria
-//         .map((crit) => {
-//           return `${crit.shortname}: ${crit.criteria}`;
-//         })
-//         .join("\n");
-
-//       generateLLMEvaluationCriteria(
-//         "",
-//         apiKeys,
-//         `I've given some feedback on some text output. Use this feedback to decide on a single new evaluation criteria with a yes/no answer, only if the feedback isn't encompassed by existing criteria. I want you to take the criteria and output a JSON object in the format below.
-
-// TEXT OUTPUT:
-// \`\`\`
-// ${response}
-// \`\`\`
-
-// EXISTING CRITERIA:
-// \`\`\`
-// ${prettyCriteria}
-// \`\`\`
-
-// GRADE (whether text was good or bad):
-// \`\`\`
-// ${grade}
-// \`\`\`
-
-// FEEDBACK:
-// \`\`\`
-// ${feedback}
-// \`\`\`
-
-// If you determine the feedback corresponds to a new criteria, your response should contain a short title for the criteria ("shortname"), a description of the criteria in 2 sentences ("criteria"), and whether it should be evaluated with "code", or by an "expert" if the criteria is difficult to evaluate ("eval_method"). Your answer should be JSON within a \`\`\`json \`\`\` marker, with the following three fields: "criteria", "shortname", and "eval_method" (code or expert). The "criteria" should expand upon the user's input, the "shortname" should be a very brief title for the criteria, and this list should contain as many evaluation criteria as you can think of. Each evaluation criteria should test a unit concept that should evaluate to "true" in the ideal case. Only output JSON, nothing else. Output an empty list if there is no new evaluation criteria`, // prompt
-//         "gpt-4o", // llm
-//       )
-//         .then((evalCrits) => {
-//           // Take only the first if evalCrits has a nonempty list
-//           if (evalCrits[0]) {
-//             setCriteria((crit) =>
-//               crit.concat([
-//                 {
-//                   ...evalCrits[0],
-//                   uid: uuid(),
-//                 },
-//               ]),
-//             );
-//           }
-//           // Remove a loading Skeleton
-//           setIsLoadingCriteria((num) => num - 1);
-
-//           setNumGPT4Calls((num) => num + 1);
-//         })
-//         .catch((err) => {
-//           console.error(err);
-//           setIsLoadingCriteria((num) => num - 1);
-//         });
-//     };
-
-//     // Goto next response in the queue (skipping grading the current one)
-//     const nextResponse = () => {
-//       if (responses.length === 0) return;
-
-//       // Update annotation for current response (if any)
-//       // TODO: Fix this for generate case when num resp per prompt > 1
-
-//       if (
-//         grades[shownResponse.uid] ||
-//         holisticGrade ||
-//         (annotation && annotation.trim())
-//       ) {
-//         executor?.setGradeForExample(
-//           shownResponse.uid,
-//           grades[shownResponse.uid],
-//           holisticGrade,
-//           annotation ? annotation.trim() : null,
-//         );
-//       }
-
-//       if (
-//         shownResponse &&
-//         annotation &&
-//         typeof annotation === "string" &&
-//         annotation.trim().length > 0
-//       ) {
-//         console.log(
-//           "setting annotation for resp",
-//           shownResponse.uid,
-//           annotation,
-//         );
-//         updateGlobalRating(shownResponse.uid, "note", { 0: annotation });
-//         setAnnotation("");
-//       }
-
-//       if (shownResponse && holisticGrade) {
-//         updateGlobalRating(shownResponse.uid, "grade", {
-//           0: holisticGrade === "good",
-//         });
-//       }
-
-//       if (shownResponse && grades[shownResponse.uid]) {
-//         updateGlobalRating(
-//           shownResponse.uid,
-//           "perCriteriaGrades",
-//           grades[shownResponse.uid],
-//         );
-//       }
-
-//       // @ts-expect-error The only way to deselect the Radio.Group is to set it to null. Undefined doesn't work.
-//       setHolisticGrade(null);
-
-//       if (shownResponseIdx < pastShownResponses.length - 1) {
-//         // If we are not at the end of the history of shown responses, then show the next response:
-//         setShownResponse(pastShownResponses[shownResponseIdx + 1]);
-//         setShownResponseIdx(shownResponseIdx + 1); // increment the shown resp idx
-//       } else {
-//         // We are at the end of the history; pick the next response off the stack:
-//         // TODO: Make this unique (maybe by removing picked responses from the list!)
-//         let num_tries = 3;
-//         let next_resp = executor?.getNextExampleToGrade();
-//         while (
-//           num_tries > 0 &&
-//           (!next_resp ||
-//             pastShownResponses.some((r) => r.uid === next_resp?.uid))
-//         ) {
-//           // We're presenting a response that's already been shown. Try again.
-//           // NOTE: If we're trying again the first time, executor will flip and get the response on the other side of the grading stack, so we try once more:
-//           if (next_resp && num_tries === 3)
-//             next_resp =
-//               executor?.getNextExampleToGrade() ??
-//               sampleRandomElements(responses, 1)[0];
-//           // Otherwise we just choose a response at random:
-//           else next_resp = sampleRandomElements(responses, 1)[0];
-//           num_tries -= 1;
-//         }
-//         // Note that this doesn't guarantee uniqueness here ---it is possible to see a response again.
-//         // However, the internal "grades" dict will help us in remembering what grade the user gave the response.
-//         setShownResponse(next_resp ?? undefined);
-//         if (next_resp)
-//           setPastShownResponses(pastShownResponses.concat(next_resp));
-//         setShownResponseIdx(pastShownResponses.length);
-//       }
-//       updateShownResponseUniqueIndex();
-//     };
-
-//     // Go back to previously shown response
-//     const prevResponse = () => {
-//       if (pastShownResponses.length === 0 || shownResponseIdx === 0) return;
-//       setShownResponse(pastShownResponses[shownResponseIdx - 1]);
-//       setShownResponseIdx(shownResponseIdx - 1); // decrement shown resp idx
-//       updateShownResponseUniqueIndex();
-//     };
-
-//     const updateShownResponseUniqueIndex = () => {
-//       let idx = 0;
-//       for (const resp of responses) {
-//         if (resp === shownResponse) {
-//           setShownResponseUniqueIdx(idx);
-//           break;
-//         }
-//         idx++;
-//       }
-//     };
-
-//     const nextResponse2 = () => {
-//       if (responses.length === 0) return;
-//       if (shownResponseIdx < responses.length - 1) {
-//         // setShownResponse(responses[shownResponseIdx + 1]);
-//         setShownResponseIdx(shownResponseIdx + 1);
-//       }
-//     };
-
-//     const prevResponse2 = () => {
-//       if (shownResponseIdx > 0) {
-//         // setShownResponse(responses[shownResponseIdx - 1]);
-//         setShownResponseIdx(shownResponseIdx - 1); // decrement shown resp idx
-//       }
-//     };
-
-//     React.useEffect(() => {
-//       setShownResponse(responses[shownResponseIdx]);
-//     }, [shownResponseIdx]);
-
-//     const estimateGPTCalls = () => {
-//       return executor
-//         ? `This will trigger around ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT4Calls} GPT-4o and ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT35Calls} GPT-3.5-turbo-16k calls.`
-//         : "# estimated GPT calls not available.";
-//     };
-
-//     const updateCriteriaForDisplay = () => {
-//       const highCriteria = criteria.filter((c) => c.priority === 1);
-//       const lowCriteria = criteria.filter((c) => c.priority === 0);
-//       setCriteriaForDisplay(highCriteria.concat(lowCriteria));
-//     };
-//     useEffect(() => {
-//       const highCriteria = criteria.filter((c) => c.priority === 1);
-//       const lowCriteria = criteria.filter((c) => c.priority === 0);
-//       setCriteriaForDisplay(highCriteria.concat(lowCriteria));
-//     }, [criteria]);
-
-//     const [screen, setScreen] = useState("");
-//     const gotoNextScreen = (screenName: string) => {
-//       setScreen(screenName);
-//     };
-
-//     // const [onFinish, setOnFinish] = useState(null);
-
-//     return (
-//       <Modal
-//         size="95%"
-//         keepMounted
-//         opened={opened}
-//         onClose={close}
-//         closeOnClickOutside={true}
-//         style={{ position: "relative", left: "-5%" }}
-//       >
-//         {screen === "response" && (
-//           <Grid h={window?.innerHeight * 0.8}>
-//             <Grid.Col span={8}>
-//               <Stack justify="space-between">
-//                 {/* View showing the response the user is currently grading */}
-//                 <GradingView
-//                   shownResponse={shownResponse}
-//                   shownResponseIdx={shownResponseIdx}
-//                   // shownResponseIdx={shownResponseUniqueIdx}
-//                   responseCount={responses.length}
-//                   numGPT4Calls={numGPT4Calls}
-//                   numGPT35Calls={numGPT35Calls}
-//                   logs={logs}
-//                   gotoNextResponse={nextResponse2}
-//                   gotoPrevResponse={prevResponse2}
-//                   estimateGPTCalls={estimateGPTCalls}
-//                   gotoNextScreen={gotoNextScreen}
-//                 />
-
-//                 {/* Progress bar */}
-//                 {/* <Flex justify="left" align="center" gap="md">
-//                 <Stack w="100%" spacing={4}>
-//                   <Text color="#aaa" size="sm">
-//                     {bottomBar.progressLabel}
-//                   </Text>
-//                   <Progress w="100%" value={bottomBar.progressPerc} mb="0px" />
-//                 </Stack>
-
-//                 <Button
-//                   onClick={handleDone}
-//                   variant={bottomBar.buttonStyle}
-//                   disabled={bottomBar.buttonDisabled}
-//                 >
-//                   {bottomBar.buttonLabel}
-//                 </Button>
-//               </Flex> */}
-//               </Stack>
-//             </Grid.Col>
-//             <Grid.Col span={4} bg="#eee" pt="16px" h="100%">
-//               <Center>
-//                 <Title order={3} ml={8} mt="sm" mb="md">
-//                   Rubric
-//                 </Title>
-//               </Center>
-
-//               <div
-//                 style={{
-//                   display: "flex",
-//                   flexDirection: "column",
-//                 }}
-//               >
-//                 <div style={{ flex: 2, overflowY: "auto" }}>
-//                   {criteriaForDisplay.map((e) => (
-//                     <CriteriaCard
-//                       criterion={e}
-//                       key={e.uid}
-//                       onChange={(newCrit) =>
-//                         handleChangeCriteria(newCrit, e.uid)
-//                       }
-//                       onDelete={() => handleDeleteCriteria(e.uid)}
-//                       grade={
-//                         shownResponse
-//                           ? grades[shownResponse.uid][e.uid]
-//                           : undefined
-//                       }
-//                       getGradeCount={(grade) => {
-//                         return shownResponse
-//                           ? getGradeCount(
-//                               // shownResponse.uid,
-//                               e.uid,
-//                               grade,
-//                             )
-//                           : 0;
-//                       }}
-//                       onChangeGrade={(newGrade) => {
-//                         if (shownResponse)
-//                           setPerCriteriaGrade(
-//                             shownResponse.uid,
-//                             e.uid,
-//                             newGrade,
-//                           );
-//                       }}
-//                       initiallyOpen={true}
-//                       getStateValue={(stateId) => getStateValue(stateId)}
-//                     />
-//                   ))}
-//                   {isLoadingCriteria > 0 ? (
-//                     Array.from(
-//                       { length: isLoadingCriteria },
-//                       (v: unknown, idx: number) => (
-//                         <Skeleton key={idx} h={80} mb={4} />
-//                       ),
-//                     )
-//                   ) : (
-//                     <></>
-//                   )}
-
-//                   <div className="criteriaButtons">
-//                     <Button
-//                       leftIcon={<IconPencil size={14} />}
-//                       variant="subtle"
-//                       color="gray"
-//                       // gradient={{ from: "blue", to: "green", deg: 90 }}
-//                       onClick={() => {
-//                         handleAddCriteria({
-//                           shortname: "New Criteria",
-//                           criteria: "",
-//                           eval_method: "code",
-//                           priority: 0,
-//                           uid: uuid(),
-//                         });
-//                       }}
-//                     >
-//                       Add a new criteria
-//                     </Button>
-//                     <Button
-//                       leftIcon={<IconSparkles size={14} />}
-//                       variant="subtle"
-//                       color="gray"
-//                       // gradient={{ from: "blue", to: "green", deg: 90 }}
-//                       onClick={() => {
-//                         generateCriteria(responses);
-//                       }}
-//                     >
-//                       Suggest Criteria
-//                     </Button>
-//                   </div>
-//                 </div>
-
-//                 <Stack spacing="0px" pl="xs" pr="lg" style={{ flex: 1 }}>
-//                   <Divider mt="lg" />
-//                   <Title mb="0px" order={4}>
-//                     Suggest New Criteria Based on the Feedback
-//                   </Title>
-//                   <Textarea
-//                     value={annotation}
-//                     onChange={(e) => setAnnotation(e.target.value)}
-//                     description="How good is this response? Explain anything not captured under your existing criteria. Your feedback will be used to generate new criteria."
-//                     mb="sm"
-//                   />
-//                   <Radio.Group
-//                     name="favoriteFramework"
-//                     label="Rate the response holistically:"
-//                     value={holisticGrade}
-//                     onChange={(v) => setHolisticGrade(v as "good" | "bad")}
-//                     withAsterisk
-//                     mb="md"
-//                   >
-//                     <Group mt="xs">
-//                       <Radio value="good" label="Good" />
-//                       <Radio value="bad" label="Bad" />
-//                       <span>
-//                         &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
-//                       </span>
-//                       <Button
-//                         color="green"
-//                         variant="filled"
-//                         disabled={
-//                           !holisticGrade ||
-//                           annotation === undefined ||
-//                           annotation.length === 0
-//                         }
-//                         onClick={() => {
-//                           synthNewCriteriaWithLLM(
-//                             shownResponse?.responses[0].toString() ?? "",
-//                             annotation ?? "",
-//                             holisticGrade ?? "unknown",
-//                           );
-
-//                           nextResponse();
-//                         }}
-//                       >
-//                         + Submit Feedback
-//                       </Button>
-//                     </Group>
-//                   </Radio.Group>
-//                 </Stack>
-//               </div>
-//             </Grid.Col>
-//           </Grid>
-//         )}
-//         {screen === "report" && (
-//           <Grid>
-//             <ReportCardView
-//               report={{
-//                 criteria: criteria,
-//                 failureCoverage: 99.2,
-//                 falseFailureRate: 66.7,
-//               }}
-//               onFinish={(reports: EvalGenReport) => {
-//                 onFinish.setFinalRpts(reports);
-//               }}
-//               getGradeCount={(crit: EvalCriteria, grade: boolean) => {
-//                 return shownResponse
-//                   ? getGradeCount(
-//                       // shownResponse.uid,
-//                       crit.uid,
-//                       grade,
-//                     )
-//                   : 0;
-//               }}
-//               getStateValue={(stateId) => getStateValue(stateId)}
-//             />
-//           </Grid>
-//         )}
-//       </Modal>
-//     );
-//   },
-// );
-
-// const HeaderText = ({ children }: { children: ReactNode }) => {
-//   return (
-//     <Text size="xl" fw={500} pl="sm" mb="lg">
-//       {children}
-//     </Text>
-//   );
-// };
-
-// interface GradingViewProps {
-//   shownResponse: LLMResponse | undefined;
-//   shownResponseIdx: number;
-//   responseCount: number;
-//   numGPT4Calls: number;
-//   numGPT35Calls: number;
-//   logs: { date: Date; message: string }[];
-//   gotoPrevResponse: () => void;
-//   gotoNextResponse: () => void;
-//   estimateGPTCalls: () => string;
-//   gotoNextScreen: (screenName: string) => void;
-// }
-
-// const GradingView: React.FC<GradingViewProps> = ({
-//   shownResponse,
-//   shownResponseIdx,
-//   responseCount,
-//   numGPT4Calls,
-//   numGPT35Calls,
-//   logs,
-//   gotoPrevResponse,
-//   gotoNextResponse,
-//   estimateGPTCalls,
-//   gotoNextScreen,
-// }) => {
-//   // Calculate inner values only when shownResponse changes
-//   const responseText = useMemo(
-//     () =>
-//       shownResponse && shownResponse.responses?.length > 0
-//         ? shownResponse.responses[0].toString()
-//         : "",
-//     [shownResponse],
-//   );
-
-//   const prompt = useMemo(() => shownResponse?.prompt ?? "", [shownResponse]);
-//   const varsDivs = useMemo(() => {
-//     const combined_vars_metavars = shownResponse
-//       ? {
-//           ...shownResponse.vars,
-//           ...transformDict(shownResponse.metavars, cleanMetavarsFilterFunc),
-//         }
-//       : {};
-
-//     // console.log("**************shownResponse", shownResponse);
-//     return Object.entries(combined_vars_metavars).map(([varname, val]) => (
-//       <div key={varname} className="grade-resp-var-container">
-//         <span className="response-var-name">{varname}&nbsp;=&nbsp;</span>
-//         <span className="response-var-value linebreaks">{val}</span>
-//       </div>
-//     ));
-//   }, [shownResponse]);
-
-//   // const [shownResponseIdx, setShownResponseIdx] = useState(0);
-//   // const [shownResponses, setShownResponses] = useState<LLMResponse[]>([]);
-//   // React.useEffect(() => {
-//   //   console.error("current response", shownResponse);
-//   //   if (shownResponse && !shownResponses.includes(shownResponse)) {
-//   //     shownResponses.push(shownResponse);
-//   //     setShownResponses(shownResponses);
-//   //     setShownResponseIdx(shownResponses.length - 1);
-//   //     console.error("current response is saved.", shownResponses.length);
-//   //   } else {
-//   //     console.error("current response already saved.");
-//   //     for (const [idx, resp] of shownResponses.entries()) {
-//   //       if (shownResponse === resp) {
-//   //         setShownResponseIdx(idx);
-//   //         break;
-//   //       }
-//   //     }
-//   //   }
-//   // }, [shownResponse]);
-
-//   return (
-//     <Stack justify="space-between" mih={500}>
-//       <Box>
-//         {/* Top header */}
-//         <Flex justify="center">
-//           <HeaderText>
-//             {/* What do you think of this response? */}
-//             What do you think of response #{shownResponseIdx + 1} of{" "}
-//             {responseCount}?
-//           </HeaderText>
-//         </Flex>
-//         {/* Middle response box with chevron buttons < and > for going back and forward a response */}
-//         <Flex justify="center" align="center" mb="sm">
-//           {/* Go back to previous response */}
-//           <Button variant="white" color="dark" onClick={gotoPrevResponse}>
-//             <IconChevronLeft />
-//           </Button>
-
-//           {/* The response one is currently grading */}
-//           <div
-//             className="response-box"
-//             style={{
-//               backgroundColor: "#eee",
-//               width: "80%",
-//               maxHeight: "340px",
-//               overflowY: "scroll",
-//               borderColor: "black",
-//               borderStyle: "solid",
-//             }}
-//           >
-//             <div className="response-item-llm-name-wrapper">
-//               <div
-//                 className="small-response"
-//                 style={{ fontSize: "11pt", padding: "12pt" }}
-//               >
-//                 {responseText}
-//               </div>
-//             </div>
-//           </div>
-
-//           {/* Go forward to the next response */}
-//           <Tooltip label={estimateGPTCalls()} withArrow>
-//             <Button variant="white" color="dark" onClick={gotoNextResponse}>
-//               <IconChevronRight />
-//             </Button>
-//           </Tooltip>
-//         </Flex>
-//         {/* Views for the vars (inputs) that generated this response, and the concrete prompt */}
-//         <Flex justify="center" mb="xl" gap="lg">
-//           <div
-//             style={{
-//               backgroundColor: "#fff",
-//               padding: "12px",
-//               width: "31%",
-//               borderRadius: "12px",
-//               borderWidth: "1px",
-//               borderStyle: "solid",
-//             }}
-//           >
-//             Vars
-//             <hr />
-//             <div style={{ maxHeight: "160px", overflowY: "scroll" }}>
-//               {varsDivs}
-//             </div>
-//           </div>
-//           <div
-//             style={{
-//               backgroundColor: "#fff",
-//               padding: "12px",
-//               width: "41%",
-//               borderRadius: "2px",
-//             }}
-//           >
-//             Prompt
-//             <hr />
-//             <div
-//               className="monofont linebreaks"
-//               style={{
-//                 maxHeight: "160px",
-//                 overflowY: "scroll",
-//                 fontSize: "10pt",
-//                 lineHeight: "1.2",
-//               }}
-//             >
-//               {prompt}
-//             </div>
-//           </div>
-//         </Flex>
-//         <Flex direction="column">
-//           <Flex justify="space-between" align="center">
-//             <Text size="lg" weight={500} mb="sm">
-//               LLM Activity
-//             </Text>
-//             {/* GPT Call Tally */}
-//             <Text size="sm" color="dark" style={{ fontStyle: "italic" }}>
-//               Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls}{" "}
-//               GPT-3.5-Turbo-16k calls.
-//             </Text>
-//           </Flex>
-//           <div
-//             style={{
-//               backgroundColor: "#f0f0f0",
-//               color: "#333",
-//               fontFamily: "monospace",
-//               padding: "12px",
-//               width: "calc(100% - 30px)",
-//               height: "200px",
-//               overflowY: "auto",
-//               borderRadius: "8px",
-//               border: "1px solid #ddd",
-//               marginRight: "20px", // Space on the right
-//             }}
-//             ref={(el) => {
-//               if (el) {
-//                 el.scrollTop = el.scrollHeight;
-//               }
-//             }}
-//           >
-//             {logs.map((log, index) => (
-//               <div key={index}>
-//                 <span style={{ color: "#4A90E2" }}>
-//                   {log.date.toLocaleString()} -{" "}
-//                 </span>
-//                 <span>{log.message}</span>
-//               </div>
-//             ))}
-//           </div>
-//         </Flex>
-//       </Box>
-//       <div>
-//         <Center>
-//           <Button
-//             leftIcon={<IconSparkles size={14} />}
-//             variant="gradient"
-//             gradient={{ from: "blue", to: "green", deg: 45 }}
-//             onClick={() => {
-//               // console.log("(3) gotoNextScreen", gotoNextScreen);
-//               gotoNextScreen("report");
-//             }}
-//           >
-//             I&apos;m done. Access EvalGen Report!
-//           </Button>
-//         </Center>
-//       </div>
-//     </Stack>
-//   );
-// };
-
-// interface ReportCardViewProps {
-//   report: EvalGenReport;
-//   // recomputeAlignment,
-//   onFinish: (reports: EvalGenReport) => void;
-//   getGradeCount: (crit: EvalCriteria, grade: boolean) => number;
-//   getStateValue: (stateId: number) => number;
-// }
-
-// // const ReportCardScreen = () => {
-// const ReportCardView: React.FC<ReportCardViewProps> = ({
-//   report,
-//   // recomputeAlignment,
-//   onFinish,
-//   getGradeCount,
-//   getStateValue,
-// }) => {
-//   // The criteria cards, now with report information
-
-//   const [finalReport, setFinalReport] = useState(report);
-
-//   const onSelect = (criterion: EvalCriteria, isSelected: boolean) => {
-//     if (isSelected) {
-//       finalReport.criteria.push(criterion);
-//     } else {
-//       finalReport.criteria = finalReport.criteria.filter(
-//         (c) => c !== criterion,
-//       );
-//     }
-//     setFinalReport(finalReport);
-//   };
-//   const cards = useMemo(() => {
-//     const res = [];
-
-//     // Iterate through selected eval functions and create cards
-//     // for (const selectedFunc of report.selectedEvalFunctions) {
-//     //   const crit = selectedFunc.evalCriteria;
-//     //   // Find corresponding report in allEvalFunctionReports map from criteria to list
-//     //   const critEvalFuncReports = report.allEvalFunctionReports.get(crit);
-//     //   const evalFuncReport = critEvalFuncReports.find(
-//     //     (rep) => rep.evalFunction === selectedFunc,
-//     //   );
-
-//     //   // Get the functions that were not selected for this criteria
-//     //   const otherFuncs = critEvalFuncReports.filter(
-//     //     (rep) => rep.evalFunction !== selectedFunc,
-//     //   );
-//     for (const crit of report.criteria) {
-//       res.push(
-//         <ReportCriteriaCard
-//           criterion={crit}
-//           key={crit.uid}
-//           // onCheck={(checked) => {
-//           //   crit.selected = checked;
-//           //   recomputeAlignment();
-//           // }}
-//           getGradeCount={getGradeCount}
-//           getStateValue={getStateValue}
-//           onSelect={onSelect}
-//         />,
-//       );
-//     }
-
-//     return res;
-//   }, [report]);
-
-//   return (
-//     report && (
-//       <div>
-//         <Text align="center" size="lg" pl="sm" mb="lg">
-//           Chosen Functions and Alignment
-//         </Text>
-
-//         {/* Show coverage and false failure rate numbers */}
-//         <Flex justify="center" gap="md" mb="lg">
-//           <Group position="center" spacing="xl" style={{ textAlign: "center" }}>
-//             <Card
-//               shadow="sm"
-//               padding="md"
-//               radius="md"
-//               style={{ backgroundColor: "#f0f0f0" }}
-//             >
-//               <Text weight={500} size="md">
-//                 Coverage of Bad Responses
-//               </Text>
-//               <Text color="blue" weight={700} size="md">
-//                 {report.failureCoverage.toFixed(2)}%
-//               </Text>
-//             </Card>
-//             <Card
-//               shadow="sm"
-//               padding="md"
-//               radius="md"
-//               style={{ backgroundColor: "#f0f0f0" }}
-//             >
-//               <Text weight={500} size="md">
-//                 False Failure Rate
-//               </Text>
-//               <Text color="red" weight={700} size="md">
-//                 {report.falseFailureRate.toFixed(2)}%
-//               </Text>
-//             </Card>
-//           </Group>
-//         </Flex>
-
-//         <ScrollArea mih={300} h={500} mah={500}>
-//           <SimpleGrid cols={3} spacing="sm" verticalSpacing="sm" mb="lg">
-//             {cards}
-//           </SimpleGrid>
-//         </ScrollArea>
-
-//         <Flex justify="center" gap={12} mt="xs">
-//           <Button
-//             onClick={() => {
-//               // console.log("finalReport", finalReport);
-//               onFinish(finalReport);
-//             }}
-//           >
-//             Finish with selected evaluators
-//           </Button>
-//         </Flex>
-//       </div>
-//     )
-//   );
-// };
-
-// interface ReportCriteriaCardProps {
-//   criterion: EvalCriteria;
-//   // onChange: (changedCriteria: EvalCriteria) => void;
-//   // onDelete: () => void;
-//   // initiallyOpen?: boolean;
-//   // grade: boolean | undefined;
-//   // onChangeGrade: (newGrade: boolean | undefined) => void;
-//   getGradeCount: (crit: EvalCriteria, grade: boolean) => number;
-//   getStateValue: (stateId: number) => number;
-//   onSelect: (criterion: EvalCriteria, isChecked: boolean) => void;
-// }
-
-// const ReportCriteriaCard: React.FC<ReportCriteriaCardProps> = ({
-//   criterion,
-//   // onChange,
-//   // onDelete,
-//   // initiallyOpen,
-//   // grade,
-//   getGradeCount,
-//   // onChangeGrade,
-//   getStateValue,
-//   onSelect,
-// }) => {
-//   // const [opened, { toggle }] = useDisclosure(true);
-//   // const [title, setTitle] = useState(criterion.shortname);
-//   const [checked, setChecked] = useState(true);
-
-//   // Simulates eval functions that are expected to be passed in later on (TODO)
-//   const evalFuncs = [
-//     { evalFunction: { code: "To be provided (1) ..." } },
-//     { evalFunction: { code: "To be provided (2) ..." } },
-//     { evalFunction: { code: "To be provided (3) ..." } },
-//   ];
-//   const unselectedImplementations = evalFuncs.map((item) => (
-//     <div key={uuid()}>
-//       <Code style={{ whiteSpace: "pre-wrap" }} key={uuid()}>
-//         {item.evalFunction.code}
-//       </Code>
-//       <Divider />
-//     </div>
-//   ));
-
-//   return (
-//     // <Card withBorder mb={4} radius="md" style={{ cursor: "default" }}>
-//     <Card
-//       shadow="sm"
-//       padding="sm"
-//       pl="md"
-//       pb="xl"
-//       radius="md"
-//       withBorder
-//       style={{ backgroundColor: checked ? "#f2f7fc" : "#fff" }}
-//     >
-//       <div
-//         // onClick={() => setChecked(!checked)}
-//         onKeyUp={(e) => e.preventDefault()}
-//         className="checkcard"
-//       >
-//         {/* <Card.Section withBorder pl="8px">
-//           <Flex align="center">
-//             <Group spacing="0px"> */}
-//         {/* The arrow chevron user can click to collapse/expand */}
-//         {/* <Button
-//                 color="gray"
-//                 p={0}
-//                 m={0}
-//                 variant="subtle"
-//                 mr="4px"
-//                 onClick={toggle}
-//               >
-//                 {opened ? (
-//                   <IconChevronDown size="14pt" />
-//                 ) : (
-//                   <IconChevronRight size="14pt" />
-//                 )}
-//               </Button> */}
-
-//         <Tooltip label={checked ? "Don't use this" : "Use this"} withArrow>
-//           <Checkbox
-//             checked={checked}
-//             onChange={() => {
-//               setChecked(!checked);
-//               if (onSelect) onSelect(criterion, !checked);
-//             }}
-//             tabIndex={-1}
-//             size="xs"
-//             mr="sm"
-//             mt="xs"
-//             styles={{ input: { cursor: "pointer" } }}
-//             aria-hidden
-//           />
-//         </Tooltip>
-
-//         {/* Thumbs up/down buttons - disable for now */}
-//         {/* <ReadOnlyThumbUpDownButtons
-//                   upCount={getGradeCount(criterion, true)}
-//                   downCount={getGradeCount(criterion, false)}
-//                 /> */}
-
-//         <div style={{ width: "100%" }}>
-//           {/* Title of the criteria */}
-//           <TextInput
-//             value={criterion.shortname}
-//             // placeholder="Criteria name"
-//             readOnly
-//             variant="unstyled"
-//             size="sm"
-//             ml="xs"
-//             className="nodrag nowheel"
-//             styles={{
-//               input: {
-//                 border: "none",
-//                 borderWidth: "0px",
-//                 padding: "0px",
-//                 background: "transparent",
-//                 fontWeight: 500,
-//                 fontSize: "12pt",
-//                 margin: "0px",
-//                 height: "auto",
-//                 minHeight: "auto",
-//               },
-//             }}
-//           />
-//           {/* </Group> */}
-
-//           {/* <Group spacing="4px" ml="auto"> */}
-
-//           {/* <Button
-//                   color={criterion.priority <= 0 ? "gray" : "red"}
-//                   m={0}
-//                   p={0}
-//                   variant="subtle"
-//                 >
-//                   <IconFlagFilled size="14pt" />
-//                 </Button> */}
-//           {/* </Group>
-//             </Flex>
-//           </Card.Section> */}
-
-//           {/* Description of the criteria */}
-//           {/* <Card.Section p="0px"> */}
-//           {/* <Collapse in={opened}> */}
-//           <Textarea
-//             value={criterion.criteria}
-//             // placeholder="Describe here."
-//             readOnly
-//             // onClickCapture={(e) => e.stopPropagation()}
-//             styles={{
-//               input: {
-//                 border: "none",
-//                 borderWidth: "0px",
-//                 paddingTop: "0px !important",
-//                 paddingLeft: "0px",
-//                 margin: "0px",
-//                 color: "#444",
-//                 background: "transparent",
-//                 lineHeight: 1.1,
-//               },
-//             }}
-//             autosize
-//             minRows={2}
-//             maxRows={5}
-//             fz="sm"
-//             mb="xs"
-//             c="dimmed"
-//           />
-
-//           {/* Whether this criteria should be implemented with code (function) or an LLM evaluator */}
-//           <Text color="#999" size="sm" mr="6px">
-//             {criterion.eval_method === "code" ? (
-//               <Flex style={{ userSelect: "none" }}>
-//                 <IconTerminal2 size="14pt" />
-//                 &nbsp;Python
-//               </Flex>
-//             ) : (
-//               <Flex style={{ userSelect: "none" }}>
-//                 <IconRobot size="14pt" />
-//                 &nbsp;LLM
-//               </Flex>
-//             )}
-//           </Text>
-//         </div>
-//         <Stack spacing={0}>
-//           <Contributor
-//             getStateValue={getStateValue}
-//             style={{ size: 90, thickness: 12 }}
-//           />
-//           <Text align="center" size="xs" color="gray" maw="90%" lh={1.1}>
-//             Alignment with your grades
-//           </Text>
-//         </Stack>
-//       </div>
-//       {/* </Collapse> */}
-//       {/* </Card.Section> */}
-//       <div>
-//         <Accordion>
-//           <Accordion.Item
-//             key={"Show Bad Implementations"}
-//             value={"Show Bad Implementations"}
-//           >
-//             <Accordion.Control>
-//               <Text size="sm"> Show Bad Implementations </Text>
-//             </Accordion.Control>
-//             <Accordion.Panel>{unselectedImplementations}</Accordion.Panel>
-//           </Accordion.Item>
-//         </Accordion>
-//       </div>
-//     </Card>
-//   );
-// };
-
-// const ReadOnlyThumbUpDownButtons = ({
-//   upCount,
-//   downCount,
-// }: {
-//   upCount: number;
-//   downCount: number;
-//   // grade: boolean | undefined;
-//   // onChangeGrade: (newGrade: boolean | undefined) => void;
-//   // getGradeCount: (grade: boolean | undefined) => number;
-// }) => {
-//   return (
-//     <>
-//       {/* Thumbs up/down buttons */}
-//       <Button color={"green"} m={0} p={0} variant="subtle">
-//         <div className="gradeContainer">
-//           <IconThumbUp size="14pt" fill={"#aea"} />
-//           <div className="gradeUpCount">{upCount}</div>
-//         </div>
-//       </Button>
-//       <Button color={"red"} m={0} p={0} variant="subtle">
-//         <div className="gradeContainer">
-//           <IconThumbDown size="14pt" fill={"pink"} />
-//           <div className="gradeDownCount">{downCount}</div>
-//         </div>
-//       </Button>
-//     </>
-//   );
-// };
-
-// // export default { EvalGenModal, ReportCardScreen };
-// export default EvalGenModal;
diff --git a/chainforge/react-server/src/OldEvalGenModal.js b/chainforge/react-server/src/OldEvalGenModal.js
deleted file mode 100644
index a2e9b77ac..000000000
--- a/chainforge/react-server/src/OldEvalGenModal.js
+++ /dev/null
@@ -1,1494 +0,0 @@
-import React, {
-  forwardRef,
-  useImperativeHandle,
-  useState,
-  useMemo,
-  useEffect,
-  useCallback,
-} from "react";
-import { v4 as uuid } from "uuid";
-import Plot from "react-plotly.js";
-import {
-  SimpleGrid,
-  Card,
-  Modal,
-  Text,
-  Button,
-  UnstyledButton,
-  Textarea,
-  TextInput,
-  Flex,
-  Progress,
-  ScrollArea,
-  useMantineTheme,
-  Loader,
-  Switch,
-  Stack,
-  Box,
-  Space,
-  Center,
-  Tooltip,
-  Skeleton,
-  RingProgress,
-  Checkbox,
-  Popover,
-  Group,
-  Collapse,
-  Code,
-  Accordion,
-  Divider,
-} from "@mantine/core";
-import { useDisclosure } from "@mantine/hooks";
-import {
-  IconChevronLeft,
-  IconChevronRight,
-  IconCode,
-  IconPencil,
-  IconRepeat,
-  IconRobot,
-  IconSparkles,
-  IconThumbDown,
-  IconThumbUp,
-  IconTrash,
-} from "@tabler/icons-react";
-import ConfettiExplosion from "react-confetti-explosion";
-import {
-  cleanMetavarsFilterFunc,
-  deepcopy,
-  sampleRandomElements,
-  transformDict,
-} from "./backend/utils";
-import { generateLLMEvaluationCriteria } from "./backend/evalgen/utils";
-import { escapeBraces } from "./backend/template";
-import EvaluationFunctionExecutor from "./backend/evalgen/executor";
-import {
-  extractUIDFromRatingKey,
-  getRatingKeyForResponse,
-} from "./ResponseRatingToolbar";
-import useStore from "./store";
-import { DEFAULT_LLM_EVAL_MODEL } from "./LLMEvalNode";
-import StorageCache from "./backend/cache";
-
-const MANTINE_GREEN = "#40c057";
-const SELECT_EVAL_FUNC_THRESHOLD = 0.4;
-
-const HeaderText = ({ children }) => {
-  return (
-    <Text size="xl" fw={500} pl="sm" mb="lg">
-      {children}
-    </Text>
-  );
-};
-
-const evalgenReportToImplementations = (report) => {
-  // Convert to expected format by MultiEval node
-  const specs = report.selectedEvalFunctions.map((evalFuncSpec) => {
-    // Skip if evalFuncSpec.evalCriteria.selected is false
-    if (evalFuncSpec.evalCriteria.selected === false) return null;
-
-    if (evalFuncSpec.evalCriteria.eval_method === "code")
-      return {
-        name: evalFuncSpec.evalCriteria.shortname,
-        type: "python", // for now, only generates Python
-        state: {
-          code: evalFuncSpec.code,
-        },
-      };
-    else
-      return {
-        name: evalFuncSpec.evalCriteria.shortname,
-        type: "llm",
-        state: {
-          prompt: evalFuncSpec.code,
-          grader: deepcopy(DEFAULT_LLM_EVAL_MODEL),
-          format: "bin", // for now, only boolean assertions
-        },
-      };
-  });
-
-  return specs.filter((s) => s !== null);
-};
-
-const accuracyToColor = (acc) => {
-  if (acc > 0.9) return "green";
-  else if (acc > 0.7) return "yellow";
-  else if (acc > 0.5) return "orange";
-  else return "red";
-};
-
-const cmatrixTextAnnotations = (x, y, z) => {
-  const annotations = [];
-  const midVal = Math.max(...z.flat());
-  for (let i = 0; i < y.length; i++) {
-    for (let j = 0; j < x.length; j++) {
-      annotations.push({
-        xref: "x1",
-        yref: "y1",
-        x: x[j],
-        y: y[i],
-        text: z[i][j],
-        font: {
-          // family: "monospace",
-          // size: 12,
-          color: z[i][j] < midVal ? "white" : "black",
-        },
-        showarrow: false,
-      });
-    }
-  }
-  return annotations;
-};
-
-/** Example flows to help users get started and see what CF can do */
-const CriteriaCard = function CriteriaCard({
-  title,
-  description,
-  evalMethod,
-  onTitleChange,
-  onDescriptionChange,
-  onEvalMethodChange,
-  onRemove,
-  reportMode,
-  evalFuncReport,
-  onCheck,
-  otherFuncs,
-}) {
-  const [checked, setChecked] = useState(true);
-  const [codeChecked, setCodeChecked] = useState(evalMethod === "code");
-  const theme = useMantineTheme();
-
-  // Report card specific
-  const [openedCMatrix, { close: closeCMatrix, open: openCMatrix }] =
-    useDisclosure(false);
-  const [viewedCode, { close: closeViewedCode, open: openViewedCode }] =
-    useDisclosure(false);
-  const [openedOtherFuncs, { toggleOtherFuncs }] = useDisclosure(false);
-  const cMatrixPlot = useMemo(() => {
-    if (!evalFuncReport) return undefined;
-    const x = ["Pred.<br>fail", "Pred.<br>pass"];
-    const y = ["Human<br>pass", "Human<br>fail"];
-    const z = [
-      [evalFuncReport.false_fail, evalFuncReport.true_pass],
-      [evalFuncReport.true_fail, evalFuncReport.false_pass],
-    ];
-    return (
-      <Plot
-        data={[
-          {
-            z,
-            x,
-            y,
-            xgap: 2,
-            ygap: 2,
-            type: "heatmap",
-            hoverongaps: false,
-            colorscale: "Blues",
-            showscale: false,
-            showlegend: false,
-          },
-        ]}
-        layout={{
-          width: 160,
-          height: 160,
-          margin: { t: 10, b: 40, l: 50, r: 0 },
-          annotations: cmatrixTextAnnotations(x, y, z),
-        }}
-      />
-    );
-  }, [evalFuncReport]);
-  const reportAccuracyRing = useMemo(() => {
-    if (!evalFuncReport) return undefined;
-    return {
-      percent: Math.floor(evalFuncReport.alignment * 100),
-      color: accuracyToColor(evalFuncReport.alignment),
-    };
-  }, [evalFuncReport]);
-
-  // Update the checkbox whenever the evalFuncReport changes,
-  // ticking it if the accuracy is over the threshold.
-  // useEffect(() => {
-  //   if (!evalFuncReport) return;
-  //   setChecked(evalFuncReport.accuracy >= SELECT_EVAL_FUNC_THRESHOLD);
-  // }, [evalFuncReport]);
-
-  const setCheckedAndRealign = (newChecked) => {
-    setChecked(newChecked);
-
-    // oncheck is a callback to the parent to update the selected eval functions
-    // oncheck is an awaitable function
-    if (onCheck && evalFuncReport) onCheck(newChecked);
-  };
-
-  const unselectedImplementations =
-    otherFuncs !== undefined && otherFuncs.length > 0
-      ? otherFuncs.map((item) => (
-          <div key={uuid()}>
-            <Code style={{ whiteSpace: "pre-wrap" }} key={uuid()}>
-              {item.evalFunction.code}
-            </Code>
-            <Divider />
-          </div>
-        ))
-      : null;
-
-  return (
-    <Card
-      shadow="sm"
-      padding="sm"
-      pl="md"
-      pb="xl"
-      radius="md"
-      withBorder
-      style={{ backgroundColor: checked ? "#f2f7fc" : "#fff" }}
-    >
-      <div
-        // onClick={() => setChecked(!checked)}
-        onKeyUp={(e) => e.preventDefault()}
-        className="checkcard"
-      >
-        <Tooltip label={checked ? "Don't use this" : "Use this"} withArrow>
-          <Checkbox
-            checked={checked}
-            onChange={() => setCheckedAndRealign(!checked)}
-            tabIndex={-1}
-            size="xs"
-            mr="sm"
-            mt="xs"
-            styles={{ input: { cursor: "pointer" } }}
-            aria-hidden
-          />
-        </Tooltip>
-
-        <div style={{ width: "100%" }}>
-          <TextInput
-            value={title}
-            onChange={(e) => onTitleChange(e.currentTarget.value)}
-            mb={7}
-            lh={1}
-            styles={{
-              input: {
-                border: "none",
-                borderWidth: "0px",
-                padding: "0px",
-                background: "transparent",
-                fontWeight: 500,
-                fontSize: "12pt",
-                margin: "0px",
-                height: "auto",
-                minHeight: "auto",
-              },
-            }}
-          />
-
-          <Textarea
-            value={description}
-            onChange={(e) => onDescriptionChange(e.currentTarget.value)}
-            onClickCapture={(e) => e.stopPropagation()}
-            styles={{
-              input: {
-                border: "none",
-                borderWidth: "0px",
-                paddingTop: "0px !important",
-                paddingLeft: "0px",
-                margin: "0px",
-                color: "#444",
-                background: "transparent",
-                lineHeight: 1.1,
-              },
-            }}
-            autosize
-            minRows={2}
-            maxRows={5}
-            fz="sm"
-            mb="xs"
-            c="dimmed"
-          />
-
-          {reportMode && (
-            <Popover
-              opened={viewedCode}
-              // offset={{ crossAxis: -20 }}
-              withinPortal
-              position="bottom"
-              shadow="lg"
-              withArrow
-              width={400}
-            >
-              <Popover.Target>
-                <Text
-                  size="sm"
-                  color="gray"
-                  onMouseEnter={openViewedCode}
-                  onMouseLeave={closeViewedCode}
-                >
-                  {codeChecked ? "Python" : "LLM"}
-                </Text>
-              </Popover.Target>
-              <Popover.Dropdown>
-                <Code style={{ whiteSpace: "pre-wrap" }}>
-                  {evalFuncReport.evalFunction.code}
-                </Code>
-              </Popover.Dropdown>
-            </Popover>
-          )}
-        </div>
-
-        {!reportMode ? (
-          <Button
-            size="xs"
-            variant="subtle"
-            compact
-            color="gray"
-            onClick={onRemove}
-            pos="absolute"
-            right="8px"
-            top="8px"
-            style={{ padding: "0px" }}
-          >
-            <IconTrash size={"95%"} />
-          </Button>
-        ) : (
-          <></>
-        )}
-
-        {reportMode && reportAccuracyRing ? (
-          <Stack spacing={0}>
-            <Popover
-              position="right"
-              opened={openedCMatrix}
-              offset={{ crossAxis: -20 }}
-              withinPortal
-              shadow="lg"
-              withArrow
-            >
-              <Popover.Target>
-                <RingProgress
-                  size={100}
-                  sections={[
-                    {
-                      value: reportAccuracyRing.percent,
-                      color: reportAccuracyRing.color,
-                    },
-                  ]}
-                  label={
-                    <Text
-                      color={reportAccuracyRing.color}
-                      weight={700}
-                      align="center"
-                      size="lg"
-                    >
-                      {`${reportAccuracyRing.percent}%`}
-                    </Text>
-                  }
-                  onMouseEnter={openCMatrix}
-                  onMouseLeave={closeCMatrix}
-                />
-              </Popover.Target>
-              <Popover.Dropdown>{cMatrixPlot}</Popover.Dropdown>
-            </Popover>
-            <Text align="center" size="xs" color="gray" maw="90%" lh={1.1}>
-              Alignment with your grades
-            </Text>
-          </Stack>
-        ) : (
-          <></>
-        )}
-
-        {!reportMode ? (
-          <Switch
-            size="lg"
-            color="gray"
-            onLabel="Code"
-            offLabel="LLM"
-            pos="absolute"
-            right="8px"
-            bottom="10px"
-            checked={codeChecked}
-            onChange={(e) => {
-              setCodeChecked(e.currentTarget.checked);
-              if (onEvalMethodChange)
-                onEvalMethodChange(e.currentTarget.checked ? "code" : "expert");
-            }}
-            thumbIcon={
-              codeChecked ? (
-                <IconCode
-                  size="0.8rem"
-                  color={theme.colors.teal[theme.fn.primaryShade()]}
-                  stroke={3}
-                />
-              ) : (
-                <IconRobot
-                  size="0.8rem"
-                  color={theme.colors.blue[theme.fn.primaryShade()]}
-                  stroke={3}
-                />
-              )
-            }
-          />
-        ) : (
-          <></>
-        )}
-      </div>
-
-      <div>
-        {reportMode && (
-          <Accordion>
-            <Accordion.Item
-              key={"Show Bad Implementations"}
-              value={"Show Bad Implementations"}
-            >
-              <Accordion.Control>
-                <Text size="sm"> Show Bad Implementations </Text>
-              </Accordion.Control>
-              <Accordion.Panel>{unselectedImplementations}</Accordion.Panel>
-            </Accordion.Item>
-          </Accordion>
-        )}
-      </div>
-    </Card>
-  );
-};
-
-const ChooseCard = function ChooseCard({
-  title,
-  description,
-  icon,
-  bg,
-  onClick,
-}) {
-  const [hovering, setHovering] = useState(false);
-
-  return (
-    <Card
-      shadow="sm"
-      padding="lg"
-      radius="md"
-      withBorder
-      style={{ backgroundColor: bg + (hovering ? "44" : "77") }}
-      onMouseEnter={() => setHovering(true)}
-      onMouseLeave={() => setHovering(false)}
-      onClick={onClick}
-    >
-      <UnstyledButton className="checkcard">
-        <Tooltip
-          label={description}
-          maw="200px"
-          position="bottom"
-          withinPortal
-          withArrow
-          multiline
-        >
-          <Flex justify="center" gap="md">
-            <Box>{icon}</Box>
-            <Text fw={500} lh={1.2} fz="md">
-              {title}
-            </Text>
-          </Flex>
-        </Tooltip>
-      </UnstyledButton>
-    </Card>
-  );
-};
-
-// Pop-up to ask user to pick criterias for evaluation
-export const PickCriteriaModal = forwardRef(
-  function PickCriteriaModal(props, ref) {
-    const [opened, { open, close }] = useDisclosure(false);
-    const [responses, setResponses] = useState([]);
-    const apiKeys = useStore((state) => state.apiKeys);
-    const globalState = useStore((store) => store.state);
-
-    // Callback to caller when criteria implementations return
-    const [onFinish, setOnFinish] = useState(null);
-
-    // Which stage of picking + generating criteria we are in. Screens are:
-    // pick, wait, grade
-    const [screen, setScreen] = useState("welcome");
-    const modalTitle = useMemo(() => {
-      if (screen === "pick") return "Pick Criteria";
-      else if (screen === "welcome") return "Welcome";
-      else if (screen === "wait") return "Collecting implementations...";
-      else if (screen === "report") return "EvalGen Report";
-      else return "Grading Responses";
-    }, [screen]);
-
-    const [criteria, setCriteria] = useState([]);
-    const [addCriteriaValue, setAddCriteriaValue] = useState("");
-    const [isLoadingCriteria, setIsLoadingCriteria] = useState(0);
-
-    // The EvalGen object responsible for generating, implementing, and filtering candidate implementations
-    const [executor, setExecutor] = useState(null);
-    const [execProgress, setExecProgress] = useState(0);
-
-    // Stores report generated when executor is done
-    const [report, setReport] = useState(null);
-
-    // The samples to pass the executor / grading responses features. This will be bounded
-    // by maxNumSamplesForExecutor, instead of the whole dataset.
-    const samples = useMemo(() => {
-      // The max number of samples (responses) to pass the executor. This controls how many requests will
-      // need to be sent off and how many evaluation function executions are performed.
-      // TODO: Give the user some control over this.
-      const maxNumSamplesForExecutor = 16;
-
-      // Sample from the full set of responses, if needed:
-      if (responses.length > maxNumSamplesForExecutor)
-        return sampleRandomElements(responses, maxNumSamplesForExecutor);
-      else return responses.slice();
-    }, [responses]);
-
-    const addCriteria = () => {
-      // Add a loading Skeleton
-      setIsLoadingCriteria((num) => num + 1);
-      // Make async LLM call to expand criteria
-      generateLLMEvaluationCriteria(
-        "",
-        apiKeys,
-        `I've described a criteria I want to use to evaluate text. I want you to take the criteria and output a JSON object in the format below. 
-
-CRITERIA: 
-\`\`\`
-${addCriteriaValue}
-\`\`\`
-
-Your response should contain a short title for the criteria ("shortname"), a description of the criteria in 2 sentences ("criteria"), and whether it should be evaluated with "code", or by an "expert" if the criteria is difficult to evaluate ("eval_method"). Your answer should be JSON within a \`\`\`json \`\`\` marker, with the following three fields: "criteria", "shortname", and "eval_method" (code or expert). The "criteria" should expand upon the user's input, the "shortname" should be a very brief title for the criteria, and this list should contain as many evaluation criteria as you can think of. Each evaluation criteria should test a unit concept that should evaluate to "true" in the ideal case. Only output JSON, nothing else.`, // prompt
-        "gpt-3.5-turbo", // llm
-        null, // system_msg
-      )
-        .then((evalCrits) => {
-          // Take only the first
-          setCriteria((crit) =>
-            crit.concat([
-              {
-                ...evalCrits[0],
-                uid: uuid(),
-              },
-            ]),
-          );
-          // Remove a loading Skeleton
-          setIsLoadingCriteria((num) => num - 1);
-        })
-        .catch((err) => {
-          console.error(err);
-          setIsLoadingCriteria((num) => num - 1);
-        });
-    };
-    const updateCriteria = (newValue, critIdx, propName) => {
-      setCriteria((crit) => {
-        crit[critIdx][propName] = newValue;
-        return [...crit];
-      });
-    };
-
-    // An estimate of many requests the implementation executor will require (upper bound).
-    const estimatedLLMRequestsToImplement = useMemo(() => {
-      const num_llm_evals = criteria.reduce(
-        (acc, crit) => acc + (crit.eval_method === "expert" ? 1 : 0),
-        0,
-      );
-      // The executor sends off one query per criteria to generate 3-5 candidates each.
-      // Each candidate LLM eval prompt will be run over all candidates.
-      return criteria.length + num_llm_evals * 5 * samples.length;
-    }, [criteria, samples]);
-
-    const getLikelyPromptTemplateAsContext = useCallback(() => {
-      // Attempt to infer the prompt template used to generate the responses:
-      const prompts = new Set();
-      for (const resp_obj of responses) {
-        if (resp_obj?.metavars?.__pt !== undefined) {
-          prompts.add(resp_obj.metavars.__pt);
-        }
-      }
-
-      if (prompts.size === 0) return null;
-
-      // Pick a prompt template at random to serve as context....
-      return escapeBraces(prompts.values().next().value);
-    }, [responses]);
-
-    // Given the context from "inputs", tries to generate an array of natural language criteria.
-    const genCriteriaFromContext = useCallback(async () => {
-      // Get the context from the input responses
-      const inputPromptTemplate = getLikelyPromptTemplateAsContext();
-
-      if (inputPromptTemplate === null) {
-        console.error("No context found. Cannot proceed.");
-        return;
-      }
-
-      // Attempt to generate criteria using an LLM
-      return await generateLLMEvaluationCriteria(inputPromptTemplate, apiKeys);
-    }, [responses]);
-
-    // Update the executor whenever samples or eval criteria changes,
-    // as long as the executor is not already running.
-    useEffect(() => {
-      let ex = executor;
-      if (!ex) {
-        // Instantiate executor.
-        // Get the grades from the global state, and transform the dict such that it's in {uid: grade} format.
-        const existingGrades = transformDict(
-          globalState,
-          (key) => key.startsWith("r.") && key.endsWith(".grade"),
-          extractUIDFromRatingKey,
-          (_, val) => {
-            // The grades are in { idx: grade } format. Take only the first,
-            // as we only take the first response in this iteration of EvalGen:
-            if (typeof val !== "object") return undefined;
-            const gs = Object.values(val);
-            if (gs.length === 0) return undefined;
-            return gs[0];
-          },
-        );
-
-        // Create a new EvalGen executor, passing in the samples and existing grades
-        ex = new EvaluationFunctionExecutor(
-          getLikelyPromptTemplateAsContext(samples),
-          samples,
-          undefined,
-          existingGrades,
-        );
-        setExecutor(ex);
-      } else if (ex.isRunning()) {
-        console.error(
-          "Executor already running. Avoiding updating it with new samples or criteria.",
-        );
-        return;
-      }
-      ex.setExamples(samples);
-      ex.setEvalCriteria(criteria);
-    }, [samples, criteria]);
-
-    // Starts generating implementations for the chosen criteria
-    const beginGenCriteriaImplementations = useCallback(async () => {
-      // Check that an executor exists (this should never be triggered)
-      if (!executor) {
-        console.error("Executor does not exist.");
-        return;
-      } else if (executor.isRunning()) {
-        console.error("Executor is already running.");
-        return;
-      }
-
-      // Start the executor in the background
-      setExecProgress(0);
-      executor.start((progress) => {
-        setExecProgress(progress?.success ?? 0);
-      });
-    }, [executor]);
-
-    // This gives the parent access to triggering the modal alert
-    const trigger = (inputs, _onFinish) => {
-      setResponses(inputs);
-      setScreen("welcome");
-      setAddCriteriaValue("");
-      setExecutor(null);
-      setOnFinish(() => (report) => {
-        close();
-        if (_onFinish) _onFinish(evalgenReportToImplementations(report));
-      });
-      open();
-    };
-    useImperativeHandle(ref, () => ({
-      trigger,
-    }));
-
-    const handleInitialGradingDone = () => {
-      setScreen("pick");
-
-      // Generate criteria
-      setCriteria([]);
-      setIsLoadingCriteria(3);
-      genCriteriaFromContext()
-        .then((crits) => setCriteria(crits.map((c) => ({ ...c, uid: uuid() }))))
-        .finally(() => setIsLoadingCriteria(0));
-    };
-
-    const transitionToReport = (report) => {
-      setReport(report);
-      setScreen("report");
-    };
-
-    const recomputeAlignment = async () => {
-      // Get selected criteria
-      const selectedCriteria = criteria.filter(
-        (c) => c.selected || c.selected === undefined,
-      );
-
-      // Pass this into executor to recompute alignment
-      const newReport = await executor?.recomputeAlignment(
-        selectedCriteria,
-        report,
-      );
-
-      // Update the report
-      setReport(newReport);
-    };
-
-    const gradeResponsesScreen = useMemo(
-      () => (
-        <GradeResponsesScreen
-          resps={samples}
-          executor={executor}
-          onClickDone={handleInitialGradingDone}
-          askForAnnotations={screen === "grade_first"}
-          onFinish={transitionToReport}
-          execProgress={execProgress}
-        />
-      ),
-      [samples, executor, screen, onFinish, execProgress],
-    );
-
-    return (
-      <Modal
-        size="80%"
-        opened={opened}
-        onClose={close}
-        title={
-          <div>
-            <span style={{ fontSize: "14pt" }}>{modalTitle}</span>
-          </div>
-        }
-        closeOnClickOutside={true}
-        style={{ position: "relative", left: "-5%" }}
-      >
-        {screen === "welcome" ? (
-          <div>
-            <Center>
-              <Text size="sm" pl="sm" mt="lg" mb="sm" maw="560px">
-                Welcome to EvalGen. The EvalGen wizard will generate evaluation
-                criteria and implementations for grading responses that align
-                with your expectations.
-              </Text>
-            </Center>
-            <Center>
-              <Text size="sm" pl="sm" mb="lg" maw="560px">
-                To get started, we need to specify some criteria in natural
-                language that will be used to evaluate model responses. How
-                would you like to generate criteria?
-              </Text>
-            </Center>
-            <Center>
-              <Flex justify="center" gap="lg" mt="sm" mb="lg" maw="560px">
-                <ChooseCard
-                  onClick={() => {
-                    if (isLoadingCriteria > 0) return;
-                    setScreen("pick");
-                    setCriteria([]);
-                    setIsLoadingCriteria(3);
-                    genCriteriaFromContext()
-                      .then((crits) =>
-                        setCriteria(crits.map((c) => ({ ...c, uid: uuid() }))),
-                      )
-                      .finally(() => setIsLoadingCriteria(0));
-                  }}
-                  title="Infer criteria from my context"
-                  description="An AI will look at your input prompt and context and try to infer criteria. You will still be able to review, revise, and add criteria."
-                  icon={<IconSparkles />}
-                  bg="#a834eb"
-                />
-                <ChooseCard
-                  onClick={() => {
-                    setScreen("pick");
-                    // setCriteria([]);
-                  }}
-                  title="Let me specify criteria manually"
-                  description="Enter criteria manually. An AI will generate longer descriptions for your criteria, which you can review and revise."
-                  icon={<IconPencil />}
-                  bg="#34eb74"
-                />
-                <ChooseCard
-                  onClick={() => {
-                    setScreen("grade_first");
-                    // setCriteria([]);
-                  }}
-                  title="Grade some responses first"
-                  description="Grade some responses first, to help yourself identify criteria. The AI will incorporate your grades in its criteria suggestions."
-                  icon={<IconThumbUp />}
-                  bg="#eba834"
-                />
-                {/* TODO <ChooseCard title="Chat with an AI to infer criteria" description="Chat with an AI assistant that will ask questions about your task and situation. The AI will infer some criteria and provide them as starting points." icon={<IconMessage2Bolt />} bg="#34c9eb" /> */}
-              </Flex>
-            </Center>
-          </div>
-        ) : (
-          <></>
-        )}
-
-        {screen === "pick" ? (
-          <div>
-            <Text size="sm" pl="sm" mb="lg">
-              Select criteria that you would like to evaluate responses on.
-              Based on your chosen criteria, LLM will generate implementations
-              of assertions. Afterwards, an optional human scoring pass can
-              better align these implementations with your expectations.
-            </Text>
-
-            <Text size="sm" pl="sm" mb="lg" style={{ fontStyle: "italic" }}>
-              Note: Due to rate limits, please don&apos;t select more than 3
-              criteria to be evaluated by LLMs.
-            </Text>
-
-            <Flex align="center" gap="lg">
-              <TextInput
-                label="Type a new criteria to add, then press Enter:"
-                value={addCriteriaValue}
-                onChange={(evt) => setAddCriteriaValue(evt.currentTarget.value)}
-                placeholder="the response is valid JSON"
-                mb="lg"
-                pl="sm"
-                pr="sm"
-                w="100%"
-                onKeyDown={(evt) => {
-                  if (evt.key === "Enter") {
-                    evt.preventDefault();
-                    addCriteria();
-                    setAddCriteriaValue("");
-                  }
-                }}
-              />
-              <Button
-                variant="filled"
-                onClick={() => {
-                  if (isLoadingCriteria > 0) return;
-                  setIsLoadingCriteria(3);
-                  genCriteriaFromContext()
-                    .then((crit) => setCriteria(criteria.concat(crit)))
-                    .finally(() => setIsLoadingCriteria(0));
-                }}
-              >
-                <IconRepeat />
-                <IconSparkles />
-                &nbsp;Suggest more
-              </Button>
-            </Flex>
-
-            <ScrollArea mih={300} h={500} mah={500}>
-              <SimpleGrid cols={3} spacing="sm" verticalSpacing="sm" mb="lg">
-                {criteria.map((c, idx) => (
-                  <CriteriaCard
-                    title={c.shortname}
-                    description={c.criteria}
-                    evalMethod={c.eval_method}
-                    key={`cc-${c.uid ?? idx.toString() + c.shortname}`}
-                    onTitleChange={(title) =>
-                      updateCriteria(title, idx, "shortname")
-                    }
-                    onDescriptionChange={(desc) =>
-                      updateCriteria(desc, idx, "criteria")
-                    }
-                    onEvalMethodChange={(method) =>
-                      updateCriteria(method, idx, "eval_method")
-                    }
-                    onRemove={() =>
-                      setCriteria(criteria.filter((v, j) => j !== idx))
-                    }
-                  />
-                ))}
-                {isLoadingCriteria > 0 ? (
-                  Array.from({ length: isLoadingCriteria }, (x, i) => (
-                    <Skeleton key={`skele-card-${i}`}>
-                      <CriteriaCard
-                        title={"Loading"}
-                        description={"Loading"}
-                        evalMethod={"expert"}
-                      />
-                    </Skeleton>
-                  ))
-                ) : (
-                  <></>
-                )}
-              </SimpleGrid>
-            </ScrollArea>
-
-            <Flex justify="center" gap={12} mt="xs">
-              <Tooltip
-                label={`Will send off up to ${estimatedLLMRequestsToImplement} requests`}
-                withArrow
-              >
-                <Button
-                  onClick={() => {
-                    // Start generating implementations + transition to next screen
-                    // setScreen("wait");
-                    // For study just go right to grading
-                    setScreen("grade");
-                    beginGenCriteriaImplementations();
-
-                    // generateLLMEvaluationCriteria(
-                    //   escapeBraces(`Delete 10 words or phrases from the following paragraph that don't contribute much to its meaning, but keep readability:
-                    // "{paragraph}"
-
-                    // Please do not add any new words or change words, only delete words.`),
-                    // ).then(setCriteria);
-                  }}
-                  variant="gradient"
-                  gradient={{ from: "teal", to: "lime", deg: 105 }}
-                  disabled={!criteria || criteria.length === 0}
-                >
-                  <IconSparkles />
-                  &nbsp;I&apos;m done. Implement it!
-                </Button>
-              </Tooltip>
-            </Flex>
-          </div>
-        ) : (
-          <></>
-        )}
-
-        {screen === "wait" ? (
-          <div>
-            <Stack justify="center" align="center" h={500}>
-              <Text mb={0}>Collecting...</Text>
-              <Loader size="lg" />
-              <Text color="gray" size="sm">
-                This may take a while.
-              </Text>
-
-              <Space h="lg" />
-              <Button
-                onClick={() => setScreen("grade")}
-                size="lg"
-                variant="gradient"
-                gradient={{ from: "teal", to: "lime", deg: 105 }}
-              >
-                <IconSparkles />
-                &nbsp;Grade Responses While You Wait
-              </Button>
-              <Text ml="lg" lh={1.2} w={380} color="gray">
-                Grading helps us choose implementations that better align with
-                your expectations. 📈
-              </Text>
-            </Stack>
-          </div>
-        ) : (
-          <></>
-        )}
-
-        {screen === "grade" ? gradeResponsesScreen : <></>}
-        {screen === "grade_first" ? (
-          <div>
-            <Center>
-              <Text size="md" pl="sm" mt="lg" mb="sm" maw="80%">
-                Grade at least 5 responses. You can use the arrows to skip
-                responses. Try to get a good sample of good (thumbs up) and bad
-                (thumbs down) examples.
-                {/* Welcome to EvalGen. We&apos;ve learned that grading responses
-                helps you decide your criteria. So, before AI can help you
-                generate evaluators,{" "}
-                <span style={{ fontWeight: 800 }}>
-                  we ask you to grade at least 5 responses
-                </span>
-                . The EvalGen wizard will then generate evaluation criteria and
-                implementations for grading responses that align with your
-                expectations. */}
-              </Text>
-            </Center>
-            <hr />
-            {gradeResponsesScreen}
-          </div>
-        ) : (
-          <></>
-        )}
-
-        {screen === "report" ? (
-          <ReportCardScreen
-            report={report}
-            recomputeAlignment={recomputeAlignment}
-            onClickFinish={(report) => onFinish(report)}
-          />
-        ) : (
-          <></>
-        )}
-      </Modal>
-    );
-  },
-);
-
-// Screen where the user grades responses.
-export const GradeResponsesScreen = forwardRef(function GradeResponsesScreen(
-  { resps, executor, onClickDone, askForAnnotations, onFinish, execProgress },
-  ref,
-) {
-  // Confetti effects
-  const [isGreenExploding, setIsGreenExploding] = React.useState(false);
-  const [isRedExploding, setIsRedExploding] = React.useState(false);
-
-  const [responses, setResponses] = useState([]);
-  const [shownResponse, setShownResponse] = useState(undefined);
-  const [pastShownResponses, setPastShownResponses] = useState([]);
-  const [shownResponseIdx, setShownResponseIdx] = useState(0);
-  const [grades, setGrades] = useState({});
-
-  const showProgressType = useMemo(
-    () => (executor ? "grade" : "num_graded"),
-    [executor],
-  );
-  const [minNumGrade, setMinNumGrade] = useState(5);
-  const numGraded = useMemo(() => Object.keys(grades).length, [grades]);
-
-  const [promptReasoning, setPromptReasoning] = useState(null);
-  const [annotation, setAnnotation] = useState(undefined);
-
-  // For updating the global human ratings state
-  const setState = useStore((store) => store.setState);
-  const updateGlobalRating = useCallback(
-    (uid, label, payload) => {
-      const key = getRatingKeyForResponse(uid, label);
-      const safe_payload = deepcopy(payload);
-      setState(key, safe_payload);
-      StorageCache.store(key, safe_payload);
-    },
-    [setState],
-  );
-
-  const bottomBar = useMemo(() => {
-    const bar = {};
-    if (showProgressType === "num_graded") {
-      bar.progressPerc = Math.min((numGraded / minNumGrade) * 100, 100);
-      bar.progressLabel = `${numGraded} / ${minNumGrade} graded`;
-      bar.buttonLabel = bar.progressPerc < 100 ? "Keep grading!" : "Next Step";
-      bar.buttonDisabled = bar.progressPerc < 100;
-      bar.buttonStyle = "filled";
-    } else {
-      bar.progressPerc = Math.min(execProgress, 100);
-      bar.progressLabel = "Generating and selecting implementations...";
-      bar.buttonLabel = bar.progressPerc < 99.5 ? "I'm tired 😴" : "Done";
-      bar.buttonDisabled = false;
-      bar.buttonStyle = bar.progressPerc < 99.5 ? "outline" : "filled";
-    }
-    return bar;
-  }, [showProgressType, numGraded, minNumGrade, execProgress]);
-
-  const responseText = useMemo(() =>
-    shownResponse && shownResponse.responses?.length > 0
-      ? shownResponse.responses[0]
-      : "",
-  );
-  const prompt = useMemo(() => shownResponse?.prompt ?? "", [shownResponse]);
-  const varsDivs = useMemo(() => {
-    const combined_vars_metavars = shownResponse
-      ? {
-          ...shownResponse.vars,
-          ...transformDict(shownResponse.metavars, cleanMetavarsFilterFunc),
-        }
-      : {};
-    return Object.entries(combined_vars_metavars).map(([varname, val]) => (
-      <div key={varname} className="grade-resp-var-container">
-        <span className="response-var-name">{varname}&nbsp;=&nbsp;</span>
-        <span className="response-var-value linebreaks">{val}</span>
-      </div>
-    ));
-  }, [shownResponse]);
-
-  // Goto next response in the queue (skipping grading the current one)
-  const nextResponse = () => {
-    if (responses.length === 0) return;
-
-    // Update annotation for current response (if any)
-    // TODO: Fix this for generate case when num resp per prompt > 1
-    if (
-      shownResponse &&
-      annotation &&
-      typeof annotation === "string" &&
-      annotation.trim().length > 0
-    ) {
-      // console.log("setting annotation for resp", shownResponse.uid, annotation);
-      updateGlobalRating(shownResponse.uid, "note", { 0: annotation });
-      setAnnotation(null);
-    }
-    setPromptReasoning(null);
-
-    if (shownResponseIdx < pastShownResponses.length - 1) {
-      // If we are not at the end of the history of shown responses, then show the next response:
-      setShownResponse(pastShownResponses[shownResponseIdx + 1]);
-      setShownResponseIdx(shownResponseIdx + 1); // increment the shown resp idx
-    } else {
-      // We are at the end of the history; pick the next response off the stack:
-      // TODO: Make this unique (maybe by removing picked responses from the list!)
-      let num_tries = 3;
-      let next_resp = executor?.getNextExampleToGrade();
-      while (
-        num_tries > 0 &&
-        (!next_resp || pastShownResponses.some((r) => r.uid === next_resp.uid))
-      ) {
-        // We're presenting a response that's already been shown. Try again.
-        // NOTE: If we're trying again the first time, executor will flip and get the response on the other side of the grading stack, so we try once more:
-        if (next_resp && num_tries === 3)
-          next_resp =
-            executor?.getNextExampleToGrade() ??
-            sampleRandomElements(responses, 1)[0];
-        // Otherwise we just choose a response at random:
-        else next_resp = sampleRandomElements(responses, 1)[0];
-        num_tries -= 1;
-      }
-      // Note that this doesn't guarantee uniqueness here ---it is possible to see a response again.
-      // However, the internal "grades" dict will help us in remembering what grade the user gave the response.
-      setShownResponse(next_resp);
-      setPastShownResponses(pastShownResponses.concat(next_resp));
-      setShownResponseIdx(pastShownResponses.length);
-    }
-  };
-
-  // Go back to previously shown response
-  const prevResponse = () => {
-    if (pastShownResponses.length === 0 || shownResponseIdx === 0) return;
-    setShownResponse(pastShownResponses[shownResponseIdx - 1]);
-    setShownResponseIdx(shownResponseIdx - 1); // decrement shown resp idx
-  };
-
-  // Update responses to draw from, when passed by external source
-  const updateResponsePool = (inputs) => {
-    if (!inputs) return;
-
-    setResponses(inputs);
-
-    // Choose the first response to display to the user
-    if (inputs?.length > 0) {
-      const random_resp = sampleRandomElements(inputs, 1)[0];
-      setShownResponse(random_resp);
-      setPastShownResponses([random_resp]);
-      setShownResponseIdx(0);
-      setGrades({});
-    }
-  };
-
-  const handleDone = useCallback(async () => {
-    if (showProgressType === "num_graded") {
-      if (onClickDone) onClickDone();
-    } else {
-      // Await completion of all gen + execution of eval funcs
-      await executor?.waitForCompletion();
-
-      // Filtering eval funcs by grades and present results
-      const filteredFunctions = await executor?.filterEvaluationFunctions(0.25);
-      console.log("Filtered Functions: ", filteredFunctions);
-
-      // Return selected implementations to caller
-      if (onFinish) onFinish(filteredFunctions);
-    }
-  }, [executor, showProgressType]);
-
-  const updateGrade = (idx, uid, grade) => {
-    grades[idx] = grade;
-    setGrades({ ...grades });
-    executor?.setGradeForExample(uid, grade);
-    updateGlobalRating(uid, "grade", { 0: grade });
-  };
-
-  const handleClickGradeButton = (isGoodResponse) => {
-    updateGrade(shownResponseIdx, shownResponse.uid, isGoodResponse);
-    const explodeFunc = isGoodResponse
-      ? setIsGreenExploding
-      : setIsRedExploding;
-    explodeFunc(true);
-    if (isGoodResponse) {
-      // Don't ask for clarification if it's a good grade
-      setTimeout(() => explodeFunc(false), 1200);
-      setTimeout(nextResponse, 800);
-    } else {
-      // If they gave a bad grade, ask them why
-      setTimeout(() => explodeFunc(false), 1200);
-      setTimeout(() => {
-        if (askForAnnotations) setPromptReasoning(true);
-        else nextResponse();
-      }, 800);
-    }
-  };
-
-  // Update responses whenever upstream changes
-  useEffect(() => {
-    updateResponsePool(resps);
-  }, [resps]);
-
-  return (
-    <Stack justify="space-between" mih={500}>
-      <Box>
-        <Flex justify="center">
-          {shownResponseIdx in grades ? (
-            grades[shownResponseIdx] ? (
-              <HeaderText>
-                You chose&nbsp;
-                <IconThumbUp color="green" style={{ marginBottom: "-3px" }} />!
-              </HeaderText>
-            ) : (
-              <HeaderText>
-                You chose&nbsp;
-                <IconThumbDown color="red" style={{ marginBottom: "-6px" }} />!
-              </HeaderText>
-            )
-          ) : (
-            <HeaderText>
-              Is this response&nbsp;
-              <IconThumbUp style={{ marginBottom: "-3px" }} />
-              &nbsp;or&nbsp;
-              <IconThumbDown style={{ marginBottom: "-6px" }} />
-              &nbsp;?
-            </HeaderText>
-          )}
-        </Flex>
-
-        <Flex justify="center" align="center" mb="sm">
-          <Button variant="white" color="dark" onClick={prevResponse}>
-            <IconChevronLeft />
-          </Button>
-          <div
-            className="response-box"
-            style={{
-              backgroundColor: "#eee",
-              width: "80%",
-              maxHeight: "340px",
-              overflowY: "scroll",
-              borderColor: "black",
-              borderStyle: "solid",
-            }}
-          >
-            <div className="response-item-llm-name-wrapper">
-              <div
-                className="small-response"
-                style={{ fontSize: "11pt", padding: "12pt" }}
-              >
-                {responseText}
-              </div>
-            </div>
-          </div>
-          <Button variant="white" color="dark" onClick={nextResponse}>
-            <IconChevronRight />
-          </Button>
-        </Flex>
-
-        <Flex justify="center" mb="xl" gap="lg">
-          <div
-            style={{
-              backgroundColor: "#fff",
-              padding: "12px",
-              width: "31%",
-              borderRadius: "12px",
-              borderWidth: "1px",
-              borderStyle: "solid",
-            }}
-          >
-            Vars
-            <hr />
-            <div style={{ maxHeight: "160px", overflowY: "scroll" }}>
-              {varsDivs}
-            </div>
-          </div>
-          <div
-            style={{
-              backgroundColor: "#fff",
-              padding: "12px",
-              width: "41%",
-              borderRadius: "2px",
-            }}
-          >
-            Prompt
-            <hr />
-            <div
-              className="monofont linebreaks"
-              style={{
-                maxHeight: "160px",
-                overflowY: "scroll",
-                fontSize: "10pt",
-                lineHeight: "1.2",
-              }}
-            >
-              {prompt}
-            </div>
-          </div>
-        </Flex>
-
-        {promptReasoning === null ? (
-          <Flex justify="center" gap="50px" mb="xl">
-            <Button
-              color="red"
-              variant="filled"
-              onClick={() => {
-                handleClickGradeButton(false);
-              }}
-            >
-              <IconThumbDown />
-              &nbsp;Bad!
-              <>
-                {isRedExploding && (
-                  <ConfettiExplosion
-                    zIndex={1000}
-                    colors={["#f00"]}
-                    force={0.1}
-                    height={300}
-                    width={200}
-                    particleCount={5}
-                    duration={2200}
-                    onComplete={() => setIsRedExploding(false)}
-                    style={{ position: "absolute", left: "50%", top: "20%" }}
-                  />
-                )}
-              </>
-            </Button>
-            <Button
-              color="green"
-              variant="filled"
-              onClick={() => {
-                handleClickGradeButton(true);
-              }}
-            >
-              <IconThumbUp />
-              &nbsp;Good!
-              <>
-                {isGreenExploding && (
-                  <ConfettiExplosion
-                    zIndex={1000}
-                    colors={[MANTINE_GREEN]}
-                    force={0.9}
-                    height={300}
-                    width={300}
-                    particleCount={10}
-                    duration={2200}
-                    onComplete={() => setIsGreenExploding(false)}
-                    style={{ position: "absolute", left: "50%", top: "20%" }}
-                  />
-                )}
-              </>
-            </Button>
-          </Flex>
-        ) : (
-          <Center>
-            <Stack spacing="xs">
-              <Text>What&apos;s the reason for your score?</Text>
-              <Flex align="center" gap="lg">
-                <Textarea
-                  value={annotation}
-                  onChange={(e) => setAnnotation(e.currentTarget.value)}
-                  autoFocus
-                  onKeyDown={(e) => {
-                    if (e.key === "Enter") {
-                      e.preventDefault();
-                      nextResponse();
-                    }
-                  }}
-                />
-                <Button onClick={nextResponse} w={100}>
-                  {!annotation ? "Skip" : "Continue"}
-                </Button>
-              </Flex>
-            </Stack>
-          </Center>
-        )}
-      </Box>
-
-      <Flex justify="left" align="center" gap="md">
-        {/* <Progress size={18} w='100%' sections={[{ value: 30, color: 'blue', label: '3/10 graded', tooltip: 'Samples graded' }]} /> */}
-        {/* <Loader size='sm' /> */}
-        <Stack w="100%" spacing={4}>
-          <Text color="#aaa" size="sm">
-            {bottomBar.progressLabel}
-          </Text>
-          <Progress w="100%" value={bottomBar.progressPerc} mb="0px" />
-        </Stack>
-
-        <Button
-          onClick={handleDone}
-          variant={bottomBar.buttonStyle}
-          disabled={bottomBar.buttonDisabled}
-        >
-          {bottomBar.buttonLabel}
-        </Button>
-      </Flex>
-    </Stack>
-  );
-});
-
-// Screen after EvalGen finishes, to show a report to the user
-// about the chosen functions and the alignment with their ratings.
-const ReportCardScreen = ({ report, recomputeAlignment, onClickFinish }) => {
-  // The criteria cards, now with report information
-  const cards = useMemo(() => {
-    const res = [];
-
-    // Iterate through selected eval functions and create cards
-    for (const selectedFunc of report.selectedEvalFunctions) {
-      const crit = selectedFunc.evalCriteria;
-      // Find corresponding report in allEvalFunctionReports map from criteria to list
-      const critEvalFuncReports = report.allEvalFunctionReports.get(crit);
-      const evalFuncReport = critEvalFuncReports.find(
-        (rep) => rep.evalFunction === selectedFunc,
-      );
-
-      // Get the functions that were not selected for this criteria
-      const otherFuncs = critEvalFuncReports.filter(
-        (rep) => rep.evalFunction !== selectedFunc,
-      );
-
-      res.push(
-        <CriteriaCard
-          title={crit.shortname}
-          description={crit.criteria}
-          evalMethod={crit.eval_method}
-          key={`cc-${crit.uid ?? res.length.toString() + crit.shortname}`}
-          reportMode={true}
-          evalFuncReport={evalFuncReport} // undefined if none was chosen
-          otherFuncs={otherFuncs}
-          onCheck={(checked) => {
-            crit.selected = checked;
-            recomputeAlignment();
-          }}
-        />,
-      );
-    }
-
-    return res;
-  }, [report]);
-
-  return (
-    report && (
-      <div>
-        <Text align="center" size="lg" pl="sm" mb="lg">
-          Chosen Functions and Alignment
-        </Text>
-
-        {/* Show coverage and false failure rate numbers */}
-        <Flex justify="center" gap="md" mb="lg">
-          <Group position="center" spacing="xl" style={{ textAlign: "center" }}>
-            <Card
-              shadow="sm"
-              padding="md"
-              radius="md"
-              style={{ backgroundColor: "#f0f0f0" }}
-            >
-              <Text weight={500} size="md">
-                Coverage of Bad Responses
-              </Text>
-              <Text color="blue" weight={700} size="md">
-                {report.failureCoverage.toFixed(2)}%
-              </Text>
-            </Card>
-            <Card
-              shadow="sm"
-              padding="md"
-              radius="md"
-              style={{ backgroundColor: "#f0f0f0" }}
-            >
-              <Text weight={500} size="md">
-                False Failure Rate
-              </Text>
-              <Text color="red" weight={700} size="md">
-                {report.falseFailureRate.toFixed(2)}%
-              </Text>
-            </Card>
-          </Group>
-        </Flex>
-
-        <ScrollArea mih={300} h={500} mah={500}>
-          <SimpleGrid cols={3} spacing="sm" verticalSpacing="sm" mb="lg">
-            {cards}
-          </SimpleGrid>
-        </ScrollArea>
-
-        <Flex justify="center" gap={12} mt="xs">
-          <Button onClick={() => onClickFinish(report)}>
-            Finish with selected evaluators
-          </Button>
-        </Flex>
-      </div>
-    )
-  );
-};

From 95553f7288fc418f5923710500700ac9fbccdcf9 Mon Sep 17 00:00:00 2001
From: Ian Arawjo <fatso784@gmail.com>
Date: Wed, 14 May 2025 16:02:03 -0400
Subject: [PATCH 35/35] Update readme

---
 chainforge/react-server/src/backend/evalgen/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/chainforge/react-server/src/backend/evalgen/README.md b/chainforge/react-server/src/backend/evalgen/README.md
index e4cda83b9..0ff121d97 100644
--- a/chainforge/react-server/src/backend/evalgen/README.md
+++ b/chainforge/react-server/src/backend/evalgen/README.md
@@ -16,7 +16,7 @@ This module contains types and prompts for criteria generation, function generat
 
 ### OAI Utils
 
-This module contains utilities for interacting with the Azure OpenAI API and streaming partial results (e.g., each evaluation criteria as it is generated).
+This module contains utilities for interacting with the OpenAI API.
 
 ### Executor