From ddadb7a455bf454c0412e3210889b7cc55400781 Mon Sep 17 00:00:00 2001
From: Shreya Shankar <ss.shankar505@gmail.com>
Date: Fri, 27 Sep 2024 11:29:08 -0700
Subject: [PATCH 1/3] Merge with Helen changes

---
 chainforge/react-server/src/backend/evalgen/executor.ts | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts
index 09d9e0d8e..73b22f587 100644
--- a/chainforge/react-server/src/backend/evalgen/executor.ts
+++ b/chainforge/react-server/src/backend/evalgen/executor.ts
@@ -145,6 +145,8 @@ export default class EvaluationFunctionExecutor {
     this.logFunction = addLog;
   }
 
+
+
   /**
    * Starts the background computation for generating and executing evaluation functions.
    * This method initiates the tasks but does not wait for them to complete.
@@ -415,10 +417,7 @@ export default class EvaluationFunctionExecutor {
     // Wait for the 'allFunctionsGenerated' event, which now waits for all executions
     await allFunctionsGeneratedPromise;
   }
-
-  public generateNewImplementationsForCriteria(
-    criteriaID: EvalCriteriaUID,
-  ): void {
+  public generateNewImplementationsForCriteria(criteriaID: EvalCriteriaUID): void {
     const crit = this.evalCriteria.find((c) => c.uid === criteriaID);
     if (!crit) {
       throw new Error(`Criteria with ID ${criteriaID} not found.`);
@@ -429,6 +428,7 @@ export default class EvaluationFunctionExecutor {
     }
   }
 
+
   /**
    * Adds another evaluation criteria and triggers the generation and execution of evaluation functions for the new criteria.
    * This method allows the client to add new evaluation criteria after the executor has been initialized.

From 4c1880703ad880184f0360119a89e7dc027ae7fe Mon Sep 17 00:00:00 2001
From: Shreya Shankar <ss.shankar505@gmail.com>
Date: Mon, 24 Jun 2024 15:53:32 -0700
Subject: [PATCH 2/3] Adding UI indicators of how many LLM calls are executed

---
 chainforge/react-server/src/EvalGenModal.tsx  | 84 +++++++++++++++++++
 .../src/backend/evalgen/executor.ts           | 57 +++++++++++++
 .../react-server/src/backend/evalgen/utils.ts |  4 +
 3 files changed, 145 insertions(+)

diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx
index 6e5b5f3dc..ab39d096f 100644
--- a/chainforge/react-server/src/EvalGenModal.tsx
+++ b/chainforge/react-server/src/EvalGenModal.tsx
@@ -536,13 +536,21 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
         const addLog = (message: string) => {
           setLogs((prevLogs) => [...prevLogs, { date: new Date(), message }]);
         };
+<<<<<<< HEAD
+=======
+
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
 
         const ex = new EvaluationFunctionExecutor(
           getLikelyPromptTemplateAsContext(responses),
           responses,
           criteria,
+<<<<<<< HEAD
           (gpt4Calls, gpt35Calls) => {
             // Callback to update GPT call counts
+=======
+          (gpt4Calls, gpt35Calls) => {  // Callback to update GPT call counts
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
             setNumGPT4Calls((num) => num + gpt4Calls);
             setNumGPT35Calls((num) => num + gpt35Calls);
           },
@@ -767,17 +775,26 @@ If you determine the feedback corresponds to a new criteria, your response shoul
       // Update annotation for current response (if any)
       // TODO: Fix this for generate case when num resp per prompt > 1
 
+<<<<<<< HEAD
       if (
         grades[shownResponse.uid] ||
         holisticGrade ||
         (annotation && annotation.trim())
       ) {
+=======
+      if (grades[shownResponse.uid] || holisticGrade || (annotation && annotation.trim())) {
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
         executor?.setGradeForExample(
           shownResponse.uid,
           grades[shownResponse.uid],
           holisticGrade,
+<<<<<<< HEAD
           annotation ? annotation.trim() : null,
         );
+=======
+          annotation ? annotation.trim() : null
+        ); 
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
       }
 
       if (
@@ -854,6 +871,7 @@ If you determine the feedback corresponds to a new criteria, your response shoul
       updateShownResponseUniqueIndex();
     };
 
+<<<<<<< HEAD
     const updateShownResponseUniqueIndex = () => {
       let idx = 0;
       for (const resp of responses) {
@@ -884,10 +902,13 @@ If you determine the feedback corresponds to a new criteria, your response shoul
       setShownResponse(responses[shownResponseIdx]);
     }, [shownResponseIdx]);
 
+=======
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
     const estimateGPTCalls = () => {
       return executor
         ? `This will trigger around ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT4Calls} GPT-4o and ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT35Calls} GPT-3.5-turbo-16k calls.`
         : "# estimated GPT calls not available.";
+<<<<<<< HEAD
     };
 
     const updateCriteriaForDisplay = () => {
@@ -907,6 +928,9 @@ If you determine the feedback corresponds to a new criteria, your response shoul
     };
 
     // const [onFinish, setOnFinish] = useState(null);
+=======
+    }
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
 
     return (
       <Modal
@@ -917,6 +941,7 @@ If you determine the feedback corresponds to a new criteria, your response shoul
         closeOnClickOutside={true}
         style={{ position: "relative", left: "-5%" }}
       >
+<<<<<<< HEAD
         {screen === "response" && (
           <Grid h={window?.innerHeight * 0.8}>
             <Grid.Col span={8}>
@@ -935,6 +960,21 @@ If you determine the feedback corresponds to a new criteria, your response shoul
                   estimateGPTCalls={estimateGPTCalls}
                   gotoNextScreen={gotoNextScreen}
                 />
+=======
+        <Grid h={window?.innerHeight * 0.8}>
+          <Grid.Col span={8}>
+            <Stack justify="space-between">
+              {/* View showing the response the user is currently grading */}
+              <GradingView
+                shownResponse={shownResponse}
+                numGPT4Calls={numGPT4Calls}
+                numGPT35Calls={numGPT35Calls}
+                logs={logs}
+                gotoNextResponse={nextResponse}
+                gotoPrevResponse={prevResponse}
+                estimateGPTCalls={estimateGPTCalls}
+              />
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
 
                 {/* Progress bar */}
                 {/* <Flex justify="left" align="center" gap="md">
@@ -1146,28 +1186,40 @@ const HeaderText = ({ children }: { children: ReactNode }) => {
 
 interface GradingViewProps {
   shownResponse: LLMResponse | undefined;
+<<<<<<< HEAD
   shownResponseIdx: number;
   responseCount: number;
+=======
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
   numGPT4Calls: number;
   numGPT35Calls: number;
   logs: { date: Date; message: string }[];
   gotoPrevResponse: () => void;
   gotoNextResponse: () => void;
   estimateGPTCalls: () => string;
+<<<<<<< HEAD
   gotoNextScreen: (screenName: string) => void;
+=======
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
 }
 
 const GradingView: React.FC<GradingViewProps> = ({
   shownResponse,
+<<<<<<< HEAD
   shownResponseIdx,
   responseCount,
+=======
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
   numGPT4Calls,
   numGPT35Calls,
   logs,
   gotoPrevResponse,
   gotoNextResponse,
   estimateGPTCalls,
+<<<<<<< HEAD
   gotoNextScreen,
+=======
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
 }) => {
   // Calculate inner values only when shownResponse changes
   const responseText = useMemo(
@@ -1257,7 +1309,16 @@ const GradingView: React.FC<GradingViewProps> = ({
           </div>
 
           {/* Go forward to the next response */}
+<<<<<<< HEAD
           <Tooltip label={estimateGPTCalls()} withArrow>
+=======
+          <Tooltip
+            label={
+              estimateGPTCalls()
+            }
+            withArrow
+          >
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
             <Button variant="white" color="dark" onClick={gotoNextResponse}>
               <IconChevronRight />
             </Button>
@@ -1306,6 +1367,7 @@ const GradingView: React.FC<GradingViewProps> = ({
         </Flex>
         <Flex direction="column">
           <Flex justify="space-between" align="center">
+<<<<<<< HEAD
             <Text size="lg" weight={500} mb="sm">
               LLM Activity
             </Text>
@@ -1313,10 +1375,17 @@ const GradingView: React.FC<GradingViewProps> = ({
             <Text size="sm" color="dark" style={{ fontStyle: "italic" }}>
               Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls}{" "}
               GPT-3.5-Turbo-16k calls.
+=======
+            <Text size="lg" weight={500} mb="sm">LLM Activity</Text>
+            {/* GPT Call Tally */}
+            <Text size="sm" color="dark" style={{ fontStyle: "italic" }}>
+              Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls} GPT-3.5-Turbo-16k calls.
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
             </Text>
           </Flex>
           <div
             style={{
+<<<<<<< HEAD
               backgroundColor: "#f0f0f0",
               color: "#333",
               fontFamily: "monospace",
@@ -1326,6 +1395,17 @@ const GradingView: React.FC<GradingViewProps> = ({
               overflowY: "auto",
               borderRadius: "8px",
               border: "1px solid #ddd",
+=======
+              backgroundColor: "#f0f0f0", 
+              color: "#333",
+              fontFamily: "monospace",
+              padding: "12px",
+              width: "calc(100% - 30px)", 
+              height: "200px",
+              overflowY: "auto",
+              borderRadius: "8px",
+              border: "1px solid #ddd", 
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
               marginRight: "20px", // Space on the right
             }}
             ref={(el) => {
@@ -1336,9 +1416,13 @@ const GradingView: React.FC<GradingViewProps> = ({
           >
             {logs.map((log, index) => (
               <div key={index}>
+<<<<<<< HEAD
                 <span style={{ color: "#4A90E2" }}>
                   {log.date.toLocaleString()} -{" "}
                 </span>
+=======
+                <span style={{ color: '#4A90E2' }}>{log.date.toLocaleString()} - </span> 
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
                 <span>{log.message}</span>
               </div>
             ))}
diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts
index 73b22f587..00b2287df 100644
--- a/chainforge/react-server/src/backend/evalgen/executor.ts
+++ b/chainforge/react-server/src/backend/evalgen/executor.ts
@@ -197,6 +197,7 @@ export default class EvaluationFunctionExecutor {
     const functionExecutionPromises: Promise<any>[] = [];
 
     emitter.on("functionGenerated", (evalFunction) => {
+
       const executionPromise = (async () => {
         this.evalFunctions.push(evalFunction);
         const executionPromises = this.examples.map(async (example) => {
@@ -228,6 +229,11 @@ export default class EvaluationFunctionExecutor {
             this.updateGPTCalls(0, 1);
           }
 
+          // Update GPT-3.5 call count by 1 if the eval method is expert
+          if (evalFunction.evalCriteria.eval_method === "expert") {
+            this.updateGPTCalls(0, 1);
+          }
+
           if (onProgress) {
             onProgress({
               success:
@@ -253,11 +259,17 @@ export default class EvaluationFunctionExecutor {
       functionExecutionPromises.push(executionPromise);
     });
 
+<<<<<<< HEAD
     const badExample = this.examples.find(
       (example) =>
         this.perCriteriaGrades[criteria.uid]?.[example.uid] === false,
     );
 
+=======
+    const badExample = this.examples.find(example => this.perCriteriaGrades[criteria.uid]?.[example.uid] === false);
+
+  
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
     await generateFunctionsForCriteria(
       criteria,
       this.promptTemplate,
@@ -269,12 +281,17 @@ export default class EvaluationFunctionExecutor {
     this.updateGPTCalls(1, 0);
 
     console.log(`Generated functions for criteria: ${criteria.shortname}`);
+<<<<<<< HEAD
     console.log(
       `Number of functions generated: ${functionExecutionPromises.length}`,
     );
     this.logFunction(
       `Generated ${functionExecutionPromises.length} functions for criteria: ${criteria.shortname}`,
     );
+=======
+    console.log(`Number of functions generated: ${functionExecutionPromises.length}`);
+    this.logFunction(`Generated ${functionExecutionPromises.length} functions for criteria: ${criteria.shortname}`);
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
 
     await Promise.all(functionExecutionPromises);
   }
@@ -308,9 +325,13 @@ export default class EvaluationFunctionExecutor {
 
     // Listen for generated functions and execute them as they come in
     emitter.on("functionGenerated", (evalFunction) => {
+<<<<<<< HEAD
       this.logFunction(
         `Generated a new ${evalFunction.evalCriteria.eval_method === "code" ? "code-based" : "LLM-based"} validator for criteria: ${evalFunction.evalCriteria.shortname}${evalFunction.evalCriteria.eval_method === "expert" ? `, with prompt: ${evalFunction.name}` : ""}. Executing it on ${this.examples.length} examples.`,
       );
+=======
+      this.logFunction(`Generated a new ${evalFunction.evalCriteria.eval_method === "code" ? "code-based" : "LLM-based"} validator for criteria: ${evalFunction.evalCriteria.shortname}${evalFunction.evalCriteria.eval_method === "expert" ? `, with prompt: ${evalFunction.name}` : ""}. Executing it on ${this.examples.length} examples.`);
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
 
       // Capture the execution promise of each function
       const executionPromise = (async () => {
@@ -347,6 +368,11 @@ export default class EvaluationFunctionExecutor {
             this.updateGPTCalls(0, 1);
           }
 
+          // Update GPT-3.5 call count by 1 if the eval method is expert
+          if (evalFunction.evalCriteria.eval_method === "expert") {
+            this.updateGPTCalls(0, 1);
+          }
+
           funcsExecuted++;
           if (onProgress) {
             onProgress({
@@ -398,9 +424,13 @@ export default class EvaluationFunctionExecutor {
           console.log(
             "All evaluation functions have been generated and executed.",
           );
+<<<<<<< HEAD
           this.logFunction(
             "All initially-generated evaluation functions have been generated and executed.",
           );
+=======
+          this.logFunction("All initially-generated evaluation functions have been generated and executed.");
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
           if (resolveAllFunctionsGenerated) {
             resolveAllFunctionsGenerated(); // Resolve the promise when all functions have been generated and executed
           }
@@ -541,14 +571,20 @@ export default class EvaluationFunctionExecutor {
     return new Map(this.grades);
   }
 
+<<<<<<< HEAD
   public estimateNumGPTCalls(perCriteriaGrades: Dict<boolean>): {
     numGPT4Calls: number;
     numGPT35Calls: number;
   } {
+=======
+  public estimateNumGPTCalls(perCriteriaGrades: Dict<boolean>): { numGPT4Calls: number; numGPT35Calls: number }{
+
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
     let numGPT4Calls = 0;
     let numLLMCriteria = 0;
     for (const criteriaId in perCriteriaGrades) {
       const currGrade = perCriteriaGrades[criteriaId];
+<<<<<<< HEAD
       const numGradedAsCurrGrade = this.examples.filter(
         (example) =>
           this.perCriteriaGrades[example.uid] &&
@@ -559,6 +595,12 @@ export default class EvaluationFunctionExecutor {
         const criteria = this.evalCriteria.find(
           (criteria) => criteria.uid === criteriaId,
         );
+=======
+      const numGradedAsCurrGrade = this.examples.filter(example => this.perCriteriaGrades[example.uid] && this.perCriteriaGrades[example.uid][criteriaId] === currGrade).length;
+      if (Math.random() <= 1 / (numGradedAsCurrGrade + 1)) {
+        numGPT4Calls += 1;
+        const criteria = this.evalCriteria.find(criteria => criteria.uid === criteriaId);
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
         if (criteria && criteria.eval_method === "expert") {
           numLLMCriteria += 1;
         }
@@ -566,9 +608,16 @@ export default class EvaluationFunctionExecutor {
     }
 
     return {
+<<<<<<< HEAD
       numGPT4Calls,
       numGPT35Calls: numLLMCriteria * 3 * this.examples.length,
     };
+=======
+      numGPT4Calls: numGPT4Calls,
+      numGPT35Calls: numLLMCriteria * 3 * this.examples.length,
+    };
+
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
   }
 
   /**
@@ -580,12 +629,16 @@ export default class EvaluationFunctionExecutor {
    * @param exampleId The unique ID of the example being graded.
    * @param holisticGrade The developer-provided grade assigned to the example, "good" or "bad" or unknown.
    */
+<<<<<<< HEAD
   public setGradeForExample(
     exampleId: ResponseUID,
     perCriteriaGrades?: Dict<boolean | undefined>,
     holisticGrade?: string,
     annotation?: string,
   ): void {
+=======
+  public setGradeForExample(exampleId: ResponseUID, perCriteriaGrades?: Dict<boolean | undefined>, holisticGrade?: string, annotation?: string ): void {
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
     if (holisticGrade !== null) {
       const boolHolistic = holisticGrade === "good";
       this.grades.set(exampleId, boolHolistic);
@@ -638,9 +691,13 @@ export default class EvaluationFunctionExecutor {
       }
     }
 
+<<<<<<< HEAD
     console.log(
       `Generated new implementations for ${numCriteriaWithNewImplementations} criteria.`,
     );
+=======
+    console.log(`Generated new implementations for ${numCriteriaWithNewImplementations} criteria.`);
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
   }
 
   /**
diff --git a/chainforge/react-server/src/backend/evalgen/utils.ts b/chainforge/react-server/src/backend/evalgen/utils.ts
index 699d8abd6..4220fe941 100644
--- a/chainforge/react-server/src/backend/evalgen/utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/utils.ts
@@ -282,7 +282,11 @@ export async function generateFunctionsForCriteria(
     criteria,
     promptTemplate,
     example,
+<<<<<<< HEAD
     badExample,
+=======
+    badExample
+>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
   );
   console.log("Function generation prompt:", functionGenPrompt);
 

From 14ea6b4291bce1610ac93f3533da2259d37a07d8 Mon Sep 17 00:00:00 2001
From: Shreya Shankar <ss.shankar505@gmail.com>
Date: Fri, 27 Sep 2024 19:57:53 -0700
Subject: [PATCH 3/3] Integrating executor with report card screen and
 transition back to multi-eval node

---
 chainforge/react-server/src/EvalGenModal.tsx  | 198 ++++++------------
 .../react-server/src/ModelSettingSchemas.tsx  |   1 +
 chainforge/react-server/src/MultiEvalNode.tsx |  28 +--
 .../src/backend/evalgen/executor.ts           |  67 +-----
 .../src/backend/evalgen/typing.ts             |   6 -
 .../react-server/src/backend/evalgen/utils.ts |  14 +-
 chainforge/react-server/src/backend/models.ts |   1 +
 7 files changed, 95 insertions(+), 220 deletions(-)

diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx
index ab39d096f..0ad7f526c 100644
--- a/chainforge/react-server/src/EvalGenModal.tsx
+++ b/chainforge/react-server/src/EvalGenModal.tsx
@@ -65,7 +65,7 @@ import {
   RatingDict,
   ResponseUID,
 } from "./backend/typing";
-import { EvalCriteria, EvalGenReport } from "./backend/evalgen/typing";
+import { EvalCriteria, EvalFunction, EvalFunctionReport, EvalFunctionSetReport } from "./backend/evalgen/typing";
 import {
   IconChevronDown,
   IconChevronLeft,
@@ -267,7 +267,7 @@ const CriteriaCard: React.FC<CriteriaCardProps> = ({
               onChangeGrade={onChangeGrade}
               getGradeCount={getGradeCount}
             />
-            <Contributor getStateValue={getStateValue} />
+            <Contributor getStateValue={getStateValue} style={{ size: 22, thickness: 4 }} />
 
             {/* Title of the criteria */}
             <TextInput
@@ -412,7 +412,7 @@ const CriteriaCard: React.FC<CriteriaCardProps> = ({
 export interface EvalGenModalRef {
   trigger: (
     resps: LLMResponse[],
-    setFinalReports: (reports: EvalGenReport) => void,
+    setFinalReports: (selectedFuncs: EvalFunction[]) => void,
   ) => void;
 }
 
@@ -422,6 +422,7 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
     const apiKeys = useStore((state) => state.apiKeys);
     const globalState = useStore((store) => store.state);
     const [criteria, setCriteria] = useState<EvalCriteria[]>([]);
+    const [reports, setReports] = useState<EvalFunctionSetReport | undefined>(undefined);
     const [criteriaForDisplay, setCriteriaForDisplay] = useState<
       EvalCriteria[]
     >([]);
@@ -536,21 +537,13 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
         const addLog = (message: string) => {
           setLogs((prevLogs) => [...prevLogs, { date: new Date(), message }]);
         };
-<<<<<<< HEAD
-=======
-
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
 
         const ex = new EvaluationFunctionExecutor(
           getLikelyPromptTemplateAsContext(responses),
           responses,
           criteria,
-<<<<<<< HEAD
           (gpt4Calls, gpt35Calls) => {
             // Callback to update GPT call counts
-=======
-          (gpt4Calls, gpt35Calls) => {  // Callback to update GPT call counts
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
             setNumGPT4Calls((num) => num + gpt4Calls);
             setNumGPT35Calls((num) => num + gpt35Calls);
           },
@@ -593,7 +586,7 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
 
     // const defaultOnFinish = (reports: string) => {};
     const [onFinish, setOnFinish] = useState({
-      setFinalRpts: (reports: EvalGenReport) => {
+      setFinalRpts: (reports: EvalFunction[]) => {
         // console.log("");
       },
     });
@@ -601,7 +594,7 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
     // Open the EvalGen wizard
     const trigger = (
       resps: LLMResponse[],
-      setFinalReports: (reports: EvalGenReport) => void,
+      setFinalReports: (reports: EvalFunction[]) => void,
     ) => {
       // We pass the responses here manually to ensure they remain the same
       // for the duration of one EvalGen operation.
@@ -609,7 +602,7 @@ const EvalGenModal = forwardRef<EvalGenModalRef, NonNullable<unknown>>(
       gotoNextScreen("response");
       // setFinalReports("A plenty response");
       setOnFinish({
-        setFinalRpts: (reports: EvalGenReport) => {
+        setFinalRpts: (reports: EvalFunction[]) => {
           close();
           setFinalReports(reports);
         },
@@ -775,26 +768,17 @@ If you determine the feedback corresponds to a new criteria, your response shoul
       // Update annotation for current response (if any)
       // TODO: Fix this for generate case when num resp per prompt > 1
 
-<<<<<<< HEAD
       if (
         grades[shownResponse.uid] ||
         holisticGrade ||
         (annotation && annotation.trim())
       ) {
-=======
-      if (grades[shownResponse.uid] || holisticGrade || (annotation && annotation.trim())) {
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
         executor?.setGradeForExample(
           shownResponse.uid,
           grades[shownResponse.uid],
           holisticGrade,
-<<<<<<< HEAD
           annotation ? annotation.trim() : null,
         );
-=======
-          annotation ? annotation.trim() : null
-        ); 
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
       }
 
       if (
@@ -871,7 +855,6 @@ If you determine the feedback corresponds to a new criteria, your response shoul
       updateShownResponseUniqueIndex();
     };
 
-<<<<<<< HEAD
     const updateShownResponseUniqueIndex = () => {
       let idx = 0;
       for (const resp of responses) {
@@ -902,13 +885,10 @@ If you determine the feedback corresponds to a new criteria, your response shoul
       setShownResponse(responses[shownResponseIdx]);
     }, [shownResponseIdx]);
 
-=======
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
     const estimateGPTCalls = () => {
       return executor
-        ? `This will trigger around ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT4Calls} GPT-4o and ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT35Calls} GPT-3.5-turbo-16k calls.`
+        ? `This will trigger around ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT4Calls} GPT-4o and ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT35Calls} GPT-4o-mini calls.`
         : "# estimated GPT calls not available.";
-<<<<<<< HEAD
     };
 
     const updateCriteriaForDisplay = () => {
@@ -927,10 +907,22 @@ If you determine the feedback corresponds to a new criteria, your response shoul
       setScreen(screenName);
     };
 
+    const handleGradingDone = async () => {
+      await executor?.waitForCompletion();
+      const filteredFunctions = await executor?.filterEvaluationFunctions(0.25);
+      console.log("filteredFunctions", filteredFunctions);
+
+      // schema is {
+      //   failureCoverage: coverage,
+      //   falseFailureRate,
+      //   selectedEvalFunctions: bestEvalFunctions,
+      //   allEvalFunctionReports: evalFunctionReport,
+      // };
+      // set state
+      setReports(filteredFunctions);
+    };
+
     // const [onFinish, setOnFinish] = useState(null);
-=======
-    }
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
 
     return (
       <Modal
@@ -941,7 +933,6 @@ If you determine the feedback corresponds to a new criteria, your response shoul
         closeOnClickOutside={true}
         style={{ position: "relative", left: "-5%" }}
       >
-<<<<<<< HEAD
         {screen === "response" && (
           <Grid h={window?.innerHeight * 0.8}>
             <Grid.Col span={8}>
@@ -959,22 +950,8 @@ If you determine the feedback corresponds to a new criteria, your response shoul
                   gotoPrevResponse={prevResponse2}
                   estimateGPTCalls={estimateGPTCalls}
                   gotoNextScreen={gotoNextScreen}
+                  handleGradingDone={handleGradingDone}
                 />
-=======
-        <Grid h={window?.innerHeight * 0.8}>
-          <Grid.Col span={8}>
-            <Stack justify="space-between">
-              {/* View showing the response the user is currently grading */}
-              <GradingView
-                shownResponse={shownResponse}
-                numGPT4Calls={numGPT4Calls}
-                numGPT35Calls={numGPT35Calls}
-                logs={logs}
-                gotoNextResponse={nextResponse}
-                gotoPrevResponse={prevResponse}
-                estimateGPTCalls={estimateGPTCalls}
-              />
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
 
                 {/* Progress bar */}
                 {/* <Flex justify="left" align="center" gap="md">
@@ -1150,12 +1127,8 @@ If you determine the feedback corresponds to a new criteria, your response shoul
         {screen === "report" && (
           <Grid>
             <ReportCardView
-              report={{
-                criteria: criteria,
-                failureCoverage: 99.2,
-                falseFailureRate: 66.7,
-              }}
-              onFinish={(reports: EvalGenReport) => {
+              report={reports}
+              onFinish={(reports: EvalFunction[]) => {
                 onFinish.setFinalRpts(reports);
               }}
               getGradeCount={(crit: EvalCriteria, grade: boolean) => {
@@ -1186,40 +1159,30 @@ const HeaderText = ({ children }: { children: ReactNode }) => {
 
 interface GradingViewProps {
   shownResponse: LLMResponse | undefined;
-<<<<<<< HEAD
   shownResponseIdx: number;
   responseCount: number;
-=======
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
   numGPT4Calls: number;
   numGPT35Calls: number;
   logs: { date: Date; message: string }[];
   gotoPrevResponse: () => void;
   gotoNextResponse: () => void;
   estimateGPTCalls: () => string;
-<<<<<<< HEAD
   gotoNextScreen: (screenName: string) => void;
-=======
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
+  handleGradingDone: () => void;
 }
 
 const GradingView: React.FC<GradingViewProps> = ({
   shownResponse,
-<<<<<<< HEAD
   shownResponseIdx,
   responseCount,
-=======
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
   numGPT4Calls,
   numGPT35Calls,
   logs,
   gotoPrevResponse,
   gotoNextResponse,
   estimateGPTCalls,
-<<<<<<< HEAD
   gotoNextScreen,
-=======
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
+  handleGradingDone,
 }) => {
   // Calculate inner values only when shownResponse changes
   const responseText = useMemo(
@@ -1309,16 +1272,7 @@ const GradingView: React.FC<GradingViewProps> = ({
           </div>
 
           {/* Go forward to the next response */}
-<<<<<<< HEAD
           <Tooltip label={estimateGPTCalls()} withArrow>
-=======
-          <Tooltip
-            label={
-              estimateGPTCalls()
-            }
-            withArrow
-          >
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
             <Button variant="white" color="dark" onClick={gotoNextResponse}>
               <IconChevronRight />
             </Button>
@@ -1367,25 +1321,17 @@ const GradingView: React.FC<GradingViewProps> = ({
         </Flex>
         <Flex direction="column">
           <Flex justify="space-between" align="center">
-<<<<<<< HEAD
             <Text size="lg" weight={500} mb="sm">
               LLM Activity
             </Text>
             {/* GPT Call Tally */}
             <Text size="sm" color="dark" style={{ fontStyle: "italic" }}>
               Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls}{" "}
-              GPT-3.5-Turbo-16k calls.
-=======
-            <Text size="lg" weight={500} mb="sm">LLM Activity</Text>
-            {/* GPT Call Tally */}
-            <Text size="sm" color="dark" style={{ fontStyle: "italic" }}>
-              Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls} GPT-3.5-Turbo-16k calls.
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
+              GPT-4o-mini calls.
             </Text>
           </Flex>
           <div
             style={{
-<<<<<<< HEAD
               backgroundColor: "#f0f0f0",
               color: "#333",
               fontFamily: "monospace",
@@ -1395,17 +1341,6 @@ const GradingView: React.FC<GradingViewProps> = ({
               overflowY: "auto",
               borderRadius: "8px",
               border: "1px solid #ddd",
-=======
-              backgroundColor: "#f0f0f0", 
-              color: "#333",
-              fontFamily: "monospace",
-              padding: "12px",
-              width: "calc(100% - 30px)", 
-              height: "200px",
-              overflowY: "auto",
-              borderRadius: "8px",
-              border: "1px solid #ddd", 
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
               marginRight: "20px", // Space on the right
             }}
             ref={(el) => {
@@ -1416,13 +1351,9 @@ const GradingView: React.FC<GradingViewProps> = ({
           >
             {logs.map((log, index) => (
               <div key={index}>
-<<<<<<< HEAD
                 <span style={{ color: "#4A90E2" }}>
                   {log.date.toLocaleString()} -{" "}
                 </span>
-=======
-                <span style={{ color: '#4A90E2' }}>{log.date.toLocaleString()} - </span> 
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
                 <span>{log.message}</span>
               </div>
             ))}
@@ -1435,8 +1366,10 @@ const GradingView: React.FC<GradingViewProps> = ({
             leftIcon={<IconSparkles size={14} />}
             variant="gradient"
             gradient={{ from: "blue", to: "green", deg: 45 }}
-            onClick={() => {
+            onClick={async () => {
               // console.log("(3) gotoNextScreen", gotoNextScreen);
+              // Get the evaluation functions
+              await handleGradingDone();
               gotoNextScreen("report");
             }}
           >
@@ -1449,9 +1382,9 @@ const GradingView: React.FC<GradingViewProps> = ({
 };
 
 interface ReportCardViewProps {
-  report: EvalGenReport;
+  report: EvalFunctionSetReport;
   // recomputeAlignment,
-  onFinish: (reports: EvalGenReport) => void;
+  onFinish: (reports: EvalFunction[]) => void;
   getGradeCount: (crit: EvalCriteria, grade: boolean) => number;
   getStateValue: (stateId: number) => number;
 }
@@ -1464,41 +1397,44 @@ const ReportCardView: React.FC<ReportCardViewProps> = ({
   getGradeCount,
   getStateValue,
 }) => {
-  // The criteria cards, now with report information
 
-  const [finalReport, setFinalReport] = useState(report);
+  const [selectedEvalFunctions, setSelectedEvalFunctions] = useState<EvalFunction[]>(report.selectedEvalFunctions);
 
   const onSelect = (criterion: EvalCriteria, isSelected: boolean) => {
     if (isSelected) {
-      finalReport.criteria.push(criterion);
+      const matchingFunction = report.selectedEvalFunctions.find(func => func.evalCriteria === criterion);
+      if (matchingFunction && !selectedEvalFunctions.includes(matchingFunction)) {
+        setSelectedEvalFunctions([...selectedEvalFunctions, matchingFunction]);
+      }
     } else {
-      finalReport.criteria = finalReport.criteria.filter(
-        (c) => c !== criterion,
-      );
+      setSelectedEvalFunctions(selectedEvalFunctions.filter(func => func.evalCriteria !== criterion));
     }
-    setFinalReport(finalReport);
-  };
+  }
+
+  // The criteria cards, now with report information
   const cards = useMemo(() => {
     const res = [];
 
     // Iterate through selected eval functions and create cards
-    // for (const selectedFunc of report.selectedEvalFunctions) {
-    //   const crit = selectedFunc.evalCriteria;
-    //   // Find corresponding report in allEvalFunctionReports map from criteria to list
-    //   const critEvalFuncReports = report.allEvalFunctionReports.get(crit);
-    //   const evalFuncReport = critEvalFuncReports.find(
-    //     (rep) => rep.evalFunction === selectedFunc,
-    //   );
-
-    //   // Get the functions that were not selected for this criteria
-    //   const otherFuncs = critEvalFuncReports.filter(
-    //     (rep) => rep.evalFunction !== selectedFunc,
-    //   );
-    for (const crit of report.criteria) {
+    for (const selectedFunc of report.selectedEvalFunctions) {
+      const crit = selectedFunc.evalCriteria;
+      // Find corresponding report in allEvalFunctionReports map from criteria to list
+      const critEvalFuncReports = report.allEvalFunctionReports.get(crit);
+      const evalFuncReport = critEvalFuncReports.find(
+        (rep) => rep.evalFunction === selectedFunc,
+      );
+
+      // Get the functions that were not selected for this criteria
+      const otherFuncs = critEvalFuncReports.filter(
+        (rep) => rep.evalFunction !== selectedFunc,
+      );
+
       res.push(
         <ReportCriteriaCard
           criterion={crit}
           key={crit.uid}
+          evalFunctionReport={evalFuncReport}
+          otherFunctions={otherFuncs}
           // onCheck={(checked) => {
           //   crit.selected = checked;
           //   recomputeAlignment();
@@ -1562,7 +1498,7 @@ const ReportCardView: React.FC<ReportCardViewProps> = ({
           <Button
             onClick={() => {
               // console.log("finalReport", finalReport);
-              onFinish(finalReport);
+              onFinish(selectedEvalFunctions);
             }}
           >
             Finish with selected evaluators
@@ -1575,6 +1511,8 @@ const ReportCardView: React.FC<ReportCardViewProps> = ({
 
 interface ReportCriteriaCardProps {
   criterion: EvalCriteria;
+  evalFunctionReport: EvalFunctionReport;
+  otherFunctions: EvalFunctionReport[];
   // onChange: (changedCriteria: EvalCriteria) => void;
   // onDelete: () => void;
   // initiallyOpen?: boolean;
@@ -1587,6 +1525,8 @@ interface ReportCriteriaCardProps {
 
 const ReportCriteriaCard: React.FC<ReportCriteriaCardProps> = ({
   criterion,
+  evalFunctionReport,
+  otherFunctions,
   // onChange,
   // onDelete,
   // initiallyOpen,
@@ -1601,12 +1541,12 @@ const ReportCriteriaCard: React.FC<ReportCriteriaCardProps> = ({
   const [checked, setChecked] = useState(true);
 
   // Simulates eval functions that are expected to be passed in later on (TODO)
-  const evalFuncs = [
-    { evalFunction: { code: "To be provided (1) ..." } },
-    { evalFunction: { code: "To be provided (2) ..." } },
-    { evalFunction: { code: "To be provided (3) ..." } },
-  ];
-  const unselectedImplementations = evalFuncs.map((item) => (
+  // const evalFuncs = [
+  //   { evalFunction: { code: "To be provided (1) ..." } },
+  //   { evalFunction: { code: "To be provided (2) ..." } },
+  //   { evalFunction: { code: "To be provided (3) ..." } },
+  // ];
+  const unselectedImplementations = otherFunctions.map((item) => (
     <div key={uuid()}>
       <Code style={{ whiteSpace: "pre-wrap" }} key={uuid()}>
         {item.evalFunction.code}
diff --git a/chainforge/react-server/src/ModelSettingSchemas.tsx b/chainforge/react-server/src/ModelSettingSchemas.tsx
index 4f9ad42ca..7d6372cf9 100644
--- a/chainforge/react-server/src/ModelSettingSchemas.tsx
+++ b/chainforge/react-server/src/ModelSettingSchemas.tsx
@@ -58,6 +58,7 @@ const ChatGPTSettings: ModelSettingsDict = {
           "gpt-3.5-turbo",
           "gpt-4-turbo",
           "gpt-4o",
+          "gpt-4o-mini",
           "gpt-4",
           "gpt-4-turbo-2024-04-09",
           "gpt-4-turbo-preview",
diff --git a/chainforge/react-server/src/MultiEvalNode.tsx b/chainforge/react-server/src/MultiEvalNode.tsx
index 961cb905c..b0d0de0d3 100644
--- a/chainforge/react-server/src/MultiEvalNode.tsx
+++ b/chainforge/react-server/src/MultiEvalNode.tsx
@@ -60,10 +60,9 @@ import { Dict, LLMResponse, QueryProgress } from "./backend/typing";
 import { AlertModalContext } from "./AlertModal";
 import { Status } from "./StatusIndicatorComponent";
 import EvalGenModal, {
-  EvalGenModalRef,
-  ReportCardScreen,
+  EvalGenModalRef
 } from "./EvalGenModal";
-import { EvalGenReport } from "./backend/evalgen/typing";
+import { EvalFunction } from "./backend/evalgen/typing";
 
 const IS_RUNNING_LOCALLY = APP_IS_RUNNING_LOCALLY();
 
@@ -663,32 +662,33 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
     evalGenModalRef.current?.trigger(resps, onFinalReportsReady);
   };
 
-  const onFinalReportsReady = (reports: EvalGenReport) => {
+
+  const onFinalReportsReady = (selectedFunctions: EvalFunction[]) => {
     // Placeholder for process the final reports returned from EvalGenModel
-    console.log("!!!!!!!!!!!!!!!!!!!!!!!!!! final reports", reports);
+    console.log("!!!!!!!!!!!!!!!!!!!!!!!!!! final functions", selectedFunctions);
     // let kkk = 1;
-    for (const crit of reports.criteria) {
+    for (const func of selectedFunctions) {
       // setTimeout(() => {
       // console.log("crit", crit);
-      if (crit.eval_method === "code") {
+      if (func.evalCriteria.eval_method === "code") {
         // Python
         addEvaluator(
-          crit.shortname,
+          func.evalCriteria.shortname,
           "python",
           {
-            code: "def evaluate(r):\n\treturn len(r.text)", // to be populated once python code is implemented for the criteria
+            code: func.code, // to be populated once python code is implemented for the criteria
             sandbox: true,
           },
           false,
         );
-      } else if (crit.eval_method === "expert") {
+      } else if (func.evalCriteria.eval_method === "expert") {
         // LLM
         addEvaluator(
-          crit.shortname,
+          func.evalCriteria.shortname,
           "llm",
           {
             // to be populated once LLM code is implemented for the criteria
-            prompt: "",
+            prompt: func.code,
             format: "bin",
           },
           false,
@@ -696,10 +696,10 @@ const MultiEvalNode: React.FC<MultiEvalNodeProps> = ({ data, id }) => {
       } else {
         // JavaScript
         addEvaluator(
-          crit.shortname,
+          func.evalCriteria.shortname,
           "javascript",
           {
-            code: "function evaluate(r) {\n\treturn r.text.length;\n}", // to be populated once javascript code is implemented for the criteria
+            code: func.code,
           },
           false,
         );
diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts
index 00b2287df..0fe82c013 100644
--- a/chainforge/react-server/src/backend/evalgen/executor.ts
+++ b/chainforge/react-server/src/backend/evalgen/executor.ts
@@ -145,8 +145,6 @@ export default class EvaluationFunctionExecutor {
     this.logFunction = addLog;
   }
 
-
-
   /**
    * Starts the background computation for generating and executing evaluation functions.
    * This method initiates the tasks but does not wait for them to complete.
@@ -197,7 +195,6 @@ export default class EvaluationFunctionExecutor {
     const functionExecutionPromises: Promise<any>[] = [];
 
     emitter.on("functionGenerated", (evalFunction) => {
-
       const executionPromise = (async () => {
         this.evalFunctions.push(evalFunction);
         const executionPromises = this.examples.map(async (example) => {
@@ -229,11 +226,6 @@ export default class EvaluationFunctionExecutor {
             this.updateGPTCalls(0, 1);
           }
 
-          // Update GPT-3.5 call count by 1 if the eval method is expert
-          if (evalFunction.evalCriteria.eval_method === "expert") {
-            this.updateGPTCalls(0, 1);
-          }
-
           if (onProgress) {
             onProgress({
               success:
@@ -259,17 +251,11 @@ export default class EvaluationFunctionExecutor {
       functionExecutionPromises.push(executionPromise);
     });
 
-<<<<<<< HEAD
     const badExample = this.examples.find(
       (example) =>
         this.perCriteriaGrades[criteria.uid]?.[example.uid] === false,
     );
 
-=======
-    const badExample = this.examples.find(example => this.perCriteriaGrades[criteria.uid]?.[example.uid] === false);
-
-  
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
     await generateFunctionsForCriteria(
       criteria,
       this.promptTemplate,
@@ -281,17 +267,12 @@ export default class EvaluationFunctionExecutor {
     this.updateGPTCalls(1, 0);
 
     console.log(`Generated functions for criteria: ${criteria.shortname}`);
-<<<<<<< HEAD
     console.log(
       `Number of functions generated: ${functionExecutionPromises.length}`,
     );
     this.logFunction(
       `Generated ${functionExecutionPromises.length} functions for criteria: ${criteria.shortname}`,
     );
-=======
-    console.log(`Number of functions generated: ${functionExecutionPromises.length}`);
-    this.logFunction(`Generated ${functionExecutionPromises.length} functions for criteria: ${criteria.shortname}`);
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
 
     await Promise.all(functionExecutionPromises);
   }
@@ -325,13 +306,9 @@ export default class EvaluationFunctionExecutor {
 
     // Listen for generated functions and execute them as they come in
     emitter.on("functionGenerated", (evalFunction) => {
-<<<<<<< HEAD
       this.logFunction(
         `Generated a new ${evalFunction.evalCriteria.eval_method === "code" ? "code-based" : "LLM-based"} validator for criteria: ${evalFunction.evalCriteria.shortname}${evalFunction.evalCriteria.eval_method === "expert" ? `, with prompt: ${evalFunction.name}` : ""}. Executing it on ${this.examples.length} examples.`,
       );
-=======
-      this.logFunction(`Generated a new ${evalFunction.evalCriteria.eval_method === "code" ? "code-based" : "LLM-based"} validator for criteria: ${evalFunction.evalCriteria.shortname}${evalFunction.evalCriteria.eval_method === "expert" ? `, with prompt: ${evalFunction.name}` : ""}. Executing it on ${this.examples.length} examples.`);
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
 
       // Capture the execution promise of each function
       const executionPromise = (async () => {
@@ -368,11 +345,6 @@ export default class EvaluationFunctionExecutor {
             this.updateGPTCalls(0, 1);
           }
 
-          // Update GPT-3.5 call count by 1 if the eval method is expert
-          if (evalFunction.evalCriteria.eval_method === "expert") {
-            this.updateGPTCalls(0, 1);
-          }
-
           funcsExecuted++;
           if (onProgress) {
             onProgress({
@@ -424,13 +396,9 @@ export default class EvaluationFunctionExecutor {
           console.log(
             "All evaluation functions have been generated and executed.",
           );
-<<<<<<< HEAD
           this.logFunction(
             "All initially-generated evaluation functions have been generated and executed.",
           );
-=======
-          this.logFunction("All initially-generated evaluation functions have been generated and executed.");
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
           if (resolveAllFunctionsGenerated) {
             resolveAllFunctionsGenerated(); // Resolve the promise when all functions have been generated and executed
           }
@@ -447,7 +415,10 @@ export default class EvaluationFunctionExecutor {
     // Wait for the 'allFunctionsGenerated' event, which now waits for all executions
     await allFunctionsGeneratedPromise;
   }
-  public generateNewImplementationsForCriteria(criteriaID: EvalCriteriaUID): void {
+
+  public generateNewImplementationsForCriteria(
+    criteriaID: EvalCriteriaUID,
+  ): void {
     const crit = this.evalCriteria.find((c) => c.uid === criteriaID);
     if (!crit) {
       throw new Error(`Criteria with ID ${criteriaID} not found.`);
@@ -458,7 +429,6 @@ export default class EvaluationFunctionExecutor {
     }
   }
 
-
   /**
    * Adds another evaluation criteria and triggers the generation and execution of evaluation functions for the new criteria.
    * This method allows the client to add new evaluation criteria after the executor has been initialized.
@@ -571,20 +541,14 @@ export default class EvaluationFunctionExecutor {
     return new Map(this.grades);
   }
 
-<<<<<<< HEAD
   public estimateNumGPTCalls(perCriteriaGrades: Dict<boolean>): {
     numGPT4Calls: number;
     numGPT35Calls: number;
   } {
-=======
-  public estimateNumGPTCalls(perCriteriaGrades: Dict<boolean>): { numGPT4Calls: number; numGPT35Calls: number }{
-
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
     let numGPT4Calls = 0;
     let numLLMCriteria = 0;
     for (const criteriaId in perCriteriaGrades) {
       const currGrade = perCriteriaGrades[criteriaId];
-<<<<<<< HEAD
       const numGradedAsCurrGrade = this.examples.filter(
         (example) =>
           this.perCriteriaGrades[example.uid] &&
@@ -595,12 +559,6 @@ export default class EvaluationFunctionExecutor {
         const criteria = this.evalCriteria.find(
           (criteria) => criteria.uid === criteriaId,
         );
-=======
-      const numGradedAsCurrGrade = this.examples.filter(example => this.perCriteriaGrades[example.uid] && this.perCriteriaGrades[example.uid][criteriaId] === currGrade).length;
-      if (Math.random() <= 1 / (numGradedAsCurrGrade + 1)) {
-        numGPT4Calls += 1;
-        const criteria = this.evalCriteria.find(criteria => criteria.uid === criteriaId);
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
         if (criteria && criteria.eval_method === "expert") {
           numLLMCriteria += 1;
         }
@@ -608,16 +566,9 @@ export default class EvaluationFunctionExecutor {
     }
 
     return {
-<<<<<<< HEAD
       numGPT4Calls,
       numGPT35Calls: numLLMCriteria * 3 * this.examples.length,
     };
-=======
-      numGPT4Calls: numGPT4Calls,
-      numGPT35Calls: numLLMCriteria * 3 * this.examples.length,
-    };
-
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
   }
 
   /**
@@ -629,16 +580,12 @@ export default class EvaluationFunctionExecutor {
    * @param exampleId The unique ID of the example being graded.
    * @param holisticGrade The developer-provided grade assigned to the example, "good" or "bad" or unknown.
    */
-<<<<<<< HEAD
   public setGradeForExample(
     exampleId: ResponseUID,
     perCriteriaGrades?: Dict<boolean | undefined>,
     holisticGrade?: string,
     annotation?: string,
   ): void {
-=======
-  public setGradeForExample(exampleId: ResponseUID, perCriteriaGrades?: Dict<boolean | undefined>, holisticGrade?: string, annotation?: string ): void {
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
     if (holisticGrade !== null) {
       const boolHolistic = holisticGrade === "good";
       this.grades.set(exampleId, boolHolistic);
@@ -691,13 +638,9 @@ export default class EvaluationFunctionExecutor {
       }
     }
 
-<<<<<<< HEAD
     console.log(
       `Generated new implementations for ${numCriteriaWithNewImplementations} criteria.`,
     );
-=======
-    console.log(`Generated new implementations for ${numCriteriaWithNewImplementations} criteria.`);
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
   }
 
   /**
@@ -1142,4 +1085,4 @@ export default class EvaluationFunctionExecutor {
 
     return outcomes;
   }
-}
+}
\ No newline at end of file
diff --git a/chainforge/react-server/src/backend/evalgen/typing.ts b/chainforge/react-server/src/backend/evalgen/typing.ts
index e9e6cd24d..f5952013c 100644
--- a/chainforge/react-server/src/backend/evalgen/typing.ts
+++ b/chainforge/react-server/src/backend/evalgen/typing.ts
@@ -11,12 +11,6 @@ export interface EvalCriteria {
   source?: string;
 }
 
-export interface EvalGenReport {
-  criteria: EvalCriteria[];
-  failureCoverage: number;
-  falseFailureRate: number;
-}
-
 export function validEvalCriteriaFormat(json_obj: Dict) {
   return (
     "criteria" in json_obj &&
diff --git a/chainforge/react-server/src/backend/evalgen/utils.ts b/chainforge/react-server/src/backend/evalgen/utils.ts
index 4220fe941..29aa2c3e1 100644
--- a/chainforge/react-server/src/backend/evalgen/utils.ts
+++ b/chainforge/react-server/src/backend/evalgen/utils.ts
@@ -113,8 +113,8 @@ export async function generateLLMEvaluationCriteria(
 export async function executeLLMEval(
   evalFunction: EvalFunction,
   example: LLMResponse,
-  positiveExample: LLMResponse,
-  negativeExample: LLMResponse,
+  positiveExample?: LLMResponse,
+  negativeExample?: LLMResponse,
 ): Promise<EvalFunctionResult> {
   // Construct call to an LLM to evaluate the example
   const evalPrompt =
@@ -147,7 +147,7 @@ export async function executeLLMEval(
 
   const result = await simpleQueryLLM(
     evalPrompt, // prompt
-    "gpt-3.5-turbo-16k", // llm
+    "gpt-4o-mini", // llm
     systemMessage, // system_msg
   );
   // Get the output
@@ -220,8 +220,8 @@ export async function execJSFunc(
 export async function execPyFunc(
   evalFunction: EvalFunction,
   example: LLMResponse,
-  positiveExample: LLMResponse,
-  negativeExample: LLMResponse,
+  positiveExample?: LLMResponse,
+  negativeExample?: LLMResponse,
 ): Promise<EvalFunctionResult> {
   try {
     // We need to replace the function name with "evaluate", which is what is expected by backend:
@@ -282,11 +282,7 @@ export async function generateFunctionsForCriteria(
     criteria,
     promptTemplate,
     example,
-<<<<<<< HEAD
     badExample,
-=======
-    badExample
->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
   );
   console.log("Function generation prompt:", functionGenPrompt);
 
diff --git a/chainforge/react-server/src/backend/models.ts b/chainforge/react-server/src/backend/models.ts
index 272b65f3d..20d595767 100644
--- a/chainforge/react-server/src/backend/models.ts
+++ b/chainforge/react-server/src/backend/models.ts
@@ -22,6 +22,7 @@ export enum NativeLLM {
   OpenAI_GPT4_Turbo = "gpt-4-turbo",
   OpenAI_GPT4_Turbo_0409 = "gpt-4-turbo-2024-04-09",
   OpenAI_GPT4_O = "gpt-4o",
+  OpenAI_GPT4_O_mini = "gpt-4o-mini",
   OpenAI_GPT4_32k = "gpt-4-32k",
   OpenAI_GPT4_32k_0314 = "gpt-4-32k-0314",
   OpenAI_GPT4_32k_0613 = "gpt-4-32k-0613",