From ddadb7a455bf454c0412e3210889b7cc55400781 Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Fri, 27 Sep 2024 11:29:08 -0700 Subject: [PATCH 1/3] Merge with Helen changes --- chainforge/react-server/src/backend/evalgen/executor.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts index 09d9e0d8e..73b22f587 100644 --- a/chainforge/react-server/src/backend/evalgen/executor.ts +++ b/chainforge/react-server/src/backend/evalgen/executor.ts @@ -145,6 +145,8 @@ export default class EvaluationFunctionExecutor { this.logFunction = addLog; } + + /** * Starts the background computation for generating and executing evaluation functions. * This method initiates the tasks but does not wait for them to complete. @@ -415,10 +417,7 @@ export default class EvaluationFunctionExecutor { // Wait for the 'allFunctionsGenerated' event, which now waits for all executions await allFunctionsGeneratedPromise; } - - public generateNewImplementationsForCriteria( - criteriaID: EvalCriteriaUID, - ): void { + public generateNewImplementationsForCriteria(criteriaID: EvalCriteriaUID): void { const crit = this.evalCriteria.find((c) => c.uid === criteriaID); if (!crit) { throw new Error(`Criteria with ID ${criteriaID} not found.`); @@ -429,6 +428,7 @@ export default class EvaluationFunctionExecutor { } } + /** * Adds another evaluation criteria and triggers the generation and execution of evaluation functions for the new criteria. * This method allows the client to add new evaluation criteria after the executor has been initialized. From 4c1880703ad880184f0360119a89e7dc027ae7fe Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Mon, 24 Jun 2024 15:53:32 -0700 Subject: [PATCH 2/3] Adding UI indicators of how many LLM calls are executed --- chainforge/react-server/src/EvalGenModal.tsx | 84 +++++++++++++++++++ .../src/backend/evalgen/executor.ts | 57 +++++++++++++ .../react-server/src/backend/evalgen/utils.ts | 4 + 3 files changed, 145 insertions(+) diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx index 6e5b5f3dc..ab39d096f 100644 --- a/chainforge/react-server/src/EvalGenModal.tsx +++ b/chainforge/react-server/src/EvalGenModal.tsx @@ -536,13 +536,21 @@ const EvalGenModal = forwardRef>( const addLog = (message: string) => { setLogs((prevLogs) => [...prevLogs, { date: new Date(), message }]); }; +<<<<<<< HEAD +======= + +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) const ex = new EvaluationFunctionExecutor( getLikelyPromptTemplateAsContext(responses), responses, criteria, +<<<<<<< HEAD (gpt4Calls, gpt35Calls) => { // Callback to update GPT call counts +======= + (gpt4Calls, gpt35Calls) => { // Callback to update GPT call counts +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) setNumGPT4Calls((num) => num + gpt4Calls); setNumGPT35Calls((num) => num + gpt35Calls); }, @@ -767,17 +775,26 @@ If you determine the feedback corresponds to a new criteria, your response shoul // Update annotation for current response (if any) // TODO: Fix this for generate case when num resp per prompt > 1 +<<<<<<< HEAD if ( grades[shownResponse.uid] || holisticGrade || (annotation && annotation.trim()) ) { +======= + if (grades[shownResponse.uid] || holisticGrade || (annotation && annotation.trim())) { +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) executor?.setGradeForExample( shownResponse.uid, grades[shownResponse.uid], holisticGrade, +<<<<<<< HEAD annotation ? annotation.trim() : null, ); +======= + annotation ? annotation.trim() : null + ); +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) } if ( @@ -854,6 +871,7 @@ If you determine the feedback corresponds to a new criteria, your response shoul updateShownResponseUniqueIndex(); }; +<<<<<<< HEAD const updateShownResponseUniqueIndex = () => { let idx = 0; for (const resp of responses) { @@ -884,10 +902,13 @@ If you determine the feedback corresponds to a new criteria, your response shoul setShownResponse(responses[shownResponseIdx]); }, [shownResponseIdx]); +======= +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) const estimateGPTCalls = () => { return executor ? `This will trigger around ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT4Calls} GPT-4o and ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT35Calls} GPT-3.5-turbo-16k calls.` : "# estimated GPT calls not available."; +<<<<<<< HEAD }; const updateCriteriaForDisplay = () => { @@ -907,6 +928,9 @@ If you determine the feedback corresponds to a new criteria, your response shoul }; // const [onFinish, setOnFinish] = useState(null); +======= + } +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) return ( +<<<<<<< HEAD {screen === "response" && ( @@ -935,6 +960,21 @@ If you determine the feedback corresponds to a new criteria, your response shoul estimateGPTCalls={estimateGPTCalls} gotoNextScreen={gotoNextScreen} /> +======= + + + + {/* View showing the response the user is currently grading */} + +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) {/* Progress bar */} {/* @@ -1146,28 +1186,40 @@ const HeaderText = ({ children }: { children: ReactNode }) => { interface GradingViewProps { shownResponse: LLMResponse | undefined; +<<<<<<< HEAD shownResponseIdx: number; responseCount: number; +======= +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) numGPT4Calls: number; numGPT35Calls: number; logs: { date: Date; message: string }[]; gotoPrevResponse: () => void; gotoNextResponse: () => void; estimateGPTCalls: () => string; +<<<<<<< HEAD gotoNextScreen: (screenName: string) => void; +======= +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) } const GradingView: React.FC = ({ shownResponse, +<<<<<<< HEAD shownResponseIdx, responseCount, +======= +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) numGPT4Calls, numGPT35Calls, logs, gotoPrevResponse, gotoNextResponse, estimateGPTCalls, +<<<<<<< HEAD gotoNextScreen, +======= +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) }) => { // Calculate inner values only when shownResponse changes const responseText = useMemo( @@ -1257,7 +1309,16 @@ const GradingView: React.FC = ({ {/* Go forward to the next response */} +<<<<<<< HEAD +======= + +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) @@ -1306,6 +1367,7 @@ const GradingView: React.FC = ({ +<<<<<<< HEAD LLM Activity @@ -1313,10 +1375,17 @@ const GradingView: React.FC = ({ Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls}{" "} GPT-3.5-Turbo-16k calls. +======= + LLM Activity + {/* GPT Call Tally */} + + Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls} GPT-3.5-Turbo-16k calls. +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed)
= ({ overflowY: "auto", borderRadius: "8px", border: "1px solid #ddd", +======= + backgroundColor: "#f0f0f0", + color: "#333", + fontFamily: "monospace", + padding: "12px", + width: "calc(100% - 30px)", + height: "200px", + overflowY: "auto", + borderRadius: "8px", + border: "1px solid #ddd", +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) marginRight: "20px", // Space on the right }} ref={(el) => { @@ -1336,9 +1416,13 @@ const GradingView: React.FC = ({ > {logs.map((log, index) => (
+<<<<<<< HEAD {log.date.toLocaleString()} -{" "} +======= + {log.date.toLocaleString()} - +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) {log.message}
))} diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts index 73b22f587..00b2287df 100644 --- a/chainforge/react-server/src/backend/evalgen/executor.ts +++ b/chainforge/react-server/src/backend/evalgen/executor.ts @@ -197,6 +197,7 @@ export default class EvaluationFunctionExecutor { const functionExecutionPromises: Promise[] = []; emitter.on("functionGenerated", (evalFunction) => { + const executionPromise = (async () => { this.evalFunctions.push(evalFunction); const executionPromises = this.examples.map(async (example) => { @@ -228,6 +229,11 @@ export default class EvaluationFunctionExecutor { this.updateGPTCalls(0, 1); } + // Update GPT-3.5 call count by 1 if the eval method is expert + if (evalFunction.evalCriteria.eval_method === "expert") { + this.updateGPTCalls(0, 1); + } + if (onProgress) { onProgress({ success: @@ -253,11 +259,17 @@ export default class EvaluationFunctionExecutor { functionExecutionPromises.push(executionPromise); }); +<<<<<<< HEAD const badExample = this.examples.find( (example) => this.perCriteriaGrades[criteria.uid]?.[example.uid] === false, ); +======= + const badExample = this.examples.find(example => this.perCriteriaGrades[criteria.uid]?.[example.uid] === false); + + +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) await generateFunctionsForCriteria( criteria, this.promptTemplate, @@ -269,12 +281,17 @@ export default class EvaluationFunctionExecutor { this.updateGPTCalls(1, 0); console.log(`Generated functions for criteria: ${criteria.shortname}`); +<<<<<<< HEAD console.log( `Number of functions generated: ${functionExecutionPromises.length}`, ); this.logFunction( `Generated ${functionExecutionPromises.length} functions for criteria: ${criteria.shortname}`, ); +======= + console.log(`Number of functions generated: ${functionExecutionPromises.length}`); + this.logFunction(`Generated ${functionExecutionPromises.length} functions for criteria: ${criteria.shortname}`); +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) await Promise.all(functionExecutionPromises); } @@ -308,9 +325,13 @@ export default class EvaluationFunctionExecutor { // Listen for generated functions and execute them as they come in emitter.on("functionGenerated", (evalFunction) => { +<<<<<<< HEAD this.logFunction( `Generated a new ${evalFunction.evalCriteria.eval_method === "code" ? "code-based" : "LLM-based"} validator for criteria: ${evalFunction.evalCriteria.shortname}${evalFunction.evalCriteria.eval_method === "expert" ? `, with prompt: ${evalFunction.name}` : ""}. Executing it on ${this.examples.length} examples.`, ); +======= + this.logFunction(`Generated a new ${evalFunction.evalCriteria.eval_method === "code" ? "code-based" : "LLM-based"} validator for criteria: ${evalFunction.evalCriteria.shortname}${evalFunction.evalCriteria.eval_method === "expert" ? `, with prompt: ${evalFunction.name}` : ""}. Executing it on ${this.examples.length} examples.`); +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) // Capture the execution promise of each function const executionPromise = (async () => { @@ -347,6 +368,11 @@ export default class EvaluationFunctionExecutor { this.updateGPTCalls(0, 1); } + // Update GPT-3.5 call count by 1 if the eval method is expert + if (evalFunction.evalCriteria.eval_method === "expert") { + this.updateGPTCalls(0, 1); + } + funcsExecuted++; if (onProgress) { onProgress({ @@ -398,9 +424,13 @@ export default class EvaluationFunctionExecutor { console.log( "All evaluation functions have been generated and executed.", ); +<<<<<<< HEAD this.logFunction( "All initially-generated evaluation functions have been generated and executed.", ); +======= + this.logFunction("All initially-generated evaluation functions have been generated and executed."); +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) if (resolveAllFunctionsGenerated) { resolveAllFunctionsGenerated(); // Resolve the promise when all functions have been generated and executed } @@ -541,14 +571,20 @@ export default class EvaluationFunctionExecutor { return new Map(this.grades); } +<<<<<<< HEAD public estimateNumGPTCalls(perCriteriaGrades: Dict): { numGPT4Calls: number; numGPT35Calls: number; } { +======= + public estimateNumGPTCalls(perCriteriaGrades: Dict): { numGPT4Calls: number; numGPT35Calls: number }{ + +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) let numGPT4Calls = 0; let numLLMCriteria = 0; for (const criteriaId in perCriteriaGrades) { const currGrade = perCriteriaGrades[criteriaId]; +<<<<<<< HEAD const numGradedAsCurrGrade = this.examples.filter( (example) => this.perCriteriaGrades[example.uid] && @@ -559,6 +595,12 @@ export default class EvaluationFunctionExecutor { const criteria = this.evalCriteria.find( (criteria) => criteria.uid === criteriaId, ); +======= + const numGradedAsCurrGrade = this.examples.filter(example => this.perCriteriaGrades[example.uid] && this.perCriteriaGrades[example.uid][criteriaId] === currGrade).length; + if (Math.random() <= 1 / (numGradedAsCurrGrade + 1)) { + numGPT4Calls += 1; + const criteria = this.evalCriteria.find(criteria => criteria.uid === criteriaId); +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) if (criteria && criteria.eval_method === "expert") { numLLMCriteria += 1; } @@ -566,9 +608,16 @@ export default class EvaluationFunctionExecutor { } return { +<<<<<<< HEAD numGPT4Calls, numGPT35Calls: numLLMCriteria * 3 * this.examples.length, }; +======= + numGPT4Calls: numGPT4Calls, + numGPT35Calls: numLLMCriteria * 3 * this.examples.length, + }; + +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) } /** @@ -580,12 +629,16 @@ export default class EvaluationFunctionExecutor { * @param exampleId The unique ID of the example being graded. * @param holisticGrade The developer-provided grade assigned to the example, "good" or "bad" or unknown. */ +<<<<<<< HEAD public setGradeForExample( exampleId: ResponseUID, perCriteriaGrades?: Dict, holisticGrade?: string, annotation?: string, ): void { +======= + public setGradeForExample(exampleId: ResponseUID, perCriteriaGrades?: Dict, holisticGrade?: string, annotation?: string ): void { +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) if (holisticGrade !== null) { const boolHolistic = holisticGrade === "good"; this.grades.set(exampleId, boolHolistic); @@ -638,9 +691,13 @@ export default class EvaluationFunctionExecutor { } } +<<<<<<< HEAD console.log( `Generated new implementations for ${numCriteriaWithNewImplementations} criteria.`, ); +======= + console.log(`Generated new implementations for ${numCriteriaWithNewImplementations} criteria.`); +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) } /** diff --git a/chainforge/react-server/src/backend/evalgen/utils.ts b/chainforge/react-server/src/backend/evalgen/utils.ts index 699d8abd6..4220fe941 100644 --- a/chainforge/react-server/src/backend/evalgen/utils.ts +++ b/chainforge/react-server/src/backend/evalgen/utils.ts @@ -282,7 +282,11 @@ export async function generateFunctionsForCriteria( criteria, promptTemplate, example, +<<<<<<< HEAD badExample, +======= + badExample +>>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) ); console.log("Function generation prompt:", functionGenPrompt); From 14ea6b4291bce1610ac93f3533da2259d37a07d8 Mon Sep 17 00:00:00 2001 From: Shreya Shankar Date: Fri, 27 Sep 2024 19:57:53 -0700 Subject: [PATCH 3/3] Integrating executor with report card screen and transition back to multi-eval node --- chainforge/react-server/src/EvalGenModal.tsx | 198 ++++++------------ .../react-server/src/ModelSettingSchemas.tsx | 1 + chainforge/react-server/src/MultiEvalNode.tsx | 28 +-- .../src/backend/evalgen/executor.ts | 67 +----- .../src/backend/evalgen/typing.ts | 6 - .../react-server/src/backend/evalgen/utils.ts | 14 +- chainforge/react-server/src/backend/models.ts | 1 + 7 files changed, 95 insertions(+), 220 deletions(-) diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx index ab39d096f..0ad7f526c 100644 --- a/chainforge/react-server/src/EvalGenModal.tsx +++ b/chainforge/react-server/src/EvalGenModal.tsx @@ -65,7 +65,7 @@ import { RatingDict, ResponseUID, } from "./backend/typing"; -import { EvalCriteria, EvalGenReport } from "./backend/evalgen/typing"; +import { EvalCriteria, EvalFunction, EvalFunctionReport, EvalFunctionSetReport } from "./backend/evalgen/typing"; import { IconChevronDown, IconChevronLeft, @@ -267,7 +267,7 @@ const CriteriaCard: React.FC = ({ onChangeGrade={onChangeGrade} getGradeCount={getGradeCount} /> - + {/* Title of the criteria */} = ({ export interface EvalGenModalRef { trigger: ( resps: LLMResponse[], - setFinalReports: (reports: EvalGenReport) => void, + setFinalReports: (selectedFuncs: EvalFunction[]) => void, ) => void; } @@ -422,6 +422,7 @@ const EvalGenModal = forwardRef>( const apiKeys = useStore((state) => state.apiKeys); const globalState = useStore((store) => store.state); const [criteria, setCriteria] = useState([]); + const [reports, setReports] = useState(undefined); const [criteriaForDisplay, setCriteriaForDisplay] = useState< EvalCriteria[] >([]); @@ -536,21 +537,13 @@ const EvalGenModal = forwardRef>( const addLog = (message: string) => { setLogs((prevLogs) => [...prevLogs, { date: new Date(), message }]); }; -<<<<<<< HEAD -======= - ->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) const ex = new EvaluationFunctionExecutor( getLikelyPromptTemplateAsContext(responses), responses, criteria, -<<<<<<< HEAD (gpt4Calls, gpt35Calls) => { // Callback to update GPT call counts -======= - (gpt4Calls, gpt35Calls) => { // Callback to update GPT call counts ->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) setNumGPT4Calls((num) => num + gpt4Calls); setNumGPT35Calls((num) => num + gpt35Calls); }, @@ -593,7 +586,7 @@ const EvalGenModal = forwardRef>( // const defaultOnFinish = (reports: string) => {}; const [onFinish, setOnFinish] = useState({ - setFinalRpts: (reports: EvalGenReport) => { + setFinalRpts: (reports: EvalFunction[]) => { // console.log(""); }, }); @@ -601,7 +594,7 @@ const EvalGenModal = forwardRef>( // Open the EvalGen wizard const trigger = ( resps: LLMResponse[], - setFinalReports: (reports: EvalGenReport) => void, + setFinalReports: (reports: EvalFunction[]) => void, ) => { // We pass the responses here manually to ensure they remain the same // for the duration of one EvalGen operation. @@ -609,7 +602,7 @@ const EvalGenModal = forwardRef>( gotoNextScreen("response"); // setFinalReports("A plenty response"); setOnFinish({ - setFinalRpts: (reports: EvalGenReport) => { + setFinalRpts: (reports: EvalFunction[]) => { close(); setFinalReports(reports); }, @@ -775,26 +768,17 @@ If you determine the feedback corresponds to a new criteria, your response shoul // Update annotation for current response (if any) // TODO: Fix this for generate case when num resp per prompt > 1 -<<<<<<< HEAD if ( grades[shownResponse.uid] || holisticGrade || (annotation && annotation.trim()) ) { -======= - if (grades[shownResponse.uid] || holisticGrade || (annotation && annotation.trim())) { ->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) executor?.setGradeForExample( shownResponse.uid, grades[shownResponse.uid], holisticGrade, -<<<<<<< HEAD annotation ? annotation.trim() : null, ); -======= - annotation ? annotation.trim() : null - ); ->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) } if ( @@ -871,7 +855,6 @@ If you determine the feedback corresponds to a new criteria, your response shoul updateShownResponseUniqueIndex(); }; -<<<<<<< HEAD const updateShownResponseUniqueIndex = () => { let idx = 0; for (const resp of responses) { @@ -902,13 +885,10 @@ If you determine the feedback corresponds to a new criteria, your response shoul setShownResponse(responses[shownResponseIdx]); }, [shownResponseIdx]); -======= ->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) const estimateGPTCalls = () => { return executor - ? `This will trigger around ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT4Calls} GPT-4o and ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT35Calls} GPT-3.5-turbo-16k calls.` + ? `This will trigger around ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT4Calls} GPT-4o and ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT35Calls} GPT-4o-mini calls.` : "# estimated GPT calls not available."; -<<<<<<< HEAD }; const updateCriteriaForDisplay = () => { @@ -927,10 +907,22 @@ If you determine the feedback corresponds to a new criteria, your response shoul setScreen(screenName); }; + const handleGradingDone = async () => { + await executor?.waitForCompletion(); + const filteredFunctions = await executor?.filterEvaluationFunctions(0.25); + console.log("filteredFunctions", filteredFunctions); + + // schema is { + // failureCoverage: coverage, + // falseFailureRate, + // selectedEvalFunctions: bestEvalFunctions, + // allEvalFunctionReports: evalFunctionReport, + // }; + // set state + setReports(filteredFunctions); + }; + // const [onFinish, setOnFinish] = useState(null); -======= - } ->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) return ( -<<<<<<< HEAD {screen === "response" && ( @@ -959,22 +950,8 @@ If you determine the feedback corresponds to a new criteria, your response shoul gotoPrevResponse={prevResponse2} estimateGPTCalls={estimateGPTCalls} gotoNextScreen={gotoNextScreen} + handleGradingDone={handleGradingDone} /> -======= - - - - {/* View showing the response the user is currently grading */} - ->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) {/* Progress bar */} {/* @@ -1150,12 +1127,8 @@ If you determine the feedback corresponds to a new criteria, your response shoul {screen === "report" && ( { + report={reports} + onFinish={(reports: EvalFunction[]) => { onFinish.setFinalRpts(reports); }} getGradeCount={(crit: EvalCriteria, grade: boolean) => { @@ -1186,40 +1159,30 @@ const HeaderText = ({ children }: { children: ReactNode }) => { interface GradingViewProps { shownResponse: LLMResponse | undefined; -<<<<<<< HEAD shownResponseIdx: number; responseCount: number; -======= ->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) numGPT4Calls: number; numGPT35Calls: number; logs: { date: Date; message: string }[]; gotoPrevResponse: () => void; gotoNextResponse: () => void; estimateGPTCalls: () => string; -<<<<<<< HEAD gotoNextScreen: (screenName: string) => void; -======= ->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) + handleGradingDone: () => void; } const GradingView: React.FC = ({ shownResponse, -<<<<<<< HEAD shownResponseIdx, responseCount, -======= ->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) numGPT4Calls, numGPT35Calls, logs, gotoPrevResponse, gotoNextResponse, estimateGPTCalls, -<<<<<<< HEAD gotoNextScreen, -======= ->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) + handleGradingDone, }) => { // Calculate inner values only when shownResponse changes const responseText = useMemo( @@ -1309,16 +1272,7 @@ const GradingView: React.FC = ({
{/* Go forward to the next response */} -<<<<<<< HEAD -======= - ->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) @@ -1367,25 +1321,17 @@ const GradingView: React.FC = ({
-<<<<<<< HEAD LLM Activity {/* GPT Call Tally */} Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls}{" "} - GPT-3.5-Turbo-16k calls. -======= - LLM Activity - {/* GPT Call Tally */} - - Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls} GPT-3.5-Turbo-16k calls. ->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) + GPT-4o-mini calls.
= ({ overflowY: "auto", borderRadius: "8px", border: "1px solid #ddd", -======= - backgroundColor: "#f0f0f0", - color: "#333", - fontFamily: "monospace", - padding: "12px", - width: "calc(100% - 30px)", - height: "200px", - overflowY: "auto", - borderRadius: "8px", - border: "1px solid #ddd", ->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) marginRight: "20px", // Space on the right }} ref={(el) => { @@ -1416,13 +1351,9 @@ const GradingView: React.FC = ({ > {logs.map((log, index) => (
-<<<<<<< HEAD {log.date.toLocaleString()} -{" "} -======= - {log.date.toLocaleString()} - ->>>>>>> c979cf1 (Adding UI indicators of how many LLM calls are executed) {log.message}
))} @@ -1435,8 +1366,10 @@ const GradingView: React.FC = ({ leftIcon={} variant="gradient" gradient={{ from: "blue", to: "green", deg: 45 }} - onClick={() => { + onClick={async () => { // console.log("(3) gotoNextScreen", gotoNextScreen); + // Get the evaluation functions + await handleGradingDone(); gotoNextScreen("report"); }} > @@ -1449,9 +1382,9 @@ const GradingView: React.FC = ({ }; interface ReportCardViewProps { - report: EvalGenReport; + report: EvalFunctionSetReport; // recomputeAlignment, - onFinish: (reports: EvalGenReport) => void; + onFinish: (reports: EvalFunction[]) => void; getGradeCount: (crit: EvalCriteria, grade: boolean) => number; getStateValue: (stateId: number) => number; } @@ -1464,41 +1397,44 @@ const ReportCardView: React.FC = ({ getGradeCount, getStateValue, }) => { - // The criteria cards, now with report information - const [finalReport, setFinalReport] = useState(report); + const [selectedEvalFunctions, setSelectedEvalFunctions] = useState(report.selectedEvalFunctions); const onSelect = (criterion: EvalCriteria, isSelected: boolean) => { if (isSelected) { - finalReport.criteria.push(criterion); + const matchingFunction = report.selectedEvalFunctions.find(func => func.evalCriteria === criterion); + if (matchingFunction && !selectedEvalFunctions.includes(matchingFunction)) { + setSelectedEvalFunctions([...selectedEvalFunctions, matchingFunction]); + } } else { - finalReport.criteria = finalReport.criteria.filter( - (c) => c !== criterion, - ); + setSelectedEvalFunctions(selectedEvalFunctions.filter(func => func.evalCriteria !== criterion)); } - setFinalReport(finalReport); - }; + } + + // The criteria cards, now with report information const cards = useMemo(() => { const res = []; // Iterate through selected eval functions and create cards - // for (const selectedFunc of report.selectedEvalFunctions) { - // const crit = selectedFunc.evalCriteria; - // // Find corresponding report in allEvalFunctionReports map from criteria to list - // const critEvalFuncReports = report.allEvalFunctionReports.get(crit); - // const evalFuncReport = critEvalFuncReports.find( - // (rep) => rep.evalFunction === selectedFunc, - // ); - - // // Get the functions that were not selected for this criteria - // const otherFuncs = critEvalFuncReports.filter( - // (rep) => rep.evalFunction !== selectedFunc, - // ); - for (const crit of report.criteria) { + for (const selectedFunc of report.selectedEvalFunctions) { + const crit = selectedFunc.evalCriteria; + // Find corresponding report in allEvalFunctionReports map from criteria to list + const critEvalFuncReports = report.allEvalFunctionReports.get(crit); + const evalFuncReport = critEvalFuncReports.find( + (rep) => rep.evalFunction === selectedFunc, + ); + + // Get the functions that were not selected for this criteria + const otherFuncs = critEvalFuncReports.filter( + (rep) => rep.evalFunction !== selectedFunc, + ); + res.push( { // crit.selected = checked; // recomputeAlignment(); @@ -1562,7 +1498,7 @@ const ReportCardView: React.FC = ({