diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx index 6e5b5f3d..0ad7f526 100644 --- a/chainforge/react-server/src/EvalGenModal.tsx +++ b/chainforge/react-server/src/EvalGenModal.tsx @@ -65,7 +65,7 @@ import { RatingDict, ResponseUID, } from "./backend/typing"; -import { EvalCriteria, EvalGenReport } from "./backend/evalgen/typing"; +import { EvalCriteria, EvalFunction, EvalFunctionReport, EvalFunctionSetReport } from "./backend/evalgen/typing"; import { IconChevronDown, IconChevronLeft, @@ -267,7 +267,7 @@ const CriteriaCard: React.FC = ({ onChangeGrade={onChangeGrade} getGradeCount={getGradeCount} /> - + {/* Title of the criteria */} = ({ export interface EvalGenModalRef { trigger: ( resps: LLMResponse[], - setFinalReports: (reports: EvalGenReport) => void, + setFinalReports: (selectedFuncs: EvalFunction[]) => void, ) => void; } @@ -422,6 +422,7 @@ const EvalGenModal = forwardRef>( const apiKeys = useStore((state) => state.apiKeys); const globalState = useStore((store) => store.state); const [criteria, setCriteria] = useState([]); + const [reports, setReports] = useState(undefined); const [criteriaForDisplay, setCriteriaForDisplay] = useState< EvalCriteria[] >([]); @@ -585,7 +586,7 @@ const EvalGenModal = forwardRef>( // const defaultOnFinish = (reports: string) => {}; const [onFinish, setOnFinish] = useState({ - setFinalRpts: (reports: EvalGenReport) => { + setFinalRpts: (reports: EvalFunction[]) => { // console.log(""); }, }); @@ -593,7 +594,7 @@ const EvalGenModal = forwardRef>( // Open the EvalGen wizard const trigger = ( resps: LLMResponse[], - setFinalReports: (reports: EvalGenReport) => void, + setFinalReports: (reports: EvalFunction[]) => void, ) => { // We pass the responses here manually to ensure they remain the same // for the duration of one EvalGen operation. @@ -601,7 +602,7 @@ const EvalGenModal = forwardRef>( gotoNextScreen("response"); // setFinalReports("A plenty response"); setOnFinish({ - setFinalRpts: (reports: EvalGenReport) => { + setFinalRpts: (reports: EvalFunction[]) => { close(); setFinalReports(reports); }, @@ -886,7 +887,7 @@ If you determine the feedback corresponds to a new criteria, your response shoul const estimateGPTCalls = () => { return executor - ? `This will trigger around ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT4Calls} GPT-4o and ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT35Calls} GPT-3.5-turbo-16k calls.` + ? `This will trigger around ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT4Calls} GPT-4o and ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT35Calls} GPT-4o-mini calls.` : "# estimated GPT calls not available."; }; @@ -906,6 +907,21 @@ If you determine the feedback corresponds to a new criteria, your response shoul setScreen(screenName); }; + const handleGradingDone = async () => { + await executor?.waitForCompletion(); + const filteredFunctions = await executor?.filterEvaluationFunctions(0.25); + console.log("filteredFunctions", filteredFunctions); + + // schema is { + // failureCoverage: coverage, + // falseFailureRate, + // selectedEvalFunctions: bestEvalFunctions, + // allEvalFunctionReports: evalFunctionReport, + // }; + // set state + setReports(filteredFunctions); + }; + // const [onFinish, setOnFinish] = useState(null); return ( @@ -934,6 +950,7 @@ If you determine the feedback corresponds to a new criteria, your response shoul gotoPrevResponse={prevResponse2} estimateGPTCalls={estimateGPTCalls} gotoNextScreen={gotoNextScreen} + handleGradingDone={handleGradingDone} /> {/* Progress bar */} @@ -1110,12 +1127,8 @@ If you determine the feedback corresponds to a new criteria, your response shoul {screen === "report" && ( { + report={reports} + onFinish={(reports: EvalFunction[]) => { onFinish.setFinalRpts(reports); }} getGradeCount={(crit: EvalCriteria, grade: boolean) => { @@ -1155,6 +1168,7 @@ interface GradingViewProps { gotoNextResponse: () => void; estimateGPTCalls: () => string; gotoNextScreen: (screenName: string) => void; + handleGradingDone: () => void; } const GradingView: React.FC = ({ @@ -1168,6 +1182,7 @@ const GradingView: React.FC = ({ gotoNextResponse, estimateGPTCalls, gotoNextScreen, + handleGradingDone, }) => { // Calculate inner values only when shownResponse changes const responseText = useMemo( @@ -1312,7 +1327,7 @@ const GradingView: React.FC = ({ {/* GPT Call Tally */} Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls}{" "} - GPT-3.5-Turbo-16k calls. + GPT-4o-mini calls.
= ({ leftIcon={} variant="gradient" gradient={{ from: "blue", to: "green", deg: 45 }} - onClick={() => { + onClick={async () => { // console.log("(3) gotoNextScreen", gotoNextScreen); + // Get the evaluation functions + await handleGradingDone(); gotoNextScreen("report"); }} > @@ -1365,9 +1382,9 @@ const GradingView: React.FC = ({ }; interface ReportCardViewProps { - report: EvalGenReport; + report: EvalFunctionSetReport; // recomputeAlignment, - onFinish: (reports: EvalGenReport) => void; + onFinish: (reports: EvalFunction[]) => void; getGradeCount: (crit: EvalCriteria, grade: boolean) => number; getStateValue: (stateId: number) => number; } @@ -1380,41 +1397,44 @@ const ReportCardView: React.FC = ({ getGradeCount, getStateValue, }) => { - // The criteria cards, now with report information - const [finalReport, setFinalReport] = useState(report); + const [selectedEvalFunctions, setSelectedEvalFunctions] = useState(report.selectedEvalFunctions); const onSelect = (criterion: EvalCriteria, isSelected: boolean) => { if (isSelected) { - finalReport.criteria.push(criterion); + const matchingFunction = report.selectedEvalFunctions.find(func => func.evalCriteria === criterion); + if (matchingFunction && !selectedEvalFunctions.includes(matchingFunction)) { + setSelectedEvalFunctions([...selectedEvalFunctions, matchingFunction]); + } } else { - finalReport.criteria = finalReport.criteria.filter( - (c) => c !== criterion, - ); + setSelectedEvalFunctions(selectedEvalFunctions.filter(func => func.evalCriteria !== criterion)); } - setFinalReport(finalReport); - }; + } + + // The criteria cards, now with report information const cards = useMemo(() => { const res = []; // Iterate through selected eval functions and create cards - // for (const selectedFunc of report.selectedEvalFunctions) { - // const crit = selectedFunc.evalCriteria; - // // Find corresponding report in allEvalFunctionReports map from criteria to list - // const critEvalFuncReports = report.allEvalFunctionReports.get(crit); - // const evalFuncReport = critEvalFuncReports.find( - // (rep) => rep.evalFunction === selectedFunc, - // ); - - // // Get the functions that were not selected for this criteria - // const otherFuncs = critEvalFuncReports.filter( - // (rep) => rep.evalFunction !== selectedFunc, - // ); - for (const crit of report.criteria) { + for (const selectedFunc of report.selectedEvalFunctions) { + const crit = selectedFunc.evalCriteria; + // Find corresponding report in allEvalFunctionReports map from criteria to list + const critEvalFuncReports = report.allEvalFunctionReports.get(crit); + const evalFuncReport = critEvalFuncReports.find( + (rep) => rep.evalFunction === selectedFunc, + ); + + // Get the functions that were not selected for this criteria + const otherFuncs = critEvalFuncReports.filter( + (rep) => rep.evalFunction !== selectedFunc, + ); + res.push( { // crit.selected = checked; // recomputeAlignment(); @@ -1478,7 +1498,7 @@ const ReportCardView: React.FC = ({