From 332f757dc8a18d3ce4f7d0086de73ac182565979 Mon Sep 17 00:00:00 2001 From: w352chen Date: Sat, 29 Jun 2024 20:19:58 -0400 Subject: [PATCH 01/11] added: displaying response index,changing New Criteria button, Re-ordering by priority --- chainforge/react-server/package-lock.json | 261 +++++++++++++++--- chainforge/react-server/src/EvalGenModal.tsx | 202 +++++++++++--- chainforge/react-server/src/MultiEvalNode.tsx | 15 + .../src/backend/evalgen/executor.ts | 141 +++++++--- .../src/backend/evalgen/oai_utils.ts | 4 +- .../react-server/src/backend/evalgen/utils.ts | 16 +- 6 files changed, 514 insertions(+), 125 deletions(-) diff --git a/chainforge/react-server/package-lock.json b/chainforge/react-server/package-lock.json index 9dddca088..2804f6803 100644 --- a/chainforge/react-server/package-lock.json +++ b/chainforge/react-server/package-lock.json @@ -3525,15 +3525,15 @@ "integrity": "sha512-W2P2c/VRW1/1tLox0mVUalvnWXxavmv/Oum2aPsRcoDJuob75FC3Y8FbpfLwUegRcxINtGUMPq0tFCvYNTBXNA==" }, "node_modules/@emotion/react": { - "version": "11.11.1", - "resolved": "https://registry.npmjs.org/@emotion/react/-/react-11.11.1.tgz", - "integrity": "sha512-5mlW1DquU5HaxjLkfkGN1GA/fvVGdyHURRiX/0FHl2cfIfRxSOfmxEH5YS43edp0OldZrZ+dkBKbngxcNCdZvA==", + "version": "11.11.4", + "resolved": "https://registry.npmjs.org/@emotion/react/-/react-11.11.4.tgz", + "integrity": "sha512-t8AjMlF0gHpvvxk5mAtCqR4vmxiGHCeJBaQO6gncUSdklELOgtwjerNY2yuJNfwnc6vi16U/+uMF+afIawJ9iw==", "peer": true, "dependencies": { "@babel/runtime": "^7.18.3", "@emotion/babel-plugin": "^11.11.0", "@emotion/cache": "^11.11.0", - "@emotion/serialize": "^1.1.2", + "@emotion/serialize": "^1.1.3", "@emotion/use-insertion-effect-with-fallbacks": "^1.0.1", "@emotion/utils": "^1.2.1", "@emotion/weak-memoize": "^0.3.1", @@ -3549,9 +3549,9 @@ } }, "node_modules/@emotion/serialize": { - "version": "1.1.2", - "resolved": "https://registry.npmjs.org/@emotion/serialize/-/serialize-1.1.2.tgz", - "integrity": "sha512-zR6a/fkFP4EAcCMQtLOhIgpprZOwNmCldtpaISpvz348+DP4Mz8ZoKaGGCQpbzepNIUWbq4w6hNZkwDyKoS+HA==", + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/@emotion/serialize/-/serialize-1.1.4.tgz", + "integrity": "sha512-RIN04MBT8g+FnDwgvIUi8czvr1LU1alUMI05LekWB5DGyTm8cCBMCRpq3GqaiyEDRptEXOyXnvZ58GZYu4kBxQ==", "peer": true, "dependencies": { "@emotion/hash": "^0.9.1", @@ -3825,6 +3825,7 @@ "version": "0.11.14", "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.14.tgz", "integrity": "sha512-3T8LkOmg45BV5FICb15QQMsyUSWrQ8AygVfC7ZG32zOalnqrilm018ZVCw0eapXux8FtA33q8PSRSstjee3jSg==", + "deprecated": "Use @eslint/config-array instead", "dependencies": { "@humanwhocodes/object-schema": "^2.0.2", "debug": "^4.3.1", @@ -3847,9 +3848,10 @@ } }, "node_modules/@humanwhocodes/object-schema": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/@humanwhocodes/object-schema/-/object-schema-2.0.2.tgz", - "integrity": "sha512-6EwiSjwWYP7pTckG6I5eyFANjPhmPjUX9JRLUSfNPC7FX7zK9gyZAfUEaECL6ALTpGX5AjnBq3C9XmVWPitNpw==" + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/@humanwhocodes/object-schema/-/object-schema-2.0.3.tgz", + "integrity": "sha512-93zYdMES/c1D69yZiKDBj0V24vqNzB/koF26KPaagAfd3P/4gUlh3Dys5ogAK+Exi9QyzlD8x/08Zt7wIKcDcA==", + "deprecated": "Use @eslint/object-schema instead" }, "node_modules/@istanbuljs/load-nyc-config": { "version": "1.1.0", @@ -6246,22 +6248,31 @@ } }, "node_modules/@testing-library/dom": { - "version": "9.3.3", - "resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-9.3.3.tgz", - "integrity": "sha512-fB0R+fa3AUqbLHWyxXa2kGVtf1Fe1ZZFr0Zp6AIbIAzXb2mKbEXl+PCQNUOaq5lbTab5tfctfXRNsWXxa2f7Aw==", + "version": "10.2.0", + "resolved": "https://registry.npmjs.org/@testing-library/dom/-/dom-10.2.0.tgz", + "integrity": "sha512-CytIvb6tVOADRngTHGWNxH8LPgO/3hi/BdCEHOf7Qd2GvZVClhVP0Wo/QHzWhpki49Bk0b4VT6xpt3fx8HTSIw==", "peer": true, "dependencies": { "@babel/code-frame": "^7.10.4", "@babel/runtime": "^7.12.5", "@types/aria-query": "^5.0.1", - "aria-query": "5.1.3", + "aria-query": "5.3.0", "chalk": "^4.1.0", "dom-accessibility-api": "^0.5.9", "lz-string": "^1.5.0", "pretty-format": "^27.0.2" }, "engines": { - "node": ">=14" + "node": ">=18" + } + }, + "node_modules/@testing-library/dom/node_modules/aria-query": { + "version": "5.3.0", + "resolved": "https://registry.npmjs.org/aria-query/-/aria-query-5.3.0.tgz", + "integrity": "sha512-b0P0sZPKtyu8HkeRAfCq0IfURZK+SuwMjY1UXGBU27wpAiTwQAIlq56IbIO+ytk/JjS1fMR14ee5WBBfKi5J6A==", + "peer": true, + "dependencies": { + "dequal": "^2.0.3" } }, "node_modules/@testing-library/jest-dom": { @@ -7704,9 +7715,9 @@ "integrity": "sha512-wkJp+Wz8MRHtCVdt65L/jPFLAQ0iqJZ2EeD2XWOvKGbIi4mZNwHlpHRLRB8ZnQ07VoiB0TLFWwIjjm2FL9gUcQ==" }, "node_modules/acorn": { - "version": "8.10.0", - "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.10.0.tgz", - "integrity": "sha512-F0SAmZ8iUtS//m8DmCTA0jlh6TDKkHQyK6xc6V4KDTyZKA9dnvX9/3sRTVQrWm79glUAZbnmmNcdYwUIHWVybw==", + "version": "8.12.0", + "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.12.0.tgz", + "integrity": "sha512-RTvkC4w+KNXrM39/lWCUaG0IbRkWdCv7W/IOW9oU6SawyxulvkQy5HQPVTKxEjczcUvapcrw3cFx/60VN/NRNw==", "bin": { "acorn": "bin/acorn" }, @@ -12251,9 +12262,9 @@ } }, "node_modules/eslint/node_modules/globals": { - "version": "13.22.0", - "resolved": "https://registry.npmjs.org/globals/-/globals-13.22.0.tgz", - "integrity": "sha512-H1Ddc/PbZHTDVJSnj8kWptIRSD6AM3pK+mKytuIVF4uoBV7rshFlhhvA58ceJ5wp3Er58w6zj7bykMpYXt3ETw==", + "version": "13.24.0", + "resolved": "https://registry.npmjs.org/globals/-/globals-13.24.0.tgz", + "integrity": "sha512-AhO5QUcj8llrbG09iWhPU2B204J1xnPeL8kQmVorSsy+Sjj1sk8gIyh6cUocGmH4L0UuhAJy+hJMRA4mgA4mFQ==", "dependencies": { "type-fest": "^0.20.2" }, @@ -12944,22 +12955,22 @@ } }, "node_modules/flat-cache": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-3.1.0.tgz", - "integrity": "sha512-OHx4Qwrrt0E4jEIcI5/Xb+f+QmJYNj2rrK8wiIdQOIrB9WrrJL8cjZvXdXuBTkkEwEqLycb5BeZDV1o2i9bTew==", + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/flat-cache/-/flat-cache-3.2.0.tgz", + "integrity": "sha512-CYcENa+FtcUKLmhhqyctpclsq7QF38pKjZHsGNiSQF5r4FtoKDWabFDl3hzaEQMvT1LHEysw5twgLvpYYb4vbw==", "dependencies": { - "flatted": "^3.2.7", + "flatted": "^3.2.9", "keyv": "^4.5.3", "rimraf": "^3.0.2" }, "engines": { - "node": ">=12.0.0" + "node": "^10.12.0 || >=12.0.0" } }, "node_modules/flatted": { - "version": "3.2.9", - "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.2.9.tgz", - "integrity": "sha512-36yxDn5H7OFZQla0/jFJmbIKTdZAQHngCedGxiMmpNfEZM0sdEeT+WczLQrjK6D7o2aiyLYDnkw0R3JK0Qv1RQ==" + "version": "3.3.1", + "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.3.1.tgz", + "integrity": "sha512-X8cqMLLie7KsNUDSdzeN8FYK9rEt4Dt67OsG/DNGnYTSDBG4uFAJFBnUeiV+zCVAvwFy56IjM9sH51jVaEhNxw==" }, "node_modules/flatten-vertex-data": { "version": "1.0.2", @@ -17409,9 +17420,9 @@ "integrity": "sha512-hRkd6/XW4HTsA9vjVpY9tuXJYLSlelnkTmVFu4M9/7MIYQtFcHpbugAU7UbOfjOiVSVYl2fqgBuJ32JUmRo5Ew==" }, "node_modules/keyv": { - "version": "4.5.3", - "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.3.tgz", - "integrity": "sha512-QCiSav9WaX1PgETJ+SpNnx2PRRapJ/oRSXM4VO5OGYGSjrxbKPVFVhB3l2OCbLCk329N8qyAtsJjSjvVBWzEug==", + "version": "4.5.4", + "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz", + "integrity": "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==", "dependencies": { "json-buffer": "3.0.1" } @@ -20698,9 +20709,9 @@ } }, "node_modules/prettier": { - "version": "3.2.5", - "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.2.5.tgz", - "integrity": "sha512-3/GWa9aOC0YeD7LUfvOG2NiDyhOWRvt1k+rcKhOuYnMY24iiCphgneUfJDyFXd6rZCAnuLBv6UeAULtrhT/F4A==", + "version": "3.3.2", + "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.3.2.tgz", + "integrity": "sha512-rAVeHYMcv8ATV5d508CFdn+8/pHPpXeIid1DdrPwXnaAdH7cqjVbpJaT5eq4yRAFU/lsbwYwSF/n5iNrdJHPQA==", "dev": true, "peer": true, "bin": { @@ -21643,6 +21654,34 @@ } } }, + "node_modules/react-scripts/node_modules/@eslint/js": { + "version": "8.57.0", + "resolved": "https://registry.npmjs.org/@eslint/js/-/js-8.57.0.tgz", + "integrity": "sha512-Ys+3g2TaW7gADOJzPt83SJtCDhMjndcDMFVQ/Tj9iA1BfJzFKD9mAUXT3OenpuPHbI6P/myECxRJrofUsDx/5g==", + "engines": { + "node": "^12.22.0 || ^14.17.0 || >=16.0.0" + } + }, + "node_modules/react-scripts/node_modules/ajv": { + "version": "6.12.6", + "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", + "integrity": "sha512-j3fVLgvTo527anyYyJOGTYJbG+vnnQYvE0m5mmkc1TK+nxAppkCLMIL0aZ4dblVCNoGShhm+kzE4ZUykBoMg4g==", + "dependencies": { + "fast-deep-equal": "^3.1.1", + "fast-json-stable-stringify": "^2.0.0", + "json-schema-traverse": "^0.4.1", + "uri-js": "^4.2.2" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/epoberezkin" + } + }, + "node_modules/react-scripts/node_modules/argparse": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz", + "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==" + }, "node_modules/react-scripts/node_modules/camelcase": { "version": "6.3.0", "resolved": "https://registry.npmjs.org/camelcase/-/camelcase-6.3.0.tgz", @@ -21654,6 +21693,158 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/react-scripts/node_modules/eslint": { + "version": "8.57.0", + "resolved": "https://registry.npmjs.org/eslint/-/eslint-8.57.0.tgz", + "integrity": "sha512-dZ6+mexnaTIbSBZWgou51U6OmzIhYM2VcNdtiTtI7qPNZm35Akpr0f6vtw3w1Kmn5PYo+tZVfh13WrhpS6oLqQ==", + "dependencies": { + "@eslint-community/eslint-utils": "^4.2.0", + "@eslint-community/regexpp": "^4.6.1", + "@eslint/eslintrc": "^2.1.4", + "@eslint/js": "8.57.0", + "@humanwhocodes/config-array": "^0.11.14", + "@humanwhocodes/module-importer": "^1.0.1", + "@nodelib/fs.walk": "^1.2.8", + "@ungap/structured-clone": "^1.2.0", + "ajv": "^6.12.4", + "chalk": "^4.0.0", + "cross-spawn": "^7.0.2", + "debug": "^4.3.2", + "doctrine": "^3.0.0", + "escape-string-regexp": "^4.0.0", + "eslint-scope": "^7.2.2", + "eslint-visitor-keys": "^3.4.3", + "espree": "^9.6.1", + "esquery": "^1.4.2", + "esutils": "^2.0.2", + "fast-deep-equal": "^3.1.3", + "file-entry-cache": "^6.0.1", + "find-up": "^5.0.0", + "glob-parent": "^6.0.2", + "globals": "^13.19.0", + "graphemer": "^1.4.0", + "ignore": "^5.2.0", + "imurmurhash": "^0.1.4", + "is-glob": "^4.0.0", + "is-path-inside": "^3.0.3", + "js-yaml": "^4.1.0", + "json-stable-stringify-without-jsonify": "^1.0.1", + "levn": "^0.4.1", + "lodash.merge": "^4.6.2", + "minimatch": "^3.1.2", + "natural-compare": "^1.4.0", + "optionator": "^0.9.3", + "strip-ansi": "^6.0.1", + "text-table": "^0.2.0" + }, + "bin": { + "eslint": "bin/eslint.js" + }, + "engines": { + "node": "^12.22.0 || ^14.17.0 || >=16.0.0" + }, + "funding": { + "url": "https://opencollective.com/eslint" + } + }, + "node_modules/react-scripts/node_modules/find-up": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/find-up/-/find-up-5.0.0.tgz", + "integrity": "sha512-78/PXT1wlLLDgTzDs7sjq9hzz0vXD+zn+7wypEe4fXQxCmdmqfGsEPQxmiCSQI3ajFV91bVSsvNtrJRiW6nGng==", + "dependencies": { + "locate-path": "^6.0.0", + "path-exists": "^4.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/react-scripts/node_modules/globals": { + "version": "13.24.0", + "resolved": "https://registry.npmjs.org/globals/-/globals-13.24.0.tgz", + "integrity": "sha512-AhO5QUcj8llrbG09iWhPU2B204J1xnPeL8kQmVorSsy+Sjj1sk8gIyh6cUocGmH4L0UuhAJy+hJMRA4mgA4mFQ==", + "dependencies": { + "type-fest": "^0.20.2" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/react-scripts/node_modules/js-yaml": { + "version": "4.1.0", + "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz", + "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==", + "dependencies": { + "argparse": "^2.0.1" + }, + "bin": { + "js-yaml": "bin/js-yaml.js" + } + }, + "node_modules/react-scripts/node_modules/json-schema-traverse": { + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-0.4.1.tgz", + "integrity": "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==" + }, + "node_modules/react-scripts/node_modules/locate-path": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/locate-path/-/locate-path-6.0.0.tgz", + "integrity": "sha512-iPZK6eYjbxRu3uB4/WZ3EsEIMJFMqAoopl3R+zuq0UjcAm/MO6KCweDgPfP3elTztoKP3KtnVHxTn2NHBSDVUw==", + "dependencies": { + "p-locate": "^5.0.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/react-scripts/node_modules/p-limit": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", + "integrity": "sha512-TYOanM3wGwNGsZN2cVTYPArw454xnXj5qmWF1bEoAc4+cU/ol7GVh7odevjp1FNHduHc3KZMcFduxU5Xc6uJRQ==", + "dependencies": { + "yocto-queue": "^0.1.0" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/react-scripts/node_modules/p-locate": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/p-locate/-/p-locate-5.0.0.tgz", + "integrity": "sha512-LaNjtRWUBY++zB5nE/NwcaoMylSPk+S+ZHNB1TzdbMJMny6dynpAGt7X/tl/QYq3TIeE6nxHppbo2LGymrG5Pw==", + "dependencies": { + "p-limit": "^3.0.2" + }, + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/react-scripts/node_modules/type-fest": { + "version": "0.20.2", + "resolved": "https://registry.npmjs.org/type-fest/-/type-fest-0.20.2.tgz", + "integrity": "sha512-Ne+eE4r0/iWnpAxD852z3A+N0Bt5RN//NjJwRd2VFHEmrywxf5vsZlh4R6lixl6B+wz/8d+maTSAkN1FIkI3LQ==", + "engines": { + "node": ">=10" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/react-style-singleton": { "version": "2.2.1", "resolved": "https://registry.npmjs.org/react-style-singleton/-/react-style-singleton-2.2.1.tgz", diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx index 0abc3dbc4..a0cfac073 100644 --- a/chainforge/react-server/src/EvalGenModal.tsx +++ b/chainforge/react-server/src/EvalGenModal.tsx @@ -21,6 +21,7 @@ import React, { ReactNode, forwardRef, useCallback, + useEffect, useImperativeHandle, useMemo, useState, @@ -68,6 +69,8 @@ import { IconThumbDown, IconThumbUp, IconTrash, + IconFlagFilled, + IconPencil, } from "@tabler/icons-react"; import { cleanMetavarsFilterFunc, @@ -262,7 +265,7 @@ const CriteriaCard: React.FC = ({ withArrow > @@ -341,6 +345,9 @@ const EvalGenModal = forwardRef>( const apiKeys = useStore((state) => state.apiKeys); const globalState = useStore((store) => store.state); const [criteria, setCriteria] = useState([]); + const [criteriaForDisplay, setCriteriaForDisplay] = useState< + EvalCriteria[] + >([]); const [responses, setResponses] = useState([]); const [shownResponse, setShownResponse] = useState( @@ -350,6 +357,7 @@ const EvalGenModal = forwardRef>( [], ); const [shownResponseIdx, setShownResponseIdx] = useState(0); + const [shownResponseUniqueIdx, setShownResponseUniqueIdx] = useState(0); const [annotation, setAnnotation] = useState(undefined); const [holisticGrade, setHolisticGrade] = useState< @@ -396,9 +404,10 @@ const EvalGenModal = forwardRef>( [setState], ); + // console.error("criteria", criteria); + // Update executor whenever resps, grades, or criteria change React.useEffect(() => { - if (criteria.length > 0 && !executor) { const existingGrades = transformDict( globalState, @@ -418,12 +427,12 @@ const EvalGenModal = forwardRef>( setLogs((prevLogs) => [...prevLogs, { date: new Date(), message }]); }; - const ex = new EvaluationFunctionExecutor( getLikelyPromptTemplateAsContext(responses), responses, criteria, - (gpt4Calls, gpt35Calls) => { // Callback to update GPT call counts + (gpt4Calls, gpt35Calls) => { + // Callback to update GPT call counts setNumGPT4Calls((num) => num + gpt4Calls); setNumGPT35Calls((num) => num + gpt35Calls); }, @@ -437,28 +446,32 @@ const EvalGenModal = forwardRef>( ex.start((progress) => { setExecProgress(progress?.success ?? 0); }); - } else if (executor) { // Update criteria in executor executor.addCriteria(criteria); } + updateCriteriaForDisplay(); }, [criteria]); - // Open the EvalGen wizard const trigger = (resps: LLMResponse[]) => { // We pass the responses here manually to ensure they remain the same // for the duration of one EvalGen operation. setResponses(resps); - const firstGrades = resps.reduce((acc: Dict>, curr) => { - if (!(curr.uid in acc)) acc[curr.uid] = {}; - return acc; - }, grades); + + const firstGrades = resps.reduce( + (acc: Dict>, curr) => { + if (!(curr.uid in acc)) acc[curr.uid] = {}; + return acc; + }, + grades, + ); setGrades(firstGrades); // Create criteria setIsLoadingCriteria((num) => num + 3); + // console.log("*****************************resps", resps); genCriteriaFromContext(resps) .then((crits) => setCriteria(crits.map((c) => ({ ...c, uid: uuid() })))) .catch((err) => { @@ -472,12 +485,13 @@ const EvalGenModal = forwardRef>( setShownResponseIdx(0); if (resps.length > 0) { const first_resp = sampleRandomElements(resps, 1)[0]; - setShownResponse(first_resp); + // setShownResponse(first_resp); setPastShownResponses([first_resp]); } else { - setShownResponse(undefined); + // setShownResponse(undefined); setPastShownResponses([]); } + setShownResponse(resps[shownResponseIdx]); open(); }; useImperativeHandle(ref, () => ({ @@ -550,9 +564,11 @@ const EvalGenModal = forwardRef>( // Add a loading Skeleton setIsLoadingCriteria((num) => num + 1); // Make async LLM call to expand criteria only if the feedback contains some idea of a constraint on the output and isn't covered by existing criteria - const prettyCriteria = criteria.map((crit) => { - return `${crit.shortname}: ${crit.criteria}`; - }).join('\n'); + const prettyCriteria = criteria + .map((crit) => { + return `${crit.shortname}: ${crit.criteria}`; + }) + .join("\n"); generateLLMEvaluationCriteria( "", @@ -612,13 +628,17 @@ If you determine the feedback corresponds to a new criteria, your response shoul // Update annotation for current response (if any) // TODO: Fix this for generate case when num resp per prompt > 1 - if (grades[shownResponse.uid] || holisticGrade || (annotation && annotation.trim())) { + if ( + grades[shownResponse.uid] || + holisticGrade || + (annotation && annotation.trim()) + ) { executor?.setGradeForExample( shownResponse.uid, grades[shownResponse.uid], holisticGrade, - annotation ? annotation.trim() : null - ); + annotation ? annotation.trim() : null, + ); } if ( @@ -637,11 +657,17 @@ If you determine the feedback corresponds to a new criteria, your response shoul } if (shownResponse && holisticGrade) { - updateGlobalRating(shownResponse.uid, "grade", { 0: holisticGrade === "good" }); + updateGlobalRating(shownResponse.uid, "grade", { + 0: holisticGrade === "good", + }); } if (shownResponse && grades[shownResponse.uid]) { - updateGlobalRating(shownResponse.uid, "perCriteriaGrades", grades[shownResponse.uid]); + updateGlobalRating( + shownResponse.uid, + "perCriteriaGrades", + grades[shownResponse.uid], + ); } // @ts-expect-error The only way to deselect the Radio.Group is to set it to null. Undefined doesn't work. @@ -678,6 +704,7 @@ If you determine the feedback corresponds to a new criteria, your response shoul setPastShownResponses(pastShownResponses.concat(next_resp)); setShownResponseIdx(pastShownResponses.length); } + updateShownResponseUniqueIndex(); }; // Go back to previously shown response @@ -685,13 +712,56 @@ If you determine the feedback corresponds to a new criteria, your response shoul if (pastShownResponses.length === 0 || shownResponseIdx === 0) return; setShownResponse(pastShownResponses[shownResponseIdx - 1]); setShownResponseIdx(shownResponseIdx - 1); // decrement shown resp idx + updateShownResponseUniqueIndex(); + }; + + const updateShownResponseUniqueIndex = () => { + let idx = 0; + for (const resp of responses) { + if (resp === shownResponse) { + setShownResponseUniqueIdx(idx); + break; + } + idx++; + } + }; + + const nextResponse2 = () => { + if (responses.length === 0) return; + if (shownResponseIdx < responses.length - 1) { + // setShownResponse(responses[shownResponseIdx + 1]); + setShownResponseIdx(shownResponseIdx + 1); + } + }; + + const prevResponse2 = () => { + if (shownResponseIdx > 0) { + // setShownResponse(responses[shownResponseIdx - 1]); + setShownResponseIdx(shownResponseIdx - 1); // decrement shown resp idx + } }; + React.useEffect(() => { + console.error("1111111111111111111111111111111111111"); + setShownResponse(responses[shownResponseIdx]); + }, [shownResponseIdx]); + const estimateGPTCalls = () => { return executor ? `This will trigger around ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT4Calls} GPT-4o and ${executor.estimateNumGPTCalls(grades[shownResponse?.uid]).numGPT35Calls} GPT-3.5-turbo-16k calls.` : "# estimated GPT calls not available."; - } + }; + + const updateCriteriaForDisplay = () => { + const highCriteria = criteria.filter((c) => c.priority === 1); + const lowCriteria = criteria.filter((c) => c.priority === 0); + setCriteriaForDisplay(highCriteria.concat(lowCriteria)); + }; + useEffect(() => { + const highCriteria = criteria.filter((c) => c.priority === 1); + const lowCriteria = criteria.filter((c) => c.priority === 0); + setCriteriaForDisplay(highCriteria.concat(lowCriteria)); + }, [criteria]); return ( @@ -744,7 +817,7 @@ If you determine the feedback corresponds to a new criteria, your response shoul }} >
- {criteria.map((e) => ( + {criteriaForDisplay.map((e) => ( )}
- + */} +
@@ -828,7 +917,7 @@ If you determine the feedback corresponds to a new criteria, your response shoul annotation ?? "", holisticGrade ?? "unknown", ); - + nextResponse(); }} > @@ -853,6 +942,8 @@ const HeaderText = ({ children }: { children: ReactNode }) => { interface GradingViewProps { shownResponse: LLMResponse | undefined; + shownResponseIdx: number; + responseCount: number; numGPT4Calls: number; numGPT35Calls: number; logs: { date: Date; message: string }[]; @@ -863,6 +954,8 @@ interface GradingViewProps { const GradingView: React.FC = ({ shownResponse, + shownResponseIdx, + responseCount, numGPT4Calls, numGPT35Calls, logs, @@ -878,6 +971,7 @@ const GradingView: React.FC = ({ : "", [shownResponse], ); + const prompt = useMemo(() => shownResponse?.prompt ?? "", [shownResponse]); const varsDivs = useMemo(() => { const combined_vars_metavars = shownResponse @@ -886,6 +980,8 @@ const GradingView: React.FC = ({ ...transformDict(shownResponse.metavars, cleanMetavarsFilterFunc), } : {}; + + // console.log("**************shownResponse", shownResponse); return Object.entries(combined_vars_metavars).map(([varname, val]) => (
{varname} =  @@ -894,12 +990,36 @@ const GradingView: React.FC = ({ )); }, [shownResponse]); + // const [shownResponseIdx, setShownResponseIdx] = useState(0); + // const [shownResponses, setShownResponses] = useState([]); + // React.useEffect(() => { + // console.error("current response", shownResponse); + // if (shownResponse && !shownResponses.includes(shownResponse)) { + // shownResponses.push(shownResponse); + // setShownResponses(shownResponses); + // setShownResponseIdx(shownResponses.length - 1); + // console.error("current response is saved.", shownResponses.length); + // } else { + // console.error("current response already saved."); + // for (const [idx, resp] of shownResponses.entries()) { + // if (shownResponse === resp) { + // setShownResponseIdx(idx); + // break; + // } + // } + // } + // }, [shownResponse]); + return ( {/* Top header */} - What do you think of this response? + + {/* What do you think of this response? */} + What do you think of response #{shownResponseIdx + 1} of{" "} + {responseCount}? + {/* Middle response box with chevron buttons < and > for going back and forward a response */} @@ -932,12 +1052,7 @@ const GradingView: React.FC = ({
{/* Go forward to the next response */} - + @@ -987,23 +1102,26 @@ const GradingView: React.FC = ({ - LLM Activity + + LLM Activity + {/* GPT Call Tally */} - Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls} GPT-3.5-Turbo-16k calls. + Executed {numGPT4Calls} GPT-4o calls and {numGPT35Calls}{" "} + GPT-3.5-Turbo-16k calls.
{ @@ -1014,7 +1132,9 @@ const GradingView: React.FC = ({ > {logs.map((log, index) => (
- {log.date.toLocaleString()} - + + {log.date.toLocaleString()} -{" "} + {log.message}
))} diff --git a/chainforge/react-server/src/MultiEvalNode.tsx b/chainforge/react-server/src/MultiEvalNode.tsx index c2540bb48..f745d6bd1 100644 --- a/chainforge/react-server/src/MultiEvalNode.tsx +++ b/chainforge/react-server/src/MultiEvalNode.tsx @@ -403,8 +403,23 @@ const MultiEvalNode: React.FC = ({ data, id }) => { console.warn(`No inputs to the Multi-Evaluator node.`); return []; } + // console.log( + // "**************************pulled_inputs.responseBatch", + // pulled_inputs.responseBatch, + // ); + // pulled_inputs.responseBatch = pulled_inputs.responseBatch.map((r) => ({ + // ...r, + // uid: uuid(), + // })); // Convert to standard response format (StandardLLMResponseFormat) return pulled_inputs.responseBatch.map(toStandardResponseFormat); + // const resps = []; + // for (const resp of pulled_inputs.responseBatch) { + // resps.push({ ...resp, uid: uuid() }); + // } + // console.log("111", resps); + // setResponses(resps2); + // console.log("222", responses); } catch (err) { handleError(err as Error); return []; diff --git a/chainforge/react-server/src/backend/evalgen/executor.ts b/chainforge/react-server/src/backend/evalgen/executor.ts index 9c53340f8..09d9e0d8e 100644 --- a/chainforge/react-server/src/backend/evalgen/executor.ts +++ b/chainforge/react-server/src/backend/evalgen/executor.ts @@ -9,7 +9,7 @@ import { EvalFunctionResult, EvalFunctionReport, EvalFunctionSetReport, - EvalCriteriaUID + EvalCriteriaUID, } from "./typing"; import { LLMResponse, ResponseUID, QueryProgress, Dict } from "../typing"; import { EventEmitter } from "events"; @@ -96,9 +96,8 @@ export default class EvaluationFunctionExecutor { addLog: (log: string) => void, existingGrades?: Record, existingPerCriteriaGrades?: Dict>, - annotations?: Dict + annotations?: Dict, ) { - console.log(evalCriteria); this.resultsCache = new Map< @@ -146,8 +145,6 @@ export default class EvaluationFunctionExecutor { this.logFunction = addLog; } - - /** * Starts the background computation for generating and executing evaluation functions. * This method initiates the tasks but does not wait for them to complete. @@ -198,23 +195,31 @@ export default class EvaluationFunctionExecutor { const functionExecutionPromises: Promise[] = []; emitter.on("functionGenerated", (evalFunction) => { - const executionPromise = (async () => { this.evalFunctions.push(evalFunction); const executionPromises = this.examples.map(async (example) => { - // Get random positive and negative examples for this criteria using the perCriteriaGrades const criteriaId = criteria.uid; - const randomPositiveExample = this.examples.find(example => this.perCriteriaGrades[criteriaId]?.[example.uid] === true); - const randomNegativeExample = this.examples.find(example => this.perCriteriaGrades[criteriaId]?.[example.uid] === false); - + const randomPositiveExample = this.examples.find( + (example) => + this.perCriteriaGrades[criteriaId]?.[example.uid] === true, + ); + const randomNegativeExample = this.examples.find( + (example) => + this.perCriteriaGrades[criteriaId]?.[example.uid] === false, + ); const funcToExecute = evalFunction.evalCriteria.eval_method === "code" ? execPyFunc : executeLLMEval; - const result = await funcToExecute(evalFunction, example, randomPositiveExample, randomNegativeExample); + const result = await funcToExecute( + evalFunction, + example, + randomPositiveExample, + randomNegativeExample, + ); // Update GPT-3.5 call count by 1 if the eval method is expert if (evalFunction.evalCriteria.eval_method === "expert") { @@ -223,7 +228,9 @@ export default class EvaluationFunctionExecutor { if (onProgress) { onProgress({ - success: (100 * functionExecutionPromises.length) / this.criteriaQueue.length, + success: + (100 * functionExecutionPromises.length) / + this.criteriaQueue.length, error: 0, }); } @@ -244,9 +251,11 @@ export default class EvaluationFunctionExecutor { functionExecutionPromises.push(executionPromise); }); - const badExample = this.examples.find(example => this.perCriteriaGrades[criteria.uid]?.[example.uid] === false); + const badExample = this.examples.find( + (example) => + this.perCriteriaGrades[criteria.uid]?.[example.uid] === false, + ); - await generateFunctionsForCriteria( criteria, this.promptTemplate, @@ -258,8 +267,12 @@ export default class EvaluationFunctionExecutor { this.updateGPTCalls(1, 0); console.log(`Generated functions for criteria: ${criteria.shortname}`); - console.log(`Number of functions generated: ${functionExecutionPromises.length}`); - this.logFunction(`Generated ${functionExecutionPromises.length} functions for criteria: ${criteria.shortname}`); + console.log( + `Number of functions generated: ${functionExecutionPromises.length}`, + ); + this.logFunction( + `Generated ${functionExecutionPromises.length} functions for criteria: ${criteria.shortname}`, + ); await Promise.all(functionExecutionPromises); } @@ -293,7 +306,9 @@ export default class EvaluationFunctionExecutor { // Listen for generated functions and execute them as they come in emitter.on("functionGenerated", (evalFunction) => { - this.logFunction(`Generated a new ${evalFunction.evalCriteria.eval_method === "code" ? "code-based" : "LLM-based"} validator for criteria: ${evalFunction.evalCriteria.shortname}${evalFunction.evalCriteria.eval_method === "expert" ? `, with prompt: ${evalFunction.name}` : ""}. Executing it on ${this.examples.length} examples.`); + this.logFunction( + `Generated a new ${evalFunction.evalCriteria.eval_method === "code" ? "code-based" : "LLM-based"} validator for criteria: ${evalFunction.evalCriteria.shortname}${evalFunction.evalCriteria.eval_method === "expert" ? `, with prompt: ${evalFunction.name}` : ""}. Executing it on ${this.examples.length} examples.`, + ); // Capture the execution promise of each function const executionPromise = (async () => { @@ -302,9 +317,15 @@ export default class EvaluationFunctionExecutor { const executionPromises = this.examples.map(async (example) => { // Get random positive and negative examples for this criteria using the perCriteriaGrades - const criteriaId = evalFunction.evalCriteria.uid; - const randomPositiveExample = this.examples.find(example => this.perCriteriaGrades[criteriaId]?.[example.uid] === true); - const randomNegativeExample = this.examples.find(example => this.perCriteriaGrades[criteriaId]?.[example.uid] === false); + const criteriaId = evalFunction.evalCriteria.uid; + const randomPositiveExample = this.examples.find( + (example) => + this.perCriteriaGrades[criteriaId]?.[example.uid] === true, + ); + const randomNegativeExample = this.examples.find( + (example) => + this.perCriteriaGrades[criteriaId]?.[example.uid] === false, + ); const funcToExecute = evalFunction.evalCriteria.eval_method === "code" @@ -312,7 +333,12 @@ export default class EvaluationFunctionExecutor { : executeLLMEval; // Run the function on the example and if there's an error, increment skipped - const result = await funcToExecute(evalFunction, example, randomPositiveExample, randomNegativeExample); + const result = await funcToExecute( + evalFunction, + example, + randomPositiveExample, + randomNegativeExample, + ); // Update GPT-3.5 call count by 1 if the eval method is expert if (evalFunction.evalCriteria.eval_method === "expert") { @@ -337,7 +363,6 @@ export default class EvaluationFunctionExecutor { if (result === EvalFunctionResult.FAIL) { this.updateScore(example.uid, evalFunction); } - }); await Promise.all(executionPromises); @@ -371,7 +396,9 @@ export default class EvaluationFunctionExecutor { console.log( "All evaluation functions have been generated and executed.", ); - this.logFunction("All initially-generated evaluation functions have been generated and executed."); + this.logFunction( + "All initially-generated evaluation functions have been generated and executed.", + ); if (resolveAllFunctionsGenerated) { resolveAllFunctionsGenerated(); // Resolve the promise when all functions have been generated and executed } @@ -388,7 +415,10 @@ export default class EvaluationFunctionExecutor { // Wait for the 'allFunctionsGenerated' event, which now waits for all executions await allFunctionsGeneratedPromise; } - public generateNewImplementationsForCriteria(criteriaID: EvalCriteriaUID): void { + + public generateNewImplementationsForCriteria( + criteriaID: EvalCriteriaUID, + ): void { const crit = this.evalCriteria.find((c) => c.uid === criteriaID); if (!crit) { throw new Error(`Criteria with ID ${criteriaID} not found.`); @@ -399,13 +429,12 @@ export default class EvaluationFunctionExecutor { } } - /** * Adds another evaluation criteria and triggers the generation and execution of evaluation functions for the new criteria. * This method allows the client to add new evaluation criteria after the executor has been initialized. * The new criteria will be processed in parallel with the existing criteria. * The method returns immediately, allowing the client to continue with other tasks. - * + * * @param criteria The new evaluation criteria to be added. */ public addCriteria(criteriaList: EvalCriteria[]): void { @@ -418,7 +447,7 @@ export default class EvaluationFunctionExecutor { console.log(`Adding new criteria: ${criteria.shortname}`); this.criteriaQueue.push(criteria); this.evalCriteria.push(criteria); - + // Start the generation and execution of functions for the new criteria if (!this.processing) { this.processNextCriteria(); @@ -512,16 +541,24 @@ export default class EvaluationFunctionExecutor { return new Map(this.grades); } - public estimateNumGPTCalls(perCriteriaGrades: Dict): { numGPT4Calls: number; numGPT35Calls: number }{ - + public estimateNumGPTCalls(perCriteriaGrades: Dict): { + numGPT4Calls: number; + numGPT35Calls: number; + } { let numGPT4Calls = 0; let numLLMCriteria = 0; for (const criteriaId in perCriteriaGrades) { const currGrade = perCriteriaGrades[criteriaId]; - const numGradedAsCurrGrade = this.examples.filter(example => this.perCriteriaGrades[example.uid] && this.perCriteriaGrades[example.uid][criteriaId] === currGrade).length; + const numGradedAsCurrGrade = this.examples.filter( + (example) => + this.perCriteriaGrades[example.uid] && + this.perCriteriaGrades[example.uid][criteriaId] === currGrade, + ).length; if (Math.random() <= 1 / (numGradedAsCurrGrade + 1)) { numGPT4Calls += 1; - const criteria = this.evalCriteria.find(criteria => criteria.uid === criteriaId); + const criteria = this.evalCriteria.find( + (criteria) => criteria.uid === criteriaId, + ); if (criteria && criteria.eval_method === "expert") { numLLMCriteria += 1; } @@ -529,10 +566,9 @@ export default class EvaluationFunctionExecutor { } return { - numGPT4Calls: numGPT4Calls, + numGPT4Calls, numGPT35Calls: numLLMCriteria * 3 * this.examples.length, }; - } /** @@ -544,9 +580,14 @@ export default class EvaluationFunctionExecutor { * @param exampleId The unique ID of the example being graded. * @param holisticGrade The developer-provided grade assigned to the example, "good" or "bad" or unknown. */ - public setGradeForExample(exampleId: ResponseUID, perCriteriaGrades?: Dict, holisticGrade?: string, annotation?: string ): void { + public setGradeForExample( + exampleId: ResponseUID, + perCriteriaGrades?: Dict, + holisticGrade?: string, + annotation?: string, + ): void { if (holisticGrade !== null) { - const boolHolistic = holisticGrade === "good" ? true : false; + const boolHolistic = holisticGrade === "good"; this.grades.set(exampleId, boolHolistic); } @@ -555,7 +596,9 @@ export default class EvaluationFunctionExecutor { // If holisticGrade was null, set it based on the perCriteriaGrades---if all criteria in the perCriteriaGrades are true, set the holisticGrade to true, else false if (holisticGrade === null) { - const allTrue = Object.values(perCriteriaGrades).every(value => value === true); + const allTrue = Object.values(perCriteriaGrades).every( + (value) => value === true, + ); this.grades.set(exampleId, allTrue); } } @@ -570,11 +613,19 @@ export default class EvaluationFunctionExecutor { for (const criteriaId in perCriteriaGrades) { const currGrade = perCriteriaGrades[criteriaId]; // With probability 1 / # graded examples for this criteria with currGrade, generate new implementations - const numGradedAsCurrGrade = this.examples.filter(example => this.perCriteriaGrades[example.uid] && this.perCriteriaGrades[example.uid][criteriaId] === currGrade).length; + const numGradedAsCurrGrade = this.examples.filter( + (example) => + this.perCriteriaGrades[example.uid] && + this.perCriteriaGrades[example.uid][criteriaId] === currGrade, + ).length; if (Math.random() <= 1 / (numGradedAsCurrGrade + 1)) { - console.log(`Generating new implementations for criteria: ${criteriaId}`); - const evalCriteria = this.evalCriteria.find(criteria => criteria.uid === criteriaId); + console.log( + `Generating new implementations for criteria: ${criteriaId}`, + ); + const evalCriteria = this.evalCriteria.find( + (criteria) => criteria.uid === criteriaId, + ); if (evalCriteria) { this.criteriaQueue.push(evalCriteria); if (!this.processing) { @@ -587,7 +638,9 @@ export default class EvaluationFunctionExecutor { } } - console.log(`Generated new implementations for ${numCriteriaWithNewImplementations} criteria.`); + console.log( + `Generated new implementations for ${numCriteriaWithNewImplementations} criteria.`, + ); } /** @@ -819,8 +872,8 @@ export default class EvaluationFunctionExecutor { console.log(report); scoredFunctions.push({ - evalFunction: evalFunction, - failureCoverage: failureCoverage, + evalFunction, + failureCoverage, falseFailureRate: report.false_fail / (report.true_pass + report.false_fail), }); @@ -860,7 +913,7 @@ export default class EvaluationFunctionExecutor { // Create report of coverage, missed failures, selected functions, and all eval function reports const report = { failureCoverage: coverage, - falseFailureRate: falseFailureRate, + falseFailureRate, selectedEvalFunctions: bestEvalFunctions, allEvalFunctionReports: evalFunctionReport, }; @@ -988,7 +1041,7 @@ export default class EvaluationFunctionExecutor { // Create report of coverage, missed failures, selected functions, and all eval function reports const report = { failureCoverage: coverage, - falseFailureRate: falseFailureRate, + falseFailureRate, selectedEvalFunctions: oldReport.selectedEvalFunctions, allEvalFunctionReports: oldReport.allEvalFunctionReports, }; diff --git a/chainforge/react-server/src/backend/evalgen/oai_utils.ts b/chainforge/react-server/src/backend/evalgen/oai_utils.ts index 0d1ecaabc..b9f23ce18 100644 --- a/chainforge/react-server/src/backend/evalgen/oai_utils.ts +++ b/chainforge/react-server/src/backend/evalgen/oai_utils.ts @@ -84,8 +84,8 @@ export class OpenAIStreamer extends EventEmitter { "Content-Type": "application/json", }, body: JSON.stringify({ - model: model, - messages: messages, + model, + messages, stream: true, }), }, diff --git a/chainforge/react-server/src/backend/evalgen/utils.ts b/chainforge/react-server/src/backend/evalgen/utils.ts index 5aa1ecfc9..699d8abd6 100644 --- a/chainforge/react-server/src/backend/evalgen/utils.ts +++ b/chainforge/react-server/src/backend/evalgen/utils.ts @@ -131,8 +131,18 @@ export async function executeLLMEval( // Query an LLM as an evaluator let systemMessage = "You are an expert evaluator."; - if (positiveExample && positiveExample.responses[0] && negativeExample && negativeExample.responses[0]) { - systemMessage += " Please consider the following good example: " + positiveExample.responses[0] + " and bad example: " + negativeExample.responses[0] + " when making your evaluation."; + if ( + positiveExample && + positiveExample.responses[0] && + negativeExample && + negativeExample.responses[0] + ) { + systemMessage += + " Please consider the following good example: " + + positiveExample.responses[0] + + " and bad example: " + + negativeExample.responses[0] + + " when making your evaluation."; } const result = await simpleQueryLLM( @@ -272,7 +282,7 @@ export async function generateFunctionsForCriteria( criteria, promptTemplate, example, - badExample + badExample, ); console.log("Function generation prompt:", functionGenPrompt); From 2a32595e93966bf47b13d70a8d4477c3ad1f7fda Mon Sep 17 00:00:00 2001 From: w352chen Date: Mon, 1 Jul 2024 16:41:42 -0400 Subject: [PATCH 02/11] adding counting thumbs up/thumbs down feature --- chainforge/react-server/src/EvalGenModal.tsx | 55 +++++++++++++++++-- chainforge/react-server/src/EvalGenModel.css | 26 +++++++++ chainforge/react-server/src/backend/typing.ts | 5 ++ 3 files changed, 80 insertions(+), 6 deletions(-) create mode 100644 chainforge/react-server/src/EvalGenModel.css diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx index a0cfac073..b8401d0b8 100644 --- a/chainforge/react-server/src/EvalGenModal.tsx +++ b/chainforge/react-server/src/EvalGenModal.tsx @@ -1,7 +1,7 @@ /** * EvalGen 2.0 * - * Ian Arawjo, Shreya Shankar, J.D. Zamf. + * Ian Arawjo, Shreya Shankar, J.D. Zamf., Helen Weixu Chen * * This file concerns the front-end to evaluation generator, EvalGen. * EvalGen supports users in generating eval funcs (here binary assertions) and aligning them with their preferences. @@ -52,10 +52,12 @@ import { } from "@mantine/core"; import { useDisclosure } from "@mantine/hooks"; import { + // CriteriaGradeCount, Dict, LLMResponse, PromptVarsDict, RatingDict, + ResponseUID, } from "./backend/typing"; import { EvalCriteria } from "./backend/evalgen/typing"; import { @@ -88,6 +90,7 @@ import EvaluationFunctionExecutor from "./backend/evalgen/executor"; import { generateLLMEvaluationCriteria } from "./backend/evalgen/utils"; import { escapeBraces } from "./backend/template"; import { update } from "lodash"; +import "./EvalGenModel.css"; const INIT_CRITERIA: EvalCriteria[] = [ { @@ -116,10 +119,18 @@ const INIT_CRITERIA: EvalCriteria[] = [ const ThumbUpDownButtons = ({ grade, onChangeGrade, + getGradeCount, }: { grade: boolean | undefined; onChangeGrade: (newGrade: boolean | undefined) => void; + getGradeCount: (grade: boolean | undefined) => number; }) => { + // console.log( + // "getGradeCount", + // getGradeCount(true), + // getGradeCount(false), + // getGradeCount(undefined), + // ); return ( <> {/* Thumbs up/down buttons */} @@ -133,7 +144,10 @@ const ThumbUpDownButtons = ({ if (onChangeGrade) onChangeGrade(grade === true ? undefined : true); }} > - +
+ +
{getGradeCount(true)}
+
); @@ -158,6 +178,7 @@ export interface CriteriaCardProps { initiallyOpen?: boolean; grade: boolean | undefined; onChangeGrade: (newGrade: boolean | undefined) => void; + getGradeCount: (grade: boolean | undefined) => number; } const CriteriaCard: React.FC = ({ @@ -166,6 +187,7 @@ const CriteriaCard: React.FC = ({ onDelete, initiallyOpen, grade, + getGradeCount, onChangeGrade, }) => { const [opened, { toggle }] = useDisclosure(initiallyOpen ?? false); @@ -193,7 +215,11 @@ const CriteriaCard: React.FC = ({ {/* Thumbs up/down buttons */} - + {/* Title of the criteria */} >( setGrades((grades) => { if (!grades[responseUID]) grades[responseUID] = {}; grades[responseUID][criteriaUID] = newGrade; - grades[responseUID] = { ...grades[responseUID] }; + // grades[responseUID] = { ...grades[responseUID] }; + // console.error("grades-2", grades); return { ...grades }; }); updateGlobalRating(responseUID, "perCriteriaGrades", grades[responseUID]); }; + const getGradeCount = ( + responseUID: string, + criteriaUID: string, + grade: boolean | undefined, + ) => { + // console.log("getGradeCount", responseUID, criteriaUID, grade); + if (grades[responseUID]) { + return grade === grades[responseUID][criteriaUID] ? 1 : 0; // this needs to be changed after the grading feature is fully implemented on server side. + } + return 0; + }; // The EvalGen object responsible for generating, implementing, and filtering candidate implementations const [executor, setExecutor] = useState( @@ -742,7 +780,7 @@ If you determine the feedback corresponds to a new criteria, your response shoul }; React.useEffect(() => { - console.error("1111111111111111111111111111111111111"); + // console.error("1111111111111111111111111111111111111"); setShownResponse(responses[shownResponseIdx]); }, [shownResponseIdx]); @@ -828,6 +866,11 @@ If you determine the feedback corresponds to a new criteria, your response shoul ? grades[shownResponse.uid][e.uid] : undefined } + getGradeCount={(grade) => { + return shownResponse + ? getGradeCount(shownResponse.uid, e.uid, grade) + : 0; + }} onChangeGrade={(newGrade) => { if (shownResponse) setPerCriteriaGrade(shownResponse.uid, e.uid, newGrade); diff --git a/chainforge/react-server/src/EvalGenModel.css b/chainforge/react-server/src/EvalGenModel.css new file mode 100644 index 000000000..4c83ae1cf --- /dev/null +++ b/chainforge/react-server/src/EvalGenModel.css @@ -0,0 +1,26 @@ +.gradeContainer { + position: relative; + width: 20px; +} + +/* .gradeIcon { + position: absolute; + height: 60px; + width: 70%; + left: 50px; + top: 50px; +} */ + +.gradeUpCount { + position: absolute; + left: 12px; + top: -5px; + font-size: x-small; +} + +.gradeDownCount { + position: absolute; + left: 13px; + top: 13px; + font-size: x-small; +} diff --git a/chainforge/react-server/src/backend/typing.ts b/chainforge/react-server/src/backend/typing.ts index a6967a1c9..9883e1ec5 100644 --- a/chainforge/react-server/src/backend/typing.ts +++ b/chainforge/react-server/src/backend/typing.ts @@ -274,3 +274,8 @@ export type TabularDataColType = { export type PythonInterpreter = "flask" | "pyodide"; export type RatingDict = Record; + +// export type CriteriaGradeCount { +// on: number; +// off: number; +// } From 3a9c2759bf5693211d328dc1005aec49a3bed624 Mon Sep 17 00:00:00 2001 From: w352chen Date: Sun, 7 Jul 2024 01:47:43 -0400 Subject: [PATCH 03/11] Change name of Provide Additional Feedback;Moving submit feedback button to the right side of the Good and Bad;Create a new button called suggest criteria; fix the thumb up/down feature --- chainforge/react-server/src/EvalGenModal.tsx | 102 +++++++++++------- chainforge/react-server/src/EvalGenModel.css | 26 ----- chainforge/react-server/src/backend/utils.ts | 3 +- .../react-server/src/text-fields-node.css | 39 +++++++ 4 files changed, 105 insertions(+), 65 deletions(-) delete mode 100644 chainforge/react-server/src/EvalGenModel.css diff --git a/chainforge/react-server/src/EvalGenModal.tsx b/chainforge/react-server/src/EvalGenModal.tsx index b8401d0b8..351048346 100644 --- a/chainforge/react-server/src/EvalGenModal.tsx +++ b/chainforge/react-server/src/EvalGenModal.tsx @@ -73,6 +73,7 @@ import { IconTrash, IconFlagFilled, IconPencil, + IconSparkles, } from "@tabler/icons-react"; import { cleanMetavarsFilterFunc, @@ -90,7 +91,7 @@ import EvaluationFunctionExecutor from "./backend/evalgen/executor"; import { generateLLMEvaluationCriteria } from "./backend/evalgen/utils"; import { escapeBraces } from "./backend/template"; import { update } from "lodash"; -import "./EvalGenModel.css"; +// import "./EvalGenModel.css"; const INIT_CRITERIA: EvalCriteria[] = [ { @@ -383,7 +384,7 @@ const EvalGenModal = forwardRef>( [], ); const [shownResponseIdx, setShownResponseIdx] = useState(0); - const [shownResponseUniqueIdx, setShownResponseUniqueIdx] = useState(0); + // const [shownResponseUniqueIdx, setShownResponseUniqueIdx] = useState(0); const [annotation, setAnnotation] = useState(undefined); const [holisticGrade, setHolisticGrade] = useState< @@ -492,11 +493,26 @@ const EvalGenModal = forwardRef>( updateCriteriaForDisplay(); }, [criteria]); + const generateCriteria = (resps) => { + // Create criteria + // setIsLoadingCriteria((num) => num + 3); + genCriteriaFromContext(resps) + .then((crits) => setCriteria(crits.map((c) => ({ ...c, uid: uuid() })))) + .catch((err) => { + console.error(err); + }) + .finally(() => { + setIsLoadingCriteria((num) => num - 3); + setNumGPT4Calls((num) => num + 1); + }); + }; + // Open the EvalGen wizard const trigger = (resps: LLMResponse[]) => { // We pass the responses here manually to ensure they remain the same // for the duration of one EvalGen operation. setResponses(resps); + // console.log("tigger", resps); const firstGrades = resps.reduce( (acc: Dict>, curr) => { @@ -507,18 +523,12 @@ const EvalGenModal = forwardRef>( ); setGrades(firstGrades); - // Create criteria - setIsLoadingCriteria((num) => num + 3); - // console.log("*****************************resps", resps); - genCriteriaFromContext(resps) - .then((crits) => setCriteria(crits.map((c) => ({ ...c, uid: uuid() })))) - .catch((err) => { - console.error(err); - }) - .finally(() => { - setIsLoadingCriteria((num) => num - 3); - setNumGPT4Calls((num) => num + 1); - }); + console.log("*****************************resps", resps); + if (criteria && criteria.length === 0) { + generateCriteria(resps); + } else { + console.log("Shut up!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!", criteria.length); + } setShownResponseIdx(0); if (resps.length > 0) { @@ -888,7 +898,8 @@ If you determine the feedback corresponds to a new criteria, your response shoul ) : ( <> )} -
+ {/*
*/} +
{/* -
+ {/*
+
*/} + + {/*
*/} +
- Provide Additional Feedback + Suggest New Criteria Based on the Feedback