From 98946c326184945a1ad7bcac61e110eb7d51fcda Mon Sep 17 00:00:00 2001 From: scuuy <912074188@qq.com> Date: Wed, 4 Mar 2026 22:24:50 +0800 Subject: [PATCH] new eval framework doc (zh&en) --- docs/.vuepress/notes/en/guide.ts | 2 + docs/.vuepress/notes/zh/guide.ts | 2 + .../guide/model_evaluation/command_eval.md | 6 +- .../guide/model_evaluation/easy_evaluation.md | 4 +- .../guide/model_evaluation/overview_info.md | 24 ++ .../guide/model_evaluation/unified_eval.md | 274 ++++++++++++++++++ .../guide/model_evaluation/command_eval.md | 2 +- .../guide/model_evaluation/easy_evaluation.md | 2 +- .../guide/model_evaluation/overview_info.md | 24 ++ .../guide/model_evaluation/unified_eval.md | 274 ++++++++++++++++++ 10 files changed, 607 insertions(+), 7 deletions(-) create mode 100644 docs/en/notes/guide/model_evaluation/overview_info.md create mode 100644 docs/en/notes/guide/model_evaluation/unified_eval.md create mode 100644 docs/zh/notes/guide/model_evaluation/overview_info.md create mode 100644 docs/zh/notes/guide/model_evaluation/unified_eval.md diff --git a/docs/.vuepress/notes/en/guide.ts b/docs/.vuepress/notes/en/guide.ts index 77e9ff50a5..66aa93bebc 100644 --- a/docs/.vuepress/notes/en/guide.ts +++ b/docs/.vuepress/notes/en/guide.ts @@ -92,8 +92,10 @@ export const Guide: ThemeNote = defineNoteConfig({ icon: 'carbon:flow', prefix: 'model_evaluation', items: [ + "overview_info", "command_eval", "easy_evaluation", + "unified_eval" ] }, { diff --git a/docs/.vuepress/notes/zh/guide.ts b/docs/.vuepress/notes/zh/guide.ts index 1f3992cf5c..ab6dd90425 100644 --- a/docs/.vuepress/notes/zh/guide.ts +++ b/docs/.vuepress/notes/zh/guide.ts @@ -90,8 +90,10 @@ export const Guide: ThemeNote = defineNoteConfig({ icon: 'carbon:flow', prefix: 'model_evaluation', items: [ + "overview_info", "command_eval", "easy_evaluation", + "unified_eval" ] }, { diff --git a/docs/en/notes/guide/model_evaluation/command_eval.md b/docs/en/notes/guide/model_evaluation/command_eval.md index 59c67ffd69..e5701b2bbd 100644 --- a/docs/en/notes/guide/model_evaluation/command_eval.md +++ b/docs/en/notes/guide/model_evaluation/command_eval.md @@ -1,11 +1,11 @@ --- -title: Model Capability Assessment Pipeline +title: Model Evaluation (QA Quickstart) createTime: 2025/08/30 14:27:02 icon: hugeicons:chart-evaluation permalink: /en/guide/evaluation-pipeline/ --- -# Model Capability Assessment Pipeline +# Model Evaluation (QA Quickstart) ⚠️Only supports QA pair format evaluation @@ -163,4 +163,4 @@ dataflow eval local Run API evaluation: ```bash dataflow eval api -``` \ No newline at end of file +``` diff --git a/docs/en/notes/guide/model_evaluation/easy_evaluation.md b/docs/en/notes/guide/model_evaluation/easy_evaluation.md index 79f622d3d8..8c35f56b42 100644 --- a/docs/en/notes/guide/model_evaluation/easy_evaluation.md +++ b/docs/en/notes/guide/model_evaluation/easy_evaluation.md @@ -1,11 +1,11 @@ --- -title: easy_evaluation +title: Model Evaluation (Beginner Edition) icon: hugeicons:chart-evaluation createTime: 2025/10/17 15:20:10 permalink: /en/guide/97wq40d9/ --- -# 📊 Model Evaluation Pipeline Guide +# 📊 Model Evaluation (Beginner Edition) This guide explains how to use the **DataFlow** evaluation pipeline to assess model-generated answers against ground-truth answers using either **semantic** or **exact match** comparison. Two evaluation modes are supported: diff --git a/docs/en/notes/guide/model_evaluation/overview_info.md b/docs/en/notes/guide/model_evaluation/overview_info.md new file mode 100644 index 0000000000..bfc137e6cf --- /dev/null +++ b/docs/en/notes/guide/model_evaluation/overview_info.md @@ -0,0 +1,24 @@ +--- +title: Model Evaluation Overview +icon: solar:flag-2-broken +createTime: 2026/03/04 17:33:23 +permalink: /en/guide/0zegorzv/ +--- + +# Model Evaluation Overview + +DataFlow provides three model evaluation options “from easy to advanced”, covering needs from quick start to research-grade benchmark evaluation. **You only need to choose and read ONE of the following documents to complete your evaluation** (these are different entry points; you do not need to learn all of them). + +## How to Choose + +| Which user are you? | What you want | Recommended Reading | +|---|---|---| +| 👶 Beginner - want to get started fast | Evaluate directly via CLI (for QA data, works out of the box) | [Model Evaluation (QA Quickstart)](/en/guide/evaluation-pipeline/) | +| 🧑‍💻 Beginner+ - simple parameter tuning, model before/after comparison | Modify pipeline script parameters (more straightforward) | [Model Evaluation (Beginner Edition)](/en/guide/97wq40d9/) | +| 🧪 Researcher - academic, standardized benchmark metrics | Unified benchmark evaluation framework (task types + full evaluation parameters) | [Model Evaluation (Research Edition)](/en/guide/41y6wer6/) | + +## Document Entries + +- [Model Evaluation (QA Quickstart)](/en/guide/evaluation-pipeline/): CLI-based, beginner-friendly, suitable for quick evaluation on **QA-style** datasets. +- [Model Evaluation (Beginner Edition)](/en/guide/97wq40d9/): pipeline-code based, for beginner/intermediate users, adjust evaluation settings by **editing script parameters**. +- [Model Evaluation (Research Edition)](/en/guide/41y6wer6/): research-grade evaluation, for users who need to pass full evaluation parameters to evaluate specific benchmarks. diff --git a/docs/en/notes/guide/model_evaluation/unified_eval.md b/docs/en/notes/guide/model_evaluation/unified_eval.md new file mode 100644 index 0000000000..3ad534df86 --- /dev/null +++ b/docs/en/notes/guide/model_evaluation/unified_eval.md @@ -0,0 +1,274 @@ +--- +title: Model Evaluation (Research Edition) +icon: hugeicons:chart-evaluation +createTime: 2026/03/04 16:41:11 +permalink: /en/guide/41y6wer6/ +--- + +# Model Evaluation (Unified Bench Eval) + +DataFlow-Eval (Models) is DataFlow’s in-house model evaluation framework. It abstracts common benchmark evaluation paradigms into a set of mutually exclusive evaluation types (`eval_type`) and provides ready-to-run pipeline scripts, enabling users to evaluate with minimal configuration, write per-sample results back to the dataframe, and export aggregated statistics. + +Unified evaluation scripts directory: `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval` + +>*If you start DataFlow via the `init` workflow, you can directly locate the `gpu_pipelines` directory.* + +## ✅ Workflow + +To evaluate a specific model, follow these steps: + +1. **Choose the evaluation type**: select `eval_type` based on your dataset schema (enable semantic judging when needed) +2. **Pick the pipeline script**: open the corresponding pipeline file under the scripts directory +3. **Edit evaluation parameters**: configure data path, cache directory, model serving, and field mapping +4. **Run and inspect results**: run the pipeline, then inspect per-sample result columns and aggregated statistics + +## 🧰 Environment Setup + +Before running the evaluation pipelines locally, install DataFlow: + +```bash +cd DataFlow +pip install -e . +``` + +If you use local model serving (e.g., vLLM), make sure your environment has the required GPU/driver and dependencies. + + +## 🧩 Evaluation Types + +`eval_type` defines the required fields (keys) per sample, as well as the default metric/logic. + +**Field conventions:** +- Keys do not include the prompt string itself; they only include variables that will be injected into the prompt (e.g., `question` / `choices` / `context`). +- `context` is an optional field across all types: if present it will be used; otherwise it is treated as `None` (so you do not need separate benches for “with/without context”). + +### Type Overview + +| eval_type | Paradigm | Required keys | Default metric/logic | Example benches | Script | +|---|---|---|---|---|---| +| `key1_text_score` | Text scoring | `text` | `ppl` | WikiText / PTB | `unified_bench_eval_type1.py` | +| `key2_qa` | Generative: single reference | `question`
`target` | `math_verify` (optional semantic judge) | GSM8K / MATH | `unified_bench_eval_type2.py` | +| `key2_q_ma` | Generative: multiple references | `question`
`targets[]` | `any_math_verify` | SQuAD (multiple golds) | `unified_bench_eval_type3.py` | +| `key3_q_choices_a` | Multiple choice: single correct | `question`
`choices[]`
`label` | `ll_choice_acc` (choice loglikelihood) | PIQA / ARC / MMLU | `unified_bench_eval_type4.py` | +| `key3_q_choices_as` | Multiple choice: multiple correct | `question`
`choices[]`
`labels[]` | `micro_f1` | Multi-select / multi-label | `unified_bench_eval_type5.py` | +| `key3_q_a_rejected` | Preference: pairwise comparison | `question`
`better`
`rejected` | `pairwise_ll_winrate` | DPO / preference data | `unified_bench_eval_type6.py` | + +### Semantic Judging Toggle (key2_qa only) + +Semantic judging is not a standalone type; it is a toggle for `key2_qa`: + +- `use_semantic_judge=False`: default `math_verify` (best for verifiable answers) +- `use_semantic_judge=True`: LLM-based `semantic_judge` (best for open-ended answers) + +Reference script (in the same directory): `unified_bench_eval_type_semantic.py` + +## 📦 Data Preparation + +Unified evaluation supports `jsonl` or `json` as input formats by default. You may keep any additional fields (e.g., `id`, `eval_type`). The evaluation only depends on the column names you explicitly set in the pipeline. + +## 🧱 Type Details and Examples (Collapsible) + +
+Type1: key1_text_score (Text scoring / PPL) + +**Required keys:** +- `text` + +**Example data (jsonl):** + +```json +{"id":"t_0001","text":"The capital of France is Paris."} +{"id":"t_0002","text":"Perplexity is a common metric for language modeling."} +``` + +**Pipeline:** +- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py` + +**Notes:** +- This type does not require answer generation; it computes `ppl` directly from `text`. + +
+ +
+Type2: key2_qa (Generative: single reference) + +**Required keys:** +- `question` +- `target` (the column name can be customized; map it via `input_target_key` in the pipeline) + +**Example data (jsonl):** + +```json +{"id":"qa_0001","question":"Solve for x: 2x + 3 = 11.","target":"x = 4","context":null} +{"id":"qa_0002","question":"What is the capital of France?","target":"Paris","context":"Answer in one word."} +``` + +**Pipeline:** +- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py` + +**Notes:** +- The default evaluation logic is `math_verify` (strict and verifiable). +- For semantic judging, set `use_semantic_judge=True` and refer to `.../unified_bench_eval_type_semantic.py`. + +
+ +
+Type3: key2_q_ma (Generative: multiple references) + +**Required keys:** +- `question` +- `targets` (list; JSON-stringified list is also supported) + +**Example data (jsonl):** + +```json +{"id":"ma_0001","question":"What is the chemical formula for water?","targets":["H2O","h2o"],"context":"Use chemical symbols."} +{"id":"ma_0002","question":"Who created Python?","targets":["Guido van Rossum","Guido"],"context":null} +``` + +**Pipeline:** +- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py` + +**Notes:** +- Default logic is `any_math_verify`: any match among references counts as correct. + +
+ +
+Type4: key3_q_choices_a (Multiple choice: single correct) + +**Required keys:** +- `question` +- `choices` (list) +- `label` (0-based index) + +**Example data (jsonl):** + +```json +{"id":"mc_0001","question":"What is the capital of France?","choices":["Paris","London","Berlin","Rome"],"label":0,"context":null} +{"id":"mc_0002","question":"In Python, what does len([1, 2, 3]) return?","choices":["2","3","4","An error"],"label":1,"context":"Choose exactly one option."} +``` + +**Pipeline:** +- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py` + +**Notes:** +- Default metric is `ll_choice_acc`: compute loglikelihood for each choice, take argmax, compare with `label`. +- This type usually does not need `generated_ans` (the pipeline typically skips generation by default). + +
+ +
+Type5: key3_q_choices_as (Multiple choice: multiple correct) + +**Required keys:** +- `question` +- `choices` (list) +- `labels` (0-based index list) + +**Example data (jsonl):** + +```json +{"id":"ms_0001","question":"Which of the following are prime numbers?","choices":["2","9","11","15"],"labels":[0,2],"context":null} +{"id":"ms_0002","question":"Which of the following are HTTP methods?","choices":["GET","FETCH","POST","PUSH"],"labels":[0,2],"context":"Select all that apply."} +``` + +**Pipeline:** +- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py` + +**Notes:** +- Default metric is `micro_f1` (parse the model output as a multi-select set and compute micro-F1). +- This type requires generating `generated_ans` by default (or provide your own prediction column and point `input_pred_key` to it). + +
+ +
+Type6: key3_q_a_rejected (Preference: pairwise comparison) + +**Required keys:** +- `question` +- `better` +- `rejected` + +**Example data (jsonl):** + +```json +{"id":"pw_0001","question":"Explain what overfitting is in machine learning.","better":"Overfitting is when a model learns the training data too closely, including noise, and performs poorly on unseen data.","rejected":"Overfitting means the model is always perfect.","context":null} +{"id":"pw_0002","question":"What is the derivative of x^2?","better":"The derivative of x^2 with respect to x is 2x.","rejected":"The derivative of x^2 is x.","context":"Answer with a direct statement."} +``` + +**Pipeline:** +- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py` + +**Notes:** +- Default metric is `pairwise_ll_winrate`: compare the loglikelihood of `P(better|prompt)` vs `P(rejected|prompt)` and compute win rate. +- This type does not require answer generation (the pipeline typically skips generation by default). + +
+ +## ⚙️ Parameter Configuration (What to Edit) + +Unified Bench Eval pipeline files already implement the evaluation flow. For most users, you only need to edit two parts in the pipeline code: + +1. **Data and cache (FileStorage)** +2. **Model serving (Serving: local model / API judge)** + +Below is a minimal “what to change” example (replace paths and model parameters according to the comments): + +```python +from dataflow.utils.storage import FileStorage +from dataflow.serving import LocalModelLLMServing_vllm, APILLMServing_request + +# 1) Data & cache (FileStorage): set your evaluation data path and cache directory here +storage = FileStorage( + first_entry_file_name="path/to/your_eval_data.jsonl", # TODO: your jsonl/json dataset file + cache_path="./cache_local", # TODO: cache directory (stores intermediate & final results) + file_name_prefix="your_bench_name", # TODO: cache/result prefix (to distinguish runs) + cache_type="jsonl", # TODO: match your input type (jsonl/json) +) + +# 2) Model serving: choose local serving or API based on your needs +# 2.1 Local model serving (commonly used for generation, PPL/LL, etc.) +llm_serving_local = LocalModelLLMServing_vllm( + hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # TODO: replace with your local path or HF model id + vllm_tensor_parallel_size=1, # TODO: tensor parallel config (multi-GPU) + vllm_max_tokens=2048, # TODO: max generation tokens +) + +# 2.2 API judge serving (only needed when key2_qa uses semantic judging: use_semantic_judge=True) +llm_serving_judge = APILLMServing_request( + api_url="https://api.openai.com/v1/chat/completions", # TODO: replace with your API endpoint + model_name="gpt-4o", # TODO: replace with your judge model + max_workers=5, # TODO: concurrency +) + +# Then, in the target pipeline, replace FileStorage and Serving objects with the configs above. +# For example: +# self.storage = storage +# self.llm_serving_generator = llm_serving_local +# self.llm_serving_judger = llm_serving_judge +``` + +## ▶️ Run Evaluation + +```bash +python unified_bench_eval_type2.py +``` + +## 📊 Outputs and Field Meanings + +After evaluation, results are written back to the cached dataframe with the following columns (column names can be customized via evaluator parameters): + +- `eval_score`: numeric score (0/1 for accuracy-style tasks; float for PPL) +- `eval_pred`: parsed prediction information (e.g., choice parsing, loglikelihood info) +- `eval_valid`: whether the sample evaluation is valid +- `eval_error`: error message (e.g., `ll_unavailable` / `ppl_unavailable`) + +Additionally, aggregated statistics are saved to `eval_result_path` (configured in the pipeline scripts) for quick overall inspection. + +## 🔎 Reference Implementation + +For deeper customization (prompts, metrics, parsing logic), refer to the operator implementations: + +- `dataflow/operators/core_text/generate/bench_answer_generator.py` +- `dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py` diff --git a/docs/zh/notes/guide/model_evaluation/command_eval.md b/docs/zh/notes/guide/model_evaluation/command_eval.md index 32485167b1..4ce198f823 100644 --- a/docs/zh/notes/guide/model_evaluation/command_eval.md +++ b/docs/zh/notes/guide/model_evaluation/command_eval.md @@ -1,5 +1,5 @@ --- -title: EvalPipeline +title: 模型评估(小白QA快速版) createTime: 2025/10/20 11:30:42 icon: hugeicons:chart-evaluation permalink: /zh/guide/cqro9oa8/ diff --git a/docs/zh/notes/guide/model_evaluation/easy_evaluation.md b/docs/zh/notes/guide/model_evaluation/easy_evaluation.md index d452c5281c..545f29c43f 100644 --- a/docs/zh/notes/guide/model_evaluation/easy_evaluation.md +++ b/docs/zh/notes/guide/model_evaluation/easy_evaluation.md @@ -1,5 +1,5 @@ --- -title: 模型评估流水线 +title: 模型评估(小白简易版) icon: hugeicons:chart-evaluation createTime: 2025/10/17 15:00:50 permalink: /zh/guide/enty5ksn/ diff --git a/docs/zh/notes/guide/model_evaluation/overview_info.md b/docs/zh/notes/guide/model_evaluation/overview_info.md new file mode 100644 index 0000000000..7251e21669 --- /dev/null +++ b/docs/zh/notes/guide/model_evaluation/overview_info.md @@ -0,0 +1,24 @@ +--- +title: 模型评估概述 +icon: solar:flag-2-broken +createTime: 2026/03/04 17:33:23 +permalink: /zh/guide/0zegorzv/ +--- + +# 模型评估概述 + +DataFlow 提供三种“从易到难”的模型评估方式,覆盖从快速上手到科研级 bench 评测的不同需求。**你只需要选择并阅读其中一种文档即可完成评估**(三者是不同入口,不需要全部学习)。 + +## 如何选择 + +| 你更像哪类用户 | 你希望怎么用 | 推荐阅读 | +|---|---|---| +| 👶 新手 - 只想快速跑通 | 命令行直接评估(适合 QA 数据,开箱即用) | [模型评估(小白QA快速版)](/zh/guide/cqro9oa8/) | +| 🧑‍💻 新手进阶 - 简单调整参数,只进行模型的前后对比 | 通过 pipeline 脚本修改函数传参(更直观) | [模型评估(小白简易版)](/zh/guide/enty5ksn/) | +| 🧪 科研工作者 - bench 采用学术通用评测指标 | Benchmark 统一评测框架(划分任务类型,传递完整评测参数) | [模型评估(科研完整版)](/zh/guide/41y6wer6/) | + +## 文档入口说明 + +- [模型评估(小白QA快速版)](/zh/guide/cqro9oa8/):命令行方式,面向小白,适合对 **QA 类型数据**做快速评测。 +- [模型评估(小白简易版)](/zh/guide/enty5ksn/):pipeline 代码方式,面向小白/进阶用户,通过 **修改脚本参数** 调整评测设置。 +- [模型评估(科研完整版)](/zh/guide/41y6wer6/):科研完整版评估,面向需要传递完整评测参数评测指定Benchmark的用户。 diff --git a/docs/zh/notes/guide/model_evaluation/unified_eval.md b/docs/zh/notes/guide/model_evaluation/unified_eval.md new file mode 100644 index 0000000000..65ee0c0ad9 --- /dev/null +++ b/docs/zh/notes/guide/model_evaluation/unified_eval.md @@ -0,0 +1,274 @@ +--- +title: 模型评估(科研完整版) +icon: hugeicons:chart-evaluation +createTime: 2026/03/04 16:41:11 +permalink: /zh/guide/41y6wer6/ +--- + +# 模型评估(Unified Bench Eval) + +DataFlow-Eval (Models) 是 DataFlow 自研的模型评估框架:它将常见 benchmark 的评测范式抽象为若干互斥的评测类型(`eval_type`),并提供开箱即用的评测流水线脚本,帮助用户以最少配置完成评测、写回逐样本结果并输出汇总统计。 + +统一评测脚本目录:`dataflow/statics/pipelines/gpu_pipelines/benchmark_eval` + +>*init 方式启动 DataFlow 则直接找到`gpu_pipelines`目录即可* + +## ✅ 使用流程 + +对指定模型进行评测时,按以下顺序操作: + +1. **确定评测类型**:根据数据结构选择 `eval_type`(必要时开启语义评测开关) +2. **选择评测流水线**:进入脚本目录,找到对应 type 的 pipeline 文件 +3. **填写评测参数**:配置数据路径、缓存目录、模型 serving、字段名映射 +4. **运行并查看结果**:运行 pipeline,查看逐样本结果列与汇总统计文件 + +## 🧰 环境准备 + +在本地运行评测流水线前,安装 DataFlow: + +```bash +cd DataFlow +pip install -e . +``` + +如使用本地模型 serving(例如 vLLM),请确保运行环境具备对应 GPU/驱动与依赖。 + + +## 🧩 评测类型 + +`eval_type` 定义了每条样本必须包含的字段(keys),以及默认的评测指标/逻辑。 + +**字段约定:** +- keys 不包含 prompt 本身的字符串,仅包含需要嵌入 prompt 的变量字段(如 `question` / `choices` / `context`)。 +- `context` 为统一可选字段:存在即使用;不传或不存在即视为 `None`(无需再拆分“有/无上下文”的 bench)。 + +### 类型总览 + +| eval_type | 类型范式 | 必要 keys | 默认 metric/逻辑 | 示例 Bench | 脚本文件 | +|---|---|---|---|---|---| +| `key1_text_score` | 文本打分 | `text` | `ppl` | WikiText / PTB | `unified_bench_eval_type1.py` | +| `key2_qa` | 生成式:单参考答案 | `question`
`target` | `math_verify`(可选语义评测) | GSM8K / MATH | `unified_bench_eval_type2.py` | +| `key2_q_ma` | 生成式:多参考答案 | `question`
`targets[]` | `any_math_verify` | SQuAD(多 gold) | `unified_bench_eval_type3.py` | +| `key3_q_choices_a` | 选择题:单正确 | `question`
`choices[]`
`label` | `ll_choice_acc`(loglikelihood 选项打分) | PIQA / ARC / MMLU | `unified_bench_eval_type4.py` | +| `key3_q_choices_as` | 选择题:多正确 | `question`
`choices[]`
`labels[]` | `micro_f1` | 多选题 / 多标签 | `unified_bench_eval_type5.py` | +| `key3_q_a_rejected` | 偏好/排序:成对比较 | `question`
`better`
`rejected` | `pairwise_ll_winrate` | DPO/偏好数据 | `unified_bench_eval_type6.py` | + +### 语义评测开关(仅适用于 key2_qa) + +语义评测不是独立类型,而是 `key2_qa` 的评测开关: + +- `use_semantic_judge=False`:默认 `math_verify`(适合可验证答案) +- `use_semantic_judge=True`:使用 LLM judge 的 `semantic_judge`(适合开放式答案) + +对应示例脚本(同目录下):`unified_bench_eval_type_semantic.py` + +## 📦 数据准备 + +统一评测默认支持 `jsonl`或`json` 作为输入格式。你可以保留任意额外字段(如 `id`、`eval_type`),评测只依赖你在 pipeline 中显式填写的列名。 + +## 🧱 各类型说明与示例(可折叠) + +
+Type1:key1_text_score(文本打分 / PPL) + +**必要 keys:** +- `text` + +**数据示例(jsonl):** + +```json +{"id":"t_0001","text":"The capital of France is Paris."} +{"id":"t_0002","text":"Perplexity is a common metric for language modeling."} +``` + +**对应 pipeline:** +- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py` + +**说明:** +- 该类型不需要生成答案,评测直接对 `text` 计算 `ppl`。 + +
+ +
+Type2:key2_qa(生成式:单参考答案) + +**必要 keys:** +- `question` +- `target`(列名可自定义,只要在 pipeline 中用 `input_target_key` 映射即可) + +**数据示例(jsonl):** + +```json +{"id":"qa_0001","question":"Solve for x: 2x + 3 = 11.","target":"x = 4","context":null} +{"id":"qa_0002","question":"What is the capital of France?","target":"Paris","context":"Answer in one word."} +``` + +**对应 pipeline:** +- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py` + +**说明:** +- 默认评测逻辑为 `math_verify`(偏严格、可验证)。 +- 如需语义评测,使用 `use_semantic_judge=True`,参考脚本 `.../unified_bench_eval_type_semantic.py`。 + +
+ +
+Type3:key2_q_ma(生成式:多参考答案) + +**必要 keys:** +- `question` +- `targets`(list;也可为 JSON 字符串形式的 list) + +**数据示例(jsonl):** + +```json +{"id":"ma_0001","question":"What is the chemical formula for water?","targets":["H2O","h2o"],"context":"Use chemical symbols."} +{"id":"ma_0002","question":"Who created Python?","targets":["Guido van Rossum","Guido"],"context":null} +``` + +**对应 pipeline:** +- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py` + +**说明:** +- 默认评测逻辑为 `any_math_verify`:多参考答案中任意一个命中即视为正确。 + +
+ +
+Type4:key3_q_choices_a(选择题:单正确) + +**必要 keys:** +- `question` +- `choices`(list) +- `label`(0-based 下标) + +**数据示例(jsonl):** + +```json +{"id":"mc_0001","question":"What is the capital of France?","choices":["Paris","London","Berlin","Rome"],"label":0,"context":null} +{"id":"mc_0002","question":"In Python, what does len([1, 2, 3]) return?","choices":["2","3","4","An error"],"label":1,"context":"Choose exactly one option."} +``` + +**对应 pipeline:** +- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py` + +**说明:** +- 默认使用 `ll_choice_acc`:对每个 choice 计算 loglikelihood,取 argmax 与 `label` 比较。 +- 该类型通常不需要生成 `generated_ans`(pipeline 中会默认跳过生成步骤)。 + +
+ +
+Type5:key3_q_choices_as(选择题:多正确) + +**必要 keys:** +- `question` +- `choices`(list) +- `labels`(0-based 下标列表) + +**数据示例(jsonl):** + +```json +{"id":"ms_0001","question":"Which of the following are prime numbers?","choices":["2","9","11","15"],"labels":[0,2],"context":null} +{"id":"ms_0002","question":"Which of the following are HTTP methods?","choices":["GET","FETCH","POST","PUSH"],"labels":[0,2],"context":"Select all that apply."} +``` + +**对应 pipeline:** +- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py` + +**说明:** +- 默认 metric 为 `micro_f1`(将模型输出解析为多选集合后计算)。 +- 该类型默认需要生成 `generated_ans`(也可直接提供预测列并将 `input_pred_key` 指向该列)。 + +
+ +
+Type6:key3_q_a_rejected(偏好/排序:成对比较) + +**必要 keys:** +- `question` +- `better` +- `rejected` + +**数据示例(jsonl):** + +```json +{"id":"pw_0001","question":"Explain what overfitting is in machine learning.","better":"Overfitting is when a model learns the training data too closely, including noise, and performs poorly on unseen data.","rejected":"Overfitting means the model is always perfect.","context":null} +{"id":"pw_0002","question":"What is the derivative of x^2?","better":"The derivative of x^2 with respect to x is 2x.","rejected":"The derivative of x^2 is x.","context":"Answer with a direct statement."} +``` + +**对应 pipeline:** +- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py` + +**说明:** +- 默认 metric 为 `pairwise_ll_winrate`:比较 `P(better|prompt)` 与 `P(rejected|prompt)` 的 loglikelihood,统计 win rate。 +- 该类型不需要生成答案(pipeline 中会默认跳过生成步骤)。 + +
+ +## ⚙️ 参数配置(参考写法) + +Unified Bench Eval 的 pipeline 文件已经写好评测流程。对大多数用户来说,只需要在流水线代码里修改两处配置即可: + +1. **数据与缓存(FileStorage)** +2. **模型服务(Serving:本地模型 / API judge)** + +下面给出一个最小“需要改哪里”的示例(按注释替换为你的路径与模型参数即可): + +```python +from dataflow.utils.storage import FileStorage +from dataflow.serving import LocalModelLLMServing_vllm, APILLMServing_request + +# 1) 数据与缓存(FileStorage):把你的评测数据路径与缓存目录填在这里 +storage = FileStorage( + first_entry_file_name="path/to/your_eval_data.jsonl", # TODO: 你的 jsonl/json 数据文件 + cache_path="./cache_local", # TODO: 缓存目录(会写入中间结果与评测结果) + file_name_prefix="your_bench_name", # TODO: 结果/缓存前缀(用于区分不同评测) + cache_type="jsonl", # TODO: 与输入文件一致(jsonl/json) +) + +# 2) 模型服务(Serving):按需求选择本地模型或 API +# 2.1 本地模型 serving(常用于生成、PPL/LL 等) +llm_serving_local = LocalModelLLMServing_vllm( + hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # TODO: 替换为你的模型路径或 HF 名称 + vllm_tensor_parallel_size=1, # TODO: 多卡并行配置 + vllm_max_tokens=2048, # TODO: 生成长度上限 +) + +# 2.2 API judge serving(仅在 key2_qa 语义评测 use_semantic_judge=True 时需要) +llm_serving_judge = APILLMServing_request( + api_url="https://api.openai.com/v1/chat/completions", # TODO: 替换为你的 API 地址 + model_name="gpt-4o", # TODO: 替换为你的 judge 模型 + max_workers=5, # TODO: 并发数 +) + +# 之后在对应 pipeline 里,把 FileStorage 与 Serving 对象替换为上面的配置即可 +# 例如: +# self.storage = storage +# self.llm_serving_generator = llm_serving_local +# self.llm_serving_judger = llm_serving_judge +``` + +## ▶️ 运行评测 + +```bash +python unified_bench_eval_type2.py +``` + +## 📊 结果产物与字段含义 + +评测结束后,结果会写回缓存 DataFrame,并包含以下列(可在 evaluator 参数中自定义列名): + +- `eval_score`:数值评分(accuracy 类任务为 0/1;PPL 为浮点数) +- `eval_pred`:解析后的预测信息(如选项解析、loglikelihood 等) +- `eval_valid`:该样本评测是否有效 +- `eval_error`:错误信息(例如 `ll_unavailable` / `ppl_unavailable`) + +此外,汇总统计会写入 `eval_result_path`(由各 pipeline 脚本配置),用于快速查看整体指标。 + +## 🔎 参考实现 + +如需深度自定义(prompt、metric、解析逻辑等),建议阅读算子实现: + +- `dataflow/operators/core_text/generate/bench_answer_generator.py` +- `dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py`