From 98946c326184945a1ad7bcac61e110eb7d51fcda Mon Sep 17 00:00:00 2001
From: scuuy <912074188@qq.com>
Date: Wed, 4 Mar 2026 22:24:50 +0800
Subject: [PATCH] new eval framework doc (zh&en)
---
docs/.vuepress/notes/en/guide.ts | 2 +
docs/.vuepress/notes/zh/guide.ts | 2 +
.../guide/model_evaluation/command_eval.md | 6 +-
.../guide/model_evaluation/easy_evaluation.md | 4 +-
.../guide/model_evaluation/overview_info.md | 24 ++
.../guide/model_evaluation/unified_eval.md | 274 ++++++++++++++++++
.../guide/model_evaluation/command_eval.md | 2 +-
.../guide/model_evaluation/easy_evaluation.md | 2 +-
.../guide/model_evaluation/overview_info.md | 24 ++
.../guide/model_evaluation/unified_eval.md | 274 ++++++++++++++++++
10 files changed, 607 insertions(+), 7 deletions(-)
create mode 100644 docs/en/notes/guide/model_evaluation/overview_info.md
create mode 100644 docs/en/notes/guide/model_evaluation/unified_eval.md
create mode 100644 docs/zh/notes/guide/model_evaluation/overview_info.md
create mode 100644 docs/zh/notes/guide/model_evaluation/unified_eval.md
diff --git a/docs/.vuepress/notes/en/guide.ts b/docs/.vuepress/notes/en/guide.ts
index 77e9ff50a5..66aa93bebc 100644
--- a/docs/.vuepress/notes/en/guide.ts
+++ b/docs/.vuepress/notes/en/guide.ts
@@ -92,8 +92,10 @@ export const Guide: ThemeNote = defineNoteConfig({
icon: 'carbon:flow',
prefix: 'model_evaluation',
items: [
+ "overview_info",
"command_eval",
"easy_evaluation",
+ "unified_eval"
]
},
{
diff --git a/docs/.vuepress/notes/zh/guide.ts b/docs/.vuepress/notes/zh/guide.ts
index 1f3992cf5c..ab6dd90425 100644
--- a/docs/.vuepress/notes/zh/guide.ts
+++ b/docs/.vuepress/notes/zh/guide.ts
@@ -90,8 +90,10 @@ export const Guide: ThemeNote = defineNoteConfig({
icon: 'carbon:flow',
prefix: 'model_evaluation',
items: [
+ "overview_info",
"command_eval",
"easy_evaluation",
+ "unified_eval"
]
},
{
diff --git a/docs/en/notes/guide/model_evaluation/command_eval.md b/docs/en/notes/guide/model_evaluation/command_eval.md
index 59c67ffd69..e5701b2bbd 100644
--- a/docs/en/notes/guide/model_evaluation/command_eval.md
+++ b/docs/en/notes/guide/model_evaluation/command_eval.md
@@ -1,11 +1,11 @@
---
-title: Model Capability Assessment Pipeline
+title: Model Evaluation (QA Quickstart)
createTime: 2025/08/30 14:27:02
icon: hugeicons:chart-evaluation
permalink: /en/guide/evaluation-pipeline/
---
-# Model Capability Assessment Pipeline
+# Model Evaluation (QA Quickstart)
⚠️Only supports QA pair format evaluation
@@ -163,4 +163,4 @@ dataflow eval local
Run API evaluation:
```bash
dataflow eval api
-```
\ No newline at end of file
+```
diff --git a/docs/en/notes/guide/model_evaluation/easy_evaluation.md b/docs/en/notes/guide/model_evaluation/easy_evaluation.md
index 79f622d3d8..8c35f56b42 100644
--- a/docs/en/notes/guide/model_evaluation/easy_evaluation.md
+++ b/docs/en/notes/guide/model_evaluation/easy_evaluation.md
@@ -1,11 +1,11 @@
---
-title: easy_evaluation
+title: Model Evaluation (Beginner Edition)
icon: hugeicons:chart-evaluation
createTime: 2025/10/17 15:20:10
permalink: /en/guide/97wq40d9/
---
-# 📊 Model Evaluation Pipeline Guide
+# 📊 Model Evaluation (Beginner Edition)
This guide explains how to use the **DataFlow** evaluation pipeline to assess model-generated answers against ground-truth answers using either **semantic** or **exact match** comparison.
Two evaluation modes are supported:
diff --git a/docs/en/notes/guide/model_evaluation/overview_info.md b/docs/en/notes/guide/model_evaluation/overview_info.md
new file mode 100644
index 0000000000..bfc137e6cf
--- /dev/null
+++ b/docs/en/notes/guide/model_evaluation/overview_info.md
@@ -0,0 +1,24 @@
+---
+title: Model Evaluation Overview
+icon: solar:flag-2-broken
+createTime: 2026/03/04 17:33:23
+permalink: /en/guide/0zegorzv/
+---
+
+# Model Evaluation Overview
+
+DataFlow provides three model evaluation options “from easy to advanced”, covering needs from quick start to research-grade benchmark evaluation. **You only need to choose and read ONE of the following documents to complete your evaluation** (these are different entry points; you do not need to learn all of them).
+
+## How to Choose
+
+| Which user are you? | What you want | Recommended Reading |
+|---|---|---|
+| 👶 Beginner - want to get started fast | Evaluate directly via CLI (for QA data, works out of the box) | [Model Evaluation (QA Quickstart)](/en/guide/evaluation-pipeline/) |
+| 🧑💻 Beginner+ - simple parameter tuning, model before/after comparison | Modify pipeline script parameters (more straightforward) | [Model Evaluation (Beginner Edition)](/en/guide/97wq40d9/) |
+| 🧪 Researcher - academic, standardized benchmark metrics | Unified benchmark evaluation framework (task types + full evaluation parameters) | [Model Evaluation (Research Edition)](/en/guide/41y6wer6/) |
+
+## Document Entries
+
+- [Model Evaluation (QA Quickstart)](/en/guide/evaluation-pipeline/): CLI-based, beginner-friendly, suitable for quick evaluation on **QA-style** datasets.
+- [Model Evaluation (Beginner Edition)](/en/guide/97wq40d9/): pipeline-code based, for beginner/intermediate users, adjust evaluation settings by **editing script parameters**.
+- [Model Evaluation (Research Edition)](/en/guide/41y6wer6/): research-grade evaluation, for users who need to pass full evaluation parameters to evaluate specific benchmarks.
diff --git a/docs/en/notes/guide/model_evaluation/unified_eval.md b/docs/en/notes/guide/model_evaluation/unified_eval.md
new file mode 100644
index 0000000000..3ad534df86
--- /dev/null
+++ b/docs/en/notes/guide/model_evaluation/unified_eval.md
@@ -0,0 +1,274 @@
+---
+title: Model Evaluation (Research Edition)
+icon: hugeicons:chart-evaluation
+createTime: 2026/03/04 16:41:11
+permalink: /en/guide/41y6wer6/
+---
+
+# Model Evaluation (Unified Bench Eval)
+
+DataFlow-Eval (Models) is DataFlow’s in-house model evaluation framework. It abstracts common benchmark evaluation paradigms into a set of mutually exclusive evaluation types (`eval_type`) and provides ready-to-run pipeline scripts, enabling users to evaluate with minimal configuration, write per-sample results back to the dataframe, and export aggregated statistics.
+
+Unified evaluation scripts directory: `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval`
+
+>*If you start DataFlow via the `init` workflow, you can directly locate the `gpu_pipelines` directory.*
+
+## ✅ Workflow
+
+To evaluate a specific model, follow these steps:
+
+1. **Choose the evaluation type**: select `eval_type` based on your dataset schema (enable semantic judging when needed)
+2. **Pick the pipeline script**: open the corresponding pipeline file under the scripts directory
+3. **Edit evaluation parameters**: configure data path, cache directory, model serving, and field mapping
+4. **Run and inspect results**: run the pipeline, then inspect per-sample result columns and aggregated statistics
+
+## 🧰 Environment Setup
+
+Before running the evaluation pipelines locally, install DataFlow:
+
+```bash
+cd DataFlow
+pip install -e .
+```
+
+If you use local model serving (e.g., vLLM), make sure your environment has the required GPU/driver and dependencies.
+
+
+## 🧩 Evaluation Types
+
+`eval_type` defines the required fields (keys) per sample, as well as the default metric/logic.
+
+**Field conventions:**
+- Keys do not include the prompt string itself; they only include variables that will be injected into the prompt (e.g., `question` / `choices` / `context`).
+- `context` is an optional field across all types: if present it will be used; otherwise it is treated as `None` (so you do not need separate benches for “with/without context”).
+
+### Type Overview
+
+| eval_type | Paradigm | Required keys | Default metric/logic | Example benches | Script |
+|---|---|---|---|---|---|
+| `key1_text_score` | Text scoring | `text` | `ppl` | WikiText / PTB | `unified_bench_eval_type1.py` |
+| `key2_qa` | Generative: single reference | `question`
`target` | `math_verify` (optional semantic judge) | GSM8K / MATH | `unified_bench_eval_type2.py` |
+| `key2_q_ma` | Generative: multiple references | `question`
`targets[]` | `any_math_verify` | SQuAD (multiple golds) | `unified_bench_eval_type3.py` |
+| `key3_q_choices_a` | Multiple choice: single correct | `question`
`choices[]`
`label` | `ll_choice_acc` (choice loglikelihood) | PIQA / ARC / MMLU | `unified_bench_eval_type4.py` |
+| `key3_q_choices_as` | Multiple choice: multiple correct | `question`
`choices[]`
`labels[]` | `micro_f1` | Multi-select / multi-label | `unified_bench_eval_type5.py` |
+| `key3_q_a_rejected` | Preference: pairwise comparison | `question`
`better`
`rejected` | `pairwise_ll_winrate` | DPO / preference data | `unified_bench_eval_type6.py` |
+
+### Semantic Judging Toggle (key2_qa only)
+
+Semantic judging is not a standalone type; it is a toggle for `key2_qa`:
+
+- `use_semantic_judge=False`: default `math_verify` (best for verifiable answers)
+- `use_semantic_judge=True`: LLM-based `semantic_judge` (best for open-ended answers)
+
+Reference script (in the same directory): `unified_bench_eval_type_semantic.py`
+
+## 📦 Data Preparation
+
+Unified evaluation supports `jsonl` or `json` as input formats by default. You may keep any additional fields (e.g., `id`, `eval_type`). The evaluation only depends on the column names you explicitly set in the pipeline.
+
+## 🧱 Type Details and Examples (Collapsible)
+
+
+Type1: key1_text_score (Text scoring / PPL)
+
+**Required keys:**
+- `text`
+
+**Example data (jsonl):**
+
+```json
+{"id":"t_0001","text":"The capital of France is Paris."}
+{"id":"t_0002","text":"Perplexity is a common metric for language modeling."}
+```
+
+**Pipeline:**
+- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py`
+
+**Notes:**
+- This type does not require answer generation; it computes `ppl` directly from `text`.
+
+
+
+
+Type2: key2_qa (Generative: single reference)
+
+**Required keys:**
+- `question`
+- `target` (the column name can be customized; map it via `input_target_key` in the pipeline)
+
+**Example data (jsonl):**
+
+```json
+{"id":"qa_0001","question":"Solve for x: 2x + 3 = 11.","target":"x = 4","context":null}
+{"id":"qa_0002","question":"What is the capital of France?","target":"Paris","context":"Answer in one word."}
+```
+
+**Pipeline:**
+- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py`
+
+**Notes:**
+- The default evaluation logic is `math_verify` (strict and verifiable).
+- For semantic judging, set `use_semantic_judge=True` and refer to `.../unified_bench_eval_type_semantic.py`.
+
+
+
+
+Type3: key2_q_ma (Generative: multiple references)
+
+**Required keys:**
+- `question`
+- `targets` (list; JSON-stringified list is also supported)
+
+**Example data (jsonl):**
+
+```json
+{"id":"ma_0001","question":"What is the chemical formula for water?","targets":["H2O","h2o"],"context":"Use chemical symbols."}
+{"id":"ma_0002","question":"Who created Python?","targets":["Guido van Rossum","Guido"],"context":null}
+```
+
+**Pipeline:**
+- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py`
+
+**Notes:**
+- Default logic is `any_math_verify`: any match among references counts as correct.
+
+
+
+
+Type4: key3_q_choices_a (Multiple choice: single correct)
+
+**Required keys:**
+- `question`
+- `choices` (list)
+- `label` (0-based index)
+
+**Example data (jsonl):**
+
+```json
+{"id":"mc_0001","question":"What is the capital of France?","choices":["Paris","London","Berlin","Rome"],"label":0,"context":null}
+{"id":"mc_0002","question":"In Python, what does len([1, 2, 3]) return?","choices":["2","3","4","An error"],"label":1,"context":"Choose exactly one option."}
+```
+
+**Pipeline:**
+- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py`
+
+**Notes:**
+- Default metric is `ll_choice_acc`: compute loglikelihood for each choice, take argmax, compare with `label`.
+- This type usually does not need `generated_ans` (the pipeline typically skips generation by default).
+
+
+
+
+Type5: key3_q_choices_as (Multiple choice: multiple correct)
+
+**Required keys:**
+- `question`
+- `choices` (list)
+- `labels` (0-based index list)
+
+**Example data (jsonl):**
+
+```json
+{"id":"ms_0001","question":"Which of the following are prime numbers?","choices":["2","9","11","15"],"labels":[0,2],"context":null}
+{"id":"ms_0002","question":"Which of the following are HTTP methods?","choices":["GET","FETCH","POST","PUSH"],"labels":[0,2],"context":"Select all that apply."}
+```
+
+**Pipeline:**
+- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py`
+
+**Notes:**
+- Default metric is `micro_f1` (parse the model output as a multi-select set and compute micro-F1).
+- This type requires generating `generated_ans` by default (or provide your own prediction column and point `input_pred_key` to it).
+
+
+
+
+Type6: key3_q_a_rejected (Preference: pairwise comparison)
+
+**Required keys:**
+- `question`
+- `better`
+- `rejected`
+
+**Example data (jsonl):**
+
+```json
+{"id":"pw_0001","question":"Explain what overfitting is in machine learning.","better":"Overfitting is when a model learns the training data too closely, including noise, and performs poorly on unseen data.","rejected":"Overfitting means the model is always perfect.","context":null}
+{"id":"pw_0002","question":"What is the derivative of x^2?","better":"The derivative of x^2 with respect to x is 2x.","rejected":"The derivative of x^2 is x.","context":"Answer with a direct statement."}
+```
+
+**Pipeline:**
+- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py`
+
+**Notes:**
+- Default metric is `pairwise_ll_winrate`: compare the loglikelihood of `P(better|prompt)` vs `P(rejected|prompt)` and compute win rate.
+- This type does not require answer generation (the pipeline typically skips generation by default).
+
+
+
+## ⚙️ Parameter Configuration (What to Edit)
+
+Unified Bench Eval pipeline files already implement the evaluation flow. For most users, you only need to edit two parts in the pipeline code:
+
+1. **Data and cache (FileStorage)**
+2. **Model serving (Serving: local model / API judge)**
+
+Below is a minimal “what to change” example (replace paths and model parameters according to the comments):
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import LocalModelLLMServing_vllm, APILLMServing_request
+
+# 1) Data & cache (FileStorage): set your evaluation data path and cache directory here
+storage = FileStorage(
+ first_entry_file_name="path/to/your_eval_data.jsonl", # TODO: your jsonl/json dataset file
+ cache_path="./cache_local", # TODO: cache directory (stores intermediate & final results)
+ file_name_prefix="your_bench_name", # TODO: cache/result prefix (to distinguish runs)
+ cache_type="jsonl", # TODO: match your input type (jsonl/json)
+)
+
+# 2) Model serving: choose local serving or API based on your needs
+# 2.1 Local model serving (commonly used for generation, PPL/LL, etc.)
+llm_serving_local = LocalModelLLMServing_vllm(
+ hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # TODO: replace with your local path or HF model id
+ vllm_tensor_parallel_size=1, # TODO: tensor parallel config (multi-GPU)
+ vllm_max_tokens=2048, # TODO: max generation tokens
+)
+
+# 2.2 API judge serving (only needed when key2_qa uses semantic judging: use_semantic_judge=True)
+llm_serving_judge = APILLMServing_request(
+ api_url="https://api.openai.com/v1/chat/completions", # TODO: replace with your API endpoint
+ model_name="gpt-4o", # TODO: replace with your judge model
+ max_workers=5, # TODO: concurrency
+)
+
+# Then, in the target pipeline, replace FileStorage and Serving objects with the configs above.
+# For example:
+# self.storage = storage
+# self.llm_serving_generator = llm_serving_local
+# self.llm_serving_judger = llm_serving_judge
+```
+
+## ▶️ Run Evaluation
+
+```bash
+python unified_bench_eval_type2.py
+```
+
+## 📊 Outputs and Field Meanings
+
+After evaluation, results are written back to the cached dataframe with the following columns (column names can be customized via evaluator parameters):
+
+- `eval_score`: numeric score (0/1 for accuracy-style tasks; float for PPL)
+- `eval_pred`: parsed prediction information (e.g., choice parsing, loglikelihood info)
+- `eval_valid`: whether the sample evaluation is valid
+- `eval_error`: error message (e.g., `ll_unavailable` / `ppl_unavailable`)
+
+Additionally, aggregated statistics are saved to `eval_result_path` (configured in the pipeline scripts) for quick overall inspection.
+
+## 🔎 Reference Implementation
+
+For deeper customization (prompts, metrics, parsing logic), refer to the operator implementations:
+
+- `dataflow/operators/core_text/generate/bench_answer_generator.py`
+- `dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py`
diff --git a/docs/zh/notes/guide/model_evaluation/command_eval.md b/docs/zh/notes/guide/model_evaluation/command_eval.md
index 32485167b1..4ce198f823 100644
--- a/docs/zh/notes/guide/model_evaluation/command_eval.md
+++ b/docs/zh/notes/guide/model_evaluation/command_eval.md
@@ -1,5 +1,5 @@
---
-title: EvalPipeline
+title: 模型评估(小白QA快速版)
createTime: 2025/10/20 11:30:42
icon: hugeicons:chart-evaluation
permalink: /zh/guide/cqro9oa8/
diff --git a/docs/zh/notes/guide/model_evaluation/easy_evaluation.md b/docs/zh/notes/guide/model_evaluation/easy_evaluation.md
index d452c5281c..545f29c43f 100644
--- a/docs/zh/notes/guide/model_evaluation/easy_evaluation.md
+++ b/docs/zh/notes/guide/model_evaluation/easy_evaluation.md
@@ -1,5 +1,5 @@
---
-title: 模型评估流水线
+title: 模型评估(小白简易版)
icon: hugeicons:chart-evaluation
createTime: 2025/10/17 15:00:50
permalink: /zh/guide/enty5ksn/
diff --git a/docs/zh/notes/guide/model_evaluation/overview_info.md b/docs/zh/notes/guide/model_evaluation/overview_info.md
new file mode 100644
index 0000000000..7251e21669
--- /dev/null
+++ b/docs/zh/notes/guide/model_evaluation/overview_info.md
@@ -0,0 +1,24 @@
+---
+title: 模型评估概述
+icon: solar:flag-2-broken
+createTime: 2026/03/04 17:33:23
+permalink: /zh/guide/0zegorzv/
+---
+
+# 模型评估概述
+
+DataFlow 提供三种“从易到难”的模型评估方式,覆盖从快速上手到科研级 bench 评测的不同需求。**你只需要选择并阅读其中一种文档即可完成评估**(三者是不同入口,不需要全部学习)。
+
+## 如何选择
+
+| 你更像哪类用户 | 你希望怎么用 | 推荐阅读 |
+|---|---|---|
+| 👶 新手 - 只想快速跑通 | 命令行直接评估(适合 QA 数据,开箱即用) | [模型评估(小白QA快速版)](/zh/guide/cqro9oa8/) |
+| 🧑💻 新手进阶 - 简单调整参数,只进行模型的前后对比 | 通过 pipeline 脚本修改函数传参(更直观) | [模型评估(小白简易版)](/zh/guide/enty5ksn/) |
+| 🧪 科研工作者 - bench 采用学术通用评测指标 | Benchmark 统一评测框架(划分任务类型,传递完整评测参数) | [模型评估(科研完整版)](/zh/guide/41y6wer6/) |
+
+## 文档入口说明
+
+- [模型评估(小白QA快速版)](/zh/guide/cqro9oa8/):命令行方式,面向小白,适合对 **QA 类型数据**做快速评测。
+- [模型评估(小白简易版)](/zh/guide/enty5ksn/):pipeline 代码方式,面向小白/进阶用户,通过 **修改脚本参数** 调整评测设置。
+- [模型评估(科研完整版)](/zh/guide/41y6wer6/):科研完整版评估,面向需要传递完整评测参数评测指定Benchmark的用户。
diff --git a/docs/zh/notes/guide/model_evaluation/unified_eval.md b/docs/zh/notes/guide/model_evaluation/unified_eval.md
new file mode 100644
index 0000000000..65ee0c0ad9
--- /dev/null
+++ b/docs/zh/notes/guide/model_evaluation/unified_eval.md
@@ -0,0 +1,274 @@
+---
+title: 模型评估(科研完整版)
+icon: hugeicons:chart-evaluation
+createTime: 2026/03/04 16:41:11
+permalink: /zh/guide/41y6wer6/
+---
+
+# 模型评估(Unified Bench Eval)
+
+DataFlow-Eval (Models) 是 DataFlow 自研的模型评估框架:它将常见 benchmark 的评测范式抽象为若干互斥的评测类型(`eval_type`),并提供开箱即用的评测流水线脚本,帮助用户以最少配置完成评测、写回逐样本结果并输出汇总统计。
+
+统一评测脚本目录:`dataflow/statics/pipelines/gpu_pipelines/benchmark_eval`
+
+>*init 方式启动 DataFlow 则直接找到`gpu_pipelines`目录即可*
+
+## ✅ 使用流程
+
+对指定模型进行评测时,按以下顺序操作:
+
+1. **确定评测类型**:根据数据结构选择 `eval_type`(必要时开启语义评测开关)
+2. **选择评测流水线**:进入脚本目录,找到对应 type 的 pipeline 文件
+3. **填写评测参数**:配置数据路径、缓存目录、模型 serving、字段名映射
+4. **运行并查看结果**:运行 pipeline,查看逐样本结果列与汇总统计文件
+
+## 🧰 环境准备
+
+在本地运行评测流水线前,安装 DataFlow:
+
+```bash
+cd DataFlow
+pip install -e .
+```
+
+如使用本地模型 serving(例如 vLLM),请确保运行环境具备对应 GPU/驱动与依赖。
+
+
+## 🧩 评测类型
+
+`eval_type` 定义了每条样本必须包含的字段(keys),以及默认的评测指标/逻辑。
+
+**字段约定:**
+- keys 不包含 prompt 本身的字符串,仅包含需要嵌入 prompt 的变量字段(如 `question` / `choices` / `context`)。
+- `context` 为统一可选字段:存在即使用;不传或不存在即视为 `None`(无需再拆分“有/无上下文”的 bench)。
+
+### 类型总览
+
+| eval_type | 类型范式 | 必要 keys | 默认 metric/逻辑 | 示例 Bench | 脚本文件 |
+|---|---|---|---|---|---|
+| `key1_text_score` | 文本打分 | `text` | `ppl` | WikiText / PTB | `unified_bench_eval_type1.py` |
+| `key2_qa` | 生成式:单参考答案 | `question`
`target` | `math_verify`(可选语义评测) | GSM8K / MATH | `unified_bench_eval_type2.py` |
+| `key2_q_ma` | 生成式:多参考答案 | `question`
`targets[]` | `any_math_verify` | SQuAD(多 gold) | `unified_bench_eval_type3.py` |
+| `key3_q_choices_a` | 选择题:单正确 | `question`
`choices[]`
`label` | `ll_choice_acc`(loglikelihood 选项打分) | PIQA / ARC / MMLU | `unified_bench_eval_type4.py` |
+| `key3_q_choices_as` | 选择题:多正确 | `question`
`choices[]`
`labels[]` | `micro_f1` | 多选题 / 多标签 | `unified_bench_eval_type5.py` |
+| `key3_q_a_rejected` | 偏好/排序:成对比较 | `question`
`better`
`rejected` | `pairwise_ll_winrate` | DPO/偏好数据 | `unified_bench_eval_type6.py` |
+
+### 语义评测开关(仅适用于 key2_qa)
+
+语义评测不是独立类型,而是 `key2_qa` 的评测开关:
+
+- `use_semantic_judge=False`:默认 `math_verify`(适合可验证答案)
+- `use_semantic_judge=True`:使用 LLM judge 的 `semantic_judge`(适合开放式答案)
+
+对应示例脚本(同目录下):`unified_bench_eval_type_semantic.py`
+
+## 📦 数据准备
+
+统一评测默认支持 `jsonl`或`json` 作为输入格式。你可以保留任意额外字段(如 `id`、`eval_type`),评测只依赖你在 pipeline 中显式填写的列名。
+
+## 🧱 各类型说明与示例(可折叠)
+
+
+Type1:key1_text_score(文本打分 / PPL)
+
+**必要 keys:**
+- `text`
+
+**数据示例(jsonl):**
+
+```json
+{"id":"t_0001","text":"The capital of France is Paris."}
+{"id":"t_0002","text":"Perplexity is a common metric for language modeling."}
+```
+
+**对应 pipeline:**
+- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type1.py`
+
+**说明:**
+- 该类型不需要生成答案,评测直接对 `text` 计算 `ppl`。
+
+
+
+
+Type2:key2_qa(生成式:单参考答案)
+
+**必要 keys:**
+- `question`
+- `target`(列名可自定义,只要在 pipeline 中用 `input_target_key` 映射即可)
+
+**数据示例(jsonl):**
+
+```json
+{"id":"qa_0001","question":"Solve for x: 2x + 3 = 11.","target":"x = 4","context":null}
+{"id":"qa_0002","question":"What is the capital of France?","target":"Paris","context":"Answer in one word."}
+```
+
+**对应 pipeline:**
+- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type2.py`
+
+**说明:**
+- 默认评测逻辑为 `math_verify`(偏严格、可验证)。
+- 如需语义评测,使用 `use_semantic_judge=True`,参考脚本 `.../unified_bench_eval_type_semantic.py`。
+
+
+
+
+Type3:key2_q_ma(生成式:多参考答案)
+
+**必要 keys:**
+- `question`
+- `targets`(list;也可为 JSON 字符串形式的 list)
+
+**数据示例(jsonl):**
+
+```json
+{"id":"ma_0001","question":"What is the chemical formula for water?","targets":["H2O","h2o"],"context":"Use chemical symbols."}
+{"id":"ma_0002","question":"Who created Python?","targets":["Guido van Rossum","Guido"],"context":null}
+```
+
+**对应 pipeline:**
+- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type3.py`
+
+**说明:**
+- 默认评测逻辑为 `any_math_verify`:多参考答案中任意一个命中即视为正确。
+
+
+
+
+Type4:key3_q_choices_a(选择题:单正确)
+
+**必要 keys:**
+- `question`
+- `choices`(list)
+- `label`(0-based 下标)
+
+**数据示例(jsonl):**
+
+```json
+{"id":"mc_0001","question":"What is the capital of France?","choices":["Paris","London","Berlin","Rome"],"label":0,"context":null}
+{"id":"mc_0002","question":"In Python, what does len([1, 2, 3]) return?","choices":["2","3","4","An error"],"label":1,"context":"Choose exactly one option."}
+```
+
+**对应 pipeline:**
+- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type4.py`
+
+**说明:**
+- 默认使用 `ll_choice_acc`:对每个 choice 计算 loglikelihood,取 argmax 与 `label` 比较。
+- 该类型通常不需要生成 `generated_ans`(pipeline 中会默认跳过生成步骤)。
+
+
+
+
+Type5:key3_q_choices_as(选择题:多正确)
+
+**必要 keys:**
+- `question`
+- `choices`(list)
+- `labels`(0-based 下标列表)
+
+**数据示例(jsonl):**
+
+```json
+{"id":"ms_0001","question":"Which of the following are prime numbers?","choices":["2","9","11","15"],"labels":[0,2],"context":null}
+{"id":"ms_0002","question":"Which of the following are HTTP methods?","choices":["GET","FETCH","POST","PUSH"],"labels":[0,2],"context":"Select all that apply."}
+```
+
+**对应 pipeline:**
+- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type5.py`
+
+**说明:**
+- 默认 metric 为 `micro_f1`(将模型输出解析为多选集合后计算)。
+- 该类型默认需要生成 `generated_ans`(也可直接提供预测列并将 `input_pred_key` 指向该列)。
+
+
+
+
+Type6:key3_q_a_rejected(偏好/排序:成对比较)
+
+**必要 keys:**
+- `question`
+- `better`
+- `rejected`
+
+**数据示例(jsonl):**
+
+```json
+{"id":"pw_0001","question":"Explain what overfitting is in machine learning.","better":"Overfitting is when a model learns the training data too closely, including noise, and performs poorly on unseen data.","rejected":"Overfitting means the model is always perfect.","context":null}
+{"id":"pw_0002","question":"What is the derivative of x^2?","better":"The derivative of x^2 with respect to x is 2x.","rejected":"The derivative of x^2 is x.","context":"Answer with a direct statement."}
+```
+
+**对应 pipeline:**
+- `dataflow/statics/pipelines/gpu_pipelines/benchmark_eval/unified_bench_eval_type6.py`
+
+**说明:**
+- 默认 metric 为 `pairwise_ll_winrate`:比较 `P(better|prompt)` 与 `P(rejected|prompt)` 的 loglikelihood,统计 win rate。
+- 该类型不需要生成答案(pipeline 中会默认跳过生成步骤)。
+
+
+
+## ⚙️ 参数配置(参考写法)
+
+Unified Bench Eval 的 pipeline 文件已经写好评测流程。对大多数用户来说,只需要在流水线代码里修改两处配置即可:
+
+1. **数据与缓存(FileStorage)**
+2. **模型服务(Serving:本地模型 / API judge)**
+
+下面给出一个最小“需要改哪里”的示例(按注释替换为你的路径与模型参数即可):
+
+```python
+from dataflow.utils.storage import FileStorage
+from dataflow.serving import LocalModelLLMServing_vllm, APILLMServing_request
+
+# 1) 数据与缓存(FileStorage):把你的评测数据路径与缓存目录填在这里
+storage = FileStorage(
+ first_entry_file_name="path/to/your_eval_data.jsonl", # TODO: 你的 jsonl/json 数据文件
+ cache_path="./cache_local", # TODO: 缓存目录(会写入中间结果与评测结果)
+ file_name_prefix="your_bench_name", # TODO: 结果/缓存前缀(用于区分不同评测)
+ cache_type="jsonl", # TODO: 与输入文件一致(jsonl/json)
+)
+
+# 2) 模型服务(Serving):按需求选择本地模型或 API
+# 2.1 本地模型 serving(常用于生成、PPL/LL 等)
+llm_serving_local = LocalModelLLMServing_vllm(
+ hf_model_name_or_path="Qwen/Qwen2.5-7B-Instruct", # TODO: 替换为你的模型路径或 HF 名称
+ vllm_tensor_parallel_size=1, # TODO: 多卡并行配置
+ vllm_max_tokens=2048, # TODO: 生成长度上限
+)
+
+# 2.2 API judge serving(仅在 key2_qa 语义评测 use_semantic_judge=True 时需要)
+llm_serving_judge = APILLMServing_request(
+ api_url="https://api.openai.com/v1/chat/completions", # TODO: 替换为你的 API 地址
+ model_name="gpt-4o", # TODO: 替换为你的 judge 模型
+ max_workers=5, # TODO: 并发数
+)
+
+# 之后在对应 pipeline 里,把 FileStorage 与 Serving 对象替换为上面的配置即可
+# 例如:
+# self.storage = storage
+# self.llm_serving_generator = llm_serving_local
+# self.llm_serving_judger = llm_serving_judge
+```
+
+## ▶️ 运行评测
+
+```bash
+python unified_bench_eval_type2.py
+```
+
+## 📊 结果产物与字段含义
+
+评测结束后,结果会写回缓存 DataFrame,并包含以下列(可在 evaluator 参数中自定义列名):
+
+- `eval_score`:数值评分(accuracy 类任务为 0/1;PPL 为浮点数)
+- `eval_pred`:解析后的预测信息(如选项解析、loglikelihood 等)
+- `eval_valid`:该样本评测是否有效
+- `eval_error`:错误信息(例如 `ll_unavailable` / `ppl_unavailable`)
+
+此外,汇总统计会写入 `eval_result_path`(由各 pipeline 脚本配置),用于快速查看整体指标。
+
+## 🔎 参考实现
+
+如需深度自定义(prompt、metric、解析逻辑等),建议阅读算子实现:
+
+- `dataflow/operators/core_text/generate/bench_answer_generator.py`
+- `dataflow/operators/core_text/eval/unified_bench_dataset_evaluator.py`