diff --git a/docs/.vuepress/notes/en/mm_guide.ts b/docs/.vuepress/notes/en/mm_guide.ts index 527690ec..8a5469cc 100644 --- a/docs/.vuepress/notes/en/mm_guide.ts +++ b/docs/.vuepress/notes/en/mm_guide.ts @@ -30,6 +30,7 @@ export const MMGuide: ThemeNote = defineNoteConfig({ 'image_gcot', 'vision_mct_reasoning_pipeline', 'image_region_caption_pipeline', + 'image_region_caption_pipeline_api', 'image_scale_caption_pipeline', 'image_visual_only_mcq_pipeline', ], @@ -45,8 +46,7 @@ export const MMGuide: ThemeNote = defineNoteConfig({ 'video_clip_and_filter', 'video_qa', 'video_cotqa', - 'video_longvideo_cotqa_api', - 'multirole_videoqa_pipeline' + 'video_longvideo_cotqa_api' ], }, { diff --git a/docs/.vuepress/notes/en/mm_operators.ts b/docs/.vuepress/notes/en/mm_operators.ts index 1c7faadd..98f53d6a 100644 --- a/docs/.vuepress/notes/en/mm_operators.ts +++ b/docs/.vuepress/notes/en/mm_operators.ts @@ -26,8 +26,8 @@ export const MMOperators: ThemeNote = defineNoteConfig({ collapsed: false, prefix: 'generate/', items: [ - 'image_caption', - 'image_qa', + // 'image_caption', + // 'image_qa', 'image_pers_qa', 'multimodal_math', "prompt_templated_vqa_generator", @@ -41,8 +41,7 @@ export const MMOperators: ThemeNote = defineNoteConfig({ // 'image_region_caption', // 'image_scale_caption', // 'image_gcot', - // 'image_caprl', - // 'multirole_videoqa', + // 'image_caprl' ] }, { diff --git a/docs/.vuepress/notes/zh/mm_guide.ts b/docs/.vuepress/notes/zh/mm_guide.ts index d0e7f2c5..21bece4d 100644 --- a/docs/.vuepress/notes/zh/mm_guide.ts +++ b/docs/.vuepress/notes/zh/mm_guide.ts @@ -30,6 +30,7 @@ export const MMGuide: ThemeNote = defineNoteConfig({ 'image_gcot', 'vision_mct_reasoning_pipeline', 'image_region_caption_pipeline', + 'image_region_caption_pipeline_api', 'image_scale_caption_pipeline', 'image_visual_only_mcq_pipeline', ], @@ -45,8 +46,7 @@ export const MMGuide: ThemeNote = defineNoteConfig({ 'video_clip_and_filter', 'video_qa', 'video_cotqa', - 'video_longvideo_cotqa_api', - 'multirole_videoqa_pipeline' + 'video_longvideo_cotqa_api' ], }, { diff --git a/docs/.vuepress/notes/zh/mm_operators.ts b/docs/.vuepress/notes/zh/mm_operators.ts index d8f05499..8451b255 100644 --- a/docs/.vuepress/notes/zh/mm_operators.ts +++ b/docs/.vuepress/notes/zh/mm_operators.ts @@ -27,8 +27,8 @@ export const MMOperators: ThemeNote = defineNoteConfig({ collapsed: false, prefix: 'generate/', items: [ - 'image_caption', - 'image_qa', + // 'image_caption', + // 'image_qa', 'image_pers_qa', 'multimodal_math', 'prompt_templated_vqa_generator', @@ -42,8 +42,7 @@ export const MMOperators: ThemeNote = defineNoteConfig({ // 'image_region_caption', // 'image_scale_caption', // 'image_gcot', - // 'image_caprl', - // 'multirole_videoqa', + // 'image_caprl' ] }, { diff --git a/docs/en/notes/mm_guide/image_understanding/context_vqa.md b/docs/en/notes/mm_guide/image_understanding/context_vqa.md index 52ef9148..52bbe3b3 100644 --- a/docs/en/notes/mm_guide/image_understanding/context_vqa.md +++ b/docs/en/notes/mm_guide/image_understanding/context_vqa.md @@ -7,27 +7,25 @@ permalink: /en/mm_guide/contextvqa_pipeline/ ## 1. Overview -The **ContextVQA Multimodal QA Data Generation Pipeline** is designed to automatically generate **Visual Question Answering (VQA) data equipped with external knowledge contexts** starting from images. This pipeline utilizes Vision-Language Models (VLM) to generate Wikipedia-style articles related to the image and corresponding QA pairs, which are then parsed into structured data. - - +The **ContextVQA Multimodal QA Data Generation Pipeline** is designed to automatically generate **visual question answering (Context-based VQA) data with external knowledge contexts** starting from images. This pipeline utilizes Vision-Language Models (VLM) to generate Wikipedia-style articles related to the images and corresponding QA pairs, which are then parsed into structured data. We support the following application scenarios: * **Knowledge-based VQA Data Synthesis**: Building QA datasets that require external knowledge reasoning. * **Multimodal RAG Data Construction**: Generating high-quality data for training Retrieval-Augmented Generation (RAG) systems. -* **Visual Reasoning Training**: Generating data where the question points to the image, but the answer must be reasoned from the accompanying text context. +* **Visual Reasoning Training**: Generating questions that point to the image, but require answers reasoned from the textual context. -The main stages of the pipeline include: +The main flow includes: -1. **Data Loading**: Reading data files containing image paths. -2. **Context and QA Generation**: Using a VLM to generate a Wikipedia-style article and raw QA pairs based on the image. -3. **Data Cleaning and Structuring**: Parsing raw text to extract a structured `{context, qas}` format. +1. **Data Loading**: Reading data files containing image paths. +2. **Context and QA Generation**: Utilizing a locally deployed VLM to generate Wikipedia-style articles and raw QA pairs based on the image. +3. **Data Cleaning and Structuring**: Parsing raw text to extract a structured `{context, qas}` format. --- ## 2. Quick Start -### Step 1: Create a New DataFlow Working Directory +### Step 1: Create a New DataFlow Work Folder ```bash mkdir run_dataflow_mm @@ -42,34 +40,45 @@ dataflow init ``` -After initialization, you will see the generated file structure, including: +You will now see: ```bash gpu_pipelines/context_vqa.py ``` -### Step 3: Configure Model and Data Paths +### Step 3: Download Example Data + +```bash +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data + +``` + +### Step 4: Configure Model and Data Paths -Modify the VLM model path and dataset location in `context_vqa.py`: +Modify the class initialization parameters directly in `context_vqa.py` (no longer passed via command line arguments): ```python -parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") # Update to your local model path -parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface") -parser.add_argument("--download_dir", default="./ckpt") -parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda") - -# Update the data path below. -# We provide example data at: run_dataflow_mm/example_data/image_to_text_pipeline/capsbench_captions.json -# Note: You can download the actual images using the "source" URLs provided within the JSON file. -parser.add_argument("--images_file", default="dataflow/example/image_to_text_pipeline/capsbench_captions.json") -parser.add_argument("--cache_path", default="./cache_local") -parser.add_argument("--file_name_prefix", default="context_vqa") -parser.add_argument("--cache_type", default="json") +# Model Serving Configuration +self.serving = LocalModelVLMServing_vllm( + hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct", + hf_cache_dir="~/.cache/huggingface", + hf_local_dir="./ckpt", + vllm_tensor_parallel_size=1, + vllm_max_tokens=512, +) + +# Data Storage Configuration +self.storage = FileStorage( + first_entry_file_name="./example_data/image_contextvqa/sample_data.json", + cache_path="./cache_local", + file_name_prefix="context_vqa", + cache_type="json", +) ``` -### Step 4: Launch the Pipeline +### Step 5: One-Click Run ```bash python gpu_pipelines/context_vqa.py @@ -82,106 +91,74 @@ python gpu_pipelines/context_vqa.py ### 1. **Input Data** -The input data for this process primarily contains the following fields: - -* **image**: Path to the image file (local path or URL). -* **id** (optional): Unique identifier for the data. - -Data is managed via `FileStorage`, which supports breakpoint resumption (checkpointing). +Input data is managed through `FileStorage`, supporting breakpoint resumption. -**Input Data Example**: +**Input Data Example (`sample_data.json`)**: -```jsonl -{"id": 1, "image": "./images/landmark.jpg"} -{"id": 2, "image": "./images/animal.jpg"} +```json +[ + { + "image": ["./example_data/image_contextvqa/person.png"], + "conversation": [ + { + "from": "human", + "value": "Write a Wikipedia article related to this image without directly referring to the image. Then write question answer pairs..." + } + ] + } +] ``` -Example images can be found at `https://huggingface.co/datasets/OpenDCAI/dataflow-demo-image/tree/main/capsbench_images`. Additionally, we have synthesized 200k high-quality context VQA data records for the community to experience at `https://huggingface.co/datasets/OpenDCAI/dataflow-mm-context_vqa`. - ### 2. **Core Operator Logic** -The pipeline completes its task by concatenating two core operators: - -#### A. **FixPromptedVQAGenerator (Context Generation)** - -This operator uses the VLM model to generate raw text according to a preset prompt template. - -**Functions:** - -* Generates a Wikipedia-style science article based on the image. -* Generates QA pairs based on the article. -* **Prompt Constraints**: The question points to the image but avoids direct mention of object names; answers must come from the article content and not be objects in the image; answers should be concise. +#### A. **PromptedVQAGenerator (Context Generation)** -**Model Serving Configuration**: - -```python -self.serving = LocalModelVLMServing_vllm( - hf_model_name_or_path=model_path, - hf_cache_dir=hf_cache_dir, - vllm_tensor_parallel_size=1, - vllm_temperature=0.7, # Maintain a level of creativity - vllm_top_p=0.9, - vllm_max_tokens=512, -) - -``` +This operator calls the local VLM model to generate raw text based on built-in Wikipedia-style prompt templates. **Operator Execution**: ```python self.vqa_generator.run( storage=self.storage.step(), - input_image_key="image", - output_answer_key="vqa" # Outputs the raw generated text + input_conversation_key="conversation", + input_image_key=input_image_key, + output_answer_key=output_answer_key, ) ``` -#### B. **WikiQARefiner (Result Refinement)** - -This operator is responsible for cleaning the unstructured text generated by the VLM and converting it into a standard format. - -**Functions:** +#### B. **WikiQARefiner (Result Parsing)** -* Cleans Markdown formatting and redundant white space. -* Separates article content (Context) from QA pairs (QAs). +This operator cleans the unstructured text generated by the VLM and converts it into a standard format, separating the article content (Context) from the question-answer pairs (QAs). **Operator Execution**: ```python self.refiner.run( storage=self.storage.step(), - input_key="vqa", # Inputs raw text from the previous step - output_key="context_vqa" # Outputs final structured data + input_key="vqa", # Raw text from the previous step + output_key="context_vqa" # Final structured data ) ``` ### 3. **Output Data** -Ultimately, the output generated by the pipeline will include: - -* **image**: Original image path. -* **vqa**: Raw text generated by the VLM (intermediate result). -* **context_vqa**: Final structured result containing `context` (article) and `qas` (QA list). +The final structured data includes `context` (article) and `qas` (list of questions and answers). **Output Data Example**: ```json { "id": 1, - "image": "./images/landmark.jpg", + "image": ["./example_data/image_contextvqa/person.png"], "context_vqa": { - "context": "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France...", + "context": "Nightmare Alley is a 2021 American psychological thriller film...", "qas": [ { - "question": "In which city is this structure located?", - "answer": "Paris" - }, - { - "question": "What material is the tower primarily constructed from?", - "answer": "wrought-iron" + "question": "What genre does this film belong to?", + "answer": "Psychological thriller" } ] } @@ -193,45 +170,35 @@ Ultimately, the output generated by the pipeline will include: ## 4. Pipeline Example -Below is the complete implementation of `ContextVQAPipeline`, supporting command-line arguments. +Below is the complete `ContextVQAPipeline` code implementation. ```python import argparse from dataflow.utils.storage import FileStorage +from dataflow.core import LLMServingABC from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm -from dataflow.operators.core_vision import FixPromptedVQAGenerator -from dataflow.operators.core_vision import WikiQARefiner +from dataflow.operators.core_vision import PromptedVQAGenerator, WikiQARefiner + class ContextVQAPipeline: """ - Complete batch ContextVQA Caption generation for images with a single command. + Batch generate ContextVQA data for images with a single command. """ - def __init__( - self, - model_path: str, - *, - hf_cache_dir: str | None = None, - download_dir: str = "./ckpt", - device: str = "cuda", - first_entry_file: str = "dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl", - cache_path: str = "./cache_local_skvqa", - file_name_prefix: str = "skvqa_cache_step", - cache_type: str = "jsonl", - ): + def __init__(self, llm_serving: LLMServingABC = None): # ---------- 1. Storage ---------- self.storage = FileStorage( - first_entry_file_name=first_entry_file, - cache_path=cache_path, - file_name_prefix=file_name_prefix, - cache_type=cache_type, + first_entry_file_name="./example_data/image_contextvqa/sample_data.json", + cache_path="./cache_local", + file_name_prefix="context_vqa", + cache_type="json", ) # ---------- 2. Serving ---------- - self.serving = LocalModelVLMServing_vllm( - hf_model_name_or_path=model_path, - hf_cache_dir=hf_cache_dir, - hf_local_dir=download_dir, + self.vlm_serving = LocalModelVLMServing_vllm( + hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct", + hf_cache_dir="~/.cache/huggingface", + hf_local_dir="./ckpt", vllm_tensor_parallel_size=1, vllm_temperature=0.7, vllm_top_p=0.9, @@ -239,72 +206,35 @@ class ContextVQAPipeline: ) # ---------- 3. Operator ---------- - # Generate Wiki-style articles and QA using a specific Prompt - self.vqa_generator = FixPromptedVQAGenerator( - serving=self.serving, - system_prompt="You are a helpful assistant.", - user_prompt= """ - Write a Wikipedia article related to this image without directly referring to the image. Then write question answer pairs. The question answer pairs should satisfy the following criteria. - 1: The question should refer to the image. - 2: The question should avoid mentioning the name of the object in the image. - 3: The question should be answered by reasoning over the Wikipedia article. - 4: The question should sound natural and concise. - 5: The answer should be extracted from the Wikipedia article. - 6: The answer should not be any objects in the image. - 7: The answer should be a single word or phrase and list all correct answers separated by commas. - 8: The answer should not contain 'and', 'or', rather you can split them into multiple answers. - """ + self.vqa_generator = PromptedVQAGenerator( + serving=self.vlm_serving, + system_prompt= "You are a helpful assistant." ) - # Result cleaning and structuring self.refiner = WikiQARefiner() - + # ------------------------------------------------------------------ # def forward(self): input_image_key = "image" output_answer_key = "vqa" output_wiki_key = "context_vqa" - # Step 1: Generate raw text self.vqa_generator.run( storage=self.storage.step(), + input_conversation_key="conversation", input_image_key=input_image_key, output_answer_key=output_answer_key ) - # Step 2: Parse into structured data self.refiner.run( storage=self.storage.step(), input_key=output_answer_key, output_key=output_wiki_key ) -# ---------------------------- CLI Entry -------------------------------- # +# ---------------------------- CLI Entry ------------------------------- # if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Batch SKVQA caption generation with DataFlow") - - parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") - parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface") - parser.add_argument("--download_dir", default="./ckpt") - parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda") - - parser.add_argument("--images_file", default="dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl") - parser.add_argument("--cache_path", default="./cache_local") - parser.add_argument("--file_name_prefix", default="context_vqa") - parser.add_argument("--cache_type", default="jsonl") - - args = parser.parse_args() - - pipe = ContextVQAPipeline( - model_path=args.model_path, - hf_cache_dir=args.hf_cache_dir, - download_dir=args.download_dir, - device=args.device, - first_entry_file=args.images_file, - cache_path=args.cache_path, - file_name_prefix=args.file_name_prefix, - cache_type=args.cache_type, - ) + pipe = ContextVQAPipeline() pipe.forward() ``` \ No newline at end of file diff --git a/docs/en/notes/mm_guide/image_understanding/context_vqa_api.md b/docs/en/notes/mm_guide/image_understanding/context_vqa_api.md index 1e49a8e8..5fad70a6 100644 --- a/docs/en/notes/mm_guide/image_understanding/context_vqa_api.md +++ b/docs/en/notes/mm_guide/image_understanding/context_vqa_api.md @@ -7,19 +7,19 @@ permalink: /en/mm_guide/contextvqa_api_pipeline/ ## 1. Overview -The **ContextVQA Multimodal QA Data Generation Pipeline (API Version)** is designed to automatically generate **Context-based Visual Question Answering (VQA) data** starting from images. This pipeline utilizes Vision-Language Models (VLM) via API to generate Wikipedia-style articles and QA pairs, which are then parsed into structured data. It is ideal for building knowledge-intensive VQA and multimodal RAG (Retrieval-Augmented Generation) datasets. +The **ContextVQA Multimodal QA Data Generation Pipeline (API Version)** is designed to automatically generate **visual question answering data with external knowledge context (Context-based VQA)** starting from an image. This pipeline uses a Vision-Language Model (VLM) via API to generate Wikipedia-style articles and QA pairs, which are then parsed into structured data. This is ideal for building knowledge-based VQA and multimodal RAG (Retrieval-Augmented Generation) datasets. -We support the following use cases: +We support the following application scenarios: -* **Knowledge-based VQA Data Synthesis**: Building QA datasets that require external knowledge reasoning. -* **Multimodal RAG Data Construction**: Generating high-quality data for training RAG systems. -* **Visual Reasoning Training**: Generating data where questions refer to an image, but answers must be reasoned from text context. +* **Knowledge-based VQA Data Synthesis**: Constructing QA datasets that require external knowledge reasoning. +* **Multimodal RAG Data Construction**: Generating high-quality data for training retrieval-augmented generation models. +* **Visual Reasoning Training**: Generating questions that point to an image but require answers derived from textual context reasoning. -The pipeline consists of three main stages: +The main flow of the pipeline includes: 1. **Data Loading**: Reading data files containing image paths. -2. **Context and QA Generation**: Using VLM APIs to generate Wikipedia-style articles and raw QA pairs based on images. -3. **Data Cleaning and Structuring**: Parsing raw text to extract structured `{context, qas}` formats. +2. **Context and QA Generation**: Using a VLM API to generate Wikipedia-style articles and raw QA pairs based on images. +3. **Data Cleaning and Structuring**: Parsing raw text to extract a structured `{context, qas}` format. --- @@ -27,15 +27,15 @@ The pipeline consists of three main stages: ### Step 1: Configure API Key -Set your API Key environment variable in your script: +Set the API Key environment variable in your script: ```python import os -os.environ["DF_API_KEY"] = "your_api_key" +os.environ["DF_API_KEY"] = "sk-xxx" ``` -### Step 2: Create a New DataFlow Working Directory +### Step 2: Create a New DataFlow Work Folder ```bash mkdir run_dataflow @@ -50,29 +50,31 @@ dataflowmm init ``` -You will then see: +You will see the following file created: ```bash api_pipelines/image_contextvqa.py ``` -### Step 4: Download Sample Data +### Step 4: Download Example Data ```bash -huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data ``` ### Step 5: Configure Parameters -Configure the API service and input data paths in `image_contextvqa.py`: +In `image_contextvqa.py`, configure the API service and input data paths (no `argparse` required, modify default paths directly in the code): ```python self.vlm_serving = APIVLMServing_openai( - api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any OpenAI-compatible API platform - key_name_of_api_key="DF_API_KEY", # API key set in Step 1 - model_name="qwen3-vl-8b-instruct", + api_url="http://172.96.141.132:3001/v1", # Any OpenAI-compatible API platform + key_name_of_api_key="DF_API_KEY", # Corresponding API key set in Step 1 + model_name="gpt-5-nano-2025-08-07", + image_io=None, + send_request_stream=False, max_workers=10, timeout=1800 ) @@ -80,14 +82,16 @@ self.vlm_serving = APIVLMServing_openai( ``` ```python -parser.add_argument("--images_file", default="data/image_contextvqa/sample_data.json") -parser.add_argument("--cache_path", default="./cache_local") -parser.add_argument("--file_name_prefix", default="context_vqa") -parser.add_argument("--cache_type", default="json") +self.storage = FileStorage( + first_entry_file_name="./example_data/image_contextvqa/sample_data.json", + cache_path="./cache_local", + file_name_prefix="context_vqa", + cache_type="json", +) ``` -### Step 6: Run with One Command +### Step 6: One-Click Run ```bash python api_pipelines/image_contextvqa.py @@ -100,24 +104,24 @@ python api_pipelines/image_contextvqa.py ### 1. **Input Data** -The input data for this process primarily includes the following fields: +The input data for this process mainly includes the following fields: -* **image**: Image file path (local path or URL). +* **image**: Path to the image file (local path or URL). * **id** (Optional): Unique identifier for the data. -* **conversation** (Optional): Conversation-formatted text used to guide context generation. +* **conversation** (Optional): Text in dialogue format used to supplement context generation. -Data is managed via `FileStorage`, supporting breakpoint resumption. +Data is managed through `FileStorage`, which supports breakpoint resumption. **Input Data Example**: ```json [ { - "image": ["./data/image_contextvqa/person.png"], + "image": ["./example_data/image_contextvqa/person.png"], "conversation": [ { "from": "human", - "value": "Write a Wikipedia article related to this image without directly referring to the image. Then write question answer pairs..." + "value": "Write a Wikipedia article related to this image without directly referring to the image..." } ] } @@ -127,17 +131,17 @@ Data is managed via `FileStorage`, supporting breakpoint resumption. ### 2. **Core Operator Logic** -The pipeline chains two core operators: +This pipeline completes the task by concatenating two core operators: #### A. **PromptedVQAGenerator (Context Generation)** -This operator calls the VLM API to generate raw text based on the prompt template. +This operator is responsible for calling the VLM API to generate raw text based on a prompt template. **Features:** -* Generates a Wikipedia-style encyclopedia article based on the image. +* Generates a Wikipedia-style popular science article based on the image. * Generates QA pairs based on the article. -* **Prompt Constraints**: Questions point to the image but avoid direct object naming; answers must come from the article and not be objects in the image; answers must be concise. +* **Prompt Constraints**: Questions refer to the image but avoid mentioning object names; answers are from the article and are not objects in the image; answers are concise. **Operator Execution**: @@ -153,12 +157,12 @@ self.vqa_generator.run( #### B. **WikiQARefiner (Result Parsing)** -This operator cleans the unstructured text generated by the VLM and converts it into a standard format. +This operator cleans the raw text generated by the VLM and converts it into a standard format. **Features:** -* Cleans Markdown formatting and redundant whitespace. -* Separates article content (Context) and QA pairs (QAs). +* Cleans Markdown formatting and extra whitespace. +* Separates the article content (Context) from the QA pairs (QAs). **Operator Execution**: @@ -173,28 +177,24 @@ self.refiner.run( ### 3. **Output Data** -The final output contains: +The final output data generated by the pipeline will contain: * **image**: Original image path. -* **vqa**: Raw text generated by VLM (intermediate result). -* **context_vqa**: Structured final result containing `context` (article) and `qas` (QA list). +* **vqa**: Raw text generated by the VLM (intermediate result). +* **context_vqa**: Final structured result containing `context` (article) and `qas` (QA list). **Output Data Example**: ```json [ { - "image": ["./data/image_contextvqa/person.png"], + "image": ["./example_data/image_contextvqa/person.png"], "context_vqa": { - "context": "**Wikipedia Article:** Nightmare Alley is a 2021 American psychological thriller film...", + "context": "**Wikipedia Article:** *Nightmare Alley* is a 2021 American psychological thriller...", "qas": [ { "question": "What genre does this film belong to?", "answer": "Psychological thriller" - }, - { - "question": "Who directed this film?", - "answer": "Guillermo del Toro" } ] } @@ -207,44 +207,40 @@ The final output contains: ## 4. Pipeline Example -The following is the complete `ContextVQAPipeline` implementation supporting CLI arguments. +Below is the complete `ContextVQAPipeline` implementation. ```python import os -import argparse + +# Set API Key environment variable +os.environ["DF_API_KEY"] = "sk-xxx" + from dataflow.utils.storage import FileStorage +from dataflow.core import LLMServingABC from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai from dataflow.operators.core_vision import PromptedVQAGenerator from dataflow.operators.core_vision import WikiQARefiner -# Set API Key environment variable -os.environ["DF_API_KEY"] = "sk-xxxx" class ContextVQAPipeline: """ - Generate batch ContextVQA captions with a single command. + Generate batch ContextVQA data for images with a single command. """ - def __init__( - self, - first_entry_file: str = "dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl", - cache_path: str = "./cache_local_skvqa", - file_name_prefix: str = "skvqa_cache_step", - cache_type: str = "jsonl", - ): + def __init__(self, llm_serving: LLMServingABC = None): # ---------- 1. Storage ---------- self.storage = FileStorage( - first_entry_file_name=first_entry_file, - cache_path=cache_path, - file_name_prefix=file_name_prefix, - cache_type=cache_type, + first_entry_file_name="./example_data/image_contextvqa/sample_data.json", + cache_path="./cache_local", + file_name_prefix="context_vqa", + cache_type="json", ) # ---------- 2. Serving ---------- self.vlm_serving = APIVLMServing_openai( - api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + api_url="http://172.96.141.132:3001/v1", key_name_of_api_key="DF_API_KEY", - model_name="qwen3-vl-8b-instruct", + model_name="gpt-5-nano-2025-08-07", image_io=None, send_request_stream=False, max_workers=10, @@ -277,23 +273,8 @@ class ContextVQAPipeline: output_key=output_wiki_key ) -# ---------------------------- CLI Entry -------------------------------- # if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Batch ContextVQA generation with DataFlow") - - parser.add_argument("--images_file", default="data/image_contextvqa/sample_data.json") - parser.add_argument("--cache_path", default="./cache_local") - parser.add_argument("--file_name_prefix", default="context_vqa") - parser.add_argument("--cache_type", default="json") - - args = parser.parse_args() - - pipe = ContextVQAPipeline( - first_entry_file=args.images_file, - cache_path=args.cache_path, - file_name_prefix=args.file_name_prefix, - cache_type=args.cache_type, - ) + pipe = ContextVQAPipeline() pipe.forward() ``` \ No newline at end of file diff --git a/docs/en/notes/mm_guide/image_understanding/image_caption_api.md b/docs/en/notes/mm_guide/image_understanding/image_caption_api.md index e2607f6b..ecb5153d 100644 --- a/docs/en/notes/mm_guide/image_understanding/image_caption_api.md +++ b/docs/en/notes/mm_guide/image_understanding/image_caption_api.md @@ -3,17 +3,17 @@ title: Image Caption Generation Pipeline (API Version) icon: mdi:image-edit createTime: 2026/01/24 16:37:37 permalink: /en/mm_guide/image_caption_api_pipeline/ ---- +--- ## 1. Overview -The **Image Caption Generation Pipeline (API Version)** is designed to leverage advanced Vision-Language Models (VLM) to automatically generate high-quality, accurate, and informative text descriptions for large-scale image datasets. By calling APIs compatible with the OpenAI format, this pipeline rapidly processes images and generates structured annotation data. It is an ideal choice for building multimodal pre-training datasets, image retrieval systems, and accessibility features. +**Image Caption Generation Pipeline (API Version)** is designed to leverage advanced Vision-Language Models (VLM) to automatically generate high-quality, accurate, and informative textual descriptions for large-scale image datasets. By calling APIs compatible with the OpenAI format, this pipeline can quickly process images and generate structured annotation data. It is an ideal choice for building multimodal pre-training datasets, image retrieval systems, and accessibility features. We support the following application scenarios: -* **Multimodal Dataset Annotation**: Batch generate precise text descriptions for massive image libraries. -* **Image Content Understanding**: Automatically extract key objects, scenes, and textual information from images. -* **Search & Retrieval Optimization**: Enhance image searchability through rich textual descriptions. +* **Multimodal Dataset Annotation**: Batch generate precise text descriptions for large-scale image libraries. +* **Image Content Understanding**: Automatically extract key objects, scenes, and text information from images. +* **Search and Retrieval Optimization**: Enhance image searchability through textual descriptions. --- @@ -29,9 +29,9 @@ os.environ["DF_API_KEY"] = "your_api_key_here" ``` -### Step 2: Prepare the Environment +### Step 2: Environment Preparation -Create a working directory and initialize: +Create a work directory and initialize: ```bash mkdir run_caption_pipeline @@ -40,20 +40,20 @@ dataflowmm init ``` -### Step 3: Download Sample Data +### Step 3: Download Example Data ```bash -huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data ``` -### Step 4: Configure Core Parameters +### Step 4: Core Parameter Configuration Configure the API information in the generated `api_pipelines/image_caption.py` script: ```python self.vlm_serving = APIVLMServing_openai( - api_url="http://172.96.141.132:3001/v1", # Replace with your API endpoint + api_url="http://172.96.141.132:3001/v1", # Replace with your API address key_name_of_api_key="DF_API_KEY", model_name="gpt-5-nano-2025-08-07", max_workers=10, @@ -65,22 +65,22 @@ self.vlm_serving = APIVLMServing_openai( ### Step 5: Run the Pipeline ```bash -python api_pipelines/image_caption.py --images_file data/image_caption/sample_data.json +python api_pipelines/image_caption.py ``` --- -## 3. Data Flow & Logic +## 3. Data Flow and Logic Description ### 1. **Input Data Structure** -The pipeline accepts standard JSON/JSONL formats containing image paths and prompts: +The pipeline receives standard JSON/JSONL formats containing image paths and prompts: ```json [ { - "image": ["./data/image_caption/person.png"], + "image": ["./example_data/image_caption/person.png"], "conversation": [ { "from": "human", @@ -94,20 +94,20 @@ The pipeline accepts standard JSON/JSONL formats containing image paths and prom ### 2. **Core Operator: PromptedVQAGenerator** -In this workflow, we use `PromptedVQAGenerator` as the core operator. It transforms the VLM into a specialized image captioning engine via a system prompt. +In this process, we use `PromptedVQAGenerator` as the core operator. It transforms the VLM into a specialized image caption generator via a System Prompt. -* **System Prompt**: "You are an image caption generator. Your task is to generate a concise and informative caption for the given image content." -* **Concurrency Control**: Supports multi-threaded concurrent requests via the `max_workers` parameter, significantly improving processing efficiency for large datasets. -* **Fault Tolerance**: Built-in timeout and retry mechanisms ensure stability of API calls under high load. +* **System Prompt**: "You are a image caption generator. Your task is to generate a concise and informative caption for the given image content." +* **Concurrency Control**: Supports multi-threaded concurrent requests via the `max_workers` parameter, significantly improving processing efficiency for large-scale data. +* **Error Handling**: Built-in timeout and retry mechanisms ensure API call stability under high loads. ### 3. **Output Data Example** -Once processing is complete, the `caption` field is appended directly to the data object: +After processing, the `caption` field is added directly to the data object: ```json [ { - "image": ["./data/image_caption/person.png"], + "image": ["./example_data/image_caption/person.png"], "conversation": [...], "caption": "Promotional poster for Nightmare Alley in grayscale, showing a man in a formal tuxedo with a white bow tie. The cast names run down the left side (Bradley Cooper, Cate Blanchett, Toni Collette, Willem Dafoe, and more), and the gold title Nightmare Alley appears near the bottom left with release text and Regal branding." } @@ -117,79 +117,69 @@ Once processing is complete, the `caption` field is appended directly to the dat --- -## 4. Full Pipeline Code +## 4. Complete Pipeline Code -You can directly use or modify the following Python code to implement your custom image captioning task. +You can directly use or modify the following Python code to implement custom image captioning tasks. ```python import os -import argparse + +# Set API Key environment variable +os.environ["DF_API_KEY"] = "sk-xxx" + from dataflow.utils.storage import FileStorage +from dataflow.core import LLMServingABC from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai from dataflow.operators.core_vision import PromptedVQAGenerator -# Set API Key environment variable -os.environ["DF_API_KEY"] = "sk-xxx" class ImageCaptionPipeline: """ - Batch image caption generation with a single command. + Complete batch image caption generation with a single command. """ - def __init__( - self, - first_entry_file: str, - cache_path: str = "./cache_local", - file_name_prefix: str = "caption", - cache_type: str = "json", - ): - # ---------- 1. Storage: Manage data reading and checkpoints ---------- + def __init__(self, llm_serving: LLMServingABC = None): + + # ---------- 1. Storage ---------- self.storage = FileStorage( - first_entry_file_name=first_entry_file, - cache_path=cache_path, - file_name_prefix=file_name_prefix, - cache_type=cache_type, + first_entry_file_name="./example_data/image_caption/sample_data.json", + cache_path="./cache_local", + file_name_prefix="caption", + cache_type="json", ) - # ---------- 2. Serving: Configure API Service ---------- + # ---------- 2. Serving ---------- self.vlm_serving = APIVLMServing_openai( - api_url="http://172.96.141.132:3001/v1", - key_name_of_api_key="DF_API_KEY", + api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format + key_name_of_api_key="DF_API_KEY", # Set the API key for the corresponding platform in the environment variable or line 4 model_name="gpt-5-nano-2025-08-07", + image_io=None, + send_request_stream=False, max_workers=10, timeout=1800 ) - # ---------- 3. Operator: Define Generation Logic ---------- + # ---------- 3. Operator ---------- self.vqa_generator = PromptedVQAGenerator( serving=self.vlm_serving, - system_prompt="You are an image caption generator. Your task is to generate a concise and informative caption for the given image content." + system_prompt= "You are a image caption generator. Your task is to generate a concise and informative caption for the given image content." ) + # ------------------------------------------------------------------ # def forward(self): - # Run the pipeline + input_image_key = "image" + output_answer_key = "caption" + self.vqa_generator.run( storage=self.storage.step(), input_conversation_key="conversation", - input_image_key="image", - output_answer_key="caption", + input_image_key=input_image_key, + output_answer_key=output_answer_key, ) +# ---------------------------- CLI Entry ------------------------------- # if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Batch image caption generation with DataFlow") - parser.add_argument("--images_file", default="data/image_caption/sample_data.json") - parser.add_argument("--cache_path", default="./cache_local") - parser.add_argument("--file_name_prefix", default="caption") - parser.add_argument("--cache_type", default="json") - - args = parser.parse_args() - - pipe = ImageCaptionPipeline( - first_entry_file=args.images_file, - cache_path=args.cache_path, - file_name_prefix=args.file_name_prefix, - cache_type=args.cache_type, - ) + pipe = ImageCaptionPipeline() pipe.forward() -``` +``` \ No newline at end of file diff --git a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md index a033dcf1..4013ee2b 100644 --- a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md +++ b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline.md @@ -1,12 +1,12 @@ --- -title: Image Region Captioning Pipeline +title: Image Region Caption Pipeline createTime: 2026/01/11 22:04:27 icon: mdi:image-text permalink: /en/mm_guide/image_region_caption_pipeline/ --- ## 1. Overview -The **Image Region Captioning Pipeline** is designed to generate detailed text descriptions for specific regions within an image. Combining the localization capabilities of Computer Vision with the understanding of Multimodal Large Models (VLMs), this pipeline identifies Regions of Interest (ROI) and generates precise natural language annotations for them. +The **Image Region Caption Pipeline** is designed to generate detailed text descriptions for specific regions within an image. Combining the localization capabilities of Computer Vision with the understanding of Multimodal Large Models (VLMs), this pipeline identifies Regions of Interest (ROI) and generates precise natural language annotations for them. This pipeline supports processing **pre-defined Bounding Box** data, visualizing these boxes, and then feeding them into a VLM for caption generation. @@ -26,37 +26,100 @@ The main process of the pipeline includes: ## 2. Quick Start -### Step 1: Create a Working Directory +### Step 1: Create a New DataFlow Working Directory ```bash -mkdir run_region_caption -cd run_region_caption +mkdir run_dataflow +cd run_dataflow ``` -### Step 2: Prepare the Script +### Step 2: Initialize DataFlow-MM -Save the code in the "Pipeline Example" section below as `region_caption_pipeline.py`. +```bash +dataflowmm init -### Step 3: Configure Parameters +``` -Ensure the input file (jsonl) contains `image` and `bbox` fields. +You will then see: ```bash -# Install dependencies -pip install open-dataflow vllm - +gpu_pipelines/image_region_caption_pipeline.py ``` -### Step 4: Run +### Step 3: Download Sample Data ```bash -python region_caption_pipeline.py \ - --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \ - --first_entry_file "data/region_captions.jsonl" \ - --output_jsonl_path "data/results.jsonl" +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data +``` + +### Step 4: Configure Parameters +```python + def __init__( + self, + model_path: str = "Qwen/Qwen2.5-VL-3B-Instruct", + hf_cache_dir: str = "~/.cache/huggingface", + download_dir: str = "../ckpt/models/Qwen2.5-VL-3B-Instruct", + first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl", + cache_path: str = "../cache/image_region_caption", + file_name_prefix: str = "region_caption", + cache_type: str = "jsonl", + input_image_key: str = "image", + input_bbox_key: str = "bbox", + max_boxes: int = 10, + output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl", + ): +``` +> **⚠️ Important Note on Model Path Configuration (Taking `Qwen2.5-VL-3B-Instruct` as an example):** +> +> * **If you have already downloaded the model files:** Please change `model_path` to your local model path. **Crucially**, ensure that the model folder is named exactly `Qwen2.5-VL-3B-Instruct`; otherwise, the framework will fail to recognize it. +> * **If you haven't downloaded the model yet:** You must specify a `download_dir` parameter that ends with `Qwen2.5-VL-3B-Instruct` (as shown in the default parameters). Failure to do so will also result in the model not being recognized after downloading. + + +### Step 5: Run +```bash +cd gpu_pipelines +python image_region_caption_pipeline.py ``` +> **🛠️ Troubleshooting** +> +> **Issue 1:** If you encounter a CUDA library conflict error similar to the following: +> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12` +> +> **Solution:** This is usually caused by conflicting environment variables. Run the script with an empty `LD_LIBRARY_PATH`: +> ```bash +> LD_LIBRARY_PATH="" python image_region_caption_pipeline.py +> ``` +> +> **Issue 2:** If you are using **Qwen series models** and encounter the following error: +> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"` +> +> **Solution:** Open the `config.json` file located in your model folder, find the `rope_scaling` section, and change the key `"type"` to `"rope_type"`. +> +> **Before modification:** +> ```json +> "rope_scaling": { +> "type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> ``` +> +> **After modification:** +> ```json +> "rope_scaling": { +> "rope_type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> ``` --- @@ -64,16 +127,16 @@ python region_caption_pipeline.py \ ### 1. **Input Data** -The input data typically contains the image path and a list of corresponding bounding boxes: +The input data typically contains the image path and a list of corresponding bounding boxes (optional): * **image**: Path to the image file. -* **bbox**: List of bounding box coordinates, typically in `[[x, y, w, h], ...]` or `[[x1, y1, x2, y2], ...]` format (depending on configuration). +* **bbox**: List of bounding box coordinates, typically in `[[x, y, w, h], ...]` format. **Input Data Example**: ```json { - "image": "./images/kitchen.jpg", + "image": "../example_data/image_region_caption/20.jpg", "bbox": [[196, 104, 310, 495], [50, 60, 100, 200]] } @@ -89,46 +152,54 @@ This operator handles the vision-level tasks. * **Input**: Raw image + `bbox` data. * **Functionality**: Reads bounding boxes and draws them onto the image (visualization) or preprocesses them according to configuration. -* **Configuration (`ExistingBBoxDataGenConfig`)**: Controls parameters like `max_boxes` and visualization options (`draw_visualization`). -* **Output**: Generates a new image path containing visual markers (`image_with_bbox`). +* **Configuration (`ExistingBBoxDataGenConfig`)**: Controls the maximum number of bounding boxes and the input/output paths. +* **Output**: JSON output path for the new image with visual markers. #### B. **PromptedVQAGenerator** This operator is responsible for generating text using the VLM. -* **Input**: The `image_with_bbox` generated in the previous step. +* **Input**: The result generated in the previous step. * **Functionality**: The VLM receives the marked image and generates descriptions for the corresponding regions based on prompts. * **Output**: Region description text. ### 3. **Output Data** -The final output data will contain the processed image path and the generated descriptions: +The final generated output data includes the processed image path and the generated descriptions: -* **image_with_bbox**: Path to the image with drawn boxes. -* **mdvp_record**: List of generated region descriptions. +* **image**: The input image path. +* **type**: Indicates whether a bounding box is provided. +* **bbox**: Bounding box parameters. +* **normalized_bbox**: Normalized bounding box parameters. +* **result_file**: The output path for the results. +* **image_with_bbox**: Path to the image with drawn bounding boxes. +* **valid_bboxes_num**: The number of valid bounding boxes. +* **prompt**: The prompt received by the VLM. +* **answer**: The list of generated region descriptions. **Output Data Example**: ```json { - "image": "./images/kitchen.jpg", - "image_with_bbox": "./images/kitchen_visualized.jpg", - "mdvp_record": [ - "A wooden chair located near the table.", - "A white refrigerator in the background." - ] + "image":"..\/example_data\/image_region_caption\/20.png", + "type":"with_bbox", + "bbox":[[196,104,310,495]], + "normalized_bbox":[[0.128,0.125,0.329,0.72],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0]], + "result_file":"..\/cache\/image_region_caption", + "image_with_bbox":"..\/cache\/image_region_caption\\2_bbox_vis.jpg", + "valid_bboxes_num":1, + "prompt":"Describe the content of each marked region in the image. There are 1 regions: to .", + "answer":"In , the focus is on the lower half of a person wearing high-heeled shoes with an ornate design. The setting appears to be a kitchen, with items such as a table with floral tablecloth, a broom, and various kitchen utensils visible in the background. The legs of another person can also be seen, indicating there may be interaction happening in this domestic space. The overall scene captures a domestic and casual atmosphere." } - ``` --- ## 4. Pipeline Example -Below is the complete `ImageRegionCaptioningPipeline` code implementation. +Below is the complete `ImageRegionCaptionPipeline` code implementation. ```python -import argparse from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm from dataflow.operators.core_vision.generate.image_bbox_generator import ( ImageBboxGenerator, @@ -140,57 +211,40 @@ from dataflow.operators.core_vision.generate.prompted_vqa_generator import ( from dataflow.utils.storage import FileStorage -class ImageRegionCaptioningPipeline: +class ImageRegionCaptionPipeline: def __init__( self, - model_path: str, - *, - hf_cache_dir: str | None = None, - download_dir: str = "./ckpt/models", - device: str = "cuda", - # Storage & Paths - first_entry_file: str = "./dataflow/example/image_to_text_pipeline/region_captions.jsonl", - cache_path: str = "./dataflow/example/cache", + model_path: str = "Qwen/Qwen2.5-VL-3B-Instruct", + hf_cache_dir: str = "~/.cache/huggingface", + download_dir: str = "../ckpt/models/Qwen2.5-VL-3B-Instruct", + first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl", + cache_path: str = "../cache/image_region_caption", file_name_prefix: str = "region_caption", cache_type: str = "jsonl", - # Keys input_image_key: str = "image", input_bbox_key: str = "bbox", - image_with_bbox_path: str = 'image_with_bbox', # Key for intermediate image - output_key: str = "mdvp_record", - # BBox Config max_boxes: int = 10, - input_jsonl_path: str = "./dataflow/example/image_to_text_pipeline/region_captions.jsonl", - output_jsonl_path: str = "./dataflow/example/image_to_text_pipeline/region_captions_results_v1.jsonl", - output_image_with_bbox_path: str = "./dataflow/example/image_to_text_pipeline/image_with_bbox_results_v1.jsonl", - draw_visualization: bool = True + output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl", ): - # 1. 鍒濆鍖栧瓨鍌 (Storage) - # 鐢ㄤ簬 BBox 鐢熸垚闃舵鐨勫瓨鍌 self.bbox_storage = FileStorage( first_entry_file_name=first_entry_file, cache_path=cache_path, file_name_prefix=file_name_prefix, cache_type=cache_type ) - - # 2. 閰嶇疆 BBox 鐢熸垚鍣 + self.cfg = ExistingBBoxDataGenConfig( max_boxes=max_boxes, - input_jsonl_path=input_jsonl_path, + input_jsonl_path=first_entry_file, output_jsonl_path=output_image_with_bbox_path, ) - # 3. 鍒濆鍖 Caption 闃舵鐨勫瓨鍌 - # 娉ㄦ剰锛氳繖閲屾帴缁簡涓婁竴姝ョ殑杈撳嚭璺緞 self.caption_storage = FileStorage( first_entry_file_name=output_image_with_bbox_path, cache_path=cache_path, file_name_prefix=file_name_prefix, cache_type=cache_type ) - - # 4. 鍒濆鍖 VLM 鏈嶅姟 self.serving = LocalModelVLMServing_vllm( hf_model_name_or_path=model_path, hf_cache_dir=hf_cache_dir, @@ -200,76 +254,28 @@ class ImageRegionCaptioningPipeline: vllm_top_p=0.9, vllm_max_tokens=512, ) - - # 5. 鍒濆鍖栨牳蹇冪畻瀛 self.bbox_generator = ImageBboxGenerator(config=self.cfg) - self.caption_generator = PromptedVQAGenerator(serving=self.serving) - + self.caption_generator = PromptedVQAGenerator(serving=self.serving,) self.input_image_key = input_image_key self.input_bbox_key = input_bbox_key - self.output_key = output_key - self.image_with_bbox_path = image_with_bbox_path + self.bbox_record=None def forward(self): - # 姝ラ 1: 鐢熸垚甯 BBox 鍙鍖栫殑鍥惧儚 - print(">>> [Pipeline] Step 1: Processing BBoxes and Visualizing...") self.bbox_generator.run( storage=self.bbox_storage.step(), input_image_key=self.input_image_key, input_bbox_key=self.input_bbox_key, - output_key=self.image_with_bbox_path, ) - # 姝ラ 2: 鍩轰簬鍙鍖栧浘鍍忕敓鎴愭弿杩 - print(">>> [Pipeline] Step 2: Generating Region Captions...") self.caption_generator.run( storage=self.caption_storage.step(), - input_image_key='image_with_bbox' # 浣跨敤涓婁竴姝ョ敓鎴愮殑甯︽鍥惧儚 + input_image_key='image_with_bbox', + input_prompt_key='prompt' ) if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Image region captioning with DataFlow") - - parser.add_argument("--model_path", default="/data0/happykeyan/Models/Qwen2.5-VL-3B-Instruct") - parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface") - parser.add_argument("--download_dir", default="./ckpt/models") - parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda") - - parser.add_argument("--first_entry_file", default="./dataflow/example/image_to_text_pipeline/region_captions.jsonl") - parser.add_argument("--cache_path", default="./dataflow/example/cache") - parser.add_argument("--file_name_prefix", default="region_caption") - parser.add_argument("--cache_type", default="jsonl") - - parser.add_argument("--input_image_key", default="image") - parser.add_argument("--input_bbox_key", default="bbox") - parser.add_argument("--output_key", default="mdvp_record") - - parser.add_argument("--max_boxes", type=int, default=10) - parser.add_argument("--input_jsonl_path", default="./dataflow/example/image_to_text_pipeline/region_captions.jsonl") - parser.add_argument("--output_jsonl_path", default="./dataflow/example/image_to_text_pipeline/region_captions_results_v1.jsonl") - parser.add_argument("--output_image_with_bbox_path", default="./dataflow/example/image_to_text_pipeline/image_with_bbox_results_v1.jsonl") - parser.add_argument("--draw_visualization", type=bool, default=True) - - args = parser.parse_args() - - pipe = ImageRegionCaptioningPipeline( - model_path=args.model_path, - hf_cache_dir=args.hf_cache_dir, - download_dir=args.download_dir, - device=args.device, - first_entry_file=args.first_entry_file, - cache_path=args.cache_path, - file_name_prefix=args.file_name_prefix, - cache_type=args.cache_type, - input_image_key=args.input_image_key, - input_bbox_key=args.input_bbox_key, - output_key=args.output_key, - max_boxes=args.max_boxes, - input_jsonl_path=args.input_jsonl_path, - output_image_with_bbox_path=args.output_image_with_bbox_path, - draw_visualization=args.draw_visualization - ) + pipe = ImageRegionCaptionPipeline() pipe.forward() ``` diff --git a/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md new file mode 100644 index 00000000..b146935c --- /dev/null +++ b/docs/en/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md @@ -0,0 +1,258 @@ +--- +title: Image Region Caption Pipeline (API version) +createTime: 2026/01/11 22:04:27 +icon: mdi:image-text +permalink: /en/mm_guide/image_region_caption_pipeline_api/ +--- +## 1. Overview + +The **Image Region Caption Pipeline (API version)** is designed to generate detailed text descriptions for specific regions within an image. Combining the localization capabilities of Computer Vision with the understanding of Multimodal Large Models (VLMs), this pipeline identifies Regions of Interest (ROI) and generates precise natural language annotations for them. + +This pipeline supports processing **pre-defined Bounding Box** data, visualizing these boxes, and then feeding them into a VLM for caption generation. + +We support the following application scenarios: + +* **Dense Captioning**: Generating descriptions for multiple objects within a single image. +* **Fine-grained Image Understanding**: Focusing on local details rather than global descriptions. +* **Dataset Augmentation**: Constructing image-text pair datasets that include localization information. + +The main process of the pipeline includes: + +1. **Data Loading**: Reading source data containing image paths and bounding box information. +2. **BBox Processing & Visualization**: Processing input bounding boxes and generating a version of the image with visual markers (e.g., drawn boxes). +3. **Region Caption Generation**: Using a VLM to generate text descriptions based on the marked images or specific regions. + +--- + +## 2. Quick Start + +### Step 1: Create a New DataFlow Working Directory + +```bash +mkdir run_dataflow +cd run_dataflow + +``` + +### Step 2: Initialize DataFlow-MM + +```bash +dataflowmm init + +``` + +You will then see: + +```bash +api_pipelines/image_region_caption_api_pipeline.py +``` + +### Step 3: Download Sample Data + +```bash +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data + +``` + +### Step 4: Configure API Key + +Set your API Key environment variable in `api_pipelines/image_region_caption_api_pipeline.py`: + +```python +import os +os.environ["DF_API_KEY"] = "your_api_key" + +``` + +### Step 5: Configure Parameters + +Configure the API service and input data paths in `api_pipelines/image_region_caption_api_pipeline.py`: + +```python + def __init__( + self, + first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl", + cache_path: str = "../cache/image_region_caption", + file_name_prefix: str = "region_caption", + cache_type: str = "jsonl", + input_image_key: str = "image", + input_bbox_key: str = "bbox", + max_boxes: int = 10, + output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl", + ): +``` + +```python +self.vlm_serving = APIVLMServing_openai( + api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any OpenAI-compatible API platform + model_name="gpt-4o-mini", + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 +) + +``` +### Step 6: Run with One Command + +```bash +cd api_pipelines +python image_region_caption_api_pipeline.py + +``` +--- + +## 3. Data Flow & Logic + +### 1. **Input Data** + +The input data typically contains the image path and a list of corresponding bounding boxes (optional): + +* **image**: Path to the image file. +* **bbox**: List of bounding box coordinates, typically in `[[x, y, w, h], ...]` format. + +**Input Data Example**: + +```json +{ + "image": "../example_data/image_region_caption/20.jpg", + "bbox": [[196, 104, 310, 495], [50, 60, 100, 200]] +} + +``` + +### 2. **Core Operator Logic** + +This pipeline chains two core operators to complete the task: + +#### A. **ImageBboxGenerator** + +This operator handles the vision-level tasks. + +* **Input**: Raw image + `bbox` data. +* **Functionality**: Reads bounding boxes and draws them onto the image (visualization) or preprocesses them according to configuration. +* **Configuration (`ExistingBBoxDataGenConfig`)**: Controls the maximum number of bounding boxes and the input/output paths. +* **Output**: JSON output path for the new image with visual markers. + +#### B. **PromptedVQAGenerator** + +This operator is responsible for generating text using the VLM. + +* **Input**: The result generated in the previous step. +* **Functionality**: The VLM receives the marked image and generates descriptions for the corresponding regions based on prompts. +* **Output**: Region description text. + +### 3. **Output Data** + +The final generated output data includes the processed image path and the generated descriptions: + +* **image**: The input image path. +* **type**: Indicates whether a bounding box is provided. +* **bbox**: Bounding box parameters. +* **normalized_bbox**: Normalized bounding box parameters. +* **result_file**: The output path for the results. +* **image_with_bbox**: Path to the image with drawn bounding boxes. +* **valid_bboxes_num**: The number of valid bounding boxes. +* **prompt**: The prompt received by the VLM. +* **answer**: The list of generated region descriptions. + +**Output Data Example**: + +```json +{ + "image":"..\/example_data\/image_region_caption\/20.png", + "type":"with_bbox", + "bbox":[[196,104,310,495]], + "normalized_bbox":[[0.128,0.125,0.329,0.72],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0]], + "result_file":"..\/cache\/image_region_caption", + "image_with_bbox":"..\/cache\/image_region_caption\\2_bbox_vis.jpg", + "valid_bboxes_num":1, + "prompt":"Describe the content of each marked region in the image. There are 1 regions: to .", + "answer":"In , the focus is on the lower half of a person wearing high-heeled shoes with an ornate design. The setting appears to be a kitchen, with items such as a table with floral tablecloth, a broom, and various kitchen utensils visible in the background. The legs of another person can also be seen, indicating there may be interaction happening in this domestic space. The overall scene captures a domestic and casual atmosphere." +} +``` + +--- + +## 4. Pipeline Example + +Below is the complete `ImageRegionCaptionAPIPipeline` code implementation. + +```python +import os +os.environ["DF_API_KEY"] = "sk-xxxx" + +from dataflow.operators.core_vision.generate.image_bbox_generator import ( + ImageBboxGenerator, + ExistingBBoxDataGenConfig +) +from dataflow.operators.core_vision.generate.prompted_vqa_generator import ( + PromptedVQAGenerator +) +from dataflow.utils.storage import FileStorage + +from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai +class ImageRegionCaptionPipeline: + def __init__( + self, + first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl", + cache_path: str = "../cache/image_region_caption", + file_name_prefix: str = "region_caption", + cache_type: str = "jsonl", + input_image_key: str = "image", + input_bbox_key: str = "bbox", + max_boxes: int = 10, + output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl", + ): + self.bbox_storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type=cache_type + ) + + self.cfg = ExistingBBoxDataGenConfig( + max_boxes=max_boxes, + input_jsonl_path=first_entry_file, + output_jsonl_path=output_image_with_bbox_path, + ) + + self.caption_storage = FileStorage( + first_entry_file_name=output_image_with_bbox_path, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type=cache_type + ) + self.vlm_serving = APIVLMServing_openai( + api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format + model_name="gpt-4o-mini", + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 + ) + self.bbox_generator = ImageBboxGenerator(config=self.cfg) + self.caption_generator = PromptedVQAGenerator(serving=self.vlm_serving,system_prompt="You are a helpful assistant.") + self.input_image_key = input_image_key + self.input_bbox_key = input_bbox_key + self.bbox_record=None + + def forward(self): + self.bbox_generator.run( + storage=self.bbox_storage.step(), + input_image_key=self.input_image_key, + input_bbox_key=self.input_bbox_key + ) + + self.caption_generator.run( + storage=self.caption_storage.step(), + input_image_key='image_with_bbox', + input_prompt_key='prompt' + ) + + +if __name__ == "__main__": + pipe = ImageRegionCaptionPipeline() + pipe.forward() + +``` diff --git a/docs/en/notes/mm_guide/image_understanding/image_vqa_api.md b/docs/en/notes/mm_guide/image_understanding/image_vqa_api.md index 062ba560..a7f230f0 100644 --- a/docs/en/notes/mm_guide/image_understanding/image_vqa_api.md +++ b/docs/en/notes/mm_guide/image_understanding/image_vqa_api.md @@ -7,13 +7,13 @@ permalink: /en/mm_guide/image_vqa_api_pipeline/ ## 1. Overview -The **Image VQA Generation Pipeline (API Version)** focuses on automatically constructing high-quality **Question-Answer Pairs** directly from image content. By leveraging high-performance VLM APIs, the pipeline generates questions and accurate answers that align with human logic based on visual features. This is highly valuable for training multimodal dialogue models, evaluating visual understanding capabilities, and building domain-specific VQA datasets (e.g., medical, security, e-commerce). +**Image VQA Generation Pipeline (API Version)** focuses on automatically constructing high-quality **Question-Answer (QA) Pairs** directly from image content. Leveraging high-performance VLM APIs, this pipeline generates human-like questions and accurate answers based on the visual features of an image. This is highly valuable for training multimodal dialogue models, evaluating visual understanding capabilities, and building industry-specific VQA datasets (e.g., medical, security, e-commerce). We support the following application scenarios: -* **Instruction Tuning Data Synthesis**: Generate diverse questioning styles to enhance model interaction capabilities. -* **Visual Understanding Evaluation**: Create judgment, descriptive, or reasoning-based Q&A focused on image details. -* **Automated Annotation**: Replace manual labor for large-scale image Q&A labeling, reducing data production costs. +* **Instruction Fine-tuning Data Synthesis**: Generate diverse questioning styles to enhance model interaction capabilities. +* **Visual Understanding Evaluation**: Produce judgment, descriptive, or reasoning-based QAs targeting specific image details. +* **Automated Annotation**: Replace manual labor for large-scale image QA annotation, reducing data production costs. --- @@ -21,7 +21,7 @@ We support the following application scenarios: ### Step 1: Configure API Key -Ensure your environment variables are set with API access permissions: +Ensure your environment variables include the API access rights: ```python import os @@ -32,7 +32,7 @@ os.environ["DF_API_KEY"] = "sk-your-key-here" ### Step 2: Initialize Environment ```bash -# Create and enter the working directory +# Create and enter the workspace mkdir run_vqa_dataflow cd run_vqa_dataflow @@ -41,16 +41,16 @@ dataflowmm init ``` -### Step 3: Download Sample Data +### Step 3: Download Example Data ```bash -huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data ``` -### Step 4: Configure the Script +### Step 4: Configure Running Script -In the generated `api_pipelines/image_vqa.py`, you can customize the VLM model name and API information: +In `api_pipelines/image_vqa.py`, you can customize the VLM model name and API information: ```python self.vlm_serving = APIVLMServing_openai( @@ -65,22 +65,22 @@ self.vlm_serving = APIVLMServing_openai( ### Step 5: Execute the Pipeline ```bash -python api_pipelines/image_vqa.py --images_file data/image_vqa/sample_data.json +python api_pipelines/image_vqa.py ``` --- -## 3. Data Flow & Logic +## 3. Data Flow and Logic Description ### 1. **Input Data Format** -The input file must contain the image path and a prompt to trigger VQA generation: +The input file must contain the image path and a prompt to guide the VQA generation: ```json [ { - "image": ["./data/image_vqa/person.png"], + "image": ["./example_data/image_vqa/person.png"], "conversation": [ { "from": "human", @@ -94,20 +94,20 @@ The input file must contain the image path and a prompt to trigger VQA generatio ### 2. **Core Operator: PromptedVQAGenerator** -This operator is the core engine for generating Q&A pairs: +This operator serves as the engine for generating QA pairs: -* **Role Definition**: Through the `system_prompt` set as "image question-answer generator", the model is guided to output standard Q&A formats. -* **Multi-turn Support**: Capable of combining historical context or specific instructions in the `conversation` field to optimize the focus of generated questions. -* **High-Throughput Processing**: Utilizes `max_workers` for parallel calls, suitable for processing image datasets at scales of entries. +* **Role Definition**: Through the `system_prompt`, the model is set as an "image question-answer generator," guiding it to output standard QA formats. +* **Multi-turn Support**: It can combine historical context or specific instructions in the `conversation` field to refine the focus of question generation. +* **High Throughput Processing**: Utilizes `max_workers` to implement parallel calls, suitable for processing data at a scale of tens of thousands of images or more. -### 3. **Output Example** +### 3. **Output Result Example** -Generated VQA results are stored as text in the `vqa` field, typically containing multiple Q&A sets: +The generated VQA results are stored as text in the `vqa` field, typically containing multiple Q&A sets: ```json [ { - "image": ["./data/image_vqa/person.png"], + "image": ["./example_data/image_vqa/person.png"], "vqa": "- Q: What is the title of the movie shown on the poster?\n A: Nightmare Alley\n\n- Q: What color is the film鈥檚 title text?\n A: Gold" } ] @@ -116,67 +116,67 @@ Generated VQA results are stored as text in the `vqa` field, typically containin --- -## 4. Full Pipeline Code +## 4. Complete Pipeline Code ```python import os -import argparse + +# Set API Key environment variable +os.environ["DF_API_KEY"] = "sk-xxx" + from dataflow.utils.storage import FileStorage +from dataflow.core import LLMServingABC from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai from dataflow.operators.core_vision import PromptedVQAGenerator -# Configure API Environment -os.environ["DF_API_KEY"] = "sk-xxx" class ImageVQAPipeline: """ - One-click batch image VQA generation pipeline + Generate batch VQA for images with a single command. """ - def __init__( - self, - first_entry_file: str, - cache_path: str = "./cache_local_vqa", - file_name_prefix: str = "vqa_task", - cache_type: str = "json", - ): - # 1. Initialize Storage: Supports checkpoints and multi-format export + def __init__(self, llm_serving: LLMServingABC = None): + + # ---------- 1. Storage ---------- self.storage = FileStorage( - first_entry_file_name=first_entry_file, - cache_path=cache_path, - file_name_prefix=file_name_prefix, - cache_type=cache_type, + first_entry_file_name="./example_data/image_vqa/sample_data.json", + cache_path="./cache_local", + file_name_prefix="qa", + cache_type="json", ) - # 2. Configure VLM API Service + # ---------- 2. Serving ---------- self.vlm_serving = APIVLMServing_openai( - api_url="http://172.96.141.132:3001/v1", - key_name_of_api_key="DF_API_KEY", + api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format + key_name_of_api_key="DF_API_KEY", # Set the API key in environment variable or line 4 model_name="gpt-5-nano-2025-08-07", - max_workers=10 + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 ) - # 3. Initialize VQA Operator + # ---------- 3. Operator ---------- self.vqa_generator = PromptedVQAGenerator( serving=self.vlm_serving, - system_prompt="You are an image question-answer generator. Your task is to generate a question-answer pair for the given image content." + system_prompt= "You are a image question-answer generator. Your task is to generate a question-answer pair for the given image content." ) + # ------------------------------------------------------------------ # def forward(self): - # Execute inference task + input_image_key = "image" + output_answer_key = "vqa" + self.vqa_generator.run( storage=self.storage.step(), input_conversation_key="conversation", - input_image_key="image", - output_answer_key="vqa", + input_image_key=input_image_key, + output_answer_key=output_answer_key, ) +# ---------------------------- CLI Entry ------------------------------- # if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Batch VQA generation") - parser.add_argument("--images_file", default="data/image_vqa/sample_data.json") - args = parser.parse_args() - - pipe = ImageVQAPipeline(first_entry_file=args.images_file) + pipe = ImageVQAPipeline() pipe.forward() ``` \ No newline at end of file diff --git a/docs/en/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md b/docs/en/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md deleted file mode 100644 index bbe78d17..00000000 --- a/docs/en/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md +++ /dev/null @@ -1,288 +0,0 @@ ---- -title: Multi-Role Video QA Pipeline -createTime: 2026/01/11 22:15:28 -icon: mdi:image-text -permalink: /en/mm_guide/multirole_videoqa_pipeline/ ---- -## 1. Overview - -The **Multi-Role Video QA Pipeline** leverages Multimodal Large Models (VLMs) and a Multi-Agent collaboration mechanism to automatically generate high-quality, deep Question-Answer (QA) pairs from long videos or advertising footage. - -Unlike standard single-pass generation, this pipeline introduces a **Multi-Agent Iterative Refinement** phase. It first generates initial QAs, then refines them through multiple rounds of interaction simulating different agent roles (e.g., Questioner, Checker, Polisher), finally outputting logical and accurate QA data. - -We support the following application scenarios: - -* **Ad Video Understanding**: Extracting key selling points, emotional tone, and narrative logic from ads. -* **Complex Video Reasoning**: Constructing deep QA datasets requiring reasoning across different time segments. -* **Long Video Summarization & QA**: Handling video data containing rich Metadata (`Meta`) and multiple Clips (`Clips`). - -The main process of the pipeline includes: - -1. **Initial Generation**: Generates baseline QA pairs based on video metadata and clips. -2. **Multi-Agent Refinement**: Critiques, corrects, and optimizes QA pairs through multiple iterations (default 3 rounds). -3. **Final Generation**: Cleans the data and outputs the final QA set in a standard format. - ---- - -## 2. Quick Start - -### Step 1: Create a Working Directory - -```bash -mkdir run_video_qa -cd run_video_qa - -``` - -### Step 2: Prepare the Script - -Save the code in the "Pipeline Example" section below as `multirole_videoqa_pipeline.py`. - -### Step 3: Configure Parameters - -Ensure the input data contains `Meta` and `Clips` fields. - -```bash -# Install dependencies -pip install open-dataflow vllm - -``` - -### Step 4: Run - -```bash -python multirole_videoqa_pipeline.py \ - --model_path "/path/to/Qwen2.5-VL-7B-Instruct" \ - --images_file "data/adsQA.jsonl" \ - --card_id "0" - -``` - ---- - -## 3. Data Flow & Logic - -### 1. **Input Data** - -Input data is typically pre-processed video data containing global metadata and segment information: - -* **Meta**: Global description, title, or background info of the video. -* **Clips**: List of video clips, where each clip contains audio text, frame image paths, and clip descriptions. - -**Input Data Example**: - -```json -{ - "Meta": "A commercial for a new sports car featuring dynamic driving scenes.", - "Clips": [ - { - "Audio_Text": "Experience the speed.", - "Frames_Images": ["./frames/001.jpg", "./frames/002.jpg"], - "Description": "Car accelerating on a highway." - }, - { - "Audio_Text": "Safety meets luxury.", - "Frames_Images": ["./frames/003.jpg"], - "Description": "Interior shot showing leather seats." - } - ] -} - -``` - -### 2. **Core Operator Logic** - -This pipeline executes through a chain of three specialized operators: - -#### A. **MultiroleVideoQAInitialGenerator** - -* **Function**: Acts as the "Draft Author", reading `Meta` and `Clips` to generate the first version of QA pairs using the VLM. -* **Output**: A DataFrame containing preliminary QAs. - -#### B. **MultiroleVideoQAMultiAgentGenerator** - -* **Function**: Acts as the "Editorial Team", polishing the draft. -* **Mechanism**: Configured with `max_iterations` (e.g., 3 rounds). During these rounds, the model may simulate different roles (e.g., a reviewer pointing out errors, a polisher improving wording) to progressively enhance QA quality. -* **Input**: Initial DataFrame. -* **Output**: Intermediate DataFrame after multiple rounds of correction. - -#### C. **MultiroleVideoQAFinalGenerator** - -* **Function**: Acts as the "Publisher", responsible for final formatting and cleaning. -* **Output**: Standardized `QA` list. - -### 3. **Output Data** - -The output data adds a high-quality QA list to the original fields: - -* **QA**: List of generated QA pairs, including labels (e.g., question type), question text, and answer text. - -**Output Data Example**: - -```json -{ - "Meta": "...", - "Clips": [...], - "QA": [ - { - "Label": "Feature Extraction", - "Question": "What specific features of the car are highlighted in the interior shots?", - "Answer": "The video highlights the luxury leather seats and the advanced dashboard interface." - }, - { - "Label": "Narrative Analysis", - "Question": "How does the audio complement the visual transition?", - "Answer": "The narration 'Experience speed' coincides with the acceleration scene, reinforcing the dynamic visual." - } - ] -} - -``` - ---- - -## 4. Pipeline Example - -Below is the complete `MultiRoleVideoQAPipeline` code implementation. - -```python -import argparse -import os -from dataflow.serving import LocalModelVLMServing_vllm -from dataflow.utils.storage import FileStorage -from dataflow.operators.core_vision import ( - MultiroleVideoQAInitialGenerator, - MultiroleVideoQAMultiAgentGenerator, - MultiroleVideoQAFinalGenerator -) - -try: - import torch - # 澶氳繘绋嬪惎鍔ㄦ柟寮忚缃负 spawn锛岄伩鍏 CUDA 鍒濆鍖栧啿绐 - if 'spawn' not in torch.multiprocessing.get_all_start_methods(): - torch.multiprocessing.set_start_method('spawn', force=True) -except ImportError: - pass - - -class MultiRoleVideoQAPipeline(): - def __init__( - self, - model_path: str, - *, - hf_cache_dir: str | None = None, - download_dir: str = "./ckpt", - first_entry_file: str = "/dataflow/example/ads_QA/adsQA.jsonl", - cache_path: str = "./cache_local", - file_name_prefix: str = "dataflow_cache_step", - cache_type: str = "jsonl", - # Keys Configuration - Meta_key: str = "Meta", - clips_key: str = "Clips", - output_key: str = "QA" - ): - # 1. 瀛樺偍鍒濆鍖 - self.storage = FileStorage( - first_entry_file_name=first_entry_file, - cache_path=cache_path, - file_name_prefix=file_name_prefix, - cache_type=cache_type, - ) - - # 寮哄埗璁剧疆 vLLM 鐨勫杩涚▼鏂规硶 - os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "spawn" - - # 2. VLM 鏈嶅姟鍒濆鍖 - self.llm_serving = LocalModelVLMServing_vllm( - hf_model_name_or_path=model_path, - hf_cache_dir=hf_cache_dir, - hf_local_dir=download_dir, - vllm_tensor_parallel_size=1, - vllm_temperature=0.7, - vllm_top_p=0.9, - vllm_max_tokens=6000, # 瑙嗛闂瓟閫氬父闇瑕佽緝闀跨殑 Context - ) - - # 3. 绠楀瓙閾惧垵濮嬪寲 - # 闃舵涓锛氬垵濮嬬敓鎴 - self.initial_QA_generation = MultiroleVideoQAInitialGenerator(llm_serving = self.llm_serving) - - # 闃舵浜岋細澶氭櫤鑳戒綋杩唬浼樺寲 (鏍稿績宸紓鐐) - self.multiAgent_QA_generation = MultiroleVideoQAMultiAgentGenerator( - llm_serving = self.llm_serving, - max_iterations = 3 - ) - - # 闃舵涓夛細鏈缁堟牸寮忓寲 - self.final_QA_generation = MultiroleVideoQAFinalGenerator(llm_serving = self.llm_serving) - - self.input_meta_key = Meta_key - self.input_clips_key = clips_key - self.output_key = output_key - - def forward(self): - print(">>> [Pipeline] Step 1: Initial QA Generation...") - init_df = self.initial_QA_generation.run( - storage = self.storage.step(), - input_meta_key = self.input_meta_key, - input_clips_key = self.input_clips_key, - output_key = self.output_key - ) - - print(">>> [Pipeline] Step 2: Multi-Agent Refinement (3 iterations)...") - # 娉ㄦ剰锛氭绠楀瓙鎺ユ敹涓婁竴闃舵鐨 DataFrame (init_df) 浣滀负杈撳叆 - middle_df = self.multiAgent_QA_generation.run( - df = init_df, - input_meta_key = self.input_meta_key, - input_clips_key = self.input_clips_key, - output_key = self.output_key - ) - - print(">>> [Pipeline] Step 3: Finalizing QA Pairs...") - self.final_QA_generation.run( - storage = self.storage, - df = middle_df, - input_meta_key = self.input_meta_key, - input_clips_key = self.input_clips_key, - output_key = self.output_key - ) - print(">>> [Pipeline] Done.") - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Batch video QA generation with DataFlow (Single GPU)") - - parser.add_argument("--model_path", default="../../Models/Qwen2.5-VL-7B-Instruct", - help="Path to the local model or HuggingFace repo ID.") - parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface", - help="HuggingFace cache directory.") - parser.add_argument("--download_dir", default="./ckpt", - help="Local directory for downloading models.") - - parser.add_argument("--card_id", type=str, default="0", - help="The single CUDA device ID to use (e.g., '0' or '1').") - - parser.add_argument("--images_file", default="./dataflow/example/ads_QA/adsQA.jsonl", - help="Path to the first entry file for DataFlow.") - parser.add_argument("--cache_path", default="./cache_local", - help="Directory for caching DataFlow steps.") - parser.add_argument("--file_name_prefix", default="caption", - help="Prefix for cache file names.") - parser.add_argument("--cache_type", default="jsonl", - help="Type of cache file (e.g., jsonl).") - - args = parser.parse_args() - - os.environ['CUDA_VISIBLE_DEVICES'] = args.card_id.replace(' ', '') - - pipe = MultiRoleVideoQAPipeline( - model_path=args.model_path, - hf_cache_dir=args.hf_cache_dir, - download_dir=args.download_dir, - first_entry_file=args.images_file, - cache_path=args.cache_path, - file_name_prefix=args.file_name_prefix, - cache_type=args.cache_type, - ) - pipe.forward() - -``` diff --git a/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md b/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md index e930c005..b5f579be 100644 --- a/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md +++ b/docs/en/notes/mm_operators/image_understanding/generate/image_bbox_generator.md @@ -49,8 +49,7 @@ def run( self, storage: DataFlowStorage, input_image_key: str = "image", - input_bbox_key: str = "bbox", - output_key: str = "mdvp_record" + input_bbox_key: str = "bbox" ): ... @@ -90,7 +89,6 @@ Reads raw data from `config.input_jsonl_path`. | `storage` | `DataFlowStorage` | N/A | Storage object, mainly used to provide the `cache_path`. | | `input_image_key` | `str` | `"image"` | Field name for image paths in the input JSONL. | | `input_bbox_key` | `str` | `"bbox"` | Field name for BBox data in the input JSONL. | -| `output_key` | `str` | `"mdvp_record"` | (Reserved) Key name for the output record. | ## 馃З Example Usage @@ -98,50 +96,41 @@ Reads raw data from `config.input_jsonl_path`. from dataflow.utils.storage import FileStorage from dataflow.operators.cv import ImageBboxGenerator, ExistingBBoxDataGenConfig -# 1) Configure Parameters -config = ExistingBBoxDataGenConfig( - max_boxes=5, - input_jsonl_path="./data/raw_images.jsonl", - output_jsonl_path="./data/processed_with_prompts.jsonl" +cfg = ExistingBBoxDataGenConfig( + max_boxes=10, + input_jsonl_path="../example_data/image_region_caption/image_region_caption_demo.jsonl", + output_jsonl_path="../cache/image_region_caption/image_with_bbox_result.jsonl", ) - -# 2) Initialize Operator -# Note: This operator is for data prep and does not require a Serving instance generator = ImageBboxGenerator(config=config) -# 3) Prepare Storage (Only for providing cache path) storage = FileStorage( - cache_path="./cache_vis_images", - file_name_prefix="bbox_gen" + first_entry_file_name="../example_data/image_region_caption/image_region_caption_demo.jsonl", + cache_path="../cache/image_region_caption", + file_name_prefix="region_caption", + cache_type="jsonl" ) -# 4) Execute Processing -# Automatically reads from config input, writes to config output generator.run( storage=storage, - input_image_key="image_path", - input_bbox_key="ground_truth_bbox" # Will auto-extract if this column is missing + input_image_key="image", + input_bbox_key="bbox" ) - ``` ### 馃Ь Output Data Format (Output JSONL) -Each line in the `output_jsonl_path` file contains: +Each line in the `image_with_bbox_result.jsonl` file contains: ```json { - "image": "/data/raw/cat.jpg", - "type": "without_bbox", // or "with_bbox" - "bbox": [[100, 200, 50, 60], ...], // Raw pixel coords [x, y, w, h] - "normalized_bbox": [ - [0.1, 0.2, 0.15, 0.26], - [0.0, 0.0, 0.0, 0.0] // Zero-padded - ], - "result_file": "./cache_vis_images", - "image_with_bbox": "./cache_vis_images/1_bbox_vis.jpg", // Path to visualized image - "valid_bboxes_num": 1, - "prompt": "Describe the content of each marked region in the image. There are 1 regions: \ to \." + "image": "../example_data/image_region_caption/20.png", + "type": "with_bbox", + "bbox": [[196, 104, 310, 495]], + "normalized_bbox": [[0.128, 0.125, 0.329, 0.72], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]], + "result_file": "../cache/image_region_caption", + "image_with_bbox": "../cache/image_region_caption\\2_bbox_vis.jpg", + "valid_bboxes_num": 1, + "prompt": "Describe the content of each marked region in the image. There are 1 regions: to ." } ``` diff --git a/docs/en/notes/mm_operators/image_understanding/generate/multirole_videoqa.md b/docs/en/notes/mm_operators/image_understanding/generate/multirole_videoqa.md deleted file mode 100644 index 2f45e04c..00000000 --- a/docs/en/notes/mm_operators/image_understanding/generate/multirole_videoqa.md +++ /dev/null @@ -1,140 +0,0 @@ ---- -title: MultiRole Video QA Generation -createTime: 2025/12/2 20:00:00 -icon: material-symbols-light:video -permalink: /en/mm_operators/generate/multirole_videoqa/ ---- - -## 馃摌 Overview - -`MultiroleVideoQAGenerate` is a data generation operator for **automatically creating Question-Answer (QA) pairs based on the preprocessed video data**. -Given input preprocessed video data, it constructs several QA pairs relative to the video. This is suitable for Advertisement video annotation, dataset construction, and video understanding tasks. - -**Features:** -* Supports batch processing of multiple preprocessed video data. -* Generates high-quality QA pairs using VLMs like Qwen2.5-VL. -* Automatically handles video input and using prompt to generate data. - ---- - -## 馃彈锔 `__init__` Function - -```python -def __init__( - self, - llm_serving: VLMServingABC -): - ... -``` -## 馃Ь `__init__` Parameters - -| Parameter | Type | Default | Description | -| :------------ | :-------------- | :------ | :-------------------------------------------------------------- | -| `llm_serving` | `VLMServingABC` | - | **Model Serving Object** used to call the VLM for QA pairs generation | - ------ - -## 鈿 `run` Function - -```python -def run( - self, - storage: DataFlowStorage, - input_meta_key: str = "Meta", - input_clips_key: str = "Clips", - output_key: str = "QA" -): - ... -``` - -The `run` function executes the main QA pairs generation workflow: -read data paths 鈫 **validate DataFrame** 鈫 construct prompts 鈫 call the model 鈫 generate QA pairs captions 鈫 write results to output. - -## 馃Ь `run` Parameters - -| Parameter | Type | Default | Description | -| :---------------- | :---------------- | :---------- | :---------------------------------------------------- | -| `storage` | `DataFlowStorage` | - | Dataflow storage object | -| `input_mets_key` | `str` | `"Meta"` | **Multimodal Input Field Name** | -| `input_clips_key` | `str` | `"Clips"` | **Multimodal Input Field Name** | -| `output_key` | `str` | `"QA"` | **Model Output Field Name** (the generated QA pairs) | - ------ - -## 馃 Example Usage - -```python -import os -import argparse -from dataflow.serving import LocalModelVLMServing_vllm -from dataflow.utils.storage import FileStorage -from dataflow.operators.core_vision import MultiroleVideoQAInitialGenerator, MultiroleVideoQAMultiAgentGenerator, MultiroleVideoQAFinalGenerator - -# Step 1: Launch local model service -llm_serving = LocalModelVLMServing_vllm( - hf_model_name_or_path=model_path, - hf_cache_dir=hf_cache_dir, - hf_local_dir=download_dir, - vllm_tensor_parallel_size=1, - vllm_temperature=0.7, - vllm_top_p=0.9, - vllm_max_tokens=6000, - ) - -# Step 2: Prepare input data -storage = FileStorage( - first_entry_file_name=first_entry_file, - cache_path=cache_path, - file_name_prefix=file_name_prefix, - cache_type=cache_type, - ) - -# Step 3: Initialize and run the operator -initial_QA_generation = MultiroleVideoQAInitialGenerator(llm_serving = self.llm_serving) -multiAgent_QA_generation = MultiroleVideoQAMultiAgentGenerator(llm_serving = self.llm_serving, max_iterations = 3) -final_QA_generation = MultiroleVideoQAFinalGenerator(llm_serving = self.llm_serving) - -init_df = initial_QA_generation.run( - storage = self.storage.step(), - input_meta_key = self.input_meta_key, - input_clips_key = self.input_clips_key, - output_key = self.output_key - ) -middle_df = multiAgent_QA_generation.run( - df = init_df, - input_meta_key = self.input_meta_key, - input_clips_key = self.input_clips_key, - output_key = self.output_key - ) -final_QA_generation.run( - storage = self.storage, - df = middle_df, - input_meta_key = self.input_meta_key, - input_clips_key = self.input_clips_key, - output_key = self.output_key - ) -``` - ------ - -## 馃Ь Default Output Format - -| Field | Type | Description | -| :-------- | :----------- | :------------------------------- | -| `Meta` | `str` | Meta information for video | -| `Clips` | `List[Dict]` | Interleaved modality video Clips | -| `QA` | `List[Dict]` | QA pairs | - ------ - -### 馃摜 Example Input - -```jsonl -{"Meta": "Meta Information", "Clips": [{"Audio_Text": "Audio_Text1", "Frames_Images": ["image_path1","image_path2"], "Description": "Description1"}, {"Audio_Text": "Audio_Text2", "Frames_Images": ["image_path3","image_path4"], "Description": "Description2"}]} -``` - -### 馃摛 Example Output - -```jsonl -{"Meta": "Meta Information", "Clips": [{"Audio_Text": "Audio_Text1", "Frames_Images": ["image_path1","image_path2"], "Description": "Description1"}, {"Audio_Text": "Audio_Text2", "Frames_Images": ["image_path3","image_path4"], "Description": "Description2"}], "QA":[{"Label":"label1", "Question": "Question1", "Answer": "Answer1"},{"Label":"label2", "Question": "Question2", "Answer": "Answer2"}]} -``` \ No newline at end of file diff --git a/docs/zh/notes/mm_guide/image_understanding/context_vqa.md b/docs/zh/notes/mm_guide/image_understanding/context_vqa.md index d9775d1d..cd9ac1da 100644 --- a/docs/zh/notes/mm_guide/image_understanding/context_vqa.md +++ b/docs/zh/notes/mm_guide/image_understanding/context_vqa.md @@ -1,8 +1,9 @@ --- -title: ContextVQA 澶氭ā鎬侀棶绛旀暟鎹敓鎴愭祦姘寸嚎 -icon: mdi:image-text -createTime: 2026/01/24 16:37:37 +title: ContextVQA 澶氭ā鎬侀棶绛旀暟鎹敓鎴愭祦姘寸嚎 +icon: mdi:image-text +createTime: 2026/01/24 16:37:37 permalink: /zh/mm_guide/contextvqa_pipeline/ + --- ## 1. 姒傝堪 @@ -17,7 +18,7 @@ permalink: /zh/mm_guide/contextvqa_pipeline/ 娴佹按绾跨殑涓昏娴佺▼鍖呮嫭锛 1. **鏁版嵁鍔犺浇**锛氳鍙栧寘鍚浘鍍忚矾寰勭殑鏁版嵁鏂囦欢銆 -2. **涓婁笅鏂囦笌闂瓟鐢熸垚**锛氬埄鐢 VLM 鍩轰簬鍥惧儚鐢熸垚 Wikipedia 椋庢牸鏂囩珷鍙婂師濮嬮棶绛斿銆 +2. **涓婁笅鏂囦笌闂瓟鐢熸垚**锛氬埄鐢ㄦ湰鍦伴儴缃茬殑 VLM 鍩轰簬鍥惧儚鐢熸垚 Wikipedia 椋庢牸鏂囩珷鍙婂師濮嬮棶绛斿銆 3. **鏁版嵁娓呮礂涓庣粨鏋勫寲**锛氳В鏋愬師濮嬫枃鏈紝鎻愬彇缁撴瀯鍖栫殑 `{context, qas}` 鏍煎紡銆 --- @@ -25,40 +26,63 @@ permalink: /zh/mm_guide/contextvqa_pipeline/ ## 2. 蹇熷紑濮 ### 绗竴姝ワ細鍒涘缓鏂扮殑 DataFlow 宸ヤ綔鏂囦欢澶 + ```bash mkdir run_dataflow_mm cd run_dataflow_mm + ``` ### 绗簩姝ワ細鍒濆鍖 DataFlow-MM + ```bash dataflow init + ``` + 杩欐椂浣犱細鐪嬪埌锛 + ```bash gpu_pipelines/context_vqa.py + +``` + +### 绗笁姝ワ細涓嬭浇绀轰緥鏁版嵁 + +```bash +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data + ``` -### 绗笁姝ワ細閰嶇疆妯″瀷璺緞 +### 绗洓姝ワ細閰嶇疆妯″瀷涓庢暟鎹矾寰 -鍦 `context_vqa.py` 涓厤缃 VLM 妯″瀷璺緞鍜岀ず渚嬫暟鎹 +鍦 `context_vqa.py` 涓洿鎺ヤ慨鏀圭被鍒濆鍖栧弬鏁帮紙涓嶅啀閫氳繃鍛戒护琛屽弬鏁颁紶閫掞級锛 ```python -parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") # 淇敼涓轰綘鐨勬ā鍨嬭矾寰 -parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface") -parser.add_argument("--download_dir", default="./ckpt") -parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda") - -parser.add_argument("--images_file", default="dataflow/example/image_to_text_pipeline/capsbench_captions.json") # 淇敼涓轰綘鐨勬暟鎹湴鍧锛屾垜浠彁渚涚ず渚嬫暟鎹湪鍦╮un_dataflow_mm/example_data/image_to_text_pipeline/capsbench_captions.json锛屽叿浣撻噷闈㈢殑鍥剧墖鍙互浠巎son涓"source"鏉ユ簮涓嬭浇 -parser.add_argument("--cache_path", default="./cache_local") -parser.add_argument("--file_name_prefix", default="context_vqa") -parser.add_argument("--cache_type", default="json") +# 妯″瀷鏈嶅姟閰嶇疆 +self.serving = LocalModelVLMServing_vllm( + hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct", + hf_cache_dir="~/.cache/huggingface", + hf_local_dir="./ckpt", + vllm_tensor_parallel_size=1, + vllm_max_tokens=512, +) + +# 鏁版嵁瀛樺偍閰嶇疆 +self.storage = FileStorage( + first_entry_file_name="./example_data/image_contextvqa/sample_data.json", + cache_path="./cache_local", + file_name_prefix="context_vqa", + cache_type="json", +) + ``` -### 绗洓姝ワ細涓閿繍琛 +### 绗簲姝ワ細涓閿繍琛 ```bash python gpu_pipelines/context_vqa.py + ``` --- @@ -67,67 +91,46 @@ python gpu_pipelines/context_vqa.py ### 1. **杈撳叆鏁版嵁** -璇ユ祦绋嬬殑杈撳叆鏁版嵁涓昏鍖呭惈浠ヤ笅瀛楁锛 +璇ユ祦绋嬬殑杈撳叆鏁版嵁閫氳繃 `FileStorage` 杩涜绠$悊锛屾敮鎸佹柇鐐圭画浼犮 -* **image**锛氬浘鍍忔枃浠惰矾寰勶紙鏈湴璺緞鎴 URL锛夈 -* **id**锛堝彲閫夛級锛氭暟鎹殑鍞竴鏍囪瘑绗︺ +**杈撳叆鏁版嵁绀轰緥 (`sample_data.json`)**锛 -鏁版嵁閫氳繃 `FileStorage` 杩涜绠$悊锛屾敮鎸佹柇鐐圭画浼犮 - -**杈撳叆鏁版嵁绀轰緥**锛 +```json +[ + { + "image": ["./example_data/image_contextvqa/person.png"], + "conversation": [ + { + "from": "human", + "value": "Write a Wikipedia article related to this image without directly referring to the image. Then write question answer pairs. The question answer pairs should satisfy the following criteria.\n1: The question should refer to the image.\n2: The question should avoid mentioning the name of the object in the image.\n3: The question should be answered by reasoning over the Wikipedia article.\n4: The question should sound natural and concise.\n5: The answer should be extracted from the Wikipedia article.\n6: The answer should not be any objects in the image.\n7: The answer should be a single word or phrase and list all correct answers separated by commas.\n8: The answer should not contain 'and', 'or', rather you can split them into multiple answers." + } + ] + } +] -```jsonl -{"id": 1, "image": "./images/landmark.jpg"} -{"id": 2, "image": "./images/animal.jpg"} ``` -绀轰緥鍥剧墖鍙互鍦╜https://huggingface.co/datasets/OpenDCAI/dataflow-demo-image/tree/main/capsbench_images`涓壘鍒帮紱姝ゅ鎴戜滑宸茬粡鍚堟垚浜20w楂樿川閲廲ontext vqa鏁版嵁渚涚ぞ鍖轰娇鐢ㄤ綋楠岋紝鍦╤ttps://huggingface.co/datasets/OpenDCAI/dataflow-mm-context_vqa涓 -### 2. **鏍稿績绠楀瓙閫昏緫** -璇ユ祦姘寸嚎閫氳繃涓茶仈涓や釜鏍稿績绠楀瓙鏉ュ畬鎴愪换鍔★細 +### 2. **鏍稿績绠楀瓙閫昏緫** #### A. **FixPromptedVQAGenerator锛堜笂涓嬫枃鐢熸垚锛** -璇ョ畻瀛愯礋璐e埄鐢 VLM 妯″瀷锛屾牴鎹璁剧殑 Prompt 妯℃澘鐢熸垚鍘熷鏂囨湰銆 - -**鍔熻兘锛** - -* 鍩轰簬鍥惧儚鐢熸垚涓娈 Wikipedia 椋庢牸鐨勭鏅枃绔犮 -* 鍩轰簬鏂囩珷鐢熸垚闂瓟瀵广 -* **Prompt 绾︽潫**锛氶棶棰樻寚鍚戝浘鍍忎絾閬垮厤鐩存帴鎻愬強鐗╀綋鍚嶇О锛涚瓟妗堝繀椤绘潵鑷枃绔犲唴瀹逛笖闈炲浘鍍忎腑鐨勭墿浣擄紱绛旀绠缁冦 - -**妯″瀷鏈嶅姟閰嶇疆**锛 - -```python -self.serving = LocalModelVLMServing_vllm( - hf_model_name_or_path=model_path, - hf_cache_dir=hf_cache_dir, - vllm_tensor_parallel_size=1, - vllm_temperature=0.7, # 淇濇寔涓瀹氱殑鍒涢犳 - vllm_top_p=0.9, - vllm_max_tokens=512, -) - -``` +璇ョ畻瀛愯礋璐h皟鐢ㄦ湰鍦 VLM 妯″瀷锛屾牴鎹唴缃殑 Wikipedia 椋庢牸 Prompt 妯℃澘鐢熸垚鍘熷鏂囨湰銆 **绠楀瓙杩愯**锛 ```python self.vqa_generator.run( storage=self.storage.step(), - input_image_key="image", - output_answer_key="vqa" # 杈撳嚭鍘熷鐢熸垚鐨勬枃鏈 + input_conversation_key="conversation", + input_image_key=input_image_key, + output_answer_key=output_answer_key, ) ``` #### B. **WikiQARefiner锛堢粨鏋滆В鏋愶級** -璇ョ畻瀛愯礋璐e皢 VLM 鐢熸垚鐨勯潪缁撴瀯鍖栨枃鏈竻娲楀苟杞崲涓烘爣鍑嗘牸寮忋 - -**鍔熻兘锛** - -* 娓呮礂 Markdown 鏍煎紡鍜屽浣欑殑绌虹櫧瀛楃銆 -* 鍒嗙鏂囩珷鍐呭锛圕ontext锛夊拰闂瓟瀵癸紙QAs锛夈 +璇ョ畻瀛愯礋璐e皢 VLM 鐢熸垚鐨勯潪缁撴瀯鍖栨枃鏈竻娲楀苟杞崲涓烘爣鍑嗘牸寮忥紝鍒嗙鏂囩珷鍐呭锛圕ontext锛夊拰闂瓟瀵癸紙QAs锛夈 **绠楀瓙杩愯**锛 @@ -142,28 +145,20 @@ self.refiner.run( ### 3. **杈撳嚭鏁版嵁** -鏈缁堬紝娴佹按绾跨敓鎴愮殑杈撳嚭鏁版嵁灏嗗寘鍚互涓嬪唴瀹癸細 - -* **image**锛氬師濮嬪浘鍍忚矾寰勩 -* **vqa**锛歏LM 鐢熸垚鐨勫師濮嬫枃鏈紙涓棿缁撴灉锛夈 -* **context_vqa**锛氱粨鏋勫寲鐨勬渶缁堢粨鏋滐紝鍖呭惈 `context`锛堟枃绔狅級鍜 `qas`锛堥棶绛斿垪琛級銆 +鏈缁堢敓鎴愮殑缁撴瀯鍖栨暟鎹寘鍚 `context`锛堟枃绔狅級鍜 `qas`锛堥棶绛斿垪琛級銆 **杈撳嚭鏁版嵁绀轰緥**锛 ```json { "id": 1, - "image": "./images/landmark.jpg", + "image": ["./example_data/image_contextvqa/person.png"], "context_vqa": { - "context": "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris, France...", + "context": "Nightmare Alley is a 2021 American psychological thriller film...", "qas": [ { - "question": "In which city is this structure located?", - "answer": "Paris" - }, - { - "question": "What material is the tower primarily constructed from?", - "answer": "wrought-iron" + "question": "What genre does this film belong to?", + "answer": "Psychological thriller" } ] } @@ -175,45 +170,35 @@ self.refiner.run( ## 4. 娴佹按绾跨ず渚 -浠ヤ笅鏄畬鏁寸殑 `ContextVQAPipeline` 浠g爜瀹炵幇锛屾敮鎸佸懡浠よ鍙傛暟璋冪敤銆 +浠ヤ笅鏄畬鏁寸殑 `ContextVQAPipeline` 浠g爜瀹炵幇銆 ```python import argparse from dataflow.utils.storage import FileStorage +from dataflow.core import LLMServingABC from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm -from dataflow.operators.core_vision import FixPromptedVQAGenerator -from dataflow.operators.core_vision import WikiQARefiner +from dataflow.operators.core_vision import PromptedVQAGenerator, WikiQARefiner + class ContextVQAPipeline: """ 涓琛屽懡浠ゅ嵆鍙畬鎴愬浘鐗囨壒閲 ContextVQA Caption 鐢熸垚銆 """ - def __init__( - self, - model_path: str, - *, - hf_cache_dir: str | None = None, - download_dir: str = "./ckpt", - device: str = "cuda", - first_entry_file: str = "dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl", - cache_path: str = "./cache_local_skvqa", - file_name_prefix: str = "skvqa_cache_step", - cache_type: str = "jsonl", - ): + def __init__(self, llm_serving: LLMServingABC = None): # ---------- 1. Storage ---------- self.storage = FileStorage( - first_entry_file_name=first_entry_file, - cache_path=cache_path, - file_name_prefix=file_name_prefix, - cache_type=cache_type, + first_entry_file_name="./example_data/image_contextvqa/sample_data.json", + cache_path="./cache_local", + file_name_prefix="context_vqa", + cache_type="json", ) # ---------- 2. Serving ---------- - self.serving = LocalModelVLMServing_vllm( - hf_model_name_or_path=model_path, - hf_cache_dir=hf_cache_dir, - hf_local_dir=download_dir, + self.vlm_serving = LocalModelVLMServing_vllm( + hf_model_name_or_path="Qwen/Qwen2.5-VL-3B-Instruct", + hf_cache_dir="~/.cache/huggingface", + hf_local_dir="./ckpt", vllm_tensor_parallel_size=1, vllm_temperature=0.7, vllm_top_p=0.9, @@ -221,40 +206,25 @@ class ContextVQAPipeline: ) # ---------- 3. Operator ---------- - # 浣跨敤鐗瑰畾 Prompt 鐢熸垚 Wiki 椋庢牸鏂囩珷涓庨棶绛 - self.vqa_generator = FixPromptedVQAGenerator( - serving=self.serving, - system_prompt="You are a helpful assistant.", - user_prompt= """ - Write a Wikipedia article related to this image without directly referring to the image. Then write question answer pairs. The question answer pairs should satisfy the following criteria. - 1: The question should refer to the image. - 2: The question should avoid mentioning the name of the object in the image. - 3: The question should be answered by reasoning over the Wikipedia article. - 4: The question should sound natural and concise. - 5: The answer should be extracted from the Wikipedia article. - 6: The answer should not be any objects in the image. - 7: The answer should be a single word or phrase and list all correct answers separated by commas. - 8: The answer should not contain 'and', 'or', rather you can split them into multiple answers. - """ + self.vqa_generator = PromptedVQAGenerator( + serving=self.vlm_serving, + system_prompt= "You are a helpful assistant." ) - # 缁撴灉娓呮礂涓庣粨鏋勫寲 self.refiner = WikiQARefiner() - # ------------------------------------------------------------------ # def forward(self): input_image_key = "image" output_answer_key = "vqa" output_wiki_key = "context_vqa" - # 姝ラ 1: 鐢熸垚鍘熷鏂囨湰 self.vqa_generator.run( storage=self.storage.step(), + input_conversation_key="conversation", input_image_key=input_image_key, output_answer_key=output_answer_key ) - # 姝ラ 2: 瑙f瀽涓虹粨鏋勫寲鏁版嵁 self.refiner.run( storage=self.storage.step(), input_key=output_answer_key, @@ -263,30 +233,7 @@ class ContextVQAPipeline: # ---------------------------- CLI 鍏ュ彛 -------------------------------- # if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Batch SKVQA caption generation with DataFlow") - - parser.add_argument("--model_path", default="Qwen/Qwen2.5-VL-3B-Instruct") - parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface") - parser.add_argument("--download_dir", default="./ckpt") - parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda") - - parser.add_argument("--images_file", default="dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl") - parser.add_argument("--cache_path", default="./cache_local") - parser.add_argument("--file_name_prefix", default="context_vqa") - parser.add_argument("--cache_type", default="jsonl") - - args = parser.parse_args() - - pipe = ContextVQAPipeline( - model_path=args.model_path, - hf_cache_dir=args.hf_cache_dir, - download_dir=args.download_dir, - device=args.device, - first_entry_file=args.images_file, - cache_path=args.cache_path, - file_name_prefix=args.file_name_prefix, - cache_type=args.cache_type, - ) + pipe = ContextVQAPipeline() pipe.forward() -``` +``` \ No newline at end of file diff --git a/docs/zh/notes/mm_guide/image_understanding/context_vqa_api.md b/docs/zh/notes/mm_guide/image_understanding/context_vqa_api.md index d21ecff8..3ecb894a 100644 --- a/docs/zh/notes/mm_guide/image_understanding/context_vqa_api.md +++ b/docs/zh/notes/mm_guide/image_understanding/context_vqa_api.md @@ -1,13 +1,12 @@ --- -title: ContextVQA 澶氭ā鎬侀棶绛旀暟鎹敓鎴愭祦姘寸嚎锛圓PI鐗堬級 -icon: mdi:image-text -createTime: 2026/01/24 16:37:37 +title: ContextVQA 澶氭ā鎬侀棶绛旀暟鎹敓鎴愭祦姘寸嚎锛圓PI鐗堬級 +icon: mdi:image-text +createTime: 2026/01/24 16:37:37 permalink: /zh/mm_guide/contextvqa_api_pipeline/ --- - ## 1. 姒傝堪 -**ContextVQA 澶氭ā鎬侀棶绛旀暟鎹敓鎴愭祦姘寸嚎锛圓PI鐗堬級**鏃ㄥ湪浠庡浘鍍忓嚭鍙戯紝鑷姩鐢熸垚**鍏峰澶栭儴鐭ヨ瘑涓婁笅鏂囩殑瑙嗚闂瓟锛圕ontext-based VQA锛夋暟鎹**銆傝娴佹按绾块氳繃 API 褰㈠紡鐨勮瑙夎瑷妯″瀷锛圴LM锛夌敓鎴 Wikipedia 椋庢牸鏂囩珷鍙婇棶绛斿锛屽苟灏嗗叾瑙f瀽涓虹粨鏋勫寲鏁版嵁锛屼究浜庢瀯寤虹煡璇嗗瀷 VQA 涓庡妯℃ RAG 鏁版嵁闆嗐 +**ContextVQA 澶氭ā鎬侀棶绛旀暟鎹敓鎴愭祦姘寸嚎锛圓PI鐗堬級鏃ㄥ湪浠庡浘鍍忓嚭鍙戯紝鑷姩鐢熸垚鍏峰澶栭儴鐭ヨ瘑涓婁笅鏂囩殑瑙嗚闂瓟锛圕ontext-based VQA锛夋暟鎹**銆傝娴佹按绾块氳繃 API 褰㈠紡鐨勮瑙夎瑷妯″瀷锛圴LM锛夌敓鎴 Wikipedia 椋庢牸鏂囩珷鍙婇棶绛斿锛屽苟灏嗗叾瑙f瀽涓虹粨鏋勫寲鏁版嵁锛屼究浜庢瀯寤虹煡璇嗗瀷 VQA 涓庡妯℃ RAG 鏁版嵁闆嗐 鎴戜滑鏀寔浠ヤ笅搴旂敤鍦烘櫙锛 @@ -31,53 +30,71 @@ permalink: /zh/mm_guide/contextvqa_api_pipeline/ ```python import os -os.environ["DF_API_KEY"] = "your_api_key" +os.environ["DF_API_KEY"] = "sk-xxx" + ``` ### 绗簩姝ワ細鍒涘缓鏂扮殑 DataFlow 宸ヤ綔鏂囦欢澶 + ```bash mkdir run_dataflow cd run_dataflow + ``` ### 绗笁姝ワ細鍒濆鍖 DataFlow-MM + ```bash dataflowmm init + ``` + 杩欐椂浣犱細鐪嬪埌锛 + ```bash api_pipelines/image_contextvqa.py + ``` ### 绗洓姝ワ細涓嬭浇绀轰緥鏁版嵁 + ```bash -huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data + ``` ### 绗簲姝ワ細閰嶇疆鍙傛暟 -鍦 `image_contextvqa.py` 涓厤缃 API 鏈嶅姟鍜岃緭鍏ユ暟鎹矾寰勶細 +鍦 `image_contextvqa.py` 涓厤缃 API 鏈嶅姟鍜岃緭鍏ユ暟鎹矾寰勶紙鏃犻渶 `argparse`锛岀洿鎺ュ湪浠g爜涓慨鏀归粯璁よ矾寰勶級锛 ```python self.vlm_serving = APIVLMServing_openai( - api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # 浠绘剰鍏煎openai 鏍煎紡鐨刟pi骞冲彴 + api_url="http://172.96.141.132:3001/v1", # 浠绘剰鍏煎openai 鏍煎紡鐨刟pi骞冲彴 key_name_of_api_key="DF_API_KEY", # 瀵瑰簲鐨刟pi key锛屽湪绗竴姝ヤ腑璁剧疆 - model_name="qwen3-vl-8b-instruct", + model_name="gpt-5-nano-2025-08-07", + image_io=None, + send_request_stream=False, max_workers=10, timeout=1800 ) + ``` ```python -parser.add_argument("--images_file", default="data/image_contextvqa/sample_data.json") -parser.add_argument("--cache_path", default="./cache_local") -parser.add_argument("--file_name_prefix", default="context_vqa") -parser.add_argument("--cache_type", default="json") +self.storage = FileStorage( + first_entry_file_name="./example_data/image_contextvqa/sample_data.json", + cache_path="./cache_local", + file_name_prefix="context_vqa", + cache_type="json", +) + ``` -### 绗簲姝ワ細涓閿繍琛 +### 绗叚姝ワ細涓閿繍琛 + ```bash python api_pipelines/image_contextvqa.py + ``` --- @@ -99,7 +116,7 @@ python api_pipelines/image_contextvqa.py ```json [ { - "image": ["./data/image_contextvqa/person.png"], + "image": ["./example_data/image_contextvqa/person.png"], "conversation": [ { "from": "human", @@ -128,12 +145,13 @@ python api_pipelines/image_contextvqa.py ```python self.vlm_serving = APIVLMServing_openai( - api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + api_url="http://172.96.141.132:3001/v1", key_name_of_api_key="DF_API_KEY", - model_name="qwen3-vl-8b-instruct", + model_name="gpt-5-nano-2025-08-07", max_workers=10, timeout=1800 ) + ``` **绠楀瓙杩愯**锛 @@ -145,6 +163,7 @@ self.vqa_generator.run( input_image_key="image", output_answer_key="vqa" ) + ``` #### B. **WikiQARefiner锛堢粨鏋滆В鏋愶級** @@ -164,6 +183,7 @@ self.refiner.run( input_key="vqa", output_key="context_vqa" ) + ``` ### 3. **杈撳嚭鏁版嵁** @@ -180,105 +200,66 @@ self.refiner.run( [ { "image":[ - ".\/data\/image_contextvqa\/person.png" + "./example_data/image_contextvqa/person.png" ], "conversation":[ { "from":"human", - "value":"Write a Wikipedia article related to this image without directly referring to the image. Then write question answer pairs. The question answer pairs should satisfy the following criteria.\n1: The question should refer to the image.\n2: The question should avoid mentioning the name of the object in the image.\n3: The question should be answered by reasoning over the Wikipedia article.\n4: The question should sound natural and concise.\n5: The answer should be extracted from the Wikipedia article.\n6: The answer should not be any objects in the image.\n7: The answer should be a single word or phrase and list all correct answers separated by commas.\n8: The answer should not contain 'and', 'or', rather you can split them into multiple answers." + "value":"Write a Wikipedia article related to this image..." } ], "context_vqa":{ - "context":"**Wikipedia Article:** *Nightmare Alley* is a 2021 American psychological thriller film directed by Guillermo del Toro and written by del Toro and Kim Morgan. The film is based on the 1946 novel of the same name by William Lindsay Gresham. It follows the rise and fall of a street-smart con man who becomes involved with a carnival showman and his wife, eventually becoming embroiled in a dangerous world of deception and manipulation. The film stars Bradley Cooper as Stanton 鈥淪tan鈥 Carlisle, Cate Blanchett as Pearl Holland, Toni Collette as Molly, Willem Dafoe as Dr. John L. Thorne, Richard Jenkins as Mr. O鈥橫alley, Rooney Mara as Vera, Ron Perlman as The Duke, Mary Steenburgen as Mrs. Hargrove, and David Strathairn as Mr. Hargrove. The screenplay was adapted from the original novel by William Lindsay Gresham, which had previously been adapted into a 1947 film starring Tyrone Power. The film premiered at the Venice International Film Festival on September 1, 2021, and was released in the United States on December 17, 2021. It received critical acclaim for its direction, performances, and cinematography. The film鈥檚 score was composed by Benjamin Wallfisch, and it features a haunting atmosphere that complements its dark themes. *Nightmare Alley* explores themes of ambition, morality, and the corrupting nature of power. It was nominated for several awards, including Best Picture at the Academy Awards, and won Best Supporting Actor for Willem Dafoe. The film's production design and visual style were praised for their evocative portrayal of 1940s America.", + "context":"**Wikipedia Article:** *Nightmare Alley* is a 2021 American psychological thriller film directed by Guillermo del Toro...", "qas":[ { "question":"What genre does this film belong to?", "answer":"Psychological thriller" - }, - { - "question":"Who directed this film?", - "answer":"Guillermo del Toro" - }, - { - "question":"What year was this film released?", - "answer":"2021" - }, - { - "question":"Which actor plays the main character?", - "answer":"Bradley Cooper" - }, - { - "question":"What is the original source material for this film?", - "answer":"Novel" - }, - { - "question":"What festival did this film premiere at?", - "answer":"Venice International Film Festival" - }, - { - "question":"What award nomination did this film receive?", - "answer":"Best Picture" - }, - { - "question":"What theme does this film explore?", - "answer":"Ambition" - }, - { - "question":"What decade does the setting primarily reflect?", - "answer":"1940s" - }, - { - "question":"What is the title of the film鈥檚 score composer?", - "answer":"Benjamin Wallfisch" } ] } } ] + ``` --- ## 4. 娴佹按绾跨ず渚 -浠ヤ笅鏄畬鏁寸殑 `ContextVQAPipeline` 浠g爜瀹炵幇锛屾敮鎸佸懡浠よ鍙傛暟璋冪敤銆 +浠ヤ笅鏄畬鏁寸殑 `ContextVQAPipeline` 浠g爜瀹炵幇銆 ```python import os -import argparse + +# 璁剧疆 API Key 鐜鍙橀噺 +os.environ["DF_API_KEY"] = "sk-xxx" + from dataflow.utils.storage import FileStorage +from dataflow.core import LLMServingABC from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai from dataflow.operators.core_vision import PromptedVQAGenerator from dataflow.operators.core_vision import WikiQARefiner -# 璁剧疆 API Key 鐜鍙橀噺 -os.environ["DF_API_KEY"] = "sk-xxxx" class ContextVQAPipeline: """ 涓琛屽懡浠ゅ嵆鍙畬鎴愬浘鐗囨壒閲 ContextVQA Caption 鐢熸垚銆 """ - def __init__( - self, - first_entry_file: str = "dataflow/example/image_to_text_pipeline/capsbench_captions.jsonl", - cache_path: str = "./cache_local_skvqa", - file_name_prefix: str = "skvqa_cache_step", - cache_type: str = "jsonl", - ): + def __init__(self, llm_serving: LLMServingABC = None): # ---------- 1. Storage ---------- self.storage = FileStorage( - first_entry_file_name=first_entry_file, - cache_path=cache_path, - file_name_prefix=file_name_prefix, - cache_type=cache_type, + first_entry_file_name="./example_data/image_contextvqa/sample_data.json", + cache_path="./cache_local", + file_name_prefix="context_vqa", + cache_type="json", ) # ---------- 2. Serving ---------- self.vlm_serving = APIVLMServing_openai( - api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", - key_name_of_api_key="DF_API_KEY", - model_name="qwen3-vl-8b-instruct", + api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format + key_name_of_api_key="DF_API_KEY", # Set the API key in environment variable + model_name="gpt-5-nano-2025-08-07", image_io=None, send_request_stream=False, max_workers=10, @@ -288,7 +269,7 @@ class ContextVQAPipeline: # ---------- 3. Operator ---------- self.vqa_generator = PromptedVQAGenerator( serving=self.vlm_serving, - system_prompt="You are a helpful assistant." + system_prompt= "You are a helpful assistant." ) self.refiner = WikiQARefiner() @@ -314,20 +295,7 @@ class ContextVQAPipeline: # ---------------------------- CLI 鍏ュ彛 -------------------------------- # if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Batch SKVQA caption generation with DataFlow") - - parser.add_argument("--images_file", default="data/image_contextvqa/sample_data.json") - parser.add_argument("--cache_path", default="./cache_local") - parser.add_argument("--file_name_prefix", default="context_vqa") - parser.add_argument("--cache_type", default="json") - - args = parser.parse_args() - - pipe = ContextVQAPipeline( - first_entry_file=args.images_file, - cache_path=args.cache_path, - file_name_prefix=args.file_name_prefix, - cache_type=args.cache_type, - ) + pipe = ContextVQAPipeline() pipe.forward() -``` + +``` \ No newline at end of file diff --git a/docs/zh/notes/mm_guide/image_understanding/image_caption_api.md b/docs/zh/notes/mm_guide/image_understanding/image_caption_api.md index 9b8291b2..15a86c2e 100644 --- a/docs/zh/notes/mm_guide/image_understanding/image_caption_api.md +++ b/docs/zh/notes/mm_guide/image_understanding/image_caption_api.md @@ -43,7 +43,7 @@ dataflowmm init ### 绗笁姝ワ細涓嬭浇绀轰緥鏁版嵁 ```bash -huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir example_data ``` @@ -65,7 +65,7 @@ self.vlm_serving = APIVLMServing_openai( ### 绗簲姝ワ細杩愯娴佹按绾 ```bash -python api_pipelines/image_caption.py --images_file data/image_caption/sample_data.json +python api_pipelines/image_caption.py ``` @@ -80,7 +80,7 @@ python api_pipelines/image_caption.py --images_file data/image_caption/sample_da ```json [ { - "image": ["./data/image_caption/person.png"], + "image": ["./example_data/image_caption/person.png"], "conversation": [ { "from": "human", @@ -107,7 +107,7 @@ python api_pipelines/image_caption.py --images_file data/image_caption/sample_da ```json [ { - "image": ["./data/image_caption/person.png"], + "image": ["./example_data/image_caption/person.png"], "conversation": [...], "caption": "Promotional poster for Nightmare Alley in grayscale, showing a man in a formal tuxedo with a white bow tie. The cast names run down the left side (Bradley Cooper, Cate Blanchett, Toni Collette, Willem Dafoe, and more), and the gold title Nightmare Alley appears near the bottom left with release text and Regal branding." } @@ -123,73 +123,63 @@ python api_pipelines/image_caption.py --images_file data/image_caption/sample_da ```python import os -import argparse + +# 璁剧疆 API Key 鐜鍙橀噺 +os.environ["DF_API_KEY"] = "sk-xxx" + from dataflow.utils.storage import FileStorage +from dataflow.core import LLMServingABC from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai from dataflow.operators.core_vision import PromptedVQAGenerator -# 璁剧疆 API Key 鐜鍙橀噺 -os.environ["DF_API_KEY"] = "sk-xxx" class ImageCaptionPipeline: """ 涓琛屽懡浠ゅ嵆鍙畬鎴愬浘鐗囨壒閲 Caption 鐢熸垚銆 """ - def __init__( - self, - first_entry_file: str, - cache_path: str = "./cache_local", - file_name_prefix: str = "caption", - cache_type: str = "json", - ): - # ---------- 1. Storage: 绠$悊鏁版嵁璇诲彇涓庢柇鐐圭画浼 ---------- + def __init__(self, llm_serving: LLMServingABC = None): + + # ---------- 1. Storage ---------- self.storage = FileStorage( - first_entry_file_name=first_entry_file, - cache_path=cache_path, - file_name_prefix=file_name_prefix, - cache_type=cache_type, + first_entry_file_name="./example_data/image_caption/sample_data.json", + cache_path="./cache_local", + file_name_prefix="caption", + cache_type="json", ) - # ---------- 2. Serving: 閰嶇疆 API 鏈嶅姟 ---------- + # ---------- 2. Serving ---------- self.vlm_serving = APIVLMServing_openai( - api_url="http://172.96.141.132:3001/v1", - key_name_of_api_key="DF_API_KEY", + api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format + key_name_of_api_key="DF_API_KEY", # Set the API key for the corresponding platform in the environment variable or line 4 model_name="gpt-5-nano-2025-08-07", + image_io=None, + send_request_stream=False, max_workers=10, timeout=1800 ) - # ---------- 3. Operator: 瀹氫箟鐢熸垚閫昏緫 ---------- + # ---------- 3. Operator ---------- self.vqa_generator = PromptedVQAGenerator( serving=self.vlm_serving, - system_prompt="You are a image caption generator. Your task is to generate a concise and informative caption for the given image content." + system_prompt= "You are a image caption generator. Your task is to generate a concise and informative caption for the given image content." ) + # ------------------------------------------------------------------ # def forward(self): - # 杩愯娴佹按绾 + input_image_key = "image" + output_answer_key = "caption" + self.vqa_generator.run( storage=self.storage.step(), input_conversation_key="conversation", - input_image_key="image", - output_answer_key="caption", + input_image_key=input_image_key, + output_answer_key=output_answer_key, ) +# ---------------------------- CLI 鍏ュ彛 -------------------------------- # if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Batch image caption generation with DataFlow") - parser.add_argument("--images_file", default="data/image_caption/sample_data.json") - parser.add_argument("--cache_path", default="./cache_local") - parser.add_argument("--file_name_prefix", default="caption") - parser.add_argument("--cache_type", default="json") - - args = parser.parse_args() - - pipe = ImageCaptionPipeline( - first_entry_file=args.images_file, - cache_path=args.cache_path, - file_name_prefix=args.file_name_prefix, - cache_type=args.cache_type, - ) + pipe = ImageCaptionPipeline() pipe.forward() ``` \ No newline at end of file diff --git a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md index 245a43db..2fe90ca0 100644 --- a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md +++ b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline.md @@ -1,12 +1,12 @@ --- -title: 鍥惧儚鍖哄煙鎻忚堪鐢熸垚娴佹按绾 +title: 鍥惧儚鍖哄煙鎻忚堪鐢熸垚娴佹按绾縍egionCap createTime: 2026/01/11 22:04:27 icon: mdi:image-text permalink: /zh/mm_guide/image_region_caption_pipeline/ --- ## 1. 姒傝堪 -**鍥惧儚鍖哄煙鎻忚堪鐢熸垚娴佹按绾 (Image Region Captioning Pipeline)** 鏃ㄥ湪涓哄浘鍍忎腑鐨勭壒瀹氬尯鍩熺敓鎴愯缁嗙殑鏂囨湰鎻忚堪銆傝娴佹按绾跨粨鍚堜簡璁$畻鏈鸿瑙夌殑瀹氫綅鑳藉姏涓庡妯℃佸ぇ妯″瀷鐨勭悊瑙h兘鍔涳紝鑳藉璇嗗埆鍥惧儚涓殑鎰熷叴瓒e尯鍩燂紙ROI锛夛紝骞朵负鍏剁敓鎴愮簿纭殑鑷劧璇█鏍囨敞銆 +**鍥惧儚鍖哄煙鎻忚堪鐢熸垚娴佹按绾 (Image Region Caption Pipeline)** 鏃ㄥ湪涓哄浘鍍忎腑鐨勭壒瀹氬尯鍩熺敓鎴愯缁嗙殑鏂囨湰鎻忚堪銆傝娴佹按绾跨粨鍚堜簡璁$畻鏈鸿瑙夌殑瀹氫綅鑳藉姏涓庡妯℃佸ぇ妯″瀷鐨勭悊瑙h兘鍔涳紝鑳藉璇嗗埆鍥惧儚涓殑鎰熷叴瓒e尯鍩燂紙ROI锛夛紝骞朵负鍏剁敓鎴愮簿纭殑鑷劧璇█鏍囨敞銆 璇ユ祦姘寸嚎鏀寔澶勭悊**棰勫畾涔夎竟鐣屾 (Bounding Box)** 鏁版嵁锛屽苟灏嗗叾鍙鍖栧悗杈撳叆 VLM 杩涜鎻忚堪鐢熸垚銆 @@ -26,37 +26,92 @@ permalink: /zh/mm_guide/image_region_caption_pipeline/ ## 2. 蹇熷紑濮 -### 绗竴姝ワ細鍑嗗宸ヤ綔鐩綍 - +### 绗竴姝ワ細鍒涘缓鏂扮殑 DataFlow 宸ヤ綔鏂囦欢澶 ```bash -mkdir run_region_caption -cd run_region_caption - +mkdir run_dataflow +cd run_dataflow ``` -### 绗簩姝ワ細鍑嗗鑴氭湰 - -灏嗕笅鏂団滄祦姘寸嚎绀轰緥鈥濅腑鐨勪唬鐮佷繚瀛樹负 `region_caption_pipeline.py`銆 - -### 绗笁姝ワ細閰嶇疆杩愯鍙傛暟 - -纭繚杈撳叆鏂囦欢锛坖sonl锛夊寘鍚 `image` 鍜 `bbox` 瀛楁銆 +### 绗簩姝ワ細鍒濆鍖 DataFlow-MM +```bash +dataflowmm init +``` +杩欐椂浣犱細鐪嬪埌锛 +```bash +gpu_pipelines/image_region_caption_pipeline.py +``` +### 绗笁姝ワ細涓嬭浇绀轰緥鏁版嵁 ```bash -# 瀹夎渚濊禆 -pip install open-dataflow vllm +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data +``` +### 绗洓姝ワ細閰嶇疆鍙傛暟 +```python + def __init__( + self, + model_path: str = "Qwen/Qwen2.5-VL-3B-Instruct", + hf_cache_dir: str = "~/.cache/huggingface", + download_dir: str = "../ckpt/models/Qwen2.5-VL-3B-Instruct", + first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl", + cache_path: str = "../cache/image_region_caption", + file_name_prefix: str = "region_caption", + cache_type: str = "jsonl", + input_image_key: str = "image", + input_bbox_key: str = "bbox", + max_boxes: int = 10, + output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl", + ): ``` +> **鈿狅笍 妯″瀷璺緞閰嶇疆鐨勯噸瑕佹彁绀猴紙浠 `Qwen2.5-VL-3B-Instruct` 涓轰緥锛夛細** +> +> * **濡傛灉鎮ㄥ凡缁忎笅杞藉ソ浜嗘ā鍨嬫枃浠**锛氳灏 `model_path` 淇敼涓烘偍鐨勬湰鍦版ā鍨嬭矾寰勩**鍔″繀淇濊瘉**妯″瀷瀛樻斁鐨勬渶缁堟枃浠跺す鍚嶇О绮剧‘涓 `Qwen2.5-VL-3B-Instruct`锛屽惁鍒欏簳灞傝В鏋愭椂灏嗘棤娉曟纭尮閰嶅拰璇嗗埆璇ユā鍨嬨 +> * **濡傛灉鎮ㄨ繕鏈笅杞芥ā鍨嬶紙闇瑕佽嚜鍔ㄤ笅杞斤級**锛氳涓瀹氳鎸囧畾 `download_dir` 鍙傛暟锛屽苟涓旇鐩綍璺緞**蹇呴』浠** `Qwen2.5-VL-3B-Instruct` **缁撳熬**锛堟濡傞粯璁ゅ弬鏁版墍绀猴級锛屽惁鍒欎笅杞藉畬鎴愬悗鍚屾牱浼氬鑷存鏋舵棤娉曡瘑鍒ā鍨嬨 -### 绗洓姝ワ細涓閿繍琛 +### 绗簲姝ワ細涓閿繍琛 ```bash -python region_caption_pipeline.py \ - --model_path "/path/to/Qwen2.5-VL-3B-Instruct" \ - --first_entry_file "data/region_captions.jsonl" \ - --output_jsonl_path "data/results.jsonl" - +cd gpu_pipelines +python image_region_caption_pipeline.py ``` +> **馃洜锔 甯歌闂鎺掓煡 (Troubleshooting)** +> +> **闂 1锛** 濡傛灉閬囧埌绫讳技濡備笅鐨勫姩鎬侀摼鎺ュ簱鍐茬獊鎶ラ敊锛 +> `ImportError: .../miniconda3/envs/Dataflow-MM/lib/python3.12/site-packages/torch/lib/../../nvidia/cusparse/lib/libcusparse.so.12: undefined symbol: __nvJitLinkComplete_12_4, version libnvJitLink.so.12` +> +> **瑙e喅鏂规硶锛** 杩欓氬父鏄幆澧冨彉閲忓共鎵板鑷寸殑銆傝鍦ㄨ繍琛屽懡浠ゅ墠娓呯┖ `LD_LIBRARY_PATH`锛 +> ```bash +> LD_LIBRARY_PATH="" python image_region_caption_pipeline.py +> ``` +> +> **闂 2锛** 濡傛灉鎮ㄤ娇鐢ㄧ殑鏄 **Qwen 绯诲垪妯″瀷**锛屽苟涓旈亣鍒颁互涓嬫姤閿欙細 +> `KeyError: "Missing required keys in rope_scaling for 'rope_type'='None': {'rope_type'}"` +> +> **瑙e喅鏂规硶锛** 鎵撳紑妯″瀷鏂囦欢澶逛笅鐨 `config.json` 鏂囦欢锛屾壘鍒 `rope_scaling` 閰嶇疆鍧楋紝灏 `"type"` 瀛楁淇敼涓 `"rope_type"` 鍗冲彲銆 +> +> **淇敼鍓嶏細** +> ```json +> "rope_scaling": { +> "type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> ``` +> +> **淇敼鍚庯細** +> ```json +> "rope_scaling": { +> "rope_type": "mrope", +> "mrope_section": [ +> 16, +> 24, +> 24 +> ] +> } +> ``` --- @@ -64,16 +119,16 @@ python region_caption_pipeline.py \ ### 1. **杈撳叆鏁版嵁** -杈撳叆鏁版嵁閫氬父鍖呭惈鍥惧儚璺緞鍜屽搴旂殑杈圭晫妗嗗垪琛細 +杈撳叆鏁版嵁閫氬父鍖呭惈鍥惧儚璺緞鍜屽搴旂殑杈圭晫妗嗗垪琛紙鍙夛級锛 * **image**锛氬浘鍍忔枃浠惰矾寰勩 -* **bbox**锛氳竟鐣屾鍧愭爣鍒楄〃锛岄氬父鏍煎紡涓 `[[x, y, w, h], ...]` 鎴 `[[x1, y1, x2, y2], ...]`锛堝彇鍐充簬鍏蜂綋閰嶇疆锛夈 +* **bbox**锛氳竟鐣屾鍧愭爣鍒楄〃锛岄氬父鏍煎紡涓 `[[x, y, w, h], ...]`銆 **杈撳叆鏁版嵁绀轰緥**锛 ```json { - "image": "./images/kitchen.jpg", + "image": "../example_data/image_region_caption/20.jpg", "bbox": [[196, 104, 310, 495], [50, 60, 100, 200]] } @@ -89,34 +144,43 @@ python region_caption_pipeline.py \ * **杈撳叆**锛氬師濮嬪浘鍍 + `bbox` 鏁版嵁銆 * **鍔熻兘**锛氳鍙栬竟鐣屾锛屽皢鍏剁粯鍒跺湪鍥惧儚涓婏紙鍙鍖栵級锛屾垨鑰呮牴鎹厤缃繘琛岄澶勭悊銆 -* **閰嶇疆 (`ExistingBBoxDataGenConfig`)**锛氭帶鍒舵渶澶ф鏁伴噺 (`max_boxes`) 鍜屽彲瑙嗗寲閫夐」 (`draw_visualization`)銆 -* **杈撳嚭**锛氱敓鎴愬甫鏈夎瑙夋爣璁扮殑鏂板浘鍍忚矾寰勶紙`image_with_bbox`锛夈 +* **閰嶇疆 (`ExistingBBoxDataGenConfig`)**锛氭帶鍒舵渶澶ф鏁伴噺 (`max_boxes`)鍜岃緭鍏ヨ緭鍑鸿矾寰勩 +* **杈撳嚭**锛氬甫鏈夎瑙夋爣璁扮殑鏂板浘鍍忕殑json鏂囦欢杈撳嚭璺緞銆 #### B. **PromptedVQAGenerator锛圴QA 鐢熸垚鍣級** 璇ョ畻瀛愯礋璐e埄鐢 VLM 鐢熸垚鏂囨湰銆 -* **杈撳叆**锛氫笂涓姝ョ敓鎴愮殑 `image_with_bbox`銆 +* **杈撳叆**锛氫笂涓姝ョ殑杈撳嚭銆 * **鍔熻兘**锛歏LM 鎺ユ敹甯︽湁鏍囪鐨勫浘鍍忥紝鏍规嵁鎻愮ず鐢熸垚瀵瑰簲鍖哄煙鐨勬弿杩般 * **杈撳嚭**锛氬尯鍩熸弿杩版枃鏈 ### 3. **杈撳嚭鏁版嵁** 鏈缁堢敓鎴愮殑杈撳嚭鏁版嵁灏嗗寘鍚鐞嗗悗鐨勫浘鍍忚矾寰勫拰鐢熸垚鐨勬弿杩帮細 - +* **image**锛氳緭鍏ョ殑鍥剧墖璺緞銆 +* **type**锛氭槸鍚︾粰瀹氳竟鐣屾銆 +* **bbox**锛氳竟鐣屾鍙傛暟銆 +* **normalized_bbox**锛氭爣鍑嗗寲鍚庣殑杈圭晫妗嗗弬鏁般 +* **result_file**锛氱粨鏋滆緭鍑鸿矾寰勩 * **image_with_bbox**锛氱敾浜嗘鐨勫浘鍍忚矾寰勩 -* **mdvp_record**锛氱敓鎴愮殑鍖哄煙鎻忚堪鍒楄〃銆 +* **valid_bboxes_num**锛氭湁鏁堣竟鐣屾鏁伴噺銆 +* **prompt**锛歏LM鎺ユ敹鐨勬彁绀鸿瘝銆 +* **answer**锛氱敓鎴愮殑鍖哄煙鎻忚堪鍒楄〃銆 **杈撳嚭鏁版嵁绀轰緥**锛 ```json { - "image": "./images/kitchen.jpg", - "image_with_bbox": "./images/kitchen_visualized.jpg", - "mdvp_record": [ - "A wooden chair located near the table.", - "A white refrigerator in the background." - ] + "image":"..\/example_data\/image_region_caption\/20.png", + "type":"with_bbox", + "bbox":[[196,104,310,495]], + "normalized_bbox":[[0.128,0.125,0.329,0.72],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0]], + "result_file":"..\/cache\/image_region_caption", + "image_with_bbox":"..\/cache\/image_region_caption\\2_bbox_vis.jpg", + "valid_bboxes_num":1, + "prompt":"Describe the content of each marked region in the image. There are 1 regions: to .", + "answer":"In , the focus is on the lower half of a person wearing high-heeled shoes with an ornate design. The setting appears to be a kitchen, with items such as a table with floral tablecloth, a broom, and various kitchen utensils visible in the background. The legs of another person can also be seen, indicating there may be interaction happening in this domestic space. The overall scene captures a domestic and casual atmosphere." } ``` @@ -125,10 +189,9 @@ python region_caption_pipeline.py \ ## 4. 娴佹按绾跨ず渚 -浠ヤ笅鏄畬鏁寸殑 `ImageRegionCaptioningPipeline` 浠g爜瀹炵幇銆 +浠ヤ笅鏄畬鏁寸殑 `ImageRegionCaptionPipeline` 浠g爜瀹炵幇銆 ```python -import argparse from dataflow.serving.local_model_vlm_serving import LocalModelVLMServing_vllm from dataflow.operators.core_vision.generate.image_bbox_generator import ( ImageBboxGenerator, @@ -140,57 +203,40 @@ from dataflow.operators.core_vision.generate.prompted_vqa_generator import ( from dataflow.utils.storage import FileStorage -class ImageRegionCaptioningPipeline: +class ImageRegionCaptionPipeline: def __init__( self, - model_path: str, - *, - hf_cache_dir: str | None = None, - download_dir: str = "./ckpt/models", - device: str = "cuda", - # Storage & Paths - first_entry_file: str = "./dataflow/example/image_to_text_pipeline/region_captions.jsonl", - cache_path: str = "./dataflow/example/cache", + model_path: str = "Qwen/Qwen2.5-VL-3B-Instruct", + hf_cache_dir: str = "~/.cache/huggingface", + download_dir: str = "../ckpt/models/Qwen2.5-VL-3B-Instruct", + first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl", + cache_path: str = "../cache/image_region_caption", file_name_prefix: str = "region_caption", cache_type: str = "jsonl", - # Keys input_image_key: str = "image", input_bbox_key: str = "bbox", - image_with_bbox_path: str = 'image_with_bbox', # Key for intermediate image - output_key: str = "mdvp_record", - # BBox Config max_boxes: int = 10, - input_jsonl_path: str = "./dataflow/example/image_to_text_pipeline/region_captions.jsonl", - output_jsonl_path: str = "./dataflow/example/image_to_text_pipeline/region_captions_results_v1.jsonl", - output_image_with_bbox_path: str = "./dataflow/example/image_to_text_pipeline/image_with_bbox_results_v1.jsonl", - draw_visualization: bool = True + output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl", ): - # 1. 鍒濆鍖栧瓨鍌 (Storage) - # 鐢ㄤ簬 BBox 鐢熸垚闃舵鐨勫瓨鍌 self.bbox_storage = FileStorage( first_entry_file_name=first_entry_file, cache_path=cache_path, file_name_prefix=file_name_prefix, cache_type=cache_type ) - - # 2. 閰嶇疆 BBox 鐢熸垚鍣 + self.cfg = ExistingBBoxDataGenConfig( max_boxes=max_boxes, - input_jsonl_path=input_jsonl_path, + input_jsonl_path=first_entry_file, output_jsonl_path=output_image_with_bbox_path, ) - # 3. 鍒濆鍖 Caption 闃舵鐨勫瓨鍌 - # 娉ㄦ剰锛氳繖閲屾帴缁簡涓婁竴姝ョ殑杈撳嚭璺緞 self.caption_storage = FileStorage( first_entry_file_name=output_image_with_bbox_path, cache_path=cache_path, file_name_prefix=file_name_prefix, cache_type=cache_type ) - - # 4. 鍒濆鍖 VLM 鏈嶅姟 self.serving = LocalModelVLMServing_vllm( hf_model_name_or_path=model_path, hf_cache_dir=hf_cache_dir, @@ -200,76 +246,28 @@ class ImageRegionCaptioningPipeline: vllm_top_p=0.9, vllm_max_tokens=512, ) - - # 5. 鍒濆鍖栨牳蹇冪畻瀛 self.bbox_generator = ImageBboxGenerator(config=self.cfg) - self.caption_generator = PromptedVQAGenerator(serving=self.serving) - + self.caption_generator = PromptedVQAGenerator(serving=self.serving,) self.input_image_key = input_image_key self.input_bbox_key = input_bbox_key - self.output_key = output_key - self.image_with_bbox_path = image_with_bbox_path + self.bbox_record=None def forward(self): - # 姝ラ 1: 鐢熸垚甯 BBox 鍙鍖栫殑鍥惧儚 - print(">>> [Pipeline] Step 1: Processing BBoxes and Visualizing...") self.bbox_generator.run( storage=self.bbox_storage.step(), input_image_key=self.input_image_key, input_bbox_key=self.input_bbox_key, - output_key=self.image_with_bbox_path, ) - # 姝ラ 2: 鍩轰簬鍙鍖栧浘鍍忕敓鎴愭弿杩 - print(">>> [Pipeline] Step 2: Generating Region Captions...") self.caption_generator.run( storage=self.caption_storage.step(), - input_image_key='image_with_bbox' # 浣跨敤涓婁竴姝ョ敓鎴愮殑甯︽鍥惧儚 + input_image_key='image_with_bbox', + input_prompt_key='prompt' ) if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Image region captioning with DataFlow") - - parser.add_argument("--model_path", default="/data0/happykeyan/Models/Qwen2.5-VL-3B-Instruct") - parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface") - parser.add_argument("--download_dir", default="./ckpt/models") - parser.add_argument("--device", choices=["cuda", "cpu", "mps"], default="cuda") - - parser.add_argument("--first_entry_file", default="./dataflow/example/image_to_text_pipeline/region_captions.jsonl") - parser.add_argument("--cache_path", default="./dataflow/example/cache") - parser.add_argument("--file_name_prefix", default="region_caption") - parser.add_argument("--cache_type", default="jsonl") - - parser.add_argument("--input_image_key", default="image") - parser.add_argument("--input_bbox_key", default="bbox") - parser.add_argument("--output_key", default="mdvp_record") - - parser.add_argument("--max_boxes", type=int, default=10) - parser.add_argument("--input_jsonl_path", default="./dataflow/example/image_to_text_pipeline/region_captions.jsonl") - parser.add_argument("--output_jsonl_path", default="./dataflow/example/image_to_text_pipeline/region_captions_results_v1.jsonl") - parser.add_argument("--output_image_with_bbox_path", default="./dataflow/example/image_to_text_pipeline/image_with_bbox_results_v1.jsonl") - parser.add_argument("--draw_visualization", type=bool, default=True) - - args = parser.parse_args() - - pipe = ImageRegionCaptioningPipeline( - model_path=args.model_path, - hf_cache_dir=args.hf_cache_dir, - download_dir=args.download_dir, - device=args.device, - first_entry_file=args.first_entry_file, - cache_path=args.cache_path, - file_name_prefix=args.file_name_prefix, - cache_type=args.cache_type, - input_image_key=args.input_image_key, - input_bbox_key=args.input_bbox_key, - output_key=args.output_key, - max_boxes=args.max_boxes, - input_jsonl_path=args.input_jsonl_path, - output_image_with_bbox_path=args.output_image_with_bbox_path, - draw_visualization=args.draw_visualization - ) + pipe = ImageRegionCaptionPipeline() pipe.forward() ``` diff --git a/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md new file mode 100644 index 00000000..450a8c12 --- /dev/null +++ b/docs/zh/notes/mm_guide/image_understanding/image_region_caption_pipeline_api.md @@ -0,0 +1,249 @@ +--- +title: 鍥惧儚鍖哄煙鎻忚堪鐢熸垚娴佹按绾縍egionCap锛圓PI鐗堬級 +createTime: 2026/01/11 22:04:27 +icon: mdi:image-text +permalink: /zh/mm_guide/image_region_caption_pipeline_api/ +--- +## 1. 姒傝堪 + +**鍥惧儚鍖哄煙鎻忚堪鐢熸垚娴佹按绾匡紙API鐗堬級** 鏃ㄥ湪涓哄浘鍍忎腑鐨勭壒瀹氬尯鍩熺敓鎴愯缁嗙殑鏂囨湰鎻忚堪銆傝娴佹按绾跨粨鍚堜簡璁$畻鏈鸿瑙夌殑瀹氫綅鑳藉姏涓庡妯℃佸ぇ妯″瀷鐨勭悊瑙h兘鍔涳紝鑳藉璇嗗埆鍥惧儚涓殑鎰熷叴瓒e尯鍩燂紙ROI锛夛紝骞朵负鍏剁敓鎴愮簿纭殑鑷劧璇█鏍囨敞銆 + +璇ユ祦姘寸嚎鏀寔澶勭悊**棰勫畾涔夎竟鐣屾 (Bounding Box)** 鏁版嵁锛屽苟灏嗗叾鍙鍖栧悗杈撳叆 VLM 杩涜鎻忚堪鐢熸垚銆 + +鎴戜滑鏀寔浠ヤ笅搴旂敤鍦烘櫙锛 + +* **瀵嗛泦鎻忚堪鐢熸垚 (Dense Captioning)**锛氫负鍥惧儚涓殑澶氫釜鐗╀綋鍒嗗埆鐢熸垚鎻忚堪銆 +* **缁嗙矑搴﹀浘鍍忕悊瑙**锛氬叧娉ㄥ浘鍍忕殑灞閮ㄧ粏鑺傝岄潪鍏ㄥ眬鎻忚堪銆 +* **鏁版嵁闆嗗寮**锛氭瀯寤哄甫瀹氫綅淇℃伅鐨勫浘鏂囧鏁版嵁闆嗐 + +娴佹按绾跨殑涓昏娴佺▼鍖呮嫭锛 + +1. **鏁版嵁鍔犺浇**锛氳鍙栧寘鍚浘鍍忓拰杈圭晫妗嗕俊鎭殑婧愭暟鎹 +2. **杈圭晫妗嗗鐞嗕笌鍙鍖**锛氬鐞嗚緭鍏ョ殑杈圭晫妗嗭紝鐢熸垚甯︽湁鍙鍖栨爣璁帮紙濡傜敾妗嗭級鐨勫浘鍍忕増鏈 +3. **鍖哄煙鎻忚堪鐢熸垚**锛氬埄鐢 VLM 閽堝鏍囪鍚庣殑鍥惧儚鎴栫壒瀹氬尯鍩熺敓鎴愭枃鏈弿杩般 + +--- + +## 2. 蹇熷紑濮 + +### 绗竴姝ワ細鍒涘缓鏂扮殑 DataFlow 宸ヤ綔鏂囦欢澶 +```bash +mkdir run_dataflow +cd run_dataflow +``` + +### 绗簩姝ワ細鍒濆鍖 DataFlow-MM +```bash +dataflowmm init +``` +杩欐椂浣犱細鐪嬪埌锛 +```bash +api_pipelines/image_region_caption_api_pipeline.py +``` + +### 绗笁姝ワ細涓嬭浇绀轰緥鏁版嵁 +```bash +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir ./example_data +``` + +### 绗洓姝ワ細閰嶇疆 API Key + +鍦 `api_pipelines/image_region_caption_api_pipeline.py` 涓缃 API Key 鐜鍙橀噺锛 + +```python +import os +os.environ["DF_API_KEY"] = "your_api_key" +``` + +### 绗簲姝ワ細閰嶇疆鍙傛暟 + +鍦 `api_pipelines/image_region_caption_api_pipeline.py` 涓厤缃 API 鏈嶅姟鍜岃緭鍏ユ暟鎹矾寰勶細 + +```python + def __init__( + self, + first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl", + cache_path: str = "../cache/image_region_caption", + file_name_prefix: str = "region_caption", + cache_type: str = "jsonl", + input_image_key: str = "image", + input_bbox_key: str = "bbox", + max_boxes: int = 10, + output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl", + ): +``` + +```python +self.vlm_serving = APIVLMServing_openai( + api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", + model_name="gpt-4o-mini", + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 + ) +``` + +### 绗叚姝ワ細涓閿繍琛 +```bash +cd api_pipelines +python image_region_caption_api_pipeline.py +``` + +--- + +## 3. 鏁版嵁娴佷笌娴佹按绾块昏緫 + +### 1. **杈撳叆鏁版嵁** + +杈撳叆鏁版嵁閫氬父鍖呭惈鍥惧儚璺緞鍜屽搴旂殑杈圭晫妗嗗垪琛紙鍙夛級锛 + +* **image**锛氬浘鍍忔枃浠惰矾寰勩 +* **bbox**锛氳竟鐣屾鍧愭爣鍒楄〃锛岄氬父鏍煎紡涓 `[[x, y, w, h], ...]` 銆 + +**杈撳叆鏁版嵁绀轰緥**锛 + +```json +{ + "image": "../example_data/image_region_caption/20.jpg", + "bbox": [[196, 104, 310, 495], [50, 60, 100, 200]] +} + +``` + +### 2. **鏍稿績绠楀瓙閫昏緫** + +璇ユ祦姘寸嚎閫氳繃涓茶仈涓や釜鏍稿績绠楀瓙鏉ュ畬鎴愪换鍔★細 + +#### A. **ImageBboxGenerator锛堣竟鐣屾澶勭悊鍣級** + +璇ョ畻瀛愯礋璐e鐞嗚瑙夊眰闈㈢殑浠诲姟銆 + +* **杈撳叆**锛氬師濮嬪浘鍍 + `bbox` 鏁版嵁銆 +* **鍔熻兘**锛氳鍙栬竟鐣屾锛屽皢鍏剁粯鍒跺湪鍥惧儚涓婏紙鍙鍖栵級锛屾垨鑰呮牴鎹厤缃繘琛岄澶勭悊銆 +* **閰嶇疆 (`ExistingBBoxDataGenConfig`)**锛氭帶鍒舵渶澶ф鏁伴噺 (`max_boxes`)鍜岃緭鍏ヨ緭鍑鸿矾寰勩 +* **杈撳嚭**锛氬甫鏈夎瑙夋爣璁扮殑鏂板浘鍍忕殑json鏂囦欢杈撳嚭璺緞銆 + +#### B. **PromptedVQAGenerator锛圴QA 鐢熸垚鍣級** + +璇ョ畻瀛愯礋璐e埄鐢 VLM 鐢熸垚鏂囨湰銆 + +* **杈撳叆**锛氫笂涓姝ョ殑杈撳嚭銆 +* **鍔熻兘**锛歏LM 鎺ユ敹甯︽湁鏍囪鐨勫浘鍍忥紝鏍规嵁鎻愮ず鐢熸垚瀵瑰簲鍖哄煙鐨勬弿杩般 +* **杈撳嚭**锛氬尯鍩熸弿杩版枃鏈 + +### 3. **杈撳嚭鏁版嵁** + +鏈缁堢敓鎴愮殑杈撳嚭鏁版嵁灏嗗寘鍚鐞嗗悗鐨勫浘鍍忚矾寰勫拰鐢熸垚鐨勬弿杩帮細 +* **image**锛氳緭鍏ョ殑鍥剧墖璺緞銆 +* **type**锛氭槸鍚︾粰瀹氳竟鐣屾銆 +* **bbox**锛氳竟鐣屾鍙傛暟銆 +* **normalized_bbox**锛氭爣鍑嗗寲鍚庣殑杈圭晫妗嗗弬鏁般 +* **result_file**锛氱粨鏋滆緭鍑鸿矾寰勩 +* **image_with_bbox**锛氱敾浜嗘鐨勫浘鍍忚矾寰勩 +* **valid_bboxes_num**锛氭湁鏁堣竟鐣屾鏁伴噺銆 +* **prompt**锛歏LM鎺ユ敹鐨勬彁绀鸿瘝銆 +* **answer**锛氱敓鎴愮殑鍖哄煙鎻忚堪鍒楄〃銆 + +**杈撳嚭鏁版嵁绀轰緥**锛 + +```json +{ + "image":"..\/example_data\/image_region_caption\/20.png", + "type":"with_bbox", + "bbox":[[196,104,310,495]], + "normalized_bbox":[[0.128,0.125,0.329,0.72],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0],[0.0,0.0,0.0,0.0]], + "result_file":"..\/cache\/image_region_caption", + "image_with_bbox":"..\/cache\/image_region_caption\\2_bbox_vis.jpg", + "valid_bboxes_num":1, + "prompt":"Describe the content of each marked region in the image. There are 1 regions: to .", + "answer":"In , the focus is on the lower half of a person wearing high-heeled shoes with an ornate design. The setting appears to be a kitchen, with items such as a table with floral tablecloth, a broom, and various kitchen utensils visible in the background. The legs of another person can also be seen, indicating there may be interaction happening in this domestic space. The overall scene captures a domestic and casual atmosphere." +} + +``` + +--- + +## 4. 娴佹按绾跨ず渚 + +浠ヤ笅鏄畬鏁寸殑 `ImageRegionCaptionAPIPipeline` 浠g爜瀹炵幇銆 + +```python +import os +os.environ["DF_API_KEY"] = "sk-xxxx" + +from dataflow.operators.core_vision.generate.image_bbox_generator import ( + ImageBboxGenerator, + ExistingBBoxDataGenConfig +) +from dataflow.operators.core_vision.generate.prompted_vqa_generator import ( + PromptedVQAGenerator +) +from dataflow.utils.storage import FileStorage + +from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai +class ImageRegionCaptionPipeline: + def __init__( + self, + first_entry_file: str = "../example_data/image_region_caption/image_region_caption_demo.jsonl", + cache_path: str = "../cache/image_region_caption", + file_name_prefix: str = "region_caption", + cache_type: str = "jsonl", + input_image_key: str = "image", + input_bbox_key: str = "bbox", + max_boxes: int = 10, + output_image_with_bbox_path: str = "../cache/image_region_caption/image_with_bbox_result.jsonl", + ): + self.bbox_storage = FileStorage( + first_entry_file_name=first_entry_file, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type=cache_type + ) + + self.cfg = ExistingBBoxDataGenConfig( + max_boxes=max_boxes, + input_jsonl_path=first_entry_file, + output_jsonl_path=output_image_with_bbox_path, + ) + + self.caption_storage = FileStorage( + first_entry_file_name=output_image_with_bbox_path, + cache_path=cache_path, + file_name_prefix=file_name_prefix, + cache_type=cache_type + ) + self.vlm_serving = APIVLMServing_openai( + api_url="https://dashscope.aliyuncs.com/compatible-mode/v1", # Any API platform compatible with OpenAI format + model_name="gpt-4o-mini", + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 + ) + self.bbox_generator = ImageBboxGenerator(config=self.cfg) + self.caption_generator = PromptedVQAGenerator(serving=self.vlm_serving,system_prompt="You are a helpful assistant.") + self.input_image_key = input_image_key + self.input_bbox_key = input_bbox_key + self.bbox_record=None + + def forward(self): + self.bbox_generator.run( + storage=self.bbox_storage.step(), + input_image_key=self.input_image_key, + input_bbox_key=self.input_bbox_key + ) + + self.caption_generator.run( + storage=self.caption_storage.step(), + input_image_key='image_with_bbox', + input_prompt_key='prompt' + ) + + +if __name__ == "__main__": + pipe = ImageRegionCaptionPipeline() + pipe.forward() + +``` + diff --git a/docs/zh/notes/mm_guide/image_understanding/image_vqa_api.md b/docs/zh/notes/mm_guide/image_understanding/image_vqa_api.md index 49ac7714..67120d2a 100644 --- a/docs/zh/notes/mm_guide/image_understanding/image_vqa_api.md +++ b/docs/zh/notes/mm_guide/image_understanding/image_vqa_api.md @@ -44,7 +44,7 @@ dataflowmm init ### 绗笁姝ワ細涓嬭浇绀轰緥鏁版嵁 ```bash -huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir data +huggingface-cli download --repo-type dataset OpenDCAI/dataflow-demo-image --local-dir dexample_dataa ``` @@ -65,7 +65,7 @@ self.vlm_serving = APIVLMServing_openai( ### 绗簲姝ワ細鎵ц娴佹按绾 ```bash -python api_pipelines/image_vqa.py --images_file data/image_vqa/sample_data.json +python api_pipelines/image_vqa.py ``` @@ -80,7 +80,7 @@ python api_pipelines/image_vqa.py --images_file data/image_vqa/sample_data.json ```json [ { - "image": ["./data/image_vqa/person.png"], + "image": ["./example_data/image_vqa/person.png"], "conversation": [ { "from": "human", @@ -107,7 +107,7 @@ python api_pipelines/image_vqa.py --images_file data/image_vqa/sample_data.json ```json [ { - "image": ["./data/image_vqa/person.png"], + "image": ["./example_data/image_vqa/person.png"], "vqa": "- Q: What is the title of the movie shown on the poster?\n A: Nightmare Alley\n\n- Q: What color is the film鈥檚 title text?\n A: Gold" } ] @@ -120,63 +120,63 @@ python api_pipelines/image_vqa.py --images_file data/image_vqa/sample_data.json ```python import os -import argparse + +# 璁剧疆 API Key 鐜鍙橀噺 +os.environ["DF_API_KEY"] = "sk-xxx" + from dataflow.utils.storage import FileStorage +from dataflow.core import LLMServingABC from dataflow.serving.api_vlm_serving_openai import APIVLMServing_openai from dataflow.operators.core_vision import PromptedVQAGenerator -# 閰嶇疆 API 鐜 -os.environ["DF_API_KEY"] = "sk-xxx" class ImageVQAPipeline: """ - 涓閿紡鍥剧墖鎵归噺 VQA 鐢熸垚娴佹按绾 + 涓琛屽懡浠ゅ嵆鍙畬鎴愬浘鐗囨壒閲 VQA 鐢熸垚銆 """ - def __init__( - self, - first_entry_file: str, - cache_path: str = "./cache_local_vqa", - file_name_prefix: str = "vqa_task", - cache_type: str = "json", - ): - # 1. 鍒濆鍖栧瓨鍌細鏀寔鏂偣缁紶涓庡鏍煎紡瀵煎嚭 + def __init__(self, llm_serving: LLMServingABC = None): + + # ---------- 1. Storage ---------- self.storage = FileStorage( - first_entry_file_name=first_entry_file, - cache_path=cache_path, - file_name_prefix=file_name_prefix, - cache_type=cache_type, + first_entry_file_name="./example_data/image_vqa/sample_data.json", + cache_path="./cache_local", + file_name_prefix="qa", + cache_type="json", ) - # 2. 閰嶇疆 VLM API 鏈嶅姟 + # ---------- 2. Serving ---------- self.vlm_serving = APIVLMServing_openai( - api_url="http://172.96.141.132:3001/v1", - key_name_of_api_key="DF_API_KEY", + api_url="http://172.96.141.132:3001/v1", # Any API platform compatible with OpenAI format + key_name_of_api_key="DF_API_KEY", # Set the API key for the corresponding platform in the environment variable or line 4 model_name="gpt-5-nano-2025-08-07", - max_workers=10 + image_io=None, + send_request_stream=False, + max_workers=10, + timeout=1800 ) - # 3. 鍒濆鍖 VQA 绠楀瓙 + # ---------- 3. Operator ---------- self.vqa_generator = PromptedVQAGenerator( serving=self.vlm_serving, - system_prompt="You are a image question-answer generator. Your task is to generate a question-answer pair for the given image content." + system_prompt= "You are a image question-answer generator. Your task is to generate a question-answer pair for the given image content." ) + # ------------------------------------------------------------------ # def forward(self): - # 鎵ц鎺ㄧ悊浠诲姟 + input_image_key = "image" + output_answer_key = "vqa" + self.vqa_generator.run( storage=self.storage.step(), input_conversation_key="conversation", - input_image_key="image", - output_answer_key="vqa", + input_image_key=input_image_key, + output_answer_key=output_answer_key, ) +# ---------------------------- CLI 鍏ュ彛 -------------------------------- # if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Batch VQA generation") - parser.add_argument("--images_file", default="data/image_vqa/sample_data.json") - args = parser.parse_args() - - pipe = ImageVQAPipeline(first_entry_file=args.images_file) + pipe = ImageVQAPipeline() pipe.forward() ``` \ No newline at end of file diff --git a/docs/zh/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md b/docs/zh/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md deleted file mode 100644 index 1d423513..00000000 --- a/docs/zh/notes/mm_guide/video_understanding/multirole_videoqa_pipeline.md +++ /dev/null @@ -1,288 +0,0 @@ ---- -title: 澶氳鑹茶棰戦棶绛旂敓鎴愭祦姘寸嚎 -createTime: 2026/01/11 22:15:28 -icon: mdi:image-text -permalink: /zh/mm_guide/multirole_videoqa_pipeline/ ---- -## 1. 姒傝堪 - -**澶氳鑹茶棰戦棶绛旂敓鎴愭祦姘寸嚎 (MultiRole Video QA Pipeline)** 鏃ㄥ湪鍒╃敤澶氭ā鎬佸ぇ妯″瀷锛圴LM锛夊拰澶氭櫤鑳戒綋锛圡ulti-Agent锛夊崗浣滄満鍒讹紝鑷姩浠庨暱瑙嗛鎴栧箍鍛婅棰戜腑鐢熸垚楂樿川閲忋佹繁搴︾殑闂瓟瀵癸紙QA Pairs锛夈 - -涓庢櫘閫氱殑鍗曟鐢熸垚涓嶅悓锛岃娴佹按绾垮紩鍏ヤ簡**澶氭櫤鑳戒綋杩唬浼樺寲**鐜妭銆傚畠棣栧厛鐢熸垚鍒濆闂瓟锛岀劧鍚庨氳繃妯℃嫙涓嶅悓瑙掕壊鐨勬櫤鑳戒綋锛堝鎻愰棶鑰呫佹鏌ヨ呫佹鼎鑹茶咃級杩涜澶氳疆浜や簰鍜屼慨姝o紝鏈缁堣緭鍑洪昏緫涓ュ瘑銆佷俊鎭噯纭殑闂瓟鏁版嵁銆 - -鎴戜滑鏀寔浠ヤ笅搴旂敤鍦烘櫙锛 - -* **骞垮憡瑙嗛鐞嗚В**锛氭彁鍙栧箍鍛婁腑鐨勫叧閿崠鐐广佹儏鎰熷惧悜鍜屽彊浜嬮昏緫銆 -* **澶嶆潅瑙嗛鎺ㄧ悊**锛氭瀯寤洪渶瑕佽法鏃堕棿娈垫帹鐞嗙殑娣卞害闂瓟鏁版嵁闆嗐 -* **闀胯棰戞憳瑕佷笌闂瓟**锛氬鐞嗗寘鍚赴瀵屽厓鏁版嵁锛圡eta锛夊拰澶氫釜鐗囨锛圕lips锛夌殑瑙嗛鏁版嵁銆 - -娴佹按绾跨殑涓昏娴佺▼鍖呮嫭锛 - -1. **鍒濆鐢熸垚 (Initial Generation)**锛氬熀浜庤棰戝厓鏁版嵁鍜岀墖娈电敓鎴愬熀纭闂瓟瀵广 -2. **澶氭櫤鑳戒綋鍗忎綔 (Multi-Agent Refinement)**锛氶氳繃澶氳疆杩唬锛堥粯璁 3 杞級锛屽闂瓟瀵硅繘琛屾壒鍒ゃ佷慨姝e拰浼樺寲銆 -3. **鏈缁堟暣鍚 (Final Generation)**锛氭竻娲楁暟鎹紝杈撳嚭鏍囧噯鏍煎紡鐨勬渶缁堥棶绛旈泦銆 - ---- - -## 2. 蹇熷紑濮 - -### 绗竴姝ワ細鍑嗗宸ヤ綔鐩綍 - -```bash -mkdir run_video_qa -cd run_video_qa - -``` - -### 绗簩姝ワ細鍑嗗鑴氭湰 - -灏嗕笅鏂団滄祦姘寸嚎绀轰緥鈥濅腑鐨勪唬鐮佷繚瀛樹负 `multirole_videoqa_pipeline.py`銆 - -### 绗笁姝ワ細閰嶇疆杩愯鍙傛暟 - -纭繚杈撳叆鏁版嵁鍖呭惈 `Meta` 鍜 `Clips` 瀛楁銆 - -```bash -# 瀹夎渚濊禆 -pip install open-dataflow vllm - -``` - -### 绗洓姝ワ細涓閿繍琛 - -```bash -python multirole_videoqa_pipeline.py \ - --model_path "/path/to/Qwen2.5-VL-7B-Instruct" \ - --images_file "data/adsQA.jsonl" \ - --card_id "0" - -``` - ---- - -## 3. 鏁版嵁娴佷笌娴佹按绾块昏緫 - -### 1. **杈撳叆鏁版嵁** - -杈撳叆鏁版嵁閫氬父鏄粡杩囬澶勭悊鐨勮棰戞暟鎹紝鍖呭惈鍏ㄥ眬鍏冩暟鎹拰鍒嗘淇℃伅锛 - -* **Meta**锛氳棰戠殑鍏ㄥ眬鎻忚堪銆佹爣棰樻垨鑳屾櫙淇℃伅銆 -* **Clips**锛氳棰戠墖娈靛垪琛紝姣忎釜鐗囨鍖呭惈闊抽鏂囨湰銆佸抚鍥惧儚璺緞鍜岀墖娈垫弿杩般 - -**杈撳叆鏁版嵁绀轰緥**锛 - -```json -{ - "Meta": "A commercial for a new sports car featuring dynamic driving scenes.", - "Clips": [ - { - "Audio_Text": "Experience the speed.", - "Frames_Images": ["./frames/001.jpg", "./frames/002.jpg"], - "Description": "Car accelerating on a highway." - }, - { - "Audio_Text": "Safety meets luxury.", - "Frames_Images": ["./frames/003.jpg"], - "Description": "Interior shot showing leather seats." - } - ] -} - -``` - -### 2. **鏍稿績绠楀瓙閫昏緫** - -璇ユ祦姘寸嚎閫氳繃涓変釜涓撻棬鐨勭畻瀛愪覆鑱旀墽琛岋細 - -#### A. **MultiroleVideoQAInitialGenerator锛堝垵濮嬬敓鎴愬櫒锛** - -* **鍔熻兘**锛氫綔涓衡滃垵绋夸綔鑰呪濓紝瀹冭鍙 `Meta` 鍜 `Clips`锛屽埄鐢 VLM 鐢熸垚绗竴鐗堥棶绛斿銆 -* **杈撳嚭**锛氬寘鍚垵姝 QA 鐨 DataFrame銆 - -#### B. **MultiroleVideoQAMultiAgentGenerator锛堝鏅鸿兘浣撲紭鍖栧櫒锛** - -* **鍔熻兘**锛氫綔涓衡滅紪杈戝洟闃熲濓紝瀹冨鍒濈ǹ杩涜鎵撶(銆 -* **鏈哄埗**锛氳缃 `max_iterations`锛堝 3 娆★級锛屽湪澶氳疆娆′腑锛屾ā鍨嬪彲鑳芥壆婕斾笉鍚岃鑹诧紙濡傚鏍稿憳鎸囧嚭閿欒銆佹鼎鑹插憳浼樺寲鎺緸锛夛紝閫愭鎻愬崌 QA 璐ㄩ噺銆 -* **杈撳叆**锛氬垵濮 DataFrame銆 -* **杈撳嚭**锛氱粡杩囧杞慨姝e悗鐨勪腑闂存 DataFrame銆 - -#### C. **MultiroleVideoQAFinalGenerator锛堟渶缁堢敓鎴愬櫒锛** - -* **鍔熻兘**锛氫綔涓衡滃嚭鐗堝晢鈥濓紝瀹冭礋璐f渶缁堢殑鏍煎紡鍖栧拰娓呮礂銆 -* **杈撳嚭**锛氭爣鍑嗗寲鐨 `QA` 鍒楄〃銆 - -### 3. **杈撳嚭鏁版嵁** - -杈撳嚭鏁版嵁鍦ㄥ師鏈夊瓧娈靛熀纭涓婂鍔犱簡楂樿川閲忕殑闂瓟鍒楄〃锛 - -* **QA**锛氱敓鎴愮殑闂瓟瀵瑰垪琛紝鍖呭惈鏍囩锛堝闂绫诲瀷锛夈侀棶棰樻枃鏈拰绛旀鏂囨湰銆 - -**杈撳嚭鏁版嵁绀轰緥**锛 - -```json -{ - "Meta": "...", - "Clips": [...], - "QA": [ - { - "Label": "Feature Extraction", - "Question": "What specific features of the car are highlighted in the interior shots?", - "Answer": "The video highlights the luxury leather seats and the advanced dashboard interface." - }, - { - "Label": "Narrative Analysis", - "Question": "How does the audio complement the visual transition?", - "Answer": "The narration 'Experience speed' coincides with the acceleration scene, reinforcing the dynamic visual." - } - ] -} - -``` - ---- - -## 4. 娴佹按绾跨ず渚 - -浠ヤ笅鏄畬鏁寸殑 `MultiRoleVideoQAPipeline` 浠g爜瀹炵幇銆 - -```python -import argparse -import os -from dataflow.serving import LocalModelVLMServing_vllm -from dataflow.utils.storage import FileStorage -from dataflow.operators.core_vision import ( - MultiroleVideoQAInitialGenerator, - MultiroleVideoQAMultiAgentGenerator, - MultiroleVideoQAFinalGenerator -) - -try: - import torch - # 澶氳繘绋嬪惎鍔ㄦ柟寮忚缃负 spawn锛岄伩鍏 CUDA 鍒濆鍖栧啿绐 - if 'spawn' not in torch.multiprocessing.get_all_start_methods(): - torch.multiprocessing.set_start_method('spawn', force=True) -except ImportError: - pass - - -class MultiRoleVideoQAPipeline(): - def __init__( - self, - model_path: str, - *, - hf_cache_dir: str | None = None, - download_dir: str = "./ckpt", - first_entry_file: str = "/dataflow/example/ads_QA/adsQA.jsonl", - cache_path: str = "./cache_local", - file_name_prefix: str = "dataflow_cache_step", - cache_type: str = "jsonl", - # Keys Configuration - Meta_key: str = "Meta", - clips_key: str = "Clips", - output_key: str = "QA" - ): - # 1. 瀛樺偍鍒濆鍖 - self.storage = FileStorage( - first_entry_file_name=first_entry_file, - cache_path=cache_path, - file_name_prefix=file_name_prefix, - cache_type=cache_type, - ) - - # 寮哄埗璁剧疆 vLLM 鐨勫杩涚▼鏂规硶 - os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = "spawn" - - # 2. VLM 鏈嶅姟鍒濆鍖 - self.llm_serving = LocalModelVLMServing_vllm( - hf_model_name_or_path=model_path, - hf_cache_dir=hf_cache_dir, - hf_local_dir=download_dir, - vllm_tensor_parallel_size=1, - vllm_temperature=0.7, - vllm_top_p=0.9, - vllm_max_tokens=6000, # 瑙嗛闂瓟閫氬父闇瑕佽緝闀跨殑 Context - ) - - # 3. 绠楀瓙閾惧垵濮嬪寲 - # 闃舵涓锛氬垵濮嬬敓鎴 - self.initial_QA_generation = MultiroleVideoQAInitialGenerator(llm_serving = self.llm_serving) - - # 闃舵浜岋細澶氭櫤鑳戒綋杩唬浼樺寲 (鏍稿績宸紓鐐) - self.multiAgent_QA_generation = MultiroleVideoQAMultiAgentGenerator( - llm_serving = self.llm_serving, - max_iterations = 3 - ) - - # 闃舵涓夛細鏈缁堟牸寮忓寲 - self.final_QA_generation = MultiroleVideoQAFinalGenerator(llm_serving = self.llm_serving) - - self.input_meta_key = Meta_key - self.input_clips_key = clips_key - self.output_key = output_key - - def forward(self): - print(">>> [Pipeline] Step 1: Initial QA Generation...") - init_df = self.initial_QA_generation.run( - storage = self.storage.step(), - input_meta_key = self.input_meta_key, - input_clips_key = self.input_clips_key, - output_key = self.output_key - ) - - print(">>> [Pipeline] Step 2: Multi-Agent Refinement (3 iterations)...") - # 娉ㄦ剰锛氭绠楀瓙鎺ユ敹涓婁竴闃舵鐨 DataFrame (init_df) 浣滀负杈撳叆 - middle_df = self.multiAgent_QA_generation.run( - df = init_df, - input_meta_key = self.input_meta_key, - input_clips_key = self.input_clips_key, - output_key = self.output_key - ) - - print(">>> [Pipeline] Step 3: Finalizing QA Pairs...") - self.final_QA_generation.run( - storage = self.storage, - df = middle_df, - input_meta_key = self.input_meta_key, - input_clips_key = self.input_clips_key, - output_key = self.output_key - ) - print(">>> [Pipeline] Done.") - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Batch video QA generation with DataFlow (Single GPU)") - - parser.add_argument("--model_path", default="../../Models/Qwen2.5-VL-7B-Instruct", - help="Path to the local model or HuggingFace repo ID.") - parser.add_argument("--hf_cache_dir", default="~/.cache/huggingface", - help="HuggingFace cache directory.") - parser.add_argument("--download_dir", default="./ckpt", - help="Local directory for downloading models.") - - parser.add_argument("--card_id", type=str, default="0", - help="The single CUDA device ID to use (e.g., '0' or '1').") - - parser.add_argument("--images_file", default="./dataflow/example/ads_QA/adsQA.jsonl", - help="Path to the first entry file for DataFlow.") - parser.add_argument("--cache_path", default="./cache_local", - help="Directory for caching DataFlow steps.") - parser.add_argument("--file_name_prefix", default="caption", - help="Prefix for cache file names.") - parser.add_argument("--cache_type", default="jsonl", - help="Type of cache file (e.g., jsonl).") - - args = parser.parse_args() - - os.environ['CUDA_VISIBLE_DEVICES'] = args.card_id.replace(' ', '') - - pipe = MultiRoleVideoQAPipeline( - model_path=args.model_path, - hf_cache_dir=args.hf_cache_dir, - download_dir=args.download_dir, - first_entry_file=args.images_file, - cache_path=args.cache_path, - file_name_prefix=args.file_name_prefix, - cache_type=args.cache_type, - ) - pipe.forward() - -``` diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/image_bbox_generator.md b/docs/zh/notes/mm_operators/image_understanding/generate/image_bbox_generator.md index 661a1562..dd87c0e6 100644 --- a/docs/zh/notes/mm_operators/image_understanding/generate/image_bbox_generator.md +++ b/docs/zh/notes/mm_operators/image_understanding/generate/image_bbox_generator.md @@ -49,8 +49,7 @@ def run( self, storage: DataFlowStorage, input_image_key: str = "image", - input_bbox_key: str = "bbox", - output_key: str = "mdvp_record" + input_bbox_key: str = "bbox" ): ... @@ -69,7 +68,7 @@ def run( 3. **鏍囧噯鍖栦笌鍙鍖 (Normalize & Visualize)** * * **鏍囧噯鍖**锛氬皢 `[x, y, w, h]` 杞崲涓哄綊涓鍖栫殑 `[x1, y1, x2, y2]` 鏍煎紡锛屽苟鏍规嵁 `max_boxes` 杩涜鎴柇鎴栬ˉ闆 (`0.0, 0.0, 0.0, 0.0`)銆 -* **鍙鍖**锛氬湪鍘熷浘涓婄粯鍒剁豢鑹茬煩褰㈡鍜屾暟瀛楁爣绛撅紝淇濆瓨鑷 `storage.cache_path`銆 +* **鍙鍖**锛氬湪鍘熷浘涓婄粯鍒剁煩褰㈡鍜屾暟瀛楁爣绛撅紝淇濆瓨鑷 `storage.cache_path`銆 4. **Prompt 鐢熸垚** @@ -89,7 +88,6 @@ def run( | `storage` | `DataFlowStorage` | 鏃 | DataFlow 瀛樺偍瀵硅薄锛屼富瑕佺敤浜庤幏鍙栫紦瀛樿矾寰 (`cache_path`)銆 | | `input_image_key` | `str` | `"image"` | 杈撳叆 JSONL 涓浘鍍忚矾寰勭殑瀛楁鍚嶃 | | `input_bbox_key` | `str` | `"bbox"` | 杈撳叆 JSONL 涓 BBox 鏁版嵁鐨勫瓧娈靛悕銆 | -| `output_key` | `str` | `"mdvp_record"` | (淇濈暀瀛楁) 鐢ㄤ簬鏍囪瘑杈撳嚭璁板綍鐨勯敭鍚嶃 | ## 馃З 绀轰緥鐢ㄦ硶 @@ -97,49 +95,41 @@ def run( from dataflow.utils.storage import FileStorage from dataflow.operators.cv import ImageBboxGenerator, ExistingBBoxDataGenConfig -# 1) 閰嶇疆鍙傛暟 -config = ExistingBBoxDataGenConfig( - max_boxes=5, - input_jsonl_path="./data/raw_images.jsonl", - output_jsonl_path="./data/processed_with_prompts.jsonl" +cfg = ExistingBBoxDataGenConfig( + max_boxes=10, + input_jsonl_path="../example_data/image_region_caption/image_region_caption_demo.jsonl", + output_jsonl_path="../cache/image_region_caption/image_with_bbox_result.jsonl", ) - -# 2) 鍒濆鍖栫畻瀛 -# 娉ㄦ剰锛氭绠楀瓙涓昏鐢ㄤ簬鏁版嵁鍑嗗锛屼笉渚濊禆 Serving 瀹炰緥 generator = ImageBboxGenerator(config=config) -# 3) 鍑嗗 Storage (浠呯敤浜庢彁渚涚紦瀛樿矾寰) storage = FileStorage( - cache_path="./cache_vis_images", - file_name_prefix="bbox_gen" + first_entry_file_name="../example_data/image_region_caption/image_region_caption_demo.jsonl", + cache_path="../cache/image_region_caption", + file_name_prefix="region_caption", + cache_type="jsonl" ) -# 4) 鎵ц澶勭悊 -# 鑷姩璇诲彇 config 涓殑 input_jsonl_path锛岀粨鏋滃啓鍏 output_jsonl_path generator.run( storage=storage, - input_image_key="image_path", - input_bbox_key="ground_truth_bbox" # 鑻ユ枃浠朵腑鏃犳鍒楋紝灏嗚嚜鍔ㄦ彁鍙 BBox + input_image_key="image", + input_bbox_key="bbox" ) ``` ### 馃Ь 杈撳嚭鏁版嵁鏍煎紡 (Output JSONL) -鐢熸垚鐨 `output_jsonl_path` 鏂囦欢涓紝姣忎竴琛屽寘鍚互涓嬬粨鏋勶細 +鐢熸垚鐨 `image_with_bbox_result.jsonl` 鏂囦欢涓紝姣忎竴琛屽寘鍚互涓嬬粨鏋勶細 ```json { - "image": "/data/raw/cat.jpg", - "type": "without_bbox", // 鎴 "with_bbox" - "bbox": [[100, 200, 50, 60], ...], // 鍘熷鍍忕礌鍧愭爣 [x, y, w, h] - "normalized_bbox": [ - [0.1, 0.2, 0.15, 0.26], - [0.0, 0.0, 0.0, 0.0] // 琛ラ浂濉厖 - ], - "result_file": "./cache_vis_images", - "image_with_bbox": "./cache_vis_images/1_bbox_vis.jpg", // 鍙鍖栧浘鐗囪矾寰 - "valid_bboxes_num": 1, - "prompt": "Describe the content of each marked region in the image. There are 1 regions: \ to \." + "image": "../example_data/image_region_caption/20.png", + "type": "with_bbox", + "bbox": [[196, 104, 310, 495]], + "normalized_bbox": [[0.128, 0.125, 0.329, 0.72], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]], + "result_file": "../cache/image_region_caption", + "image_with_bbox": "../cache/image_region_caption\\2_bbox_vis.jpg", + "valid_bboxes_num": 1, + "prompt": "Describe the content of each marked region in the image. There are 1 regions: to ." } ``` diff --git a/docs/zh/notes/mm_operators/image_understanding/generate/multirole_videoqa.md b/docs/zh/notes/mm_operators/image_understanding/generate/multirole_videoqa.md deleted file mode 100644 index e33cc836..00000000 --- a/docs/zh/notes/mm_operators/image_understanding/generate/multirole_videoqa.md +++ /dev/null @@ -1,140 +0,0 @@ ---- -title: 澶氳鑹茶棰戦棶绛旂敓鎴(MultiRole Video QA Generation) -createTime: 2025/12/2 20:00:00 -icon: material-symbols-light:video -permalink: /zh/mm_operators/generate/multirole_videoqa/ ---- - -## 馃摌 姒傝堪 - -`MultiroleVideoQAGenerate` 鏄竴涓暟鎹敓鎴愮畻瀛愶紝鐢ㄤ簬**鍩轰簬棰勫鐞嗚棰戞暟鎹嚜鍔ㄥ垱寤洪棶绛斿锛圦A Pairs锛**銆 -缁欏畾杈撳叆鐨勯澶勭悊瑙嗛鏁版嵁锛屽畠浼氭瀯寤哄涓笌璇ヨ棰戠浉鍏崇殑闂瓟瀵广傝绠楀瓙閫傜敤浜**骞垮憡瑙嗛鏍囨敞**銆**鏁版嵁闆嗘瀯寤**鍜**瑙嗛鐞嗚В**浠诲姟銆 - -**鍔熻兘鐗规э細** -* 鏀寔**鎵归噺澶勭悊**澶氫釜棰勫鐞嗚棰戞暟鎹 -* 浣跨敤 **VLM锛堝 Qwen2.5-VL锛**鐢熸垚**楂樿川閲**鐨勯棶绛斿銆 -* 鑷姩澶勭悊瑙嗛杈撳叆骞朵娇鐢 Prompt 鐢熸垚鏁版嵁銆 - ---- - -## 馃彈锔 `__init__` 鍑芥暟 - -```python -def __init__( - self, - llm_serving: VLMServingABC -): - ... -``` -## 馃Ь `__init__` 鍙傛暟 - -| Parameter | Type | Default | Description | -| :------------ | :-------------- | :------ | :-------------------------------------------------------------- | -| `llm_serving` | `VLMServingABC` | - | **Model Serving Object** used to call the VLM for QA pairs generation | - ------ - -## 鈿 `run` 鍑芥暟 - -```python -def run( - self, - storage: DataFlowStorage, - input_meta_key: str = "Meta", - input_clips_key: str = "Clips", - output_key: str = "QA" -): - ... -``` - -The `run` function executes the main QA pairs generation workflow: -read data paths 鈫 **validate DataFrame** 鈫 construct prompts 鈫 call the model 鈫 generate QA pairs captions 鈫 write results to output. - -## 馃Ь `run` 鍙傛暟 - -| Parameter | Type | Default | Description | -| :---------------- | :---------------- | :---------- | :---------------------------------------------------- | -| `storage` | `DataFlowStorage` | - | Dataflow storage object | -| `input_mets_key` | `str` | `"Meta"` | **Multimodal Input Field Name** | -| `input_clips_key` | `str` | `"Clips"` | **Multimodal Input Field Name** | -| `output_key` | `str` | `"QA"` | **Model Output Field Name** (the generated QA pairs) | - ------ - -## 馃 绀轰緥鐢ㄦ硶 - -```python -import os -import argparse -from dataflow.serving import LocalModelVLMServing_vllm -from dataflow.utils.storage import FileStorage -from dataflow.operators.core_vision import MultiroleVideoQAInitialGenerator, MultiroleVideoQAMultiAgentGenerator, MultiroleVideoQAFinalGenerator - -# Step 1: Launch local model service -llm_serving = LocalModelVLMServing_vllm( - hf_model_name_or_path=model_path, - hf_cache_dir=hf_cache_dir, - hf_local_dir=download_dir, - vllm_tensor_parallel_size=1, - vllm_temperature=0.7, - vllm_top_p=0.9, - vllm_max_tokens=6000, - ) - -# Step 2: Prepare input data -storage = FileStorage( - first_entry_file_name=first_entry_file, - cache_path=cache_path, - file_name_prefix=file_name_prefix, - cache_type=cache_type, - ) - -# Step 3: Initialize and run the operator -initial_QA_generation = MultiroleVideoQAInitialGenerator(llm_serving = self.llm_serving) -multiAgent_QA_generation = MultiroleVideoQAMultiAgentGenerator(llm_serving = self.llm_serving, max_iterations = 3) -final_QA_generation = MultiroleVideoQAFinalGenerator(llm_serving = self.llm_serving) - -init_df = initial_QA_generation.run( - storage = self.storage.step(), - input_meta_key = self.input_meta_key, - input_clips_key = self.input_clips_key, - output_key = self.output_key - ) -middle_df = multiAgent_QA_generation.run( - df = init_df, - input_meta_key = self.input_meta_key, - input_clips_key = self.input_clips_key, - output_key = self.output_key - ) -final_QA_generation.run( - storage = self.storage, - df = middle_df, - input_meta_key = self.input_meta_key, - input_clips_key = self.input_clips_key, - output_key = self.output_key - ) -``` - ------ - -## 馃Ь 榛樿杈撳嚭鏍煎紡 - -| Field | Type | Description | -| :-------- | :----------- | :------------------------------- | -| `Meta` | `str` | Meta information for video | -| `Clips` | `List[Dict]` | Interleaved modality video Clips | -| `QA` | `List[Dict]` | QA pairs | - ------ - -### 馃摜 绀轰緥杈撳叆 - -```jsonl -{"Meta": "Meta Information", "Clips": [{"Audio_Text": "Audio_Text1", "Frames_Images": ["image_path1","image_path2"], "Description": "Description1"}, {"Audio_Text": "Audio_Text2", "Frames_Images": ["image_path3","image_path4"], "Description": "Description2"}]} -``` - -### 馃摛 绀轰緥杈撳嚭 - -```jsonl -{"Meta": "Meta Information", "Clips": [{"Audio_Text": "Audio_Text1", "Frames_Images": ["image_path1","image_path2"], "Description": "Description1"}, {"Audio_Text": "Audio_Text2", "Frames_Images": ["image_path3","image_path4"], "Description": "Description2"}], "QA":[{"Label":"label1", "Question": "Question1", "Answer": "Answer1"},{"Label":"label2", "Question": "Question2", "Answer": "Answer2"}]} -``` \ No newline at end of file