From d3ea39114292fbaf27668529f632b35da1af7776 Mon Sep 17 00:00:00 2001 From: aravind-3105 Date: Mon, 16 Mar 2026 17:29:04 -0400 Subject: [PATCH 1/9] Integrate Langfuse observability into agentic VQA evaluation, replacing Opik references and updating tracing mechanisms --- implementations/agentic_vqa_eval/README.md | 229 +++++------------- .../agents/planner_agent.py | 8 +- .../agents/verifier_agent.py | 6 +- .../agents/vision_agent.py | 12 +- .../eval/error_taxonomy.py | 22 +- .../eval/eval_outputs.py | 21 +- .../agentic_chartqapro_eval/eval/eval_topk.py | 21 +- .../langfuse_integration/__init__.py | 1 + .../langfuse_integration/client.py | 56 +++++ .../langfuse_integration/dataset.py | 69 ++++++ .../langfuse_integration/ingest.py | 174 +++++++++++++ .../langfuse_integration/prompts.py | 81 +++++++ .../langfuse_integration/tracing.py | 157 ++++++++++++ .../src/agentic_chartqapro_eval/mep/schema.py | 2 +- .../runner/run_generate_meps.py | 50 ++-- .../tools/ocr_reader_tool.py | 6 +- .../tools/vision_qa_tool.py | 10 +- pyproject.toml | 3 +- 18 files changed, 689 insertions(+), 239 deletions(-) create mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/__init__.py create mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py create mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/dataset.py create mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/ingest.py create mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/prompts.py create mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py diff --git a/implementations/agentic_vqa_eval/README.md b/implementations/agentic_vqa_eval/README.md index a939c79..d28a821 100644 --- a/implementations/agentic_vqa_eval/README.md +++ b/implementations/agentic_vqa_eval/README.md @@ -6,7 +6,7 @@ Welcome to **Reference Implementation 6** of the Survey Paper on Agentic Visual The core contribution is the **Model Evaluation Packet (MEP)** — a portable JSON trace that captures everything: the inspection plan, the vision agent's reasoning, the verifier's critique, tool call logs, timestamps, and errors. This enables reproducible evaluation, post-hoc explainability analysis, and model comparison across VLM backends. -**Observability layer:** Integration with **[Opik](https://github.com/comet-ml/opik)** (self-hosted) for live trace visualization, prompt versioning, dataset registration, and experiment comparison across configs — all without changing the MEP ground-truth artifacts. +**Observability layer:** Integration with **[Langfuse](https://langfuse.com)** (cloud or self-hosted) for live trace visualization, prompt versioning, dataset registration, and experiment comparison across configs — all without changing the MEP ground-truth artifacts. --- @@ -154,7 +154,7 @@ This framework produces explainability signals at four distinct levels: | `json_repair` | 0.25.3 | Fallback JSON parsing when LLM output is malformed | | `python-dotenv` | 1.1.1 | API key management via `.env` file | | `pandas` | 2.3.3 | Metric aggregation and summary CSV generation | -| `opik` | latest | Trace visualization, prompt versioning, dataset registration | +| `langfuse` | latest | Trace visualization, prompt versioning, dataset registration | | `matplotlib` | ≥3.7 | Charts in notebook and dashboard | | `streamlit` | ≥1.32 | Interactive evaluation dashboard | | `jupyter` / `ipykernel` | latest | Analysis notebook | @@ -203,12 +203,12 @@ src/agentic_chartqapro_eval/ │ ├── dashboard.py — Streamlit interactive dashboard: sample browser, chart image viewer │ └── summarize.py — Aggregate metrics.jsonl → summary.csv │ -└── opik_integration/ - ├── client.py — Opik client singleton (gracefully disabled if not configured) +└── langfuse_integration/ + ├── client.py — Langfuse client singleton (gracefully disabled if not configured) ├── tracing.py — sample_trace(), open_llm_span(), close_span() helpers - ├── prompts.py — Push planner.txt / vision.txt to Opik Prompt Library - ├── dataset.py — Register ChartQAPro samples as an Opik Dataset - └── ingest.py — Retroactively import existing MEP files into Opik + ├── prompts.py — Push planner.txt / vision.txt to Langfuse Prompt Management + ├── dataset.py — Register ChartQAPro samples as a Langfuse Dataset + └── ingest.py — Retroactively import existing MEP files into Langfuse ``` --- @@ -217,10 +217,10 @@ src/agentic_chartqapro_eval/ ### 1. Install dependencies -From the **root of the repository**, install the `agentic-xai-eval` dependency group using `uv`: +From the **root of the repository**, install the `ref6-agentic-xai-eval` dependency group using `uv`: ```bash -uv sync --group agentic-xai-eval +uv sync --group ref6-agentic-xai-eval source .venv/bin/activate ``` @@ -229,12 +229,17 @@ The `agentic_chartqapro_eval` package is automatically available — it is inclu ### 2. Configure API keys ```bash +# From the repo root: cp .env.example .env # Edit .env and fill in your keys: # OPENAI_API_KEY=... # GEMINI_API_KEY=... +# LANGFUSE_PUBLIC_KEY=... # optional — for observability +# LANGFUSE_SECRET_KEY=... ``` +The `.env` file lives at the **repo root**. `load_dotenv()` searches upward from the working directory, so it is found automatically regardless of which subdirectory you run commands from. + ### 3. Generate MEPs (run the agentic pipeline) Run on 25 test samples using GPT-4o for planner, vision, and verifier: @@ -282,7 +287,7 @@ uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \ To disable OCR entirely (matches the original pipeline behaviour, faster and lower cost): ```bash -uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \ +uv run --env-file .env -magentic_chartqapro_eval.runner.run_generate_meps \ --split test --n 25 --config gemini_gemini --no_ocr ``` @@ -320,7 +325,7 @@ uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_traces \ Re-queries the VLM for each MEP asking for the 3 most likely candidate answers: ```bash -uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_topk \ +uv run --env-file .env -magentic_chartqapro_eval.eval.eval_topk \ --mep_dir meps/gemini_gemini/chartqapro/test \ --out output/topk_metrics.jsonl \ --backend gemini \ @@ -333,7 +338,7 @@ This pass does **not** modify existing MEPs or `metrics.jsonl`. ### 7. Summarize results ```bash -uv run --env-file .env -m agentic_chartqapro_eval.eval.summarize \ +uv run --env-file .env -magentic_chartqapro_eval.eval.summarize \ --metrics output/metrics.jsonl \ --out output/summary.csv ``` @@ -448,17 +453,17 @@ Pre-built cells walk through: loading MEPs, accuracy by question type, verifier --- -## Opik Observability (Self-Hosted) +## Langfuse Observability -Opik is an open-source LLM observability platform that adds a live visualization and experiment-comparison layer on top of the MEP artifacts. MEPs remain the portable ground truth; Opik is purely additive. +Langfuse is an open-source LLM observability platform that adds a live visualization and experiment-comparison layer on top of the MEP artifacts. MEPs remain the portable ground truth; Langfuse is purely additive. -### What Opik gives you +### What Langfuse gives you | Feature | Detail | |---|---| -| **Trace viewer** | Every sample becomes a trace with `planner` and `vision_qa_tool` child spans showing prompts, outputs, token usage, and latency | +| **Trace viewer** | Every sample becomes a trace with `planner` and `vision_qa_tool` child generations showing prompts, outputs, token usage, and latency | | **Feedback scores** | `answer_accuracy` and all five `judge_*` rubric scores are attached to each trace after eval | -| **Prompt Library** | `planner.txt` and `vision.txt` are versioned — every experiment links to the exact prompt version used | +| **Prompt Management** | `planner.txt` and `vision.txt` are versioned — every experiment links to the exact prompt version used | | **Dataset registry** | ChartQAPro samples are registered so experiments formally reference a dataset version | | **Experiment comparison** | `openai_openai` vs `gemini_gemini` side-by-side with accuracy distributions and latency CDFs | @@ -468,183 +473,76 @@ Opik is an open-source LLM observability platform that adds a live visualization Trace: chartqapro/000002 [openai_openai | standard | 11.4s] input: {question, expected_output} output: {answer, explanation} - feedback: answer_accuracy=1.0, judge_explanation_quality=0.9, ... - ├── Span: planner [llm | gpt-4o | 2.1s] + scores: answer_accuracy=1.0, judge_explanation_quality=0.9, ... + ├── Generation: planner [gpt-4o | 2.1s] │ input: {prompt} │ output: {plan_steps: [...], parse_error: false} - ├── Span: vision_agent [llm | gpt-4o | 5.6s] - │ └── Span: vision_qa_tool [llm | gpt-4o | 2.9s | 688 tokens] + ├── Generation: vision_agent [gpt-4o | 5.6s] + │ └── Generation: vision_qa_tool [gpt-4o | 2.9s | 688 tokens] │ input: {image_path, question, plan_steps} │ output: {answer, explanation} - └── Span: verifier [llm | gpt-4o | 3.7s] + └── Generation: verifier [gpt-4o | 3.7s] input: {prompt, draft_answer} output: {verdict: "confirmed" | "revised", answer, reasoning} ``` -### 1. Intall and Setup Docker - -#### Update packages and install Docker. - -```bash -sudo apt update -sudo apt install -y docker.io -``` - -Verify installation: - -```bash -docker --version -``` - -#### Start the Docker daemon - -Some cloud environments do not run systemd, so start Docker manually. - -```bash -sudo dockerd > /tmp/dockerd.log 2>&1 & -``` - -Verify Docker is running: - -```bash -sudo docker info -``` - -#### Install Docker Compose v2 - -Create plugin directory: - -```bash -sudo mkdir -p /usr/lib/docker/cli-plugins -``` - -Download the Compose plugin: - -```bash -sudo curl -SL https://github.com/docker/compose/releases/download/v2.27.0/docker-compose-linux-x86_64 \ --o /usr/lib/docker/cli-plugins/docker-compose -``` - -Make it executable: - -```bash -sudo chmod +x /usr/lib/docker/cli-plugins/docker-compose -``` - -Verify installation: - -```bash -docker compose version -``` - -Expected output example: - -``` -Docker Compose version v2.27.0 -``` - -### 2. Start the self-hosted Opik stack - -Requires Docker Desktop (already running if you followed setup above). - -```bash -# Clone the Opik repository. -git clone https://github.com/comet-ml/opik.git /tmp/opik-server --depth=1 -# Navigate to the Docker deployment directory: -cd /tmp/opik-server/deployment/docker-compose -# Start the Opik stack with the 'opik' profile: -sudo docker compose --profile opik up -d -``` +### 1. Get API keys -Dashboard is available at **http://localhost:5173** once all containers are healthy (takes ~60 seconds on first pull). +**Cloud (recommended — no infrastructure needed):** -To stop: `docker compose --profile opik down` +1. Sign up at [cloud.langfuse.com](https://cloud.langfuse.com) +2. Create a new project +3. Go to **Settings → API Keys** and create a key pair -#### Verify containers +**Self-hosted:** -Check running containers: +Follow the [Langfuse self-hosting guide](https://langfuse.com/docs/deployment/self-host) to deploy with Docker Compose, then create API keys in the UI. -```bash -sudo docker ps -``` +### 2. Configure the connection -You should see containers similar to: +Add to your `.env` at the repo root: ``` -opik-frontend-1 -opik-backend-1 -opik-python-backend-1 -opik-mysql-1 -opik-redis-1 -opik-clickhouse-1 +LANGFUSE_PUBLIC_KEY=pk-lf-... +LANGFUSE_SECRET_KEY=sk-lf-... +# LANGFUSE_HOST=https://cloud.langfuse.com # default; change for self-hosted ``` -#### Access Opik +The framework auto-detects these variables. If they are absent, all Langfuse calls are silent no-ops and the pipeline runs exactly as before. -Get your VM external IP: +### 3. Push prompt versions to Langfuse -```bash -curl ifconfig.me -``` - -Open the Opik UI in your browser: - -``` -http://:5173 -``` - -Example: - -``` -http://34.xx.xx.xxx:5173 -``` - -You should now see the **Comet Opik dashboard**. - - -### 3. Configure the connection - -Add to your `.env`: - -``` -OPIK_URL_OVERRIDE=http://localhost:5173/api -``` - -The framework auto-detects this variable. If it is absent, all Opik calls are silent no-ops and the pipeline runs exactly as before. - -### 4. Push prompt versions to Opik - -Run once before starting experiments. This creates versioned entries for `planner.txt` and `vision.txt` in the Opik Prompt Library so every future experiment links to the exact prompt version used. +Run once before starting experiments. This creates versioned entries for `planner.txt` and `vision.txt` in Langfuse Prompt Management so every future experiment links to the exact prompt version used. ```bash -uv run --env-file .env -m agentic_chartqapro_eval.opik_integration.prompts +uv run --env-file .env -m -m agentic_chartqapro_eval.langfuse_integration.prompts ``` -### 5. Register the dataset +### 4. Register the dataset ```bash -uv run --env-file .env -m agentic_chartqapro_eval.opik_integration.dataset \ +uv run --env-file .env -m agentic_chartqapro_eval.langfuse_integration.dataset \ --split test --n 25 ``` -This creates a dataset named `ChartQAPro_test` in Opik containing one item per sample (question, expected output, question type, image path). +This creates a dataset named `ChartQAPro_test` in Langfuse containing one item per sample (question, expected output, question type, image path). -### 6. Live tracing (automatic on new runs) +### 5. Live tracing (automatic on new runs) -No extra flags needed. When `OPIK_URL_OVERRIDE` is set, the pipeline automatically: +No extra flags needed. When `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are set, the pipeline automatically: - registers the dataset and versions the prompts at run start -- opens an Opik trace per sample -- creates `planner` and `vision_qa_tool` child spans with inputs, outputs, and token usage -- stores the `opik_trace_id` in the MEP for later score attachment +- opens a Langfuse trace per sample +- creates `planner` and `vision_qa_tool` child generations with inputs, outputs, and token usage +- stores the `lf_trace_id` in the MEP for later score attachment ```bash uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \ --split test --n 25 --config gemini_gemini --workers 4 --out meps/ ``` -### 7. Attach evaluation scores +### 6. Attach evaluation scores -After running `eval_outputs.py`, accuracy and judge scores are automatically written back to the Opik traces: +After running `eval_outputs.py`, accuracy and judge scores are automatically written back to the Langfuse traces: ```bash uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_outputs \ @@ -652,12 +550,12 @@ uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_outputs \ --out metrics.jsonl ``` -### 8. Ingest existing MEPs (retroactive) +### 7. Ingest existing MEPs (retroactive) -If you have MEPs from runs before Opik was configured, import them without re-running the pipeline: +If you have MEPs from runs before Langfuse was configured, import them without re-running the pipeline: ```bash -uv run --env-file .env -m agentic_chartqapro_eval.opik_integration.ingest \ +uv run --env-file .env -m agentic_chartqapro_eval.langfuse_integration.ingest \ --mep_dir meps/gemini_gemini/chartqapro/test \ --metrics_file metrics.jsonl # optional: attaches scores if available ``` @@ -720,7 +618,7 @@ Each MEP file is a self-contained JSON evaluation artifact: }, "timestamps": { "planner_ms": 2185, "ocr_ms": 1243, "vision_ms": 5684, "verifier_ms": 3712 }, "errors": [], - "opik_trace_id": "tr_abc123..." // present when Opik tracing is active + "lf_trace_id": "abc123..." // present when Langfuse tracing is active } ``` @@ -740,7 +638,7 @@ Each MEP file is a self-contained JSON evaluation artifact: - **OpenAI Vision API** — GPT-4o multimodal inference for chart image understanding ([platform.openai.com](https://platform.openai.com/docs)) - **Google Gemini API** — Alternative VLM backend for vision inference ([ai.google.dev](https://ai.google.dev/docs)) - **LLM-as-Judge (Zheng et al., 2023)** — Methodology for using LLMs to score free-form outputs with rubric dimensions ([arXiv:2306.05685](https://arxiv.org/abs/2306.05685)) -- **Opik by Comet ML** — Open-source LLM observability platform used for tracing, prompt versioning, and experiment comparison ([github.com/comet-ml/opik](https://github.com/comet-ml/opik)) +- **Langfuse** — Open-source LLM observability platform used for tracing, prompt versioning, and experiment comparison ([langfuse.com](https://langfuse.com)) --- @@ -789,12 +687,5 @@ They serve different purposes and run at different times: The verifier improves the pipeline's answer quality; the judge measures the pipeline's reasoning quality. -### 10. Do I need Opik to run the framework? -No. Opik is entirely optional. If `OPIK_URL_OVERRIDE` is not set in `.env`, all Opik calls are silent no-ops. The pipeline produces the same MEPs, `metrics.jsonl`, and `summary.csv` as before. - -### 11. How do I stop the Opik Docker stack? -```bash -cd /tmp/opik-server/deployment/docker-compose -docker compose --profile opik down -``` -MEPs and metrics files are stored locally and are unaffected. Trace data in Opik is stored in the Docker volumes and will persist across restarts unless you run `docker compose down -v`. +### 10. Do I need Langfuse to run the framework? +No. Langfuse is entirely optional. If `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are not set in `.env`, all Langfuse calls are silent no-ops. The pipeline produces the same MEPs, `metrics.jsonl`, and `summary.csv` as before. diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py index fda7d27..71d554f 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py @@ -11,7 +11,7 @@ from crewai import LLM, Agent, Crew, Task from ..datasets.perceived_sample import PerceivedSample -from ..opik_integration.tracing import close_span, open_llm_span +from ..langfuse_integration.tracing import close_span, open_llm_span from ..utils.json_strict import parse_strict @@ -137,7 +137,9 @@ def __init__( self.api_key = api_key self._llm = _build_llm(backend, model, api_key) - def run(self, sample: PerceivedSample, opik_trace: Any = None) -> Tuple[str, dict, bool, str]: + def run( + self, sample: PerceivedSample, lf_trace: Any = None + ) -> Tuple[str, dict, bool, str]: """ Execute the planning phase for a new question. @@ -165,7 +167,7 @@ def run(self, sample: PerceivedSample, opik_trace: Any = None) -> Tuple[str, dic prompt = build_planner_prompt(sample) span = open_llm_span( - opik_trace, + lf_trace, name="planner", input_data={"prompt": prompt}, model=self.model, diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py index 7d91918..fabd702 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py @@ -23,7 +23,7 @@ from openai import OpenAI from PIL import Image -from ..opik_integration.tracing import close_span, open_llm_span +from ..langfuse_integration.tracing import close_span, open_llm_span from ..utils.json_strict import parse_strict @@ -203,7 +203,7 @@ def run( sample, # PerceivedSample plan: dict, vision_parsed: dict, - opik_trace: Any = None, + lf_trace: Any = None, ) -> Tuple[str, dict, bool, str]: """ Critically audit a draft answer using a single VLM call. @@ -250,7 +250,7 @@ def run( ) span = open_llm_span( - opik_trace, + lf_trace, name="verifier", input_data={"prompt": prompt, "draft_answer": draft_answer}, model=self.model, diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/vision_agent.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/vision_agent.py index 550aa7c..4832d66 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/vision_agent.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/vision_agent.py @@ -11,7 +11,7 @@ from crewai import LLM, Agent, Crew, Task from ..datasets.perceived_sample import PerceivedSample -from ..opik_integration.tracing import close_span, open_llm_span +from ..langfuse_integration.tracing import close_span, open_llm_span from ..tools.vision_qa_tool import VisionQATool from ..utils.json_strict import parse_strict @@ -183,7 +183,7 @@ def __init__( self.agent_api_key = agent_api_key self.vision_api_key = vision_api_key - def _build_tool(self, opik_trace: Any = None) -> VisionQATool: + def _build_tool(self, lf_trace: Any = None) -> VisionQATool: """ Instantiate the vision tool with the configured vision model. @@ -206,14 +206,14 @@ def _build_tool(self, opik_trace: Any = None) -> VisionQATool: backend=self.vision_backend, model=self.vision_model, api_key=key, - opik_trace=opik_trace, + lf_trace=lf_trace, ) def run( self, sample: PerceivedSample, plan: dict, - opik_trace: Any = None, + lf_trace: Any = None, ocr_result: Optional[dict] = None, ) -> Tuple[str, dict, bool, str, List[dict]]: """ @@ -246,12 +246,12 @@ def run( tool_traces : list of dict A log of tool interactions during the run. """ - tool = self._build_tool(opik_trace=opik_trace) + tool = self._build_tool(lf_trace=lf_trace) llm = _build_llm(self.agent_backend, self.agent_model, self.agent_api_key) task_description = build_vision_task_description(sample, plan, ocr_result=ocr_result) vision_span = open_llm_span( - opik_trace, + lf_trace, name="vision_agent", input_data={"task_description": task_description}, model=self.agent_model, diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/error_taxonomy.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/error_taxonomy.py index f60b561..78b357b 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/error_taxonomy.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/error_taxonomy.py @@ -24,7 +24,7 @@ from openai import OpenAI from ..mep.writer import iter_meps -from ..opik_integration.client import get_client +from ..langfuse_integration.client import get_client from ..utils.json_strict import parse_strict @@ -262,7 +262,7 @@ def main() -> None: # noqa: PLR0915 row = json.loads(line) accuracy_by_id[row.get("sample_id", "")] = row.get("answer_accuracy", 0.0) - opik_client = get_client() + lf_client = get_client() with open(args.out, "w") as f_out: count = 0 @@ -311,19 +311,15 @@ def main() -> None: # noqa: PLR0915 f_out.write(json.dumps(row) + "\n") count += 1 - # Log to Opik if trace_id is available - opik_trace_id = mep.get("opik_trace_id") - if opik_client and opik_trace_id: + # Log to Langfuse if trace_id is available + lf_trace_id = mep.get("lf_trace_id") + if lf_client and lf_trace_id: failure_type = result.get("failure_type", "other") with contextlib.suppress(Exception): - opik_client.log_traces_feedback_scores( - [ - { - "id": opik_trace_id, - "name": f"failure_{failure_type}", - "value": 1.0, - } - ] + lf_client.create_score( + trace_id=lf_trace_id, + name=f"failure_{failure_type}", + value=1.0, ) if count % 10 == 0: diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_outputs.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_outputs.py index e196809..bb1a412 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_outputs.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_outputs.py @@ -17,7 +17,7 @@ from dotenv import load_dotenv from ..mep.writer import iter_meps -from ..opik_integration.client import get_client +from ..langfuse_integration.client import get_client from .judge import judge_mep @@ -137,19 +137,22 @@ def evaluate_mep( for k, v in judge_scores.items(): metrics[f"judge_{k}"] = v - # Log all scores back to the Opik trace if one was recorded in the MEP - opik_trace_id = mep.get("opik_trace_id") - if opik_trace_id: + # Log all scores back to the Langfuse trace if one was recorded in the MEP + lf_trace_id = mep.get("lf_trace_id") + if lf_trace_id: client = get_client() if client: score_keys = ["answer_accuracy", "latency_sec"] + ( [f"judge_{k}" for k in judge_scores] if use_judge else [] ) - scores = {k: metrics[k] for k in score_keys if isinstance(metrics.get(k), (int, float))} - with contextlib.suppress(Exception): - client.log_traces_feedback_scores( - [{"id": opik_trace_id, "name": k, "value": float(v)} for k, v in scores.items()] - ) + scores = { + k: metrics[k] + for k in score_keys + if isinstance(metrics.get(k), (int, float)) + } + for k, v in scores.items(): + with contextlib.suppress(Exception): + client.create_score(trace_id=lf_trace_id, name=k, value=float(v)) return metrics diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_topk.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_topk.py index 0584a6c..9a51a7e 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_topk.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_topk.py @@ -15,6 +15,7 @@ import argparse import base64 +import contextlib import json import os from pathlib import Path @@ -24,6 +25,7 @@ from google import genai from openai import OpenAI +from ..langfuse_integration.client import get_client from ..mep.writer import iter_meps from ..utils.json_strict import parse_strict from .eval_outputs import score_answer_accuracy @@ -209,6 +211,8 @@ def main() -> None: api_key = os.environ.get("OPENAI_API_KEY", "") if args.backend == "openai" else os.environ.get("GEMINI_API_KEY", "") + lf_client = get_client() + with open(args.out, "w") as f_out: count = 0 for mep in iter_meps(args.mep_dir): @@ -228,7 +232,22 @@ def main() -> None: cands = result["topk_candidates"] h1 = result.get("hit_at_1", 0) h3 = result.get(f"hit_at_{args.k}", 0) - print(f" {sid} exp={exp!r} candidates={cands} hit@1={h1} hit@{args.k}={h3}") + print( + f" {sid} exp={exp!r} candidates={cands} hit@1={h1} hit@{args.k}={h3}" + ) + + lf_trace_id = mep.get("lf_trace_id") + if lf_client and lf_trace_id: + for ki in range(1, args.k + 1): + key = f"hit_at_{ki}" + if key in result: + with contextlib.suppress(Exception): + lf_client.create_score( + trace_id=lf_trace_id, + name=key, + value=float(result[key]), + ) + count += 1 except Exception as exc: print(f" Error: {exc}") diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/__init__.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/__init__.py new file mode 100644 index 0000000..32d60f4 --- /dev/null +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/__init__.py @@ -0,0 +1 @@ +"""Langfuse observability integration — tracing, prompt versioning, dataset registration.""" diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py new file mode 100644 index 0000000..5acfa2e --- /dev/null +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py @@ -0,0 +1,56 @@ +"""Langfuse client singleton with graceful degradation. + +Returns None when LANGFUSE_PUBLIC_KEY / LANGFUSE_SECRET_KEY are not set or +langfuse is not installed, so every caller can guard with ``if client:``. +""" + +import os + + +_client = None +_initialised = False + + +def get_client(): + """Return a configured langfuse.Langfuse() instance, or None if unavailable.""" + global _client, _initialised # noqa: PLW0603 + if _initialised: + return _client + + _initialised = True + + try: + from dotenv import load_dotenv + + load_dotenv() + except ImportError: + pass + + public_key = os.environ.get("LANGFUSE_PUBLIC_KEY", "") + secret_key = os.environ.get("LANGFUSE_SECRET_KEY", "") + + if not public_key or not secret_key: + return None + + try: + from langfuse import Langfuse + + kwargs: dict = {"public_key": public_key, "secret_key": secret_key} + # Accept LANGFUSE_HOST or LANGFUSE_BASE_URL (both are common) + host = os.environ.get("LANGFUSE_HOST") or os.environ.get("LANGFUSE_BASE_URL", "") + if host: + kwargs["host"] = host + + _client = Langfuse(**kwargs) + except Exception as exc: + print(f"[langfuse] client init failed: {exc}") + _client = None + + return _client + + +def reset_client() -> None: + """Force re-initialisation on next call (useful for tests).""" + global _client, _initialised # noqa: PLW0603 + _client = None + _initialised = False diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/dataset.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/dataset.py new file mode 100644 index 0000000..ab0f1b7 --- /dev/null +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/dataset.py @@ -0,0 +1,69 @@ +"""Register ChartQAPro samples as a Langfuse Dataset. + +Usage: + python -m agentic_chartqapro_eval.langfuse_integration.dataset \ + --split test --n 25 +""" + +import argparse +from typing import Optional + +from .client import get_client + + +def register_dataset( + samples, + dataset_name: str = "ChartQAPro", + split: str = "test", +) -> Optional[str]: + """Insert PerceivedSamples into a Langfuse Dataset named ``{dataset_name}_{split}``. + + Returns the dataset name, or None if Langfuse is unavailable. + """ + client = get_client() + if client is None: + return None + + name = f"{dataset_name}_{split}" + try: + client.create_dataset(name=name) + for s in samples: + client.create_dataset_item( + dataset_name=name, + input={ + "source_id": s.sample_id, + "question": s.question, + "question_type": s.question_type.value, + "image_path": s.image_path or "", + "choices": s.choices or [], + }, + expected_output=s.expected_output, + ) + print(f"[langfuse] Registered {len(samples)} samples → dataset '{name}'") + return name + except Exception as exc: + print(f"[langfuse] Dataset registration failed: {exc}") + return None + + +def main() -> None: + """Register ChartQAPro dataset samples in Langfuse.""" + parser = argparse.ArgumentParser( + description="Register ChartQAPro samples as Langfuse dataset" + ) + parser.add_argument("--split", default="test") + parser.add_argument("--n", type=int, default=25) + parser.add_argument("--image_dir", default="data/chartqapro_images") + parser.add_argument("--cache_dir", default=None) + args = parser.parse_args() + + from ..datasets.chartqapro_loader import load_chartqapro + + samples = load_chartqapro( + split=args.split, n=args.n, image_dir=args.image_dir, cache_dir=args.cache_dir + ) + register_dataset(samples, split=args.split) + + +if __name__ == "__main__": + main() diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/ingest.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/ingest.py new file mode 100644 index 0000000..2ec4283 --- /dev/null +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/ingest.py @@ -0,0 +1,174 @@ +"""Retroactive ingestion: convert existing MEP JSON files to Langfuse Traces. + +This lets you visualise runs that completed before Langfuse was wired in. + +Usage: + python -m agentic_chartqapro_eval.langfuse_integration.ingest \ + --mep_dir meps/openai_openai/chartqapro/test \ + [--metrics_file metrics.jsonl] +""" + +import argparse +import contextlib +import json +from pathlib import Path +from typing import Optional + +from .client import get_client +from .tracing import _normalize_usage + + +def ingest_mep( + mep: dict, + client: object, + metrics: Optional[dict] = None, + project_name: str = "chartqapro-eval", # noqa: ARG001 — kept for API compat +) -> None: + """Create a Langfuse Trace from a single MEP dict (retroactively).""" + sample = mep.get("sample", {}) + plan = mep.get("plan", {}) + vision = mep.get("vision", {}) + config = mep.get("config", {}) + + sample_id = sample.get("sample_id", "unknown") + config_name = config.get("config_name", "unknown") + question_type = sample.get("question_type", "standard") + question = sample.get("question", "") + expected = sample.get("expected_output", "") + vision_parsed = vision.get("parsed", {}) + + with client.start_as_current_observation( # type: ignore[union-attr] + name=f"chartqapro/{sample_id}", + as_type="span", + input={"question": question, "expected_output": expected}, + output=vision_parsed if vision_parsed else None, + metadata={ + "run_id": mep.get("run_id", ""), + "config": config_name, + "question_type": question_type, + "schema_version": mep.get("schema_version", ""), + "has_errors": bool(mep.get("errors")), + "retroactive": True, + }, + ) as trace_span: + # Planner generation + if plan.get("prompt"): + planner_gen = trace_span.start_observation( + name="planner", + as_type="generation", + input={"prompt": plan.get("prompt", "")}, + model=config.get("planner_model", ""), + metadata={"backend": config.get("planner_backend", "")}, + ) + planner_gen.update( + output={ + "plan": plan.get("parsed", {}), + "parse_error": plan.get("parse_error", False), + } + ) + planner_gen.end() + + # Vision tool generations — one per ToolTrace entry + for tt in vision.get("tool_trace", []): + usage = tt.get("provider_metadata", {}).get("usage", {}) + tool_gen = trace_span.start_observation( + name="vision_qa_tool", + as_type="generation", + input={ + "question": question, + "plan_steps": plan.get("parsed", {}).get("steps", []), + }, + model=tt.get("model", config.get("vision_model", "")), + metadata={ + "backend": tt.get("backend", config.get("vision_backend", "")), + "elapsed_ms": tt.get("elapsed_ms"), + }, + usage_details=_normalize_usage(usage) if usage else None, + ) + tool_gen.update(output=vision_parsed if vision_parsed else None) + tool_gen.end() + + # Attach evaluation scores if provided + if metrics: + for key in [ + "answer_accuracy", + "judge_explanation_quality", + "judge_hallucination_rate", + "judge_plan_coverage", + "judge_plan_adherence", + "judge_faithfulness_alignment", + ]: + if key in metrics and isinstance(metrics[key], (int, float)): + with contextlib.suppress(Exception): + trace_span.score_trace(name=key, value=float(metrics[key])) + + +def ingest_dir( + mep_dir: str, + metrics_file: Optional[str] = None, + project_name: str = "chartqapro-eval", +) -> int: + """Ingest all MEPs from a directory. Returns the number ingested.""" + client = get_client() + if client is None: + print("[langfuse] No client — set LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY") + return 0 + + metrics_by_id: dict = {} + if metrics_file and Path(metrics_file).exists(): + with open(metrics_file) as f: + for raw_line in f: + line = raw_line.strip() + if line: + row = json.loads(line) + metrics_by_id[row.get("sample_id", "")] = row + + mep_path = Path(mep_dir) + mep_files = list(mep_path.glob("*.json")) + if not mep_files: + print(f"[langfuse] No MEP JSON files found in {mep_dir}") + return 0 + + count = 0 + for fpath in sorted(mep_files): + try: + mep = json.loads(fpath.read_text()) + sample_id = mep.get("sample", {}).get("sample_id", "") + ingest_mep( + mep, + client, + metrics=metrics_by_id.get(sample_id), + project_name=project_name, + ) + count += 1 + print(f" ingested {sample_id}") + except Exception as exc: + print(f" ERROR {fpath.name}: {exc}") + + print(f"[langfuse] Ingested {count}/{len(mep_files)} MEPs from {mep_dir}") + with contextlib.suppress(Exception): + client.flush() # type: ignore[union-attr] + return count + + +def main() -> None: + """Parse CLI arguments and ingest MEP files into Langfuse.""" + parser = argparse.ArgumentParser(description="Ingest existing MEPs into Langfuse") + parser.add_argument( + "--mep_dir", required=True, help="Directory containing MEP JSON files" + ) + parser.add_argument( + "--metrics_file", + default=None, + help="Optional metrics.jsonl for feedback scores", + ) + parser.add_argument( + "--project", default="chartqapro-eval", help="Langfuse project name (metadata)" + ) + args = parser.parse_args() + + ingest_dir(args.mep_dir, args.metrics_file, project_name=args.project) + + +if __name__ == "__main__": + main() diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/prompts.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/prompts.py new file mode 100644 index 0000000..ec81d02 --- /dev/null +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/prompts.py @@ -0,0 +1,81 @@ +"""Versioned prompt loading via Langfuse Prompt Management. + +Usage: + # Load prompt (falls back to file if Langfuse unavailable) + text = get_prompt("planner_prompt", PLANNER_PROMPT_PATH) + + # Push current prompt files to Langfuse (run once before a new experiment) + python -m agentic_chartqapro_eval.langfuse_integration.prompts +""" + +import argparse +from pathlib import Path +from typing import Optional + +from .client import get_client + + +# Prompt names as stored in Langfuse Prompt Management +PLANNER_PROMPT_NAME = "chartqapro_planner" +VISION_PROMPT_NAME = "chartqapro_vision" + + +def get_prompt(name: str, fallback_path: Path) -> str: + """Return the latest versioned prompt from Langfuse, or read from file.""" + client = get_client() + if client: + try: + prompt = client.get_prompt(name=name) + if prompt: + return prompt.compile() + except Exception: + pass + return fallback_path.read_text() + + +def push_prompts( + planner_path: Optional[Path] = None, + vision_path: Optional[Path] = None, +) -> None: + """Upload current planner.txt and vision.txt to Langfuse Prompt Management.""" + client = get_client() + if client is None: + print("[langfuse] No client — skipping prompt push") + return + + agents_dir = Path(__file__).parents[1] / "agents" / "prompts" + planner_path = planner_path or (agents_dir / "planner.txt") + vision_path = vision_path or (agents_dir / "vision.txt") + + for name, path in [ + (PLANNER_PROMPT_NAME, planner_path), + (VISION_PROMPT_NAME, vision_path), + ]: + if not path.exists(): + print(f"[langfuse] Prompt file not found: {path}") + continue + text = path.read_text() + try: + client.create_prompt(name=name, prompt=text, type="text") + print(f"[langfuse] Pushed prompt '{name}'") + except Exception as exc: + print(f"[langfuse] Failed to push prompt '{name}': {exc}") + + +def main() -> None: + """Parse CLI arguments and push prompt files to Langfuse Prompt Management.""" + parser = argparse.ArgumentParser( + description="Push prompt files to Langfuse Prompt Management" + ) + parser.add_argument("--planner", default=None, help="Path to planner.txt") + parser.add_argument("--vision", default=None, help="Path to vision.txt") + args = parser.parse_args() + + push_prompts( + planner_path=Path(args.planner) if args.planner else None, + vision_path=Path(args.vision) if args.vision else None, + ) + + +if __name__ == "__main__": + main() diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py new file mode 100644 index 0000000..7a8a84c --- /dev/null +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py @@ -0,0 +1,157 @@ +"""Lightweight wrappers around Langfuse v4 observations for the MEP pipeline. + +All helpers accept ``None`` as the client/trace and become no-ops, so the +rest of the codebase can call them unconditionally. +""" + +import contextlib +from contextlib import contextmanager +from typing import Optional + + +def _normalize_usage(usage: dict) -> dict: + """Map provider usage dicts (OpenAI/Gemini) to Langfuse v4 usage_details keys.""" + normalized: dict = {} + # OpenAI keys + if "prompt_tokens" in usage: + normalized["input"] = usage["prompt_tokens"] + elif "input" in usage: + normalized["input"] = usage["input"] + if "completion_tokens" in usage: + normalized["output"] = usage["completion_tokens"] + elif "output" in usage: + normalized["output"] = usage["output"] + if "total_tokens" in usage: + normalized["total"] = usage["total_tokens"] + elif "total" in usage: + normalized["total"] = usage["total"] + return normalized or usage + + +class _TraceHandle: + """Thin wrapper yielded by sample_trace; exposes a stable interface across callers. + + Attributes + ---------- + id : str | None + The Langfuse trace ID, usable for attaching scores after the run. + """ + + def __init__(self, span: object, trace_id: Optional[str]) -> None: + self._span = span + self.id = trace_id + + def update(self, **kwargs: object) -> None: + """Update the root trace span (e.g. set output after the run).""" + if self._span is not None: + with contextlib.suppress(Exception): + self._span.update(**kwargs) # type: ignore[union-attr] + + def score_trace(self, name: str, value: float) -> None: + """Attach a numeric score to the root trace.""" + if self._span is not None: + with contextlib.suppress(Exception): + self._span.score_trace(name=name, value=value) # type: ignore[union-attr] + + +@contextmanager +def sample_trace( + client: object, + sample_id: str, + question: str, + expected_output: str, + question_type: str, + config_name: str, + run_id: str, + project_name: str = "chartqapro-eval", +): # type: ignore[return] + """Open a Langfuse trace for one sample; yield a _TraceHandle (or None).""" + del project_name # kept for API compatibility; Langfuse v4 uses project from SDK config + if client is None: + yield None + return + + from langfuse import propagate_attributes + + with client.start_as_current_observation( # type: ignore[union-attr] + name=f"chartqapro/{sample_id}", + as_type="span", + input={"question": question, "expected_output": expected_output}, + metadata={ + "run_id": run_id, + "config": config_name, + "question_type": question_type, + }, + ) as span: + with propagate_attributes(session_id=run_id): + trace_id = client.get_current_trace_id() # type: ignore[union-attr] + handle = _TraceHandle(span=span, trace_id=trace_id) + try: + yield handle + finally: + with contextlib.suppress(Exception): + client.flush() # type: ignore[union-attr] + + +def open_llm_span( + trace: object, + name: str, + input_data: dict, + model: str, + metadata: Optional[dict] = None, + parent_span_id: Optional[str] = None, +) -> object: + """Create a Langfuse generation on the trace span (or return None). + + ``parent_span_id`` is accepted for API compatibility but is unused in v4 — + nesting is handled by calling ``start_observation`` on the parent span. + """ + del parent_span_id # kept for API compatibility; v4 uses contextual nesting + if trace is None: + return None + span = getattr(trace, "_span", None) + if span is None: + return None + with contextlib.suppress(Exception): + return span.start_observation( # type: ignore[union-attr] + name=name, + as_type="generation", + input=input_data, + model=model, + metadata=metadata or {}, + ) + return None + + +def close_span( + span: object, + output: Optional[dict] = None, + usage: Optional[dict] = None, + error: Optional[str] = None, +) -> None: + """End a Langfuse generation (no-op if span is None).""" + if span is None: + return + with contextlib.suppress(Exception): + update_kwargs: dict = {} + if output is not None: + update_kwargs["output"] = output + if usage: + update_kwargs["usage_details"] = _normalize_usage(usage) + if error: + update_kwargs["level"] = "ERROR" + update_kwargs["status_message"] = error + if update_kwargs: + span.update(**update_kwargs) # type: ignore[union-attr] + span.end() # type: ignore[union-attr] + + +def log_trace_scores(trace: object, scores: dict) -> None: + """Log a dict of {metric_name: float} as scores on the trace.""" + if trace is None: + return + for name, value in scores.items(): + if isinstance(value, (int, float)): + with contextlib.suppress(Exception): + if hasattr(trace, "score_trace"): + trace.score_trace(name=name, value=float(value)) # type: ignore[union-attr] diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/mep/schema.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/mep/schema.py index f4eba74..1ae7a3e 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/mep/schema.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/mep/schema.py @@ -157,7 +157,7 @@ class MEP: verifier: Optional[MEPVerifier] = None # Pass 2.5 — None when skipped timestamps: Optional[MEPTimestamps] = None errors: List[str] = field(default_factory=list) - opik_trace_id: Optional[str] = None # set when Opik tracing is active + lf_trace_id: Optional[str] = None # set when Langfuse tracing is active def to_dict(self) -> dict: """Return a dict representation suitable for JSON serialization.""" diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py index 6940c24..e3eca3b 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py @@ -37,10 +37,10 @@ MEPVision, ) from ..mep.writer import write_mep -from ..opik_integration.client import get_client -from ..opik_integration.dataset import register_dataset -from ..opik_integration.prompts import push_prompts -from ..opik_integration.tracing import ( +from ..langfuse_integration.client import get_client +from ..langfuse_integration.dataset import register_dataset +from ..langfuse_integration.prompts import push_prompts +from ..langfuse_integration.tracing import ( log_trace_scores, sample_trace, ) @@ -114,7 +114,7 @@ def process_sample( # noqa: PLR0915 config: dict, run_id: str, out_dir: str, - opik_client=None, + lf_client=None, verifier_agent: Optional[VerifierAgent] = None, ocr_tool: Optional[OcrReaderTool] = None, ) -> str: @@ -155,15 +155,15 @@ def process_sample( # noqa: PLR0915 errors: list = [] with sample_trace( - opik_client, + lf_client, sample_id=sample.sample_id, question=sample.question, expected_output=sample.expected_output, question_type=sample.question_type.value, config_name=config_name, run_id=run_id, - ) as opik_trace: - opik_trace_id = getattr(opik_trace, "id", None) + ) as lf_trace: + lf_trace_id = getattr(lf_trace, "id", None) # ---- Planner ---- plan_prompt = "" @@ -174,7 +174,8 @@ def process_sample( # noqa: PLR0915 try: with timed() as pt: - plan_prompt, plan_parsed, plan_parse_error, plan_raw = planner.run(sample, opik_trace=opik_trace) + plan_prompt, plan_parsed, plan_parse_error, plan_raw = planner.run(sample, lf_trace=lf_trace) + plan_ms = pt.elapsed_ms except Exception as exc: errors.append(f"planner_error: {exc}") @@ -192,7 +193,7 @@ def process_sample( # noqa: PLR0915 if ocr_tool is not None: try: - ocr_tool.opik_trace = opik_trace + ocr_tool.lf_trace = lf_trace with timed() as ot: ocr_raw = ocr_tool._run(sample.image_path) ocr_ms = ot.elapsed_ms @@ -225,7 +226,7 @@ def process_sample( # noqa: PLR0915 ) = vision_agent.run( sample, plan_parsed, - opik_trace=opik_trace, + lf_trace=lf_trace, ocr_result=ocr_parsed if ocr_parsed else None, ) vision_ms = vt.elapsed_ms @@ -251,7 +252,8 @@ def process_sample( # noqa: PLR0915 verifier_parsed, verifier_parse_error, verifier_raw, - ) = verifier_agent.run(sample, plan_parsed, vision_parsed, opik_trace=opik_trace) + ) = verifier_agent.run(sample, plan_parsed, vision_parsed, lf_trace=lf_trace) + verifier_ms = vrt.elapsed_ms verifier_verdict = verifier_parsed.get("verdict", "confirmed") except Exception as exc: @@ -331,20 +333,20 @@ def process_sample( # noqa: PLR0915 verifier_ms=verifier_ms, ), errors=errors, - opik_trace_id=opik_trace_id, + lf_trace_id=lf_trace_id, ) - # ---- Immediately log available scores to Opik ---- + # ---- Immediately log available scores to Langfuse ---- log_trace_scores( - opik_trace, + lf_trace, { "planner_parse_ok": float(not plan_parse_error), "vision_parse_ok": float(not vision_parse_error), "has_errors": float(bool(errors)), }, ) - if opik_trace: - opik_trace.end(output=vision_parsed if vision_parsed else None) + if lf_trace: + lf_trace.update(output=vision_parsed if vision_parsed else None) return write_mep(mep, out_dir) @@ -434,14 +436,14 @@ def main() -> None: # noqa: PLR0912, PLR0915 print(f"Output dir : {out_dir}") print(f"Workers : {args.workers}") - # Opik: register dataset + version prompts at run start (no-ops if unavailable) - opik_client = get_client() - if opik_client: - print("Opik : enabled") + # Langfuse: register dataset + version prompts at run start (no-ops if unavailable) + lf_client = get_client() + if lf_client: + print("Langfuse : enabled") register_dataset(samples, split=args.split) push_prompts() else: - print("Opik : not configured (set OPIK_URL_OVERRIDE to enable)") + print("Langfuse : not configured (set LANGFUSE_PUBLIC_KEY + LANGFUSE_SECRET_KEY to enable)") # Build agents once — run() creates fresh Crew/Tool per call so this is thread-safe print("Initialising agents …") @@ -480,7 +482,7 @@ def main() -> None: # noqa: PLR0912, PLR0915 config, run_id, out_dir, - opik_client, + lf_client, verifier, ocr, ) @@ -498,7 +500,7 @@ def main() -> None: # noqa: PLR0912, PLR0915 config, run_id, out_dir, - opik_client, + lf_client, verifier, ocr, ): s diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/ocr_reader_tool.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/ocr_reader_tool.py index aa58680..41677dd 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/ocr_reader_tool.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/ocr_reader_tool.py @@ -18,7 +18,7 @@ from openai import OpenAI from pydantic import BaseModel, Field, PrivateAttr -from ..opik_integration.tracing import close_span, open_llm_span +from ..langfuse_integration.tracing import close_span, open_llm_span _OCR_PROMPT = """\ @@ -77,7 +77,7 @@ class OcrReaderTool(BaseTool): backend: str = "gemini" model: str = "gemini-2.5-flash-lite" api_key: str = "" - opik_trace: Optional[Any] = None + lf_trace: Optional[Any] = None _traces: list = PrivateAttr(default_factory=list) @@ -116,7 +116,7 @@ def _run(self, image_path: str) -> str: t0 = time.time() opik_span = open_llm_span( - self.opik_trace, + self.lf_trace, name="ocr_reader_tool", input_data={"image_path": image_path}, model=self.model, diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/vision_qa_tool.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/vision_qa_tool.py index b68ffec..01c3c9c 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/vision_qa_tool.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/vision_qa_tool.py @@ -18,7 +18,7 @@ from openai import OpenAI from pydantic import BaseModel, Field, PrivateAttr -from ..opik_integration.tracing import close_span, open_llm_span +from ..langfuse_integration.tracing import close_span, open_llm_span class VisionQAInput(BaseModel): @@ -46,7 +46,7 @@ class VisionQATool(BaseTool): backend: str = "gemini" # "openai" | "gemini" model: str = "gemini-2.5-flash-lite" api_key: str = "" - opik_trace: Optional[Any] = None # Opik Trace object for span creation + lf_trace: Optional[Any] = None # Langfuse Trace object for span creation # Private mutable trace storage (not a Pydantic field) _traces: list = PrivateAttr(default_factory=list) @@ -100,8 +100,8 @@ def _run( start_ts = datetime.now(timezone.utc).isoformat() t0 = time.time() - opik_span = open_llm_span( - self.opik_trace, + lf_span = open_llm_span( + self.lf_trace, name="vision_qa_tool", input_data={ "image_path": image_path, @@ -133,7 +133,7 @@ def _run( usage = provider_meta.get("usage", {}) close_span( - opik_span, + lf_span, output={"raw_text": raw_text}, usage=usage if usage else None, error=error_str, diff --git a/pyproject.toml b/pyproject.toml index 75916cf..6e9500a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,6 @@ dependencies = [ agentic-xai-eval = [ "crewai>=1.6.1", "google-genai>=1.67.0", - "opik>=1.10.40", "streamlit>=1.55.0", ] dev = [ @@ -182,4 +181,4 @@ requires = ["setuptools>=68", "wheel"] build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] -where = ["implementations/agentic_vqa_eval/src"] +where = ["implementations/agentic_vqa_eval/src"] \ No newline at end of file From baaac24304d613b464f4b8721a01f08f51a212e8 Mon Sep 17 00:00:00 2001 From: Aravind N Date: Wed, 18 Mar 2026 18:36:04 -0400 Subject: [PATCH 2/9] Refactor observability integration: remove Opik references, streamline Langfuse integration, and tidy up code formatting --- .../agents/planner_agent.py | 4 +- .../eval/error_taxonomy.py | 2 +- .../eval/eval_outputs.py | 8 +- .../agentic_chartqapro_eval/eval/eval_topk.py | 6 +- .../langfuse_integration/__init__.py | 2 +- .../langfuse_integration/client.py | 33 ++- .../langfuse_integration/dataset.py | 44 +++- .../langfuse_integration/ingest.py | 8 +- .../langfuse_integration/prompts.py | 4 +- .../langfuse_integration/tracing.py | 126 +++++++-- .../opik_integration/__init__.py | 1 - .../opik_integration/client.py | 74 ------ .../opik_integration/dataset.py | 85 ------- .../opik_integration/ingest.py | 240 ------------------ .../opik_integration/prompts.py | 80 ------ .../opik_integration/tracing.py | 177 ------------- .../runner/run_generate_meps.py | 14 +- .../mechanistic_interpretability/README.md | 1 - pyproject.toml | 2 +- 19 files changed, 175 insertions(+), 736 deletions(-) delete mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/__init__.py delete mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/client.py delete mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/dataset.py delete mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/ingest.py delete mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/prompts.py delete mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/tracing.py diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py index 71d554f..5e34591 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py @@ -137,9 +137,7 @@ def __init__( self.api_key = api_key self._llm = _build_llm(backend, model, api_key) - def run( - self, sample: PerceivedSample, lf_trace: Any = None - ) -> Tuple[str, dict, bool, str]: + def run(self, sample: PerceivedSample, lf_trace: Any = None) -> Tuple[str, dict, bool, str]: """ Execute the planning phase for a new question. diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/error_taxonomy.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/error_taxonomy.py index 78b357b..554a248 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/error_taxonomy.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/error_taxonomy.py @@ -23,8 +23,8 @@ from google import genai from openai import OpenAI -from ..mep.writer import iter_meps from ..langfuse_integration.client import get_client +from ..mep.writer import iter_meps from ..utils.json_strict import parse_strict diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_outputs.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_outputs.py index bb1a412..711998e 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_outputs.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_outputs.py @@ -16,8 +16,8 @@ from dotenv import load_dotenv -from ..mep.writer import iter_meps from ..langfuse_integration.client import get_client +from ..mep.writer import iter_meps from .judge import judge_mep @@ -145,11 +145,7 @@ def evaluate_mep( score_keys = ["answer_accuracy", "latency_sec"] + ( [f"judge_{k}" for k in judge_scores] if use_judge else [] ) - scores = { - k: metrics[k] - for k in score_keys - if isinstance(metrics.get(k), (int, float)) - } + scores = {k: metrics[k] for k in score_keys if isinstance(metrics.get(k), (int, float))} for k, v in scores.items(): with contextlib.suppress(Exception): client.create_score(trace_id=lf_trace_id, name=k, value=float(v)) diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_topk.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_topk.py index 9a51a7e..99df7b6 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_topk.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_topk.py @@ -232,9 +232,7 @@ def main() -> None: cands = result["topk_candidates"] h1 = result.get("hit_at_1", 0) h3 = result.get(f"hit_at_{args.k}", 0) - print( - f" {sid} exp={exp!r} candidates={cands} hit@1={h1} hit@{args.k}={h3}" - ) + print(f" {sid} exp={exp!r} candidates={cands} hit@1={h1} hit@{args.k}={h3}") lf_trace_id = mep.get("lf_trace_id") if lf_client and lf_trace_id: @@ -247,7 +245,7 @@ def main() -> None: name=key, value=float(result[key]), ) - + count += 1 except Exception as exc: print(f" Error: {exc}") diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/__init__.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/__init__.py index 32d60f4..83aa60e 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/__init__.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/__init__.py @@ -1 +1 @@ -"""Langfuse observability integration — tracing, prompt versioning, dataset registration.""" +"""Langfuse observability integration.""" diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py index 5acfa2e..c725ce9 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py @@ -5,6 +5,10 @@ """ import os +from contextlib import suppress + +from dotenv import load_dotenv +from langfuse import Langfuse _client = None @@ -12,19 +16,26 @@ def get_client(): - """Return a configured langfuse.Langfuse() instance, or None if unavailable.""" + """ + Initialize and return a globally cached Langfuse client. + + Retrieves configuration from environment variables and configures + the SDK for local or cloud usage. + + Returns + ------- + Langfuse or None + An active client, or None if configuration is missing or invalid. + """ global _client, _initialised # noqa: PLW0603 if _initialised: return _client _initialised = True - try: - from dotenv import load_dotenv - + # Load environment variables from .env file + with suppress(Exception): load_dotenv() - except ImportError: - pass public_key = os.environ.get("LANGFUSE_PUBLIC_KEY", "") secret_key = os.environ.get("LANGFUSE_SECRET_KEY", "") @@ -33,8 +44,6 @@ def get_client(): return None try: - from langfuse import Langfuse - kwargs: dict = {"public_key": public_key, "secret_key": secret_key} # Accept LANGFUSE_HOST or LANGFUSE_BASE_URL (both are common) host = os.environ.get("LANGFUSE_HOST") or os.environ.get("LANGFUSE_BASE_URL", "") @@ -50,7 +59,13 @@ def get_client(): def reset_client() -> None: - """Force re-initialisation on next call (useful for tests).""" + """ + Clear the cached client and reset initialization state. + + Returns + ------- + None + """ global _client, _initialised # noqa: PLW0603 _client = None _initialised = False diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/dataset.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/dataset.py index ab0f1b7..d0d0c47 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/dataset.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/dataset.py @@ -8,6 +8,7 @@ import argparse from typing import Optional +from ..datasets.chartqapro_loader import load_chartqapro from .client import get_client @@ -16,9 +17,24 @@ def register_dataset( dataset_name: str = "ChartQAPro", split: str = "test", ) -> Optional[str]: - """Insert PerceivedSamples into a Langfuse Dataset named ``{dataset_name}_{split}``. + """ + Upload a collection of samples as a Langfuse Dataset. + + Allows for versioned dataset management and evaluation in the Langfuse UI. - Returns the dataset name, or None if Langfuse is unavailable. + Parameters + ---------- + samples : list of PerceivedSample + The data samples to register. + dataset_name : str, default 'ChartQAPro' + The base name for the dataset. + split : str, default 'test' + The split identifier (e.g., 'train', 'val'). + + Returns + ------- + str or None + The name of the created dataset if successful, else None. """ client = get_client() if client is None: @@ -27,11 +43,11 @@ def register_dataset( name = f"{dataset_name}_{split}" try: client.create_dataset(name=name) - for s in samples: + [ client.create_dataset_item( dataset_name=name, input={ - "source_id": s.sample_id, + "source_id": s.sample_id, # stored as data field; Langfuse auto-generates UUID v7 id "question": s.question, "question_type": s.question_type.value, "image_path": s.image_path or "", @@ -39,6 +55,8 @@ def register_dataset( }, expected_output=s.expected_output, ) + for s in samples + ] print(f"[langfuse] Registered {len(samples)} samples → dataset '{name}'") return name except Exception as exc: @@ -47,21 +65,21 @@ def register_dataset( def main() -> None: - """Register ChartQAPro dataset samples in Langfuse.""" - parser = argparse.ArgumentParser( - description="Register ChartQAPro samples as Langfuse dataset" - ) + """ + Command-line interface for registering ChartQAPro datasets. + + Returns + ------- + None + """ + parser = argparse.ArgumentParser(description="Register ChartQAPro samples as Langfuse dataset") parser.add_argument("--split", default="test") parser.add_argument("--n", type=int, default=25) parser.add_argument("--image_dir", default="data/chartqapro_images") parser.add_argument("--cache_dir", default=None) args = parser.parse_args() - from ..datasets.chartqapro_loader import load_chartqapro - - samples = load_chartqapro( - split=args.split, n=args.n, image_dir=args.image_dir, cache_dir=args.cache_dir - ) + samples = load_chartqapro(split=args.split, n=args.n, image_dir=args.image_dir, cache_dir=args.cache_dir) register_dataset(samples, split=args.split) diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/ingest.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/ingest.py index 2ec4283..c6d153b 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/ingest.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/ingest.py @@ -154,17 +154,13 @@ def ingest_dir( def main() -> None: """Parse CLI arguments and ingest MEP files into Langfuse.""" parser = argparse.ArgumentParser(description="Ingest existing MEPs into Langfuse") - parser.add_argument( - "--mep_dir", required=True, help="Directory containing MEP JSON files" - ) + parser.add_argument("--mep_dir", required=True, help="Directory containing MEP JSON files") parser.add_argument( "--metrics_file", default=None, help="Optional metrics.jsonl for feedback scores", ) - parser.add_argument( - "--project", default="chartqapro-eval", help="Langfuse project name (metadata)" - ) + parser.add_argument("--project", default="chartqapro-eval", help="Langfuse project name (metadata)") args = parser.parse_args() ingest_dir(args.mep_dir, args.metrics_file, project_name=args.project) diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/prompts.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/prompts.py index ec81d02..35aece4 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/prompts.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/prompts.py @@ -64,9 +64,7 @@ def push_prompts( def main() -> None: """Parse CLI arguments and push prompt files to Langfuse Prompt Management.""" - parser = argparse.ArgumentParser( - description="Push prompt files to Langfuse Prompt Management" - ) + parser = argparse.ArgumentParser(description="Push prompt files to Langfuse Prompt Management") parser.add_argument("--planner", default=None, help="Path to planner.txt") parser.add_argument("--vision", default=None, help="Path to vision.txt") args = parser.parse_args() diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py index 7a8a84c..546c99c 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py @@ -8,6 +8,8 @@ from contextlib import contextmanager from typing import Optional +from langfuse import propagate_attributes + def _normalize_usage(usage: dict) -> dict: """Map provider usage dicts (OpenAI/Gemini) to Langfuse v4 usage_details keys.""" @@ -65,32 +67,58 @@ def sample_trace( run_id: str, project_name: str = "chartqapro-eval", ): # type: ignore[return] - """Open a Langfuse trace for one sample; yield a _TraceHandle (or None).""" + """ + Context manager to create a Langfuse trace for a single sample. + + Parameters + ---------- + client : object + The Langfuse client. If None, the context manager yields None. + sample_id : str + Unique identifier for the sample. + question : str + The input prompt text. + expected_output : str + The ground truth answer. + question_type : str + The category of the question. + config_name : str + The evaluation configuration used. + run_id : str + The unique ID of the pipeline run. + project_name : str, default 'chartqapro-eval' + Langfuse project identifier (ignored in v4; use SDK config). + + Yields + ------ + trace_handle : _TraceHandle or None + The initialized trace object. + """ del project_name # kept for API compatibility; Langfuse v4 uses project from SDK config if client is None: yield None return - from langfuse import propagate_attributes - - with client.start_as_current_observation( # type: ignore[union-attr] - name=f"chartqapro/{sample_id}", - as_type="span", - input={"question": question, "expected_output": expected_output}, - metadata={ - "run_id": run_id, - "config": config_name, - "question_type": question_type, - }, - ) as span: - with propagate_attributes(session_id=run_id): - trace_id = client.get_current_trace_id() # type: ignore[union-attr] - handle = _TraceHandle(span=span, trace_id=trace_id) - try: - yield handle - finally: - with contextlib.suppress(Exception): - client.flush() # type: ignore[union-attr] + with ( + client.start_as_current_observation( # type: ignore[union-attr] + name=f"chartqapro/{sample_id}", + as_type="span", + input={"question": question, "expected_output": expected_output}, + metadata={ + "run_id": run_id, + "config": config_name, + "question_type": question_type, + }, + ) as span, + propagate_attributes(session_id=run_id), + ): + trace_id = client.get_current_trace_id() # type: ignore[union-attr] + handle = _TraceHandle(span=span, trace_id=trace_id) + try: + yield handle + finally: + with contextlib.suppress(Exception): + client.flush() # type: ignore[union-attr] def open_llm_span( @@ -101,10 +129,31 @@ def open_llm_span( metadata: Optional[dict] = None, parent_span_id: Optional[str] = None, ) -> object: - """Create a Langfuse generation on the trace span (or return None). + """ + Begin a Langfuse generation observation on the given trace span. ``parent_span_id`` is accepted for API compatibility but is unused in v4 — nesting is handled by calling ``start_observation`` on the parent span. + + Parameters + ---------- + trace : object + The parent trace or span. + name : str + Logical name for the operation. + input_data : dict + Model inputs. + model : str + Model identifier. + metadata : dict, optional + Additional context keys. + parent_span_id : str, optional + Explicit parent linkage (ignored in v4; nesting is contextual). + + Returns + ------- + object or None + The active span object. """ del parent_span_id # kept for API compatibility; v4 uses contextual nesting if trace is None: @@ -129,7 +178,23 @@ def close_span( usage: Optional[dict] = None, error: Optional[str] = None, ) -> None: - """End a Langfuse generation (no-op if span is None).""" + """Log results and terminate an active span. + + Parameters + ---------- + span : object + The span to close. + output : dict, optional + The model output to log. + usage : dict, optional + The provider usage dict (e.g. OpenAI or Gemini keys). + error : str, optional + An error message to log (if any). + + Returns + ------- + None + """ if span is None: return with contextlib.suppress(Exception): @@ -147,7 +212,20 @@ def close_span( def log_trace_scores(trace: object, scores: dict) -> None: - """Log a dict of {metric_name: float} as scores on the trace.""" + """ + Attach quantitative feedback scores to a trace. + + Parameters + ---------- + trace : object + The trace to update. + scores : dict + Mapping of metric names to numeric values. + + Returns + ------- + None + """ if trace is None: return for name, value in scores.items(): diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/__init__.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/__init__.py deleted file mode 100644 index 9bf02f2..0000000 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Opik observability integration — tracing, prompt versioning, dataset registration.""" diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/client.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/client.py deleted file mode 100644 index 2eebaf7..0000000 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/client.py +++ /dev/null @@ -1,74 +0,0 @@ -"""Opik client singleton with graceful degradation. - -Returns None when OPIK_URL_OVERRIDE / OPIK_API_KEY is not set or opik is not -installed, so every caller can guard with ``if client:``. -""" - -import os -from contextlib import suppress - -import opik -from dotenv import load_dotenv - - -_client = None -_initialised = False - - -def get_client(): - """ - Initialize and return a globally cached Opik client. - - Retrieves configuration from environment variables and configures - the SDK for local or cloud usage. - - Returns - ------- - Opik or None - An active client, or None if configuration is missing or invalid. - """ - global _client, _initialised # noqa: PLW0603 - if _initialised: - return _client - - _initialised = True - - # Load environment variables from .env file - with suppress(Exception): - load_dotenv() - - url = os.environ.get("OPIK_URL_OVERRIDE", "") - api_key = os.environ.get("OPIK_API_KEY", "") - - if not url and not api_key: - return None - - try: - if url: - # Opik SDK expects the base URL without /api suffix - base_url = url.rstrip("/") - if base_url.endswith("/api"): - base_url = base_url[:-4] - opik.configure(url=base_url, use_local=True, force=True, automatic_approvals=True) - else: - opik.configure(api_key=api_key, force=True, automatic_approvals=True) - - _client = opik.Opik() - except Exception as exc: - print(f"[opik] client init failed: {exc}") - _client = None - - return _client - - -def reset_client() -> None: - """ - Clear the cached Opik client and force re-initialization. - - Returns - ------- - None - """ - global _client, _initialised # noqa: PLW0603 - _client = None - _initialised = False diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/dataset.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/dataset.py deleted file mode 100644 index d4ac507..0000000 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/dataset.py +++ /dev/null @@ -1,85 +0,0 @@ -"""Register ChartQAPro samples as an Opik Dataset. - -Usage: - uv run --env-file .env -m agentic_chartqapro_eval.opik_integration.dataset \ - --split test --n 25 -""" - -import argparse -from typing import Optional - -from ..datasets.chartqapro_loader import load_chartqapro -from .client import get_client - - -def register_dataset( - samples, - dataset_name: str = "ChartQAPro", - split: str = "test", -) -> Optional[object]: - """ - Upload a collection of samples as an Opik Dataset. - - Allows for versioned dataset management and evaluation in the Opik UI. - - Parameters - ---------- - samples : list of PerceivedSample - The data samples to register. - dataset_name : str, default 'ChartQAPro' - The base name for the dataset. - split : str, default 'test' - The split identifier (e.g., 'train', 'val'). - - Returns - ------- - Dataset or None - The Opik Dataset object if successful. - """ - client = get_client() - if client is None: - return None - - name = f"{dataset_name}_{split}" - try: - dataset = client.get_or_create_dataset(name=name) - items = [ - { - "source_id": s.sample_id, # stored as data field; Opik auto-generates UUID v7 id - "question": s.question, - "expected_output": s.expected_output, - "question_type": s.question_type.value, - "image_path": s.image_path or "", - "choices": s.choices or [], - } - for s in samples - ] - dataset.insert(items) - print(f"[opik] Registered {len(items)} samples → dataset '{name}'") - return dataset - except Exception as exc: - print(f"[opik] Dataset registration failed: {exc}") - return None - - -def main() -> None: - """ - Command-line interface for registering ChartQAPro datasets. - - Returns - ------- - None - """ - parser = argparse.ArgumentParser(description="Register ChartQAPro samples as Opik dataset") - parser.add_argument("--split", default="test") - parser.add_argument("--n", type=int, default=25) - parser.add_argument("--image_dir", default="data/chartqapro_images") - parser.add_argument("--cache_dir", default=None) - args = parser.parse_args() - - samples = load_chartqapro(split=args.split, n=args.n, image_dir=args.image_dir, cache_dir=args.cache_dir) - register_dataset(samples, split=args.split) - - -if __name__ == "__main__": - main() diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/ingest.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/ingest.py deleted file mode 100644 index d8baf39..0000000 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/ingest.py +++ /dev/null @@ -1,240 +0,0 @@ -"""Retroactive ingestion: convert existing MEP JSON files to Opik Traces. - -This lets you visualise runs that completed before Opik was wired in. - -Usage: - uv run --env-file .env -m agentic_chartqapro_eval.opik_integration.ingest \ - --mep_dir meps/openai_openai/chartqapro/test \ - [--metrics_file metrics.jsonl] -""" - -import argparse -import contextlib -import json -from datetime import datetime, timedelta -from pathlib import Path -from typing import Optional - -from .client import get_client - - -def _parse_ts(iso: Optional[str]) -> Optional[datetime]: - if not iso: - return None - try: - return datetime.fromisoformat(iso) - except ValueError: - return None - - -def ingest_mep( - mep: dict, - client, - metrics: Optional[dict] = None, - project_name: str = "chartqapro-eval", -) -> None: - """ - Convert a single MEP JSON record into a retroactive Opik trace. - - Parameters - ---------- - mep : dict - The raw MEP record. - client : object - The Opik client. - metrics : dict, optional - Pre-computed metrics for the sample. - project_name : str, default 'chartqapro-eval' - Target project. - - Returns - ------- - None - """ - sample = mep.get("sample", {}) - plan = mep.get("plan", {}) - vision = mep.get("vision", {}) - timestamps = mep.get("timestamps", {}) - config = mep.get("config", {}) - - sample_id = sample.get("sample_id", "unknown") - config_name = config.get("config_name", "unknown") - question_type = sample.get("question_type", "standard") - question = sample.get("question", "") - expected = sample.get("expected_output", "") - vision_parsed = vision.get("parsed", {}) - - start_time = _parse_ts(timestamps.get("start")) - end_time = _parse_ts(timestamps.get("end")) - planner_ms = timestamps.get("planner_ms") or 0 - - trace = client.trace( - name=f"chartqapro/{sample_id}", - start_time=start_time, - end_time=end_time, - input={"question": question, "expected_output": expected}, - output=vision_parsed if vision_parsed else None, - tags=[config_name, question_type, "chartqapro", "retroactive"], - metadata={ - "run_id": mep.get("run_id", ""), - "config": config_name, - "question_type": question_type, - "schema_version": mep.get("schema_version", ""), - "has_errors": bool(mep.get("errors")), - }, - project_name=project_name, - ) - - # Planner span — estimate its time window from the start - if plan.get("prompt"): - p_start = start_time - p_end = None - if start_time and planner_ms: - p_end = start_time + timedelta(milliseconds=planner_ms) - planner_span = trace.span( - name="planner", - type="llm", - start_time=p_start, - end_time=p_end, - input={"prompt": plan.get("prompt", "")}, - output={ - "plan": plan.get("parsed", {}), - "parse_error": plan.get("parse_error", False), - }, - model=config.get("planner_model", ""), - metadata={"backend": config.get("planner_backend", "")}, - ) - planner_span.end() - - # Vision tool spans — one per ToolTrace entry - for tt in vision.get("tool_trace", []): - ts_start = _parse_ts(tt.get("start_ts")) - ts_end = _parse_ts(tt.get("end_ts")) - usage = tt.get("provider_metadata", {}).get("usage", {}) - tool_span = trace.span( - name="vision_qa_tool", - type="llm", - start_time=ts_start, - end_time=ts_end, - input={ - "question": question, - "plan_steps": plan.get("parsed", {}).get("steps", []), - }, - output=vision_parsed if vision_parsed else None, - model=tt.get("model", config.get("vision_model", "")), - usage=usage if usage else None, - metadata={ - "backend": tt.get("backend", config.get("vision_backend", "")), - "elapsed_ms": tt.get("elapsed_ms"), - }, - ) - tool_span.end() - - trace.end() - - # Log feedback scores from the matching metrics row - if metrics: - scores_to_log = {} - for key in [ - "answer_accuracy", - "judge_explanation_quality", - "judge_hallucination_rate", - "judge_plan_coverage", - "judge_plan_adherence", - "judge_faithfulness_alignment", - ]: - if key in metrics and isinstance(metrics[key], (int, float)): - scores_to_log[key] = float(metrics[key]) - for name, value in scores_to_log.items(): - with contextlib.suppress(Exception): - trace.log_feedback_score(name=name, value=value) - - -def ingest_dir( - mep_dir: str, - metrics_file: Optional[str] = None, - project_name: str = "chartqapro-eval", -) -> int: - """ - Bulk ingest all MEP files from a local directory into Opik. - - Parameters - ---------- - mep_dir : str - Path to the folder containing JSON results. - metrics_file : str, optional - Path to a .jsonl file with metrics data. - project_name : str, default 'chartqapro-eval' - Opik project identifier. - - Returns - ------- - int - The total number of successfully ingested records. - """ - client = get_client() - if client is None: - print("[opik] No client — set OPIK_URL_OVERRIDE or OPIK_API_KEY") - return 0 - - # Build sample_id → metrics lookup if provided - metrics_by_id: dict = {} - if metrics_file and Path(metrics_file).exists(): - with open(metrics_file) as f: - for raw_line in f: - line = raw_line.strip() - if line: - row = json.loads(line) - metrics_by_id[row.get("sample_id", "")] = row - - mep_path = Path(mep_dir) - mep_files = list(mep_path.glob("*.json")) - if not mep_files: - print(f"[opik] No MEP JSON files found in {mep_dir}") - return 0 - - count = 0 - for fpath in sorted(mep_files): - try: - mep = json.loads(fpath.read_text()) - sample_id = mep.get("sample", {}).get("sample_id", "") - ingest_mep( - mep, - client, - metrics=metrics_by_id.get(sample_id), - project_name=project_name, - ) - count += 1 - print(f" ingested {sample_id}") - except Exception as exc: - print(f" ERROR {fpath.name}: {exc}") - - print(f"[opik] Ingested {count}/{len(mep_files)} MEPs from {mep_dir}") - with contextlib.suppress(Exception): - client.flush() - return count - - -def main() -> None: - """ - Command-line interface for retroactive ingestion into Opik. - - Returns - ------- - None - """ - parser = argparse.ArgumentParser(description="Ingest existing MEPs into Opik") - parser.add_argument("--mep_dir", required=True, help="Directory containing MEP JSON files") - parser.add_argument( - "--metrics_file", - default=None, - help="Optional metrics.jsonl for feedback scores", - ) - parser.add_argument("--project", default="chartqapro-eval", help="Opik project name") - args = parser.parse_args() - - ingest_dir(args.mep_dir, args.metrics_file, project_name=args.project) - - -if __name__ == "__main__": - main() diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/prompts.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/prompts.py deleted file mode 100644 index 44944c8..0000000 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/prompts.py +++ /dev/null @@ -1,80 +0,0 @@ -"""Versioned prompt loading via Opik Prompt Library. - -Usage: - # Load prompt (falls back to file if Opik unavailable) - text = get_prompt("planner_prompt", PLANNER_PROMPT_PATH) - - # Push current prompt files to Opik (run once before a new experiment) - uv run --env-file .env -m agentic_chartqapro_eval.opik_integration.prompts -""" - -import argparse -from pathlib import Path -from typing import Optional - -from .client import get_client - - -# Prompt names as stored in the Opik Prompt Library -PLANNER_PROMPT_NAME = "chartqapro_planner" -VISION_PROMPT_NAME = "chartqapro_vision" - - -def get_prompt(name: str, fallback_path: Path) -> str: - """Return the latest versioned prompt from Opik, or read from file.""" - client = get_client() - if client: - try: - prompt = client.get_prompt(name=name) - if prompt: - return prompt.format() - except Exception: - pass - return fallback_path.read_text() - - -def push_prompts( - planner_path: Optional[Path] = None, - vision_path: Optional[Path] = None, -) -> None: - """Upload current planner.txt and vision.txt to Opik Prompt Library.""" - client = get_client() - if client is None: - print("[opik] No client — skipping prompt push") - return - - # Resolve default paths relative to the agents/prompts directory - agents_dir = Path(__file__).parents[1] / "agents" / "prompts" - planner_path = planner_path or (agents_dir / "planner.txt") - vision_path = vision_path or (agents_dir / "vision.txt") - - for name, path in [ - (PLANNER_PROMPT_NAME, planner_path), - (VISION_PROMPT_NAME, vision_path), - ]: - if not path.exists(): - print(f"[opik] Prompt file not found: {path}") - continue - text = path.read_text() - try: - prompt = client.create_prompt(name=name, prompt=text) - print(f"[opik] Pushed prompt '{name}' (commit={prompt.commit})") - except Exception as exc: - print(f"[opik] Failed to push prompt '{name}': {exc}") - - -def main() -> None: - """Parse CLI arguments and push prompt files to the Opik Prompt Library.""" - parser = argparse.ArgumentParser(description="Push prompt files to Opik Prompt Library") - parser.add_argument("--planner", default=None, help="Path to planner.txt") - parser.add_argument("--vision", default=None, help="Path to vision.txt") - args = parser.parse_args() - - push_prompts( - planner_path=Path(args.planner) if args.planner else None, - vision_path=Path(args.vision) if args.vision else None, - ) - - -if __name__ == "__main__": - main() diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/tracing.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/tracing.py deleted file mode 100644 index 92d84b4..0000000 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/tracing.py +++ /dev/null @@ -1,177 +0,0 @@ -"""Lightweight wrappers around opik Trace/Span for the MEP pipeline. - -All helpers accept ``None`` as the client/trace and become no-ops, so the -rest of the codebase can call them unconditionally. -""" - -import contextlib -from contextlib import contextmanager -from datetime import datetime, timezone -from typing import Optional - -from opik.types import ErrorInfoDict - - -def _now() -> datetime: - return datetime.now(timezone.utc) - - -@contextmanager -def sample_trace( - client, - sample_id: str, - question: str, - expected_output: str, - question_type: str, - config_name: str, - run_id: str, - project_name: str = "chartqapro-eval", -): - """ - Context manager to open and automatically close an Opik trace. - - Parameters - ---------- - client : object - The Opik client. If None, the context manager yields None. - sample_id : str - Unique identifier for the sample. - question : str - The input prompt text. - expected_output : str - The ground truth answer. - question_type : str - The category of the question. - config_name : str - The evaluation configuration used. - run_id : str - The unique ID of the pipeline run. - project_name : str, default 'chartqapro-eval' - Opik project identifier. - - Yields - ------ - trace : object or None - The initialized trace object. - """ - if client is None: - yield None - return - - trace = client.trace( - name=f"chartqapro/{sample_id}", - input={"question": question, "expected_output": expected_output}, - tags=[config_name, question_type, "chartqapro"], - metadata={ - "run_id": run_id, - "config": config_name, - "question_type": question_type, - }, - project_name=project_name, - ) - try: - yield trace - finally: - trace.end() - - -def open_llm_span( - trace, - name: str, - input_data: dict, - model: str, - metadata: Optional[dict] = None, - parent_span_id: Optional[str] = None, -): - """ - Begin a new LLM-type span within an active trace. - - Parameters - ---------- - trace : object - The parent trace or span. - name : str - Logical name for the operation. - input_data : dict - Model inputs. - model : str - Model identifier. - metadata : dict, optional - Additional context keys. - parent_span_id : str, optional - Explicit parent linkage. - - Returns - ------- - object or None - The active span object. - """ - if trace is None: - return None - return trace.span( - name=name, - type="llm", - input=input_data, - model=model, - metadata=metadata or {}, - parent_span_id=parent_span_id, - ) - - -def close_span( - span, - output: Optional[dict] = None, - usage: Optional[dict] = None, - error: Optional[str] = None, -) -> None: - """ - Log results and terminate an active span. - - Parameters - ---------- - span : object - The span to close. - output : dict, optional - The result of the operation. - usage : dict, optional - Token usage statistics. - error : str, optional - Error message if the span failed. - - Returns - ------- - None - """ - if span is None: - return - kwargs: dict = {} - if output is not None: - kwargs["output"] = output - if usage: - kwargs["usage"] = usage - if error: - kwargs["error_info"] = ErrorInfoDict(message=error) - span.end(**kwargs) - - -def log_trace_scores(trace, scores: dict) -> None: - """ - Attach quantitative feedback scores to a trace. - - Parameters - ---------- - trace : object - The trace to update. - scores : dict - Mapping of metric names to numeric values. - - Returns - ------- - None - """ - if trace is None: - return - for name, value in scores.items(): - if isinstance(value, (int, float)): - with contextlib.suppress(Exception): - trace.log_feedback_score(name=name, value=float(value)) diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py index e3eca3b..bad9e28 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py @@ -25,6 +25,13 @@ from ..agents.vision_agent import VisionAgent from ..datasets.chartqapro_loader import load_chartqapro from ..datasets.perceived_sample import PerceivedSample +from ..langfuse_integration.client import get_client +from ..langfuse_integration.dataset import register_dataset +from ..langfuse_integration.prompts import push_prompts +from ..langfuse_integration.tracing import ( + log_trace_scores, + sample_trace, +) from ..mep.schema import ( MEP, ImageRef, @@ -37,13 +44,6 @@ MEPVision, ) from ..mep.writer import write_mep -from ..langfuse_integration.client import get_client -from ..langfuse_integration.dataset import register_dataset -from ..langfuse_integration.prompts import push_prompts -from ..langfuse_integration.tracing import ( - log_trace_scores, - sample_trace, -) from ..tools.ocr_reader_tool import OcrReaderTool from ..utils.hashing import sha256_file from ..utils.json_strict import parse_strict diff --git a/implementations/mechanistic_interpretability/README.md b/implementations/mechanistic_interpretability/README.md index 4a5b18d..9b04b2b 100644 --- a/implementations/mechanistic_interpretability/README.md +++ b/implementations/mechanistic_interpretability/README.md @@ -113,4 +113,3 @@ Pointers for the main tools and ideas used here: - **Activation patching in VLMs**: Neo et al., 2024 — `https://arxiv.org/abs/2401.15947` - **Logit lens for VLMs (MMNeuron)**: `https://arxiv.org/abs/2406.11193` - **VLM interpretability survey (ICLR blog, 2025)**: `https://d2jud02ci9yv69.cloudfront.net/2025-04-28-vlm-understanding-29/blog/vlm-understanding/` - diff --git a/pyproject.toml b/pyproject.toml index 6e9500a..265a883 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -181,4 +181,4 @@ requires = ["setuptools>=68", "wheel"] build-backend = "setuptools.build_meta" [tool.setuptools.packages.find] -where = ["implementations/agentic_vqa_eval/src"] \ No newline at end of file +where = ["implementations/agentic_vqa_eval/src"] From 1df04a804532acaf838a6ce425fe1f0cfc613c12 Mon Sep 17 00:00:00 2001 From: Aravind N Date: Wed, 18 Mar 2026 19:56:41 -0400 Subject: [PATCH 3/9] Update README and notebooks from opik to langfuse. --- implementations/agentic_vqa_eval/README.md | 34 ++-- .../agentic_vqa_eval/analysis.ipynb | 8 +- .../agentic_vqa_eval/run_pipeline.ipynb | 138 +++++--------- uv.lock | 175 ------------------ 4 files changed, 69 insertions(+), 286 deletions(-) diff --git a/implementations/agentic_vqa_eval/README.md b/implementations/agentic_vqa_eval/README.md index d28a821..353fe8f 100644 --- a/implementations/agentic_vqa_eval/README.md +++ b/implementations/agentic_vqa_eval/README.md @@ -242,10 +242,12 @@ The `.env` file lives at the **repo root**. `load_dotenv()` searches upward from ### 3. Generate MEPs (run the agentic pipeline) +> **Note:** All `uv run` commands below use `$(git rev-parse --show-toplevel)` so they work from any directory in the repo — it resolves the repo root for `--env-file`, while `--directory` ensures outputs (`meps/`, `output/`) are written inside `implementations/agentic_vqa_eval/`. + Run on 25 test samples using GPT-4o for planner, vision, and verifier: ```bash -uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \ +uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.runner.run_generate_meps \ --split test \ --n 25 \ --config gemini_gemini \ @@ -257,7 +259,7 @@ MEPs are written to `meps/gemini_gemini/chartqapro/test/.json`. The **VerifierAgent (Pass 2.5)** runs automatically after the VisionAgent on every sample. To skip it (faster, lower cost): ```bash -uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \ +uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.runner.run_generate_meps \ --split test --n 25 --config gemini_gemini --no_verifier ``` @@ -265,7 +267,7 @@ uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \ **Model overrides** (e.g. to test different models without changing config): ```bash -uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \ +uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.runner.run_generate_meps \ --split test --n 25 --config gemini_gemini \ --planner_model gemini-2.5-flash-lite \ --vision_model gemini-2.5-flash-lite \ @@ -280,14 +282,14 @@ OCR is **enabled by default** and uses the same vision backend and model as the To run with OCR using a cheaper model (recommended — OCR is simpler than full VQA): ```bash -uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \ +uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.runner.run_generate_meps \ --split test --n 25 --config gemini_gemini \ --ocr_model gemini-2.5-flash-lite ``` To disable OCR entirely (matches the original pipeline behaviour, faster and lower cost): ```bash -uv run --env-file .env -magentic_chartqapro_eval.runner.run_generate_meps \ +uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.runner.run_generate_meps \ --split test --n 25 --config gemini_gemini --no_ocr ``` @@ -298,7 +300,7 @@ When OCR is skipped, `"ocr": null` appears in the MEP and `"ocr_ms": 0.0` in tim ### 4. Evaluate outputs (Pass 1 — accuracy + judge) ```bash -uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_outputs \ +uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.eval_outputs \ --mep_dir meps/gemini_gemini/chartqapro/test \ --out output/metrics.jsonl \ --no_judge # omit this flag to enable LLM judge (costs API calls) @@ -315,7 +317,7 @@ The `predicted` column always reflects the **final answer** — the verifier's o ### 5. Evaluate traces (Pass 2 — latency and replayability) ```bash -uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_traces \ +uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.eval_traces \ --mep_dir meps/gemini_gemini/chartqapro/test \ --out output/trace_metrics.jsonl ``` @@ -325,7 +327,7 @@ uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_traces \ Re-queries the VLM for each MEP asking for the 3 most likely candidate answers: ```bash -uv run --env-file .env -magentic_chartqapro_eval.eval.eval_topk \ +uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.eval_topk \ --mep_dir meps/gemini_gemini/chartqapro/test \ --out output/topk_metrics.jsonl \ --backend gemini \ @@ -338,7 +340,7 @@ This pass does **not** modify existing MEPs or `metrics.jsonl`. ### 7. Summarize results ```bash -uv run --env-file .env -magentic_chartqapro_eval.eval.summarize \ +uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.summarize \ --metrics output/metrics.jsonl \ --out output/summary.csv ``` @@ -348,7 +350,7 @@ uv run --env-file .env -magentic_chartqapro_eval.eval.summarize \ This pass asks **why** the agent was wrong, not just **that** it was wrong. A VLM is given the original chart image alongside the wrong answer, the correct answer, the agent's explanation, and the inspection plan — so it can make a *visual* diagnosis of the failure mode. ```bash -uv run --env-file .env -m agentic_chartqapro_eval.eval.error_taxonomy \ +uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.error_taxonomy \ --mep_dir meps/gemini_gemini/chartqapro/test \ --metrics_file output/metrics.jsonl \ --out output/taxonomy.jsonl @@ -408,7 +410,7 @@ for sid in revised: Generates a single portable HTML file with summary cards, accuracy tables, verifier stats, failure taxonomy breakdown, and a per-sample results table: ```bash -uv run --env-file .env -m agentic_chartqapro_eval.eval.report \ +uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.report \ --metrics output/metrics.jsonl \ --taxonomy output/taxonomy.jsonl \ --out output/report.html @@ -515,13 +517,13 @@ The framework auto-detects these variables. If they are absent, all Langfuse cal Run once before starting experiments. This creates versioned entries for `planner.txt` and `vision.txt` in Langfuse Prompt Management so every future experiment links to the exact prompt version used. ```bash -uv run --env-file .env -m -m agentic_chartqapro_eval.langfuse_integration.prompts +uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.langfuse_integration.prompts ``` ### 4. Register the dataset ```bash -uv run --env-file .env -m agentic_chartqapro_eval.langfuse_integration.dataset \ +uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.langfuse_integration.dataset \ --split test --n 25 ``` @@ -536,7 +538,7 @@ No extra flags needed. When `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are - stores the `lf_trace_id` in the MEP for later score attachment ```bash -uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \ +uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.runner.run_generate_meps \ --split test --n 25 --config gemini_gemini --workers 4 --out meps/ ``` @@ -545,7 +547,7 @@ uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \ After running `eval_outputs.py`, accuracy and judge scores are automatically written back to the Langfuse traces: ```bash -uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_outputs \ +uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.eval_outputs \ --mep_dir meps/gemini_gemini/chartqapro/test \ --out metrics.jsonl ``` @@ -555,7 +557,7 @@ uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_outputs \ If you have MEPs from runs before Langfuse was configured, import them without re-running the pipeline: ```bash -uv run --env-file .env -m agentic_chartqapro_eval.langfuse_integration.ingest \ +uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.langfuse_integration.ingest \ --mep_dir meps/gemini_gemini/chartqapro/test \ --metrics_file metrics.jsonl # optional: attaches scores if available ``` diff --git a/implementations/agentic_vqa_eval/analysis.ipynb b/implementations/agentic_vqa_eval/analysis.ipynb index d9537aa..4e47127 100644 --- a/implementations/agentic_vqa_eval/analysis.ipynb +++ b/implementations/agentic_vqa_eval/analysis.ipynb @@ -4,7 +4,7 @@ "cell_type": "markdown", "id": "7fb27b941602401d91542211134fc71a", "metadata": {}, - "source": "# ChartQAPro Evaluation — Analysis Walkthrough\n\nThis notebook walks through the full evaluation artifact stack produced by the\nagentic ChartQAPro framework. By the end you will be able to:\n\n- Load and inspect **MEPs** (Model Evaluation Packets) directly\n- Plot **accuracy by question type** from `metrics.jsonl`\n- Visualise the **verifier revision rate** and its effect on accuracy\n- Chart the **failure taxonomy** breakdown from `taxonomy.jsonl`\n- Browse individual samples — question, plan, vision answer, verifier verdict, chart image\n\n**Prerequisites:** Run these commands first:\n```bash\nuv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps --n 25 --config gemini_gemini\nuv run --env-file .env -m agentic_chartqapro_eval.eval.eval_outputs --mep_dir meps/gemini_gemini/chartqapro/test --out output/metrics.jsonl\nuv run --env-file .env -m agentic_chartqapro_eval.eval.error_taxonomy --mep_dir meps/gemini_gemini/chartqapro/test --metrics_file output/metrics.jsonl --out output/taxonomy.jsonl\n```" + "source": "# ChartQAPro Evaluation — Analysis Walkthrough\n\nThis notebook walks through the full evaluation artifact stack produced by the\nagentic ChartQAPro framework. By the end you will be able to:\n\n- Load and inspect **MEPs** (Model Evaluation Packets) directly\n- Plot **accuracy by question type** from `metrics.jsonl`\n- Visualise the **verifier revision rate** and its effect on accuracy\n- Chart the **failure taxonomy** breakdown from `taxonomy.jsonl`\n- Browse individual samples — question, plan, vision answer, verifier verdict, chart image\n\n**Prerequisites:** Run these commands first (from any directory in the repo):\n```bash\nuv run --env-file \"$(git rev-parse --show-toplevel)/.env\" --directory \"$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval\" -m agentic_chartqapro_eval.runner.run_generate_meps --n 25 --config gemini_gemini\nuv run --env-file \"$(git rev-parse --show-toplevel)/.env\" --directory \"$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval\" -m agentic_chartqapro_eval.eval.eval_outputs --mep_dir meps/gemini_gemini/chartqapro/test --out output/metrics.jsonl\nuv run --env-file \"$(git rev-parse --show-toplevel)/.env\" --directory \"$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval\" -m agentic_chartqapro_eval.eval.error_taxonomy --mep_dir meps/gemini_gemini/chartqapro/test --metrics_file output/metrics.jsonl --out output/taxonomy.jsonl\n```" }, { "cell_type": "markdown", @@ -272,7 +272,11 @@ "\n", " wrong = tax_df[tax_df[\"failure_type\"] != \"correct\"]\n", " print(f\"\\nTotal wrong: {len(wrong)} / {len(tax_df)}\")\n", - " print(f\"Most common failure: {counts[counts.index != 'correct'].idxmax()}\")" + " failure_counts = counts[counts.index != \"correct\"]\n", + " if failure_counts.empty:\n", + " print(\"Most common failure: none (all samples correct)\")\n", + " else:\n", + " print(f\"Most common failure: {failure_counts.idxmax()}\")" ] }, { diff --git a/implementations/agentic_vqa_eval/run_pipeline.ipynb b/implementations/agentic_vqa_eval/run_pipeline.ipynb index 294edfe..d40163e 100644 --- a/implementations/agentic_vqa_eval/run_pipeline.ipynb +++ b/implementations/agentic_vqa_eval/run_pipeline.ipynb @@ -16,7 +16,7 @@ "|---|---|\n", "| 1 — Configuration | All tunable parameters in one place |\n", "| 2 — Environment | Check API keys, install path, imports |\n", - "| 2.5 — Opik health check | Verify Opik stack is reachable and API-responsive before running |\n", + "| 2.5 — Langfuse health check | Verify Langfuse credentials are configured before running |\n", "| 3 — Load dataset | Pull samples from HuggingFace |\n", "| 4 — Instantiate agents | Build Planner, OCR, Vision, Verifier |\n", "| 5 — Run pipeline | Generate MEPs (Plan → OCR → Vision → Verify) |\n", @@ -119,7 +119,7 @@ " val = os.environ.get(var, \"\")\n", " needed = needed_for in CONFIG\n", " if val and not val.startswith(\"your_\"):\n", - " print(f\" ok {var} ({val[:12]}...)\")\n", + " print(f\" ok {var} ({val[:3]}...)\")\n", " elif needed:\n", " print(f\" MISSING {var} <- required for {CONFIG}\")\n", " missing.append(var)\n", @@ -141,8 +141,8 @@ "from agentic_chartqapro_eval.eval.eval_outputs import evaluate_mep # noqa: E402\n", "from agentic_chartqapro_eval.eval.eval_traces import evaluate_trace # noqa: E402\n", "from agentic_chartqapro_eval.eval.summarize import summarize, write_csv # noqa: E402\n", + "from agentic_chartqapro_eval.langfuse_integration.client import get_client # noqa: E402\n", "from agentic_chartqapro_eval.mep.writer import iter_meps # noqa: E402\n", - "from agentic_chartqapro_eval.opik_integration.client import get_client # noqa: E402\n", "from agentic_chartqapro_eval.runner.run_generate_meps import ( # noqa: E402\n", " BACKEND_CONFIGS,\n", " process_sample,\n", @@ -159,19 +159,17 @@ "id": "cell-opik-hdr", "metadata": {}, "source": [ - "## 2.5 — Opik Health Check\n", + "## 2.5 — Langfuse Health Check\n", "\n", - "Verifies that the self-hosted Opik stack is **fully operational** before the pipeline runs.\n", - "Three checks are run in sequence:\n", + "Verifies that Langfuse credentials are configured before the pipeline runs.\n", "\n", "| Check | What it tests |\n", "|---|---|\n", - "| HTTP reachable | TCP connection to `OPIK_URL_OVERRIDE` succeeds within 5 s |\n", - "| Client init | `opik.Opik()` initialises without error |\n", - "| API read test | A lightweight `search_traces` call returns a valid response |\n", + "| Env vars present | `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are set in `.env` |\n", + "| Client init | `Langfuse()` initialises without error |\n", "\n", - "If `OPIK_URL_OVERRIDE` is not set the cell prints a skip notice and continues — Opik is optional.\n", - "If any check fails the pipeline still runs; only tracing is affected." + "If the keys are absent the cell prints a skip notice and continues — Langfuse is optional.\n", + "The pipeline produces identical MEPs with or without it; tracing is purely additive." ] }, { @@ -181,107 +179,61 @@ "metadata": {}, "outputs": [], "source": [ - "import urllib.error\n", - "import urllib.request\n", - "\n", - "# Force re-initialisation so re-running this cell after starting Docker works correctly\n", - "from agentic_chartqapro_eval.opik_integration.client import reset_client\n", + "from agentic_chartqapro_eval.langfuse_integration.client import reset_client\n", "\n", "\n", + "# Force re-initialisation so re-running this cell picks up any .env changes\n", "reset_client()\n", "\n", - "OPIK_URL = os.environ.get(\"OPIK_URL_OVERRIDE\", \"\")\n", + "lf_public = os.environ.get(\"LANGFUSE_PUBLIC_KEY\", \"\")\n", + "lf_secret = os.environ.get(\"LANGFUSE_SECRET_KEY\", \"\")\n", "\n", - "if not OPIK_URL:\n", - " print(\"[skip] OPIK_URL_OVERRIDE is not set.\")\n", - " print(\" Opik tracing is disabled. Pipeline will run fine without it.\")\n", + "if not lf_public or not lf_secret:\n", + " print(\"[skip] LANGFUSE_PUBLIC_KEY / LANGFUSE_SECRET_KEY are not set.\")\n", + " print(\" Langfuse tracing is disabled. Pipeline will run fine without it.\")\n", " print()\n", - " print(\"To enable Opik tracing:\")\n", - " print(\" 1. Start the Docker stack:\")\n", - " print(\" cd /path/to/opik/deployment/docker-compose\")\n", - " print(\" docker compose --profile opik up -d\")\n", - " print(\" 2. Add to .env: OPIK_URL_OVERRIDE=http://localhost:5173/api\")\n", - " print(\" 3. Re-run this cell.\")\n", + " print(\"To enable Langfuse tracing, add to .env:\")\n", + " print(\" LANGFUSE_PUBLIC_KEY=pk-lf-...\")\n", + " print(\" LANGFUSE_SECRET_KEY=sk-lf-...\")\n", + " print(\" # LANGFUSE_HOST=https://cloud.langfuse.com (default; change for self-hosted)\")\n", "else:\n", " results = {}\n", "\n", - " # -- Check 1: HTTP reachability (any response = server is up) --\n", - " try:\n", - " with urllib.request.urlopen(OPIK_URL, timeout=5) as r:\n", - " results[\"http\"] = (\"ok\", f\"HTTP {r.status}\")\n", - " except urllib.error.HTTPError as e:\n", - " # HTTPError means server responded -- it is up, just returned a non-200\n", - " results[\"http\"] = (\"ok\", f\"HTTP {e.code} (server responded)\")\n", - " except Exception as e:\n", - " results[\"http\"] = (\"fail\", str(e))\n", + " # -- Check 1: Env vars present --\n", + " results[\"env\"] = (\"ok\", f\"pk={lf_public[:3]}...\")\n", "\n", - " # -- Check 2: Opik Python client initialises --\n", - " _opik_hc = None\n", + " # -- Check 2: Client initialises --\n", " try:\n", - " from agentic_chartqapro_eval.opik_integration.client import get_client\n", - "\n", - " _opik_hc = get_client()\n", - " if _opik_hc is not None:\n", - " results[\"client\"] = (\"ok\", \"opik.Opik() ready\")\n", + " _lf_hc = get_client()\n", + " if _lf_hc is not None:\n", + " results[\"client\"] = (\"ok\", \"Langfuse() ready\")\n", " else:\n", " results[\"client\"] = (\"fail\", \"get_client() returned None\")\n", " except Exception as e:\n", " results[\"client\"] = (\"fail\", str(e))\n", "\n", - " # -- Check 3: API actually responds to a lightweight read --\n", - " if results.get(\"client\", (\"\",))[0] == \"ok\" and _opik_hc is not None:\n", - " try:\n", - " traces = _opik_hc.search_traces(max_results=1)\n", - " results[\"api\"] = (\"ok\", f\"search_traces returned {len(traces)} result(s)\")\n", - " except Exception as e:\n", - " err_str = str(e)\n", - " hint = \"\"\n", - " if \"readonly\" in err_str.lower() or \"500\" in err_str:\n", - " hint = \" [ClickHouse replica may be read-only -- run SYSTEM RESTORE REPLICA]\"\n", - " results[\"api\"] = (\"fail\", err_str[:120] + hint)\n", - " else:\n", - " results[\"api\"] = (\"skip\", \"client unavailable\")\n", - "\n", " # -- Report --\n", - " print(f\"Opik URL : {OPIK_URL}\")\n", - " print()\n", " labels = [\n", - " (\"http\", \"HTTP reachable \"),\n", - " (\"client\", \"Client init \"),\n", - " (\"api\", \"API read test \"),\n", + " (\"env\", \"Env vars present\"),\n", + " (\"client\", \"Client init \"),\n", " ]\n", " all_ok = True\n", " for key, label in labels:\n", " status, detail = results.get(key, (\"skip\", \"\"))\n", - " if status == \"ok\":\n", - " marker = \"✓ OK \"\n", - " elif status == \"skip\":\n", - " marker = \"⊘ skip\"\n", - " else:\n", - " marker = \"✗ FAIL\"\n", + " marker = \"✓ OK \" if status == \"ok\" else (\"⊘ skip\" if status == \"skip\" else \"✗ FAIL\")\n", + " if status not in (\"ok\", \"skip\"):\n", " all_ok = False\n", " print(f\" {marker} {label} {detail}\")\n", "\n", " print()\n", " if all_ok:\n", - " dashboard_url = OPIK_URL.rstrip(\"/\").removesuffix(\"/api\")\n", - " print(\"✓ Opik is fully operational.\")\n", - " print(f\"Dashboard : {dashboard_url}\")\n", + " lf_host = os.environ.get(\"LANGFUSE_HOST\") or os.environ.get(\"LANGFUSE_BASE_URL\") or \"https://cloud.langfuse.com\"\n", + " print(\"✓ Langfuse is configured.\")\n", + " print(f\"Host : {lf_host}\")\n", " print(\"Traces and scores will be recorded automatically during the pipeline run.\")\n", " else:\n", - " print(\"⚠ WARNING: One or more Opik checks failed.\")\n", - " print(\"The pipeline will still run; Opik tracing may not work correctly.\")\n", - " if results.get(\"http\", (\"\",))[0] == \"fail\":\n", - " print()\n", - " print(\" Docker stack appears to be down. To start it:\")\n", - " print(\" cd /path/to/opik/deployment/docker-compose\")\n", - " print(\" docker compose --profile opik up -d\")\n", - " if results.get(\"api\", (\"\",))[0] == \"fail\":\n", - " print()\n", - " print(\" API is reachable but not responding correctly.\")\n", - " print(\" Check ClickHouse replica state:\")\n", - " print(\" docker exec opik-clickhouse-1 clickhouse-client --query \\\\\")\n", - " print(\" \\\"SELECT database,table,is_readonly FROM system.replicas WHERE database='opik'\\\"\")" + " print(\"⚠ WARNING: Langfuse client failed to initialise.\")\n", + " print(\"The pipeline will still run; tracing will be skipped.\")" ] }, { @@ -376,10 +328,10 @@ "else:\n", " print(\"OcrReaderTool : disabled (USE_OCR=False)\")\n", "\n", - "# Opik observability (no-op if OPIK_URL_OVERRIDE not set)\n", - "opik_client = get_client()\n", - "opik_status = \"enabled\" if opik_client else \"not configured\"\n", - "print(f\"Opik : {opik_status}\")" + "# Langfuse observability (no-op if keys not set)\n", + "lf_client = get_client()\n", + "lf_status = \"enabled\" if lf_client else \"not configured\"\n", + "print(f\"Langfuse : {lf_status}\")" ] }, { @@ -421,7 +373,7 @@ " config,\n", " RUN_ID,\n", " OUT_DIR,\n", - " opik_client=opik_client,\n", + " lf_client=lf_client,\n", " verifier_agent=verifier,\n", " ocr_tool=ocr,\n", " )\n", @@ -459,8 +411,8 @@ "## 6 — Inspect First MEP\n", "\n", "MEPs are self-contained JSON files. Every field you see here is what the agent actually\n", - "produced — no post-processing. The `opik_trace_id` links this MEP back to the live trace\n", - "in the Opik dashboard if Opik is configured." + "produced — no post-processing. The `lf_trace_id` links this MEP back to the live trace\n", + "in the Langfuse dashboard if Langfuse is configured." ] }, { @@ -501,8 +453,8 @@ " print(\"Timestamps (ms):\")\n", " for k in [\"planner_ms\", \"ocr_ms\", \"vision_ms\", \"verifier_ms\"]:\n", " print(f\" {k:<16} {ts.get(k, 0):.0f}\")\n", - " if mep.get(\"opik_trace_id\"):\n", - " print(f\"Opik trace ID: {mep['opik_trace_id']}\")\n", + " if mep.get(\"lf_trace_id\"):\n", + " print(f\"Langfuse trace ID: {mep['lf_trace_id']}\")\n", " print(\"=\" * 64)\n", "\n", " img_path = s.get(\"image_ref\", {}).get(\"path\", \"\")\n", @@ -609,7 +561,7 @@ " config,\n", " RUN_ID_NO_OCR,\n", " OUT_DIR_NO_OCR,\n", - " opik_client=opik_client,\n", + " lf_client=lf_client,\n", " verifier_agent=verifier,\n", " ocr_tool=None, # <-- OCR disabled\n", " )\n", diff --git a/uv.lock b/uv.lock index 9bcb45a..db5369c 100644 --- a/uv.lock +++ b/uv.lock @@ -561,24 +561,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/18/a1/128e3676fb9b4fd965a93554e5e07045975ee6bd6e9fdb536cdffa32e99e/boto3-1.42.70-py3-none-any.whl", hash = "sha256:18a108c4d5df89a200b3949de0d39c0879b100c455e3229ea38275dd392db0f4", size = 140554, upload-time = "2026-03-17T19:43:20.406Z" }, ] -[[package]] -name = "boto3-stubs" -version = "1.42.69" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "botocore-stubs" }, - { name = "types-s3transfer" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/1e/ba/b282b7ab3626a25a6896c2f31adc95324b3e5f50056923d274a35c5eaf0c/boto3_stubs-1.42.69.tar.gz", hash = "sha256:52ccd645a34d2b4e97af8f44dbaffbb854a1de52610e9502c284bfb24e6d8962", size = 101397, upload-time = "2026-03-16T20:58:58.538Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/6a/78/83ef6f549d88425618ce66d4b273ea46e379aefdf0e9e49bf4f9bfa01cda/boto3_stubs-1.42.69-py3-none-any.whl", hash = "sha256:021360b519ac54822eb00f125b0c4292ad2a1869ae8e1d0c6c097db99215d41b", size = 70010, upload-time = "2026-03-16T20:58:51.184Z" }, -] - -[package.optional-dependencies] -bedrock-runtime = [ - { name = "mypy-boto3-bedrock-runtime" }, -] - [[package]] name = "botocore" version = "1.42.70" @@ -593,18 +575,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fb/51/08f32aea872253173f513ba68122f4300966290677c8e59887b4ffd5d957/botocore-1.42.70-py3-none-any.whl", hash = "sha256:54ed9d25f05f810efd22b0dfda0bb9178df3ad8952b2e4359e05156c9321bd3c", size = 14671393, upload-time = "2026-03-17T19:43:06.777Z" }, ] -[[package]] -name = "botocore-stubs" -version = "1.42.41" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "types-awscrt" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/0c/a8/a26608ff39e3a5866c6c79eda10133490205cbddd45074190becece3ff2a/botocore_stubs-1.42.41.tar.gz", hash = "sha256:dbeac2f744df6b814ce83ec3f3777b299a015cbea57a2efc41c33b8c38265825", size = 42411, upload-time = "2026-02-03T20:46:14.479Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/32/76/cab7af7f16c0b09347f2ebe7ffda7101132f786acb767666dce43055faab/botocore_stubs-1.42.41-py3-none-any.whl", hash = "sha256:9423110fb0e391834bd2ed44ae5f879d8cb370a444703d966d30842ce2bcb5f0", size = 66759, upload-time = "2026-02-03T20:46:13.02Z" }, -] - [[package]] name = "build" version = "1.4.0" @@ -1294,25 +1264,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/90/2b/0817a2b257fe88725c25589d89aec060581aabf668707a8d03b2e9e0cb2a/fastjsonschema-2.21.1-py3-none-any.whl", hash = "sha256:c9e5b7e908310918cf494a434eeb31384dd84a98b57a30bcb1f535015b554667", size = 23924, upload-time = "2024-12-02T10:55:07.599Z" }, ] -[[package]] -name = "fastuuid" -version = "0.14.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c3/7d/d9daedf0f2ebcacd20d599928f8913e9d2aea1d56d2d355a93bfa2b611d7/fastuuid-0.14.0.tar.gz", hash = "sha256:178947fc2f995b38497a74172adee64fdeb8b7ec18f2a5934d037641ba265d26", size = 18232, upload-time = "2025-10-19T22:19:22.402Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/02/a2/e78fcc5df65467f0d207661b7ef86c5b7ac62eea337c0c0fcedbeee6fb13/fastuuid-0.14.0-cp312-cp312-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:77e94728324b63660ebf8adb27055e92d2e4611645bf12ed9d88d30486471d0a", size = 510164, upload-time = "2025-10-19T22:31:45.635Z" }, - { url = "https://files.pythonhosted.org/packages/2b/b3/c846f933f22f581f558ee63f81f29fa924acd971ce903dab1a9b6701816e/fastuuid-0.14.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:caa1f14d2102cb8d353096bc6ef6c13b2c81f347e6ab9d6fbd48b9dea41c153d", size = 261837, upload-time = "2025-10-19T22:38:38.53Z" }, - { url = "https://files.pythonhosted.org/packages/54/ea/682551030f8c4fa9a769d9825570ad28c0c71e30cf34020b85c1f7ee7382/fastuuid-0.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d23ef06f9e67163be38cece704170486715b177f6baae338110983f99a72c070", size = 251370, upload-time = "2025-10-19T22:40:26.07Z" }, - { url = "https://files.pythonhosted.org/packages/14/dd/5927f0a523d8e6a76b70968e6004966ee7df30322f5fc9b6cdfb0276646a/fastuuid-0.14.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c9ec605ace243b6dbe3bd27ebdd5d33b00d8d1d3f580b39fdd15cd96fd71796", size = 277766, upload-time = "2025-10-19T22:37:23.779Z" }, - { url = "https://files.pythonhosted.org/packages/16/6e/c0fb547eef61293153348f12e0f75a06abb322664b34a1573a7760501336/fastuuid-0.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:808527f2407f58a76c916d6aa15d58692a4a019fdf8d4c32ac7ff303b7d7af09", size = 278105, upload-time = "2025-10-19T22:26:56.821Z" }, - { url = "https://files.pythonhosted.org/packages/2d/b1/b9c75e03b768f61cf2e84ee193dc18601aeaf89a4684b20f2f0e9f52b62c/fastuuid-0.14.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2fb3c0d7fef6674bbeacdd6dbd386924a7b60b26de849266d1ff6602937675c8", size = 301564, upload-time = "2025-10-19T22:30:31.604Z" }, - { url = "https://files.pythonhosted.org/packages/fc/fa/f7395fdac07c7a54f18f801744573707321ca0cee082e638e36452355a9d/fastuuid-0.14.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab3f5d36e4393e628a4df337c2c039069344db5f4b9d2a3c9cea48284f1dd741", size = 459659, upload-time = "2025-10-19T22:31:32.341Z" }, - { url = "https://files.pythonhosted.org/packages/66/49/c9fd06a4a0b1f0f048aacb6599e7d96e5d6bc6fa680ed0d46bf111929d1b/fastuuid-0.14.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:b9a0ca4f03b7e0b01425281ffd44e99d360e15c895f1907ca105854ed85e2057", size = 478430, upload-time = "2025-10-19T22:26:22.962Z" }, - { url = "https://files.pythonhosted.org/packages/be/9c/909e8c95b494e8e140e8be6165d5fc3f61fdc46198c1554df7b3e1764471/fastuuid-0.14.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3acdf655684cc09e60fb7e4cf524e8f42ea760031945aa8086c7eae2eeeabeb8", size = 450894, upload-time = "2025-10-19T22:27:01.647Z" }, - { url = "https://files.pythonhosted.org/packages/90/eb/d29d17521976e673c55ef7f210d4cdd72091a9ec6755d0fd4710d9b3c871/fastuuid-0.14.0-cp312-cp312-win32.whl", hash = "sha256:9579618be6280700ae36ac42c3efd157049fe4dd40ca49b021280481c78c3176", size = 154374, upload-time = "2025-10-19T22:29:19.879Z" }, - { url = "https://files.pythonhosted.org/packages/cc/fc/f5c799a6ea6d877faec0472d0b27c079b47c86b1cdc577720a5386483b36/fastuuid-0.14.0-cp312-cp312-win_amd64.whl", hash = "sha256:d9e4332dc4ba054434a9594cbfaf7823b57993d7d8e7267831c3e059857cf397", size = 156550, upload-time = "2025-10-19T22:27:49.658Z" }, -] - [[package]] name = "filelock" version = "3.20.3" @@ -2084,7 +2035,6 @@ dependencies = [ agentic-xai-eval = [ { name = "crewai" }, { name = "google-genai" }, - { name = "opik" }, { name = "streamlit" }, ] dev = [ @@ -2163,7 +2113,6 @@ requires-dist = [ agentic-xai-eval = [ { name = "crewai", specifier = ">=1.6.1" }, { name = "google-genai", specifier = ">=1.67.0" }, - { name = "opik", specifier = ">=1.10.40" }, { name = "streamlit", specifier = ">=1.55.0" }, ] dev = [ @@ -2857,30 +2806,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/53/84/8a89614b2e7eeeaf0a68a4046d6cfaea4544c8619ea02595ebeec9b2bae3/license_expression-30.4.1-py3-none-any.whl", hash = "sha256:679646bc3261a17690494a3e1cada446e5ee342dbd87dcfa4a0c24cc5dce13ee", size = 111457, upload-time = "2025-01-14T05:11:38.658Z" }, ] -[[package]] -name = "litellm" -version = "1.82.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "aiohttp" }, - { name = "click" }, - { name = "fastuuid" }, - { name = "httpx" }, - { name = "importlib-metadata" }, - { name = "jinja2" }, - { name = "jsonschema" }, - { name = "openai" }, - { name = "pydantic" }, - { name = "python-dotenv" }, - { name = "tiktoken" }, - { name = "tokenizers", version = "0.21.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'group-28-interpretability-llms-agents-xai-refresher'" }, - { name = "tokenizers", version = "0.22.2", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'group-28-interpretability-llms-agents-mechanistic-interp' or extra == 'group-28-interpretability-llms-agents-preference-alignment' or extra != 'group-28-interpretability-llms-agents-xai-refresher'" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/60/12/010a86643f12ac0b004032d5927c260094299a84ed38b5ed20a8f8c7e3c4/litellm-1.82.2.tar.gz", hash = "sha256:f5f4c4049f344a88bf80b2e421bb927807687c99624515d7ff4152d533ec9dcb", size = 17353218, upload-time = "2026-03-13T21:24:24.5Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/96/e4/87e3ca82a8bf6e6bfffb42a539a1350dd6ced1b7169397bd439ba56fde10/litellm-1.82.2-py3-none-any.whl", hash = "sha256:641ed024774fa3d5b4dd9347f0efb1e31fa422fba2a6500aabedee085d1194cb", size = 15524224, upload-time = "2026-03-13T21:24:21.288Z" }, -] - [[package]] name = "llvmlite" version = "0.46.0" @@ -3182,15 +3107,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6c/28/dd72947e59a6a8c856448a5e74da6201cb5502ddff644fbc790e4bd40b9a/multiprocess-0.70.18-py39-none-any.whl", hash = "sha256:e78ca805a72b1b810c690b6b4cc32579eba34f403094bbbae962b7b5bf9dfcb8", size = 133478, upload-time = "2025-04-17T03:11:26.253Z" }, ] -[[package]] -name = "mypy-boto3-bedrock-runtime" -version = "1.42.42" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/46/bb/65dc1b2c5796a6ab5f60bdb57343bd6c3ecb82251c580eca415c8548333e/mypy_boto3_bedrock_runtime-1.42.42.tar.gz", hash = "sha256:3a4088218478b6fbbc26055c03c95bee4fc04624a801090b3cce3037e8275c8d", size = 29840, upload-time = "2026-02-04T20:53:05.999Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/00/43/7ea062f2228f47b5779dcfa14dab48d6e29f979b35d1a5102b0ba80b9c1b/mypy_boto3_bedrock_runtime-1.42.42-py3-none-any.whl", hash = "sha256:b2d16eae22607d0685f90796b3a0afc78c0b09d45872e00eafd634a31dd9358f", size = 36077, upload-time = "2026-02-04T20:53:01.768Z" }, -] - [[package]] name = "namex" version = "0.1.0" @@ -3871,32 +3787,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b2/37/cc6a55e448deaa9b27377d087da8615a3416d8ad523d5960b78dbeadd02a/opentelemetry_semantic_conventions-0.61b0-py3-none-any.whl", hash = "sha256:fa530a96be229795f8cef353739b618148b0fe2b4b3f005e60e262926c4d38e2", size = 231621, upload-time = "2026-03-04T14:17:19.33Z" }, ] -[[package]] -name = "opik" -version = "1.10.40" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "boto3-stubs", extra = ["bedrock-runtime"] }, - { name = "click" }, - { name = "httpx" }, - { name = "jinja2" }, - { name = "litellm" }, - { name = "openai" }, - { name = "pydantic" }, - { name = "pydantic-settings" }, - { name = "pytest" }, - { name = "rapidfuzz" }, - { name = "rich" }, - { name = "sentry-sdk" }, - { name = "tenacity" }, - { name = "tqdm" }, - { name = "uuid6" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ae/a4/3b7c5c705b57776a3808c58a985aa7864e77c5eea7ef16780d6792fc3e1c/opik-1.10.40.tar.gz", hash = "sha256:aee1cd8ffdb2d3f7a0a15276626c4d5e7a904722fd9fad8ec5a9fa679310f7e9", size = 777468, upload-time = "2026-03-16T13:37:18.978Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c5/64/6a9f7231115055f60052a7b8e4f3d4182dd9c097f916ac1682fe093ddff4/opik-1.10.40-py3-none-any.whl", hash = "sha256:e446551b7603ce9b34b09eb8179e74bbae005e3ce589bc8bf613f6711771b24b", size = 1315663, upload-time = "2026-03-16T13:37:17.339Z" }, -] - [[package]] name = "opt-einsum" version = "3.4.0" @@ -4835,25 +4725,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c4/43/80f67e0336cb2fc725f8e06f7fe35c1d0fe946f4d2b8b2175e797e07349e/qwen_vl_utils-0.0.14-py3-none-any.whl", hash = "sha256:5e28657bfd031e56bd447c5901b58ddfc3835285ed100f4c56580e0ade054e96", size = 8120, upload-time = "2025-09-23T09:38:56.297Z" }, ] -[[package]] -name = "rapidfuzz" -version = "3.14.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/d3/28/9d808fe62375b9aab5ba92fa9b29371297b067c2790b2d7cda648b1e2f8d/rapidfuzz-3.14.3.tar.gz", hash = "sha256:2491937177868bc4b1e469087601d53f925e8d270ccc21e07404b4b5814b7b5f", size = 57863900, upload-time = "2025-11-01T11:54:52.321Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/fa/8e/3c215e860b458cfbedb3ed73bc72e98eb7e0ed72f6b48099604a7a3260c2/rapidfuzz-3.14.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:685c93ea961d135893b5984a5a9851637d23767feabe414ec974f43babbd8226", size = 1945306, upload-time = "2025-11-01T11:53:06.452Z" }, - { url = "https://files.pythonhosted.org/packages/36/d9/31b33512015c899f4a6e6af64df8dfe8acddf4c8b40a4b3e0e6e1bcd00e5/rapidfuzz-3.14.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fa7c8f26f009f8c673fbfb443792f0cf8cf50c4e18121ff1e285b5e08a94fbdb", size = 1390788, upload-time = "2025-11-01T11:53:08.721Z" }, - { url = "https://files.pythonhosted.org/packages/a9/67/2ee6f8de6e2081ccd560a571d9c9063184fe467f484a17fa90311a7f4a2e/rapidfuzz-3.14.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:57f878330c8d361b2ce76cebb8e3e1dc827293b6abf404e67d53260d27b5d941", size = 1374580, upload-time = "2025-11-01T11:53:10.164Z" }, - { url = "https://files.pythonhosted.org/packages/30/83/80d22997acd928eda7deadc19ccd15883904622396d6571e935993e0453a/rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6c5f545f454871e6af05753a0172849c82feaf0f521c5ca62ba09e1b382d6382", size = 3154947, upload-time = "2025-11-01T11:53:12.093Z" }, - { url = "https://files.pythonhosted.org/packages/5b/cf/9f49831085a16384695f9fb096b99662f589e30b89b4a589a1ebc1a19d34/rapidfuzz-3.14.3-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:07aa0b5d8863e3151e05026a28e0d924accf0a7a3b605da978f0359bb804df43", size = 1223872, upload-time = "2025-11-01T11:53:13.664Z" }, - { url = "https://files.pythonhosted.org/packages/c8/0f/41ee8034e744b871c2e071ef0d360686f5ccfe5659f4fd96c3ec406b3c8b/rapidfuzz-3.14.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:73b07566bc7e010e7b5bd490fb04bb312e820970180df6b5655e9e6224c137db", size = 2392512, upload-time = "2025-11-01T11:53:15.109Z" }, - { url = "https://files.pythonhosted.org/packages/da/86/280038b6b0c2ccec54fb957c732ad6b41cc1fd03b288d76545b9cf98343f/rapidfuzz-3.14.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:6de00eb84c71476af7d3110cf25d8fe7c792d7f5fa86764ef0b4ca97e78ca3ed", size = 2521398, upload-time = "2025-11-01T11:53:17.146Z" }, - { url = "https://files.pythonhosted.org/packages/fa/7b/05c26f939607dca0006505e3216248ae2de631e39ef94dd63dbbf0860021/rapidfuzz-3.14.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d7843a1abf0091773a530636fdd2a49a41bcae22f9910b86b4f903e76ddc82dc", size = 4259416, upload-time = "2025-11-01T11:53:19.34Z" }, - { url = "https://files.pythonhosted.org/packages/40/eb/9e3af4103d91788f81111af1b54a28de347cdbed8eaa6c91d5e98a889aab/rapidfuzz-3.14.3-cp312-cp312-win32.whl", hash = "sha256:dea97ac3ca18cd3ba8f3d04b5c1fe4aa60e58e8d9b7793d3bd595fdb04128d7a", size = 1709527, upload-time = "2025-11-01T11:53:20.949Z" }, - { url = "https://files.pythonhosted.org/packages/b8/63/d06ecce90e2cf1747e29aeab9f823d21e5877a4c51b79720b2d3be7848f8/rapidfuzz-3.14.3-cp312-cp312-win_amd64.whl", hash = "sha256:b5100fd6bcee4d27f28f4e0a1c6b5127bc8ba7c2a9959cad9eab0bf4a7ab3329", size = 1538989, upload-time = "2025-11-01T11:53:22.428Z" }, - { url = "https://files.pythonhosted.org/packages/fc/6d/beee32dcda64af8128aab3ace2ccb33d797ed58c434c6419eea015fec779/rapidfuzz-3.14.3-cp312-cp312-win_arm64.whl", hash = "sha256:4e49c9e992bc5fc873bd0fff7ef16a4405130ec42f2ce3d2b735ba5d3d4eb70f", size = 811161, upload-time = "2025-11-01T11:53:23.811Z" }, -] - [[package]] name = "referencing" version = "0.36.2" @@ -5541,25 +5412,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1a/e4/e804505f87627cd8cdae9c010c47c4485fd8c1ce31a7dd0ab7fcc4707377/tifffile-2026.3.3-py3-none-any.whl", hash = "sha256:e8be15c94273113d31ecb7aa3a39822189dd11c4967e3cc88c178f1ad2fd1170", size = 243960, upload-time = "2026-03-03T19:14:35.808Z" }, ] -[[package]] -name = "tiktoken" -version = "0.12.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "regex" }, - { name = "requests" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806, upload-time = "2025-10-06T20:22:45.419Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a4/85/be65d39d6b647c79800fd9d29241d081d4eeb06271f383bb87200d74cf76/tiktoken-0.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b97f74aca0d78a1ff21b8cd9e9925714c15a9236d6ceacf5c7327c117e6e21e8", size = 1050728, upload-time = "2025-10-06T20:21:52.756Z" }, - { url = "https://files.pythonhosted.org/packages/4a/42/6573e9129bc55c9bf7300b3a35bef2c6b9117018acca0dc760ac2d93dffe/tiktoken-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2b90f5ad190a4bb7c3eb30c5fa32e1e182ca1ca79f05e49b448438c3e225a49b", size = 994049, upload-time = "2025-10-06T20:21:53.782Z" }, - { url = "https://files.pythonhosted.org/packages/66/c5/ed88504d2f4a5fd6856990b230b56d85a777feab84e6129af0822f5d0f70/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:65b26c7a780e2139e73acc193e5c63ac754021f160df919add909c1492c0fb37", size = 1129008, upload-time = "2025-10-06T20:21:54.832Z" }, - { url = "https://files.pythonhosted.org/packages/f4/90/3dae6cc5436137ebd38944d396b5849e167896fc2073da643a49f372dc4f/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:edde1ec917dfd21c1f2f8046b86348b0f54a2c0547f68149d8600859598769ad", size = 1152665, upload-time = "2025-10-06T20:21:56.129Z" }, - { url = "https://files.pythonhosted.org/packages/a3/fe/26df24ce53ffde419a42f5f53d755b995c9318908288c17ec3f3448313a3/tiktoken-0.12.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:35a2f8ddd3824608b3d650a000c1ef71f730d0c56486845705a8248da00f9fe5", size = 1194230, upload-time = "2025-10-06T20:21:57.546Z" }, - { url = "https://files.pythonhosted.org/packages/20/cc/b064cae1a0e9fac84b0d2c46b89f4e57051a5f41324e385d10225a984c24/tiktoken-0.12.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83d16643edb7fa2c99eff2ab7733508aae1eebb03d5dfc46f5565862810f24e3", size = 1254688, upload-time = "2025-10-06T20:21:58.619Z" }, - { url = "https://files.pythonhosted.org/packages/81/10/b8523105c590c5b8349f2587e2fdfe51a69544bd5a76295fc20f2374f470/tiktoken-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffc5288f34a8bc02e1ea7047b8d041104791d2ddbf42d1e5fa07822cbffe16bd", size = 878694, upload-time = "2025-10-06T20:21:59.876Z" }, -] - [[package]] name = "timm" version = "1.0.25" @@ -6050,15 +5902,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4a/91/48db081e7a63bb37284f9fbcefda7c44c277b18b0e13fbc36ea2335b71e6/typer-0.24.1-py3-none-any.whl", hash = "sha256:112c1f0ce578bfb4cab9ffdabc68f031416ebcc216536611ba21f04e9aa84c9e", size = 56085, upload-time = "2026-02-21T16:54:41.616Z" }, ] -[[package]] -name = "types-awscrt" -version = "0.31.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/76/26/0aa563e229c269c528a3b8c709fc671ac2a5c564732fab0852ac6ee006cf/types_awscrt-0.31.3.tar.gz", hash = "sha256:09d3eaf00231e0f47e101bd9867e430873bc57040050e2a3bd8305cb4fc30865", size = 18178, upload-time = "2026-03-08T02:31:14.569Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3e/e5/47a573bbbd0a790f8f9fe452f7188ea72b212d21c9be57d5fc0cbc442075/types_awscrt-0.31.3-py3-none-any.whl", hash = "sha256:e5ce65a00a2ab4f35eacc1e3d700d792338d56e4823ee7b4dbe017f94cfc4458", size = 43340, upload-time = "2026-03-08T02:31:13.38Z" }, -] - [[package]] name = "types-python-dateutil" version = "2.9.0.20250516" @@ -6077,15 +5920,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/bb/9bc26fcf5155bd25efeca35f8ba6bffb8b3c9da2baac8bf40067606418f3/types_regex-2026.2.28.20260301-py3-none-any.whl", hash = "sha256:7da7a1fe67528238176a5844fd435ca90617cf605341308686afbc579fdea5c0", size = 11130, upload-time = "2026-03-01T04:11:11.454Z" }, ] -[[package]] -name = "types-s3transfer" -version = "0.16.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/fe/64/42689150509eb3e6e82b33ee3d89045de1592488842ddf23c56957786d05/types_s3transfer-0.16.0.tar.gz", hash = "sha256:b4636472024c5e2b62278c5b759661efeb52a81851cde5f092f24100b1ecb443", size = 13557, upload-time = "2025-12-08T08:13:09.928Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/98/27/e88220fe6274eccd3bdf95d9382918716d312f6f6cef6a46332d1ee2feff/types_s3transfer-0.16.0-py3-none-any.whl", hash = "sha256:1c0cd111ecf6e21437cb410f5cddb631bfb2263b77ad973e79b9c6d0cb24e0ef", size = 19247, upload-time = "2025-12-08T08:13:08.426Z" }, -] - [[package]] name = "typing-extensions" version = "4.15.0" @@ -6217,15 +6051,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" }, ] -[[package]] -name = "uuid6" -version = "2025.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ca/b7/4c0f736ca824b3a25b15e8213d1bcfc15f8ac2ae48d1b445b310892dc4da/uuid6-2025.0.1.tar.gz", hash = "sha256:cd0af94fa428675a44e32c5319ec5a3485225ba2179eefcf4c3f205ae30a81bd", size = 13932, upload-time = "2025-07-04T18:30:35.186Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3d/b2/93faaab7962e2aa8d6e174afb6f76be2ca0ce89fde14d3af835acebcaa59/uuid6-2025.0.1-py3-none-any.whl", hash = "sha256:80530ce4d02a93cdf82e7122ca0da3ebbbc269790ec1cb902481fa3e9cc9ff99", size = 6979, upload-time = "2025-07-04T18:30:34.001Z" }, -] - [[package]] name = "uv" version = "0.10.10" From 40d7e95faa12e7576ba2cf98e9587fe2ea850605 Mon Sep 17 00:00:00 2001 From: Aravind N Date: Wed, 18 Mar 2026 20:09:26 -0400 Subject: [PATCH 4/9] Update MEP directory path in dashboard.py for consistency with new structure --- .../src/agentic_chartqapro_eval/eval/dashboard.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/dashboard.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/dashboard.py index 5bcdbee..7ea936a 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/dashboard.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/dashboard.py @@ -88,7 +88,7 @@ def load_meps(mep_dir: str) -> dict: mep_dir_input = st.sidebar.text_input( "MEP directory", - value="meps/openai_openai/chartqapro/test", + value="meps/gemini_gemini/chartqapro/test", help="Directory containing .json MEP files", ) metrics_input = st.sidebar.text_input("metrics.jsonl", value="output/metrics.jsonl", help="Output of eval_outputs.py") From 4928ce473f5f3967fc72265c039a6b7c81ca5d06 Mon Sep 17 00:00:00 2001 From: Aravind N Date: Thu, 19 Mar 2026 11:11:39 -0400 Subject: [PATCH 5/9] Rename Opik references to Langfuse in agentic VQA evaluation agents and update tracing parameters for observability. --- .../src/agentic_chartqapro_eval/agents/planner_agent.py | 2 +- .../src/agentic_chartqapro_eval/agents/verifier_agent.py | 2 +- .../src/agentic_chartqapro_eval/agents/vision_agent.py | 4 ++-- .../src/agentic_chartqapro_eval/runner/run_generate_meps.py | 4 ++-- .../src/agentic_chartqapro_eval/tools/ocr_reader_tool.py | 4 ++-- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py index 5e34591..42abad4 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py @@ -148,7 +148,7 @@ def run(self, sample: PerceivedSample, lf_trace: Any = None) -> Tuple[str, dict, ---------- sample : PerceivedSample The question and context to plan for. - opik_trace : Any, optional + langfuse_trace : Any, optional Observability object for logging. Returns diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py index fabd702..a182f10 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py @@ -216,7 +216,7 @@ def run( The inspection plan used by the previous agent. vision_parsed : dict The draft answer and explanation to audit. - opik_trace : Any, optional + langfuse_trace : Any, optional Tracing object for observability. Returns diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/vision_agent.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/vision_agent.py index 4832d66..28bee45 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/vision_agent.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/vision_agent.py @@ -189,7 +189,7 @@ def _build_tool(self, lf_trace: Any = None) -> VisionQATool: Parameters ---------- - opik_trace : Any, optional + langfuse_trace : Any, optional A tracing object for observability. Returns @@ -228,7 +228,7 @@ def run( The question and image to analyze. plan : dict The inspection procedure to follow. - opik_trace : Any, optional + langfuse_trace : Any, optional Trace object for execution tracking. ocr_result : dict, optional Ground-truth OCR data for grounding. diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py index bad9e28..ae97c5e 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py @@ -138,8 +138,8 @@ def process_sample( # noqa: PLR0915 Unique identifier for the current evaluation run. out_dir : str Directory where the resulting MEP JSON should be saved. - opik_client : object, optional - The Opik client for tracing. + langfuse_client : object, optional + The Langfuse client for tracing and observability. verifier_agent : VerifierAgent, optional The agent for pass 2.5 verification. ocr_tool : OcrReaderTool, optional diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/ocr_reader_tool.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/ocr_reader_tool.py index 41677dd..2655b12 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/ocr_reader_tool.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/ocr_reader_tool.py @@ -115,7 +115,7 @@ def _run(self, image_path: str) -> str: start_ts = datetime.now(timezone.utc).isoformat() t0 = time.time() - opik_span = open_llm_span( + lf_span = open_llm_span( self.lf_trace, name="ocr_reader_tool", input_data={"image_path": image_path}, @@ -155,7 +155,7 @@ def _run(self, image_path: str) -> str: usage = provider_meta.get("usage", {}) close_span( - opik_span, + lf_span, output={"raw_text": raw_text}, usage=usage if usage else None, error=error_str, From 01f6051fbfd361fab5fb106fe5233444fdcda577 Mon Sep 17 00:00:00 2001 From: Aravind N Date: Thu, 19 Mar 2026 11:31:24 -0400 Subject: [PATCH 6/9] Add integration test instructions to README for API key validation --- README.md | 12 +++ implementations/agentic_vqa_eval/README.md | 88 +++++++++++++++++----- 2 files changed, 83 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 8eb159a..3f81971 100644 --- a/README.md +++ b/README.md @@ -72,6 +72,18 @@ recent research, with fully reproducible notebooks and evaluation pipelines. uv run jupyter lab ``` +5. Run integration tests to validate that your API keys are set up correctly: + + ```bash + uv run --env-file .env pytest -sv tests/tool_tests/test_integration.py + ``` + + > **Note:** If your `.env` file is incomplete or needs to be updated, you can re-run onboarding manually from inside your Coder workspace (from the repo root): + > + > ```bash + > onboard --bootcamp-name "llm-interpretability-bootcamp" --output-dir "." --test-script "./aieng-llm-interp/tests/test_integration.py" --env-example "./.env.example" --test-marker "integration_test" --force + > ``` + ## License This project is licensed under the terms of the [LICENSE](LICENSE.md) file in the root directory. diff --git a/implementations/agentic_vqa_eval/README.md b/implementations/agentic_vqa_eval/README.md index 353fe8f..5541679 100644 --- a/implementations/agentic_vqa_eval/README.md +++ b/implementations/agentic_vqa_eval/README.md @@ -12,6 +12,8 @@ The core contribution is the **Model Evaluation Packet (MEP)** — a portable JS ## Architecture Overview +
+Show pipeline diagram ``` ┌──────────────────────────────────────────────────────────┐ @@ -112,6 +114,8 @@ The core contribution is the **Model Evaluation Packet (MEP)** — a portable JS (summary.csv by config × question_type) ``` +
+ ### Explainability at Four Levels This framework produces explainability signals at four distinct levels: @@ -143,6 +147,9 @@ This framework produces explainability signals at four distinct levels: ## Package Dependencies +
+Show dependencies table + | Package | Version | Purpose | |---|---|---| | `crewai` | 1.10.1 | Multi-agent framework: Agent, Task, Crew, LLM, BaseTool | @@ -159,10 +166,15 @@ This framework produces explainability signals at four distinct levels: | `streamlit` | ≥1.32 | Interactive evaluation dashboard | | `jupyter` / `ipykernel` | latest | Analysis notebook | +
+ --- ## Internal Package Structure +
+Show package tree + ``` src/agentic_chartqapro_eval/ ├── utils/ @@ -211,16 +223,18 @@ src/agentic_chartqapro_eval/ └── ingest.py — Retroactively import existing MEP files into Langfuse ``` +
+ --- ## Getting Started ### 1. Install dependencies -From the **root of the repository**, install the `ref6-agentic-xai-eval` dependency group using `uv`: +From the **root of the repository**, install the `agentic-xai-eval` dependency group using `uv`: ```bash -uv sync --group ref6-agentic-xai-eval +uv sync --group agentic-xai-eval source .venv/bin/activate ``` @@ -240,7 +254,19 @@ cp .env.example .env The `.env` file lives at the **repo root**. `load_dotenv()` searches upward from the working directory, so it is found automatically regardless of which subdirectory you run commands from. -### 3. Generate MEPs (run the agentic pipeline) +### 3. Run integration tests to validate your API keys + +```bash +uv run --env-file .env pytest -sv tests/tool_tests/test_integration.py +``` + +> **Note:** If your `.env` file is incomplete or needs to be updated, you can re-run onboarding manually from inside your Coder workspace (from the repo root): +> +> ```bash +> onboard --bootcamp-name "llm-interpretability-bootcamp" --output-dir "." --test-script "./aieng-llm-interp/tests/test_integration.py" --env-example "./.env.example" --test-marker "integration_test" --force +> ``` + +### 4. Generate MEPs (run the agentic pipeline) > **Note:** All `uv run` commands below use `$(git rev-parse --show-toplevel)` so they work from any directory in the repo — it resolves the repo root for `--env-file`, while `--directory` ensures outputs (`meps/`, `output/`) are written inside `implementations/agentic_vqa_eval/`. @@ -297,7 +323,7 @@ When OCR is skipped, `"ocr": null` appears in the MEP and `"ocr_ms": 0.0` in tim **Context injection:** The VisionAgent uses a single shared prompt template (`agents/prompts/vision.txt`) that contains an `{ocr_block}` placeholder. When OCR ran successfully, this block is populated with the structured OCR fields (chart type, title, axis labels, legend). When OCR is skipped or produced no output, `{ocr_block}` renders as an empty string — the prompt is otherwise identical. This is a useful example of conditional context injection: the same template handles both modes without branching at the prompt level. -### 4. Evaluate outputs (Pass 1 — accuracy + judge) +### 5. Evaluate outputs (Pass 1 — accuracy + judge) ```bash uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.eval_outputs \ @@ -314,7 +340,7 @@ When the verifier ran, two extra columns are present: The `predicted` column always reflects the **final answer** — the verifier's output when it ran, or the vision agent's output when skipped. This means accuracy scores automatically capture any corrections made by the verifier. -### 5. Evaluate traces (Pass 2 — latency and replayability) +### 6. Evaluate traces (Pass 2 — latency and replayability) ```bash uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.eval_traces \ @@ -322,7 +348,7 @@ uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev --out output/trace_metrics.jsonl ``` -### 6. Run Top-K evaluation (hit@1/2/3) +### 7. Run Top-K evaluation (hit@1/2/3) Re-queries the VLM for each MEP asking for the 3 most likely candidate answers: @@ -337,7 +363,7 @@ uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev This pass does **not** modify existing MEPs or `metrics.jsonl`. -### 7. Summarize results +### 8. Summarize results ```bash uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.summarize \ @@ -345,7 +371,7 @@ uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev --out output/summary.csv ``` -### 8. Failure taxonomy (Pass 4 — VLM-based diagnosis) +### 9. Failure taxonomy (Pass 4 — VLM-based diagnosis) This pass asks **why** the agent was wrong, not just **that** it was wrong. A VLM is given the original chart image alongside the wrong answer, the correct answer, the agent's explanation, and the inspection plan — so it can make a *visual* diagnosis of the failure mode. @@ -360,6 +386,9 @@ Each line in `taxonomy.jsonl` contains a `failure_type` (one of the categories b **Failure categories:** +
+Show failure categories + | Category | Description | |---|---| | `correct` | Model got it right — no VLM call made | @@ -372,6 +401,8 @@ Each line in `taxonomy.jsonl` contains a `failure_type` (one of the categories b | `extraction_error` | Could not locate the relevant data in the chart at all | | `other` | Does not fit any category above | +
+ **Why VLM instead of text-only LLM?** A text-only judge can only read the agent's description of what it saw. A VLM can independently verify whether the axis labels were actually ambiguous, whether the cited data point actually appears in the image, or whether the legend entries are genuinely confusing — producing a grounded diagnosis rather than a guess. @@ -403,7 +434,7 @@ for sid in revised: " ``` -### 9. Visualization & Reporting +### 10. Visualization & Reporting #### HTML report (no extra dependencies) @@ -459,6 +490,11 @@ Pre-built cells walk through: loading MEPs, accuracy by question type, verifier Langfuse is an open-source LLM observability platform that adds a live visualization and experiment-comparison layer on top of the MEP artifacts. MEPs remain the portable ground truth; Langfuse is purely additive. +> **Optional:** If `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are not set in `.env`, all Langfuse calls are silent no-ops and the pipeline runs exactly as before. + +
+Show Langfuse setup and usage + ### What Langfuse gives you | Feature | Detail | @@ -510,8 +546,6 @@ LANGFUSE_SECRET_KEY=sk-lf-... # LANGFUSE_HOST=https://cloud.langfuse.com # default; change for self-hosted ``` -The framework auto-detects these variables. If they are absent, all Langfuse calls are silent no-ops and the pipeline runs exactly as before. - ### 3. Push prompt versions to Langfuse Run once before starting experiments. This creates versioned entries for `planner.txt` and `vision.txt` in Langfuse Prompt Management so every future experiment links to the exact prompt version used. @@ -531,11 +565,7 @@ This creates a dataset named `ChartQAPro_test` in Langfuse containing one item p ### 5. Live tracing (automatic on new runs) -No extra flags needed. When `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are set, the pipeline automatically: -- registers the dataset and versions the prompts at run start -- opens a Langfuse trace per sample -- creates `planner` and `vision_qa_tool` child generations with inputs, outputs, and token usage -- stores the `lf_trace_id` in the MEP for later score attachment +No extra flags needed. When `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are set, the pipeline automatically registers the dataset and versions the prompts at run start, opens a Langfuse trace per sample, creates child generations with inputs/outputs/token usage, and stores the `lf_trace_id` in the MEP for later score attachment. ```bash uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.runner.run_generate_meps \ @@ -562,10 +592,15 @@ uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev --metrics_file metrics.jsonl # optional: attaches scores if available ``` +
+ --- ## MEP Schema +
+Show full MEP schema and field reference + Each MEP file is a self-contained JSON evaluation artifact: ```json @@ -620,17 +655,20 @@ Each MEP file is a self-contained JSON evaluation artifact: }, "timestamps": { "planner_ms": 2185, "ocr_ms": 1243, "vision_ms": 5684, "verifier_ms": 3712 }, "errors": [], - "lf_trace_id": "abc123..." // present when Langfuse tracing is active + "lf_trace_id": "abc123..." } ``` `ocr` is `null` when `--no_ocr` is passed. When present, `ocr.parsed` contains: `chart_type`, `title`, `x_axis`, `y_axis`, `legend`, `data_labels`, `annotations`. `verifier` is `null` when `--no_verifier` was passed. When present, `verifier.verdict` is one of: + - `"confirmed"` — second model agreed with the draft answer - `"revised"` — second model caught an error and corrected the answer - `"skipped"` — verifier ran but fell back due to missing image or error +
+ --- ## Resources @@ -646,33 +684,46 @@ Each MEP file is a self-contained JSON evaluation artifact: ## FAQ +
+Show all FAQs + ### 1. What is the purpose of the MEP schema? + The Model Evaluation Packet (MEP) schema is designed to provide a comprehensive, portable, and reproducible trace of the evaluation process. It captures all relevant details, including the inspection plan, tool calls, timestamps, and errors, enabling post-hoc analysis and comparison across models. ### 2. Can I use a different dataset with this framework? + Yes, the framework is modular and supports other datasets as long as they are compatible with the expected input format (question, chart image, expected answer). You may need to implement a custom dataset loader in `src/agentic_chartqapro_eval/datasets/`. ### 3. How do I add a new vision or planner backend? + To add a new backend, you need to: + - Implement the corresponding tool or agent in `src/agentic_chartqapro_eval/tools/` or `src/agentic_chartqapro_eval/agents/`. - Update the configuration options in `run_generate_meps.py` to include the new backend. ### 4. What happens if the VisionAgent produces malformed JSON? + The framework uses the `json_repair` library to attempt to fix malformed JSON outputs. If repair fails, the error is logged in the MEP under the `errors` field. ### 5. How can I customize the evaluation rubric? + The evaluation rubric is defined in `src/agentic_chartqapro_eval/eval/judge.py`. You can modify the scoring dimensions or add new ones by editing the `judge` function. ### 6. Is it possible to run the framework without API calls? + Yes, you can use pre-generated MEPs for evaluation by skipping the generation step. This is useful for offline analysis or when API usage is restricted. ### 7. How do I handle large datasets efficiently? + For large datasets, consider: + - Using the `--n` flag to process a subset of samples. - Increasing the `--workers` count to parallelize processing. - Running the pipeline on a machine with sufficient memory and disk space. ### 8. Where can I find more examples or tutorials? + Refer to the Resources section for links to documentation, datasets, and related research papers. Additional examples may be added in future updates. ### 9. How does the VerifierAgent differ from the LLM judge? @@ -690,4 +741,7 @@ They serve different purposes and run at different times: The verifier improves the pipeline's answer quality; the judge measures the pipeline's reasoning quality. ### 10. Do I need Langfuse to run the framework? + No. Langfuse is entirely optional. If `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are not set in `.env`, all Langfuse calls are silent no-ops. The pipeline produces the same MEPs, `metrics.jsonl`, and `summary.csv` as before. + +
From 335b468dafd55e614d7ace3e6079e60ad6ac43ae Mon Sep 17 00:00:00 2001 From: Aravind N Date: Thu, 19 Mar 2026 11:42:54 -0400 Subject: [PATCH 7/9] Update Langfuse integration to require version 4 and add fallback for missing attributes propagation --- .../langfuse_integration/tracing.py | 24 +++++++++++++++++-- pyproject.toml | 2 +- uv.lock | 2 +- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py index 546c99c..89d5f47 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py @@ -8,11 +8,31 @@ from contextlib import contextmanager from typing import Optional -from langfuse import propagate_attributes + +try: + from langfuse import propagate_attributes # requires langfuse>=4 +except Exception: + + @contextmanager # type: ignore[misc] + def propagate_attributes(**_: object): # type: ignore[misc] + """Fallback no-op context manager if langfuse v4 is not available.""" + yield def _normalize_usage(usage: dict) -> dict: - """Map provider usage dicts (OpenAI/Gemini) to Langfuse v4 usage_details keys.""" + """ + Map provider usage dicts (OpenAI/Gemini) to Langfuse v4 usage_details keys. + + Parameters + ---------- + usage : dict + The raw usage dict from the provider. + + Returns + ------- + dict + Normalized usage details for Langfuse. + """ normalized: dict = {} # OpenAI keys if "prompt_tokens" in usage: diff --git a/pyproject.toml b/pyproject.toml index 265a883..3851f9d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,7 +23,7 @@ dependencies = [ "python-dotenv>=1.2.2", "scikit-learn>=1.5.2", "transformers>=4.47.0", - "langfuse>=3.10.3", + "langfuse>=4", ] [dependency-groups] diff --git a/uv.lock b/uv.lock index db5369c..8ec741e 100644 --- a/uv.lock +++ b/uv.lock @@ -2099,7 +2099,7 @@ requires-dist = [ { name = "ipywidgets", specifier = ">=8.1.7" }, { name = "jupyter", specifier = ">=1.1.1" }, { name = "jupyterlab", specifier = ">=4.4.8" }, - { name = "langfuse", specifier = ">=3.10.3" }, + { name = "langfuse", specifier = ">=4" }, { name = "matplotlib", specifier = ">=3.10.5" }, { name = "numpy", specifier = ">=1.26,<2.0" }, { name = "openai", specifier = ">=2.8.0" }, From 103570945446e38b8dbb1232364035c06bfd22f8 Mon Sep 17 00:00:00 2001 From: Aravind N Date: Thu, 19 Mar 2026 12:33:39 -0400 Subject: [PATCH 8/9] Fix integration test command path in README for consistency --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3f81971..53a0e41 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ recent research, with fully reproducible notebooks and evaluation pipelines. 5. Run integration tests to validate that your API keys are set up correctly: ```bash - uv run --env-file .env pytest -sv tests/tool_tests/test_integration.py + uv run --env-file .env pytest -sv tests/test_integration.py ``` > **Note:** If your `.env` file is incomplete or needs to be updated, you can re-run onboarding manually from inside your Coder workspace (from the repo root): From 926c0888b7d6a434b1d8ad7503bde24a1248b076 Mon Sep 17 00:00:00 2001 From: Aravind N Date: Thu, 19 Mar 2026 17:45:43 -0400 Subject: [PATCH 9/9] Add Google GenAI and OpenAI instrumentation support in Langfuse integration --- .../langfuse_integration/client.py | 23 ++++++ pyproject.toml | 2 + uv.lock | 78 +++++++++++++++++++ 3 files changed, 103 insertions(+) diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py index c725ce9..aefdf1b 100644 --- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py +++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py @@ -11,6 +11,21 @@ from langfuse import Langfuse +try: + from openinference.instrumentation.google_genai import GoogleGenAIInstrumentor + + _google_instrumentor = GoogleGenAIInstrumentor() +except Exception: + _google_instrumentor = None # type: ignore[assignment] + +try: + from openinference.instrumentation.openai import OpenAIInstrumentor + + _openai_instrumentor = OpenAIInstrumentor() +except Exception: + _openai_instrumentor = None # type: ignore[assignment] + + _client = None _initialised = False @@ -51,6 +66,14 @@ def get_client(): kwargs["host"] = host _client = Langfuse(**kwargs) + # Activate OTel auto-instrumentation so provider SDK calls (Google GenAI, + # OpenAI) are captured as detailed child spans inside Langfuse traces. + if _google_instrumentor is not None: + with suppress(Exception): + _google_instrumentor.instrument() + if _openai_instrumentor is not None: + with suppress(Exception): + _openai_instrumentor.instrument() except Exception as exc: print(f"[langfuse] client init failed: {exc}") _client = None diff --git a/pyproject.toml b/pyproject.toml index 3851f9d..3b66108 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,6 +30,8 @@ dependencies = [ agentic-xai-eval = [ "crewai>=1.6.1", "google-genai>=1.67.0", + "openinference-instrumentation-google-genai>=0.1.0", + "openinference-instrumentation-openai>=0.1.0", "streamlit>=1.55.0", ] dev = [ diff --git a/uv.lock b/uv.lock index 8ec741e..8e4c60b 100644 --- a/uv.lock +++ b/uv.lock @@ -2035,6 +2035,8 @@ dependencies = [ agentic-xai-eval = [ { name = "crewai" }, { name = "google-genai" }, + { name = "openinference-instrumentation-google-genai" }, + { name = "openinference-instrumentation-openai" }, { name = "streamlit" }, ] dev = [ @@ -2113,6 +2115,8 @@ requires-dist = [ agentic-xai-eval = [ { name = "crewai", specifier = ">=1.6.1" }, { name = "google-genai", specifier = ">=1.67.0" }, + { name = "openinference-instrumentation-google-genai", specifier = ">=0.1.0" }, + { name = "openinference-instrumentation-openai", specifier = ">=0.1.0" }, { name = "streamlit", specifier = ">=1.55.0" }, ] dev = [ @@ -3675,6 +3679,65 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a4/7d/f1c30a92854540bf789e9cd5dde7ef49bbe63f855b85a2e6b3db8135c591/opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:085ad9b77c18853ea66283e98affefe2de8cc4c1f43eda4c100cf9b2721142ec", size = 39488044, upload-time = "2025-01-16T13:52:21.928Z" }, ] +[[package]] +name = "openinference-instrumentation" +version = "0.1.46" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "openinference-semantic-conventions" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-sdk" }, + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/64/8d/9b76b43e8b2ee2ccf1fe15b21c924095f9c0e4839919bcd4951b1c99c2ab/openinference_instrumentation-0.1.46.tar.gz", hash = "sha256:0b520002a1c682c525dcab49005c209bfd71611e8e4e4933b49779d5e899e6db", size = 23937, upload-time = "2026-03-04T10:13:48.883Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/25/d1/f6668492152a4180492044313e2dc427fbc237904f6bb1629abd030e3469/openinference_instrumentation-0.1.46-py3-none-any.whl", hash = "sha256:f7b63ccd5f93ce82e4e40035c9faa6b021984cbe06ad791f4cf033551533bc48", size = 30124, upload-time = "2026-03-04T10:13:47.613Z" }, +] + +[[package]] +name = "openinference-instrumentation-google-genai" +version = "0.1.13" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "openinference-instrumentation" }, + { name = "openinference-semantic-conventions" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-instrumentation" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/7f/b0/91785c0d8740c488c7d9c0789985cccafbd4dad53266242a19d511603feb/openinference_instrumentation_google_genai-0.1.13.tar.gz", hash = "sha256:088a7300264486a41db2ab44b08848aaac788d0b6a3d61ff12d66b9b3b0703fb", size = 55136, upload-time = "2026-03-11T04:45:48.223Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/72/38/9c5f8ff01cce2e93b92abd6779c6d5bad8ddc399d26a5debc7ed9a32782c/openinference_instrumentation_google_genai-0.1.13-py3-none-any.whl", hash = "sha256:b14485015a4603accba17f77636501b68bc163e95bc9cf65ffb64caf60544cfc", size = 29135, upload-time = "2026-03-11T04:45:45.471Z" }, +] + +[[package]] +name = "openinference-instrumentation-openai" +version = "0.1.42" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "openinference-instrumentation" }, + { name = "openinference-semantic-conventions" }, + { name = "opentelemetry-api" }, + { name = "opentelemetry-instrumentation" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "typing-extensions" }, + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/f1/e4/cf114f6fedc90dde6e1d4062e55686542f8b7636a4d3340b81a49b1a09a8/openinference_instrumentation_openai-0.1.42.tar.gz", hash = "sha256:6f6b340292ab7dd7dc2e9a944958f7f812108efaafbfbcaa3f7ba205744ad1ce", size = 22839, upload-time = "2026-03-11T04:45:51.37Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c5/88/eaaa4840bf1ed8ff8c0927cd6ad5653ee0cfac14bfcb4e1e8f06fb0be9e8/openinference_instrumentation_openai-0.1.42-py3-none-any.whl", hash = "sha256:e7ff7b98612102d4a3e342842d3dd231709ff51abdc4b193e5df09e9afcfac0f", size = 30333, upload-time = "2026-03-11T04:45:48.535Z" }, +] + +[[package]] +name = "openinference-semantic-conventions" +version = "0.1.28" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/df/32/c79bf8bd3ea5a00e492449b31ca600bbc2a8e88a301e42c872af925a156c/openinference_semantic_conventions-0.1.28.tar.gz", hash = "sha256:6388465174e8ab3f27ebc6a9e9bb2e1b804d30caefb57234e16db874da1c6a7b", size = 12893, upload-time = "2026-03-11T04:45:46.543Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/04/40/34b570462c3ce250277254bb0cca655eb39b64c0dffe63cd7751f103f8d6/openinference_semantic_conventions-0.1.28-py3-none-any.whl", hash = "sha256:a2fed5bb167aa56c1c7448cdb7a8d775f989339ba1f8b04a7b45d4f8388cccfb", size = 10522, upload-time = "2026-03-11T04:45:45.423Z" }, +] + [[package]] name = "openpyxl" version = "3.1.5" @@ -3748,6 +3811,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a0/3a/8865d6754e61c9fb170cdd530a124a53769ee5f740236064816eb0ca7301/opentelemetry_exporter_otlp_proto_http-1.40.0-py3-none-any.whl", hash = "sha256:a8d1dab28f504c5d96577d6509f80a8150e44e8f45f82cdbe0e34c99ab040069", size = 19960, upload-time = "2026-03-04T14:17:07.153Z" }, ] +[[package]] +name = "opentelemetry-instrumentation" +version = "0.61b0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "opentelemetry-api" }, + { name = "opentelemetry-semantic-conventions" }, + { name = "packaging" }, + { name = "wrapt" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/da/37/6bf8e66bfcee5d3c6515b79cb2ee9ad05fe573c20f7ceb288d0e7eeec28c/opentelemetry_instrumentation-0.61b0.tar.gz", hash = "sha256:cb21b48db738c9de196eba6b805b4ff9de3b7f187e4bbf9a466fa170514f1fc7", size = 32606, upload-time = "2026-03-04T14:20:16.825Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d8/3e/f6f10f178b6316de67f0dfdbbb699a24fbe8917cf1743c1595fb9dcdd461/opentelemetry_instrumentation-0.61b0-py3-none-any.whl", hash = "sha256:92a93a280e69788e8f88391247cc530fd81f16f2b011979d4d6398f805cfbc63", size = 33448, upload-time = "2026-03-04T14:19:02.447Z" }, +] + [[package]] name = "opentelemetry-proto" version = "1.40.0"