From d3ea39114292fbaf27668529f632b35da1af7776 Mon Sep 17 00:00:00 2001
From: aravind-3105 <aravindn1308@outlook.com>
Date: Mon, 16 Mar 2026 17:29:04 -0400
Subject: [PATCH 1/9] Integrate Langfuse observability into agentic VQA
 evaluation, replacing Opik references and updating tracing mechanisms

---
 implementations/agentic_vqa_eval/README.md    | 229 +++++-------------
 .../agents/planner_agent.py                   |   8 +-
 .../agents/verifier_agent.py                  |   6 +-
 .../agents/vision_agent.py                    |  12 +-
 .../eval/error_taxonomy.py                    |  22 +-
 .../eval/eval_outputs.py                      |  21 +-
 .../agentic_chartqapro_eval/eval/eval_topk.py |  21 +-
 .../langfuse_integration/__init__.py          |   1 +
 .../langfuse_integration/client.py            |  56 +++++
 .../langfuse_integration/dataset.py           |  69 ++++++
 .../langfuse_integration/ingest.py            | 174 +++++++++++++
 .../langfuse_integration/prompts.py           |  81 +++++++
 .../langfuse_integration/tracing.py           | 157 ++++++++++++
 .../src/agentic_chartqapro_eval/mep/schema.py |   2 +-
 .../runner/run_generate_meps.py               |  50 ++--
 .../tools/ocr_reader_tool.py                  |   6 +-
 .../tools/vision_qa_tool.py                   |  10 +-
 pyproject.toml                                |   3 +-
 18 files changed, 689 insertions(+), 239 deletions(-)
 create mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/__init__.py
 create mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py
 create mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/dataset.py
 create mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/ingest.py
 create mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/prompts.py
 create mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py

diff --git a/implementations/agentic_vqa_eval/README.md b/implementations/agentic_vqa_eval/README.md
index a939c79..d28a821 100644
--- a/implementations/agentic_vqa_eval/README.md
+++ b/implementations/agentic_vqa_eval/README.md
@@ -6,7 +6,7 @@ Welcome to **Reference Implementation 6** of the Survey Paper on Agentic Visual
 
 The core contribution is the **Model Evaluation Packet (MEP)** — a portable JSON trace that captures everything: the inspection plan, the vision agent's reasoning, the verifier's critique, tool call logs, timestamps, and errors. This enables reproducible evaluation, post-hoc explainability analysis, and model comparison across VLM backends.
 
-**Observability layer:** Integration with **[Opik](https://github.com/comet-ml/opik)** (self-hosted) for live trace visualization, prompt versioning, dataset registration, and experiment comparison across configs — all without changing the MEP ground-truth artifacts.
+**Observability layer:** Integration with **[Langfuse](https://langfuse.com)** (cloud or self-hosted) for live trace visualization, prompt versioning, dataset registration, and experiment comparison across configs — all without changing the MEP ground-truth artifacts.
 
 ---
 
@@ -154,7 +154,7 @@ This framework produces explainability signals at four distinct levels:
 | `json_repair` | 0.25.3 | Fallback JSON parsing when LLM output is malformed |
 | `python-dotenv` | 1.1.1 | API key management via `.env` file |
 | `pandas` | 2.3.3 | Metric aggregation and summary CSV generation |
-| `opik` | latest | Trace visualization, prompt versioning, dataset registration |
+| `langfuse` | latest | Trace visualization, prompt versioning, dataset registration |
 | `matplotlib` | ≥3.7 | Charts in notebook and dashboard |
 | `streamlit` | ≥1.32 | Interactive evaluation dashboard |
 | `jupyter` / `ipykernel` | latest | Analysis notebook |
@@ -203,12 +203,12 @@ src/agentic_chartqapro_eval/
 │   ├── dashboard.py        — Streamlit interactive dashboard: sample browser, chart image viewer
 │   └── summarize.py        — Aggregate metrics.jsonl → summary.csv
 │
-└── opik_integration/
-    ├── client.py           — Opik client singleton (gracefully disabled if not configured)
+└── langfuse_integration/
+    ├── client.py           — Langfuse client singleton (gracefully disabled if not configured)
     ├── tracing.py          — sample_trace(), open_llm_span(), close_span() helpers
-    ├── prompts.py          — Push planner.txt / vision.txt to Opik Prompt Library
-    ├── dataset.py          — Register ChartQAPro samples as an Opik Dataset
-    └── ingest.py           — Retroactively import existing MEP files into Opik
+    ├── prompts.py          — Push planner.txt / vision.txt to Langfuse Prompt Management
+    ├── dataset.py          — Register ChartQAPro samples as a Langfuse Dataset
+    └── ingest.py           — Retroactively import existing MEP files into Langfuse
 ```
 
 ---
@@ -217,10 +217,10 @@ src/agentic_chartqapro_eval/
 
 ### 1. Install dependencies
 
-From the **root of the repository**, install the `agentic-xai-eval` dependency group using `uv`:
+From the **root of the repository**, install the `ref6-agentic-xai-eval` dependency group using `uv`:
 
 ```bash
-uv sync --group agentic-xai-eval
+uv sync --group ref6-agentic-xai-eval
 source .venv/bin/activate
 ```
 
@@ -229,12 +229,17 @@ The `agentic_chartqapro_eval` package is automatically available — it is inclu
 ### 2. Configure API keys
 
 ```bash
+# From the repo root:
 cp .env.example .env
 # Edit .env and fill in your keys:
 #   OPENAI_API_KEY=...
 #   GEMINI_API_KEY=...
+#   LANGFUSE_PUBLIC_KEY=...   # optional — for observability
+#   LANGFUSE_SECRET_KEY=...
 ```
 
+The `.env` file lives at the **repo root**. `load_dotenv()` searches upward from the working directory, so it is found automatically regardless of which subdirectory you run commands from.
+
 ### 3. Generate MEPs (run the agentic pipeline)
 
 Run on 25 test samples using GPT-4o for planner, vision, and verifier:
@@ -282,7 +287,7 @@ uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \
 
 To disable OCR entirely (matches the original pipeline behaviour, faster and lower cost):
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \
+uv run --env-file .env -magentic_chartqapro_eval.runner.run_generate_meps \
     --split test --n 25 --config gemini_gemini --no_ocr
 ```
 
@@ -320,7 +325,7 @@ uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_traces \
 Re-queries the VLM for each MEP asking for the 3 most likely candidate answers:
 
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_topk \
+uv run --env-file .env -magentic_chartqapro_eval.eval.eval_topk \
     --mep_dir meps/gemini_gemini/chartqapro/test \
     --out output/topk_metrics.jsonl \
     --backend gemini \
@@ -333,7 +338,7 @@ This pass does **not** modify existing MEPs or `metrics.jsonl`.
 ### 7. Summarize results
 
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.eval.summarize \
+uv run --env-file .env -magentic_chartqapro_eval.eval.summarize \
     --metrics output/metrics.jsonl \
     --out output/summary.csv
 ```
@@ -448,17 +453,17 @@ Pre-built cells walk through: loading MEPs, accuracy by question type, verifier
 
 ---
 
-## Opik Observability (Self-Hosted)
+## Langfuse Observability
 
-Opik is an open-source LLM observability platform that adds a live visualization and experiment-comparison layer on top of the MEP artifacts. MEPs remain the portable ground truth; Opik is purely additive.
+Langfuse is an open-source LLM observability platform that adds a live visualization and experiment-comparison layer on top of the MEP artifacts. MEPs remain the portable ground truth; Langfuse is purely additive.
 
-### What Opik gives you
+### What Langfuse gives you
 
 | Feature | Detail |
 |---|---|
-| **Trace viewer** | Every sample becomes a trace with `planner` and `vision_qa_tool` child spans showing prompts, outputs, token usage, and latency |
+| **Trace viewer** | Every sample becomes a trace with `planner` and `vision_qa_tool` child generations showing prompts, outputs, token usage, and latency |
 | **Feedback scores** | `answer_accuracy` and all five `judge_*` rubric scores are attached to each trace after eval |
-| **Prompt Library** | `planner.txt` and `vision.txt` are versioned — every experiment links to the exact prompt version used |
+| **Prompt Management** | `planner.txt` and `vision.txt` are versioned — every experiment links to the exact prompt version used |
 | **Dataset registry** | ChartQAPro samples are registered so experiments formally reference a dataset version |
 | **Experiment comparison** | `openai_openai` vs `gemini_gemini` side-by-side with accuracy distributions and latency CDFs |
 
@@ -468,183 +473,76 @@ Opik is an open-source LLM observability platform that adds a live visualization
 Trace: chartqapro/000002  [openai_openai | standard | 11.4s]
   input:    {question, expected_output}
   output:   {answer, explanation}
-  feedback: answer_accuracy=1.0, judge_explanation_quality=0.9, ...
-  ├── Span: planner          [llm | gpt-4o | 2.1s]
+  scores:   answer_accuracy=1.0, judge_explanation_quality=0.9, ...
+  ├── Generation: planner          [gpt-4o | 2.1s]
   │     input: {prompt}
   │     output: {plan_steps: [...], parse_error: false}
-  ├── Span: vision_agent     [llm | gpt-4o | 5.6s]
-  │     └── Span: vision_qa_tool  [llm | gpt-4o | 2.9s | 688 tokens]
+  ├── Generation: vision_agent     [gpt-4o | 5.6s]
+  │     └── Generation: vision_qa_tool  [gpt-4o | 2.9s | 688 tokens]
   │           input:  {image_path, question, plan_steps}
   │           output: {answer, explanation}
-  └── Span: verifier         [llm | gpt-4o | 3.7s]
+  └── Generation: verifier         [gpt-4o | 3.7s]
         input:  {prompt, draft_answer}
         output: {verdict: "confirmed" | "revised", answer, reasoning}
 ```
 
-### 1. Intall and Setup Docker
-
-#### Update packages and install Docker.
-
-```bash
-sudo apt update
-sudo apt install -y docker.io
-```
-
-Verify installation:
-
-```bash
-docker --version
-```
-
-#### Start the Docker daemon
-
-Some cloud environments do not run systemd, so start Docker manually.
-
-```bash
-sudo dockerd > /tmp/dockerd.log 2>&1 &
-```
-
-Verify Docker is running:
-
-```bash
-sudo docker info
-```
-
-#### Install Docker Compose v2
-
-Create plugin directory:
-
-```bash
-sudo mkdir -p /usr/lib/docker/cli-plugins
-```
-
-Download the Compose plugin:
-
-```bash
-sudo curl -SL https://github.com/docker/compose/releases/download/v2.27.0/docker-compose-linux-x86_64 \
--o /usr/lib/docker/cli-plugins/docker-compose
-```
-
-Make it executable:
-
-```bash
-sudo chmod +x /usr/lib/docker/cli-plugins/docker-compose
-```
-
-Verify installation:
-
-```bash
-docker compose version
-```
-
-Expected output example:
-
-```
-Docker Compose version v2.27.0
-```
-
-### 2. Start the self-hosted Opik stack
-
-Requires Docker Desktop (already running if you followed setup above).
-
-```bash
-# Clone the Opik repository.
-git clone https://github.com/comet-ml/opik.git /tmp/opik-server --depth=1
-# Navigate to the Docker deployment directory:
-cd /tmp/opik-server/deployment/docker-compose
-# Start the Opik stack with the 'opik' profile:
-sudo docker compose --profile opik up -d
-```
+### 1. Get API keys
 
-Dashboard is available at **http://localhost:5173** once all containers are healthy (takes ~60 seconds on first pull).
+**Cloud (recommended — no infrastructure needed):**
 
-To stop: `docker compose --profile opik down`
+1. Sign up at [cloud.langfuse.com](https://cloud.langfuse.com)
+2. Create a new project
+3. Go to **Settings → API Keys** and create a key pair
 
-#### Verify containers
+**Self-hosted:**
 
-Check running containers:
+Follow the [Langfuse self-hosting guide](https://langfuse.com/docs/deployment/self-host) to deploy with Docker Compose, then create API keys in the UI.
 
-```bash
-sudo docker ps
-```
+### 2. Configure the connection
 
-You should see containers similar to:
+Add to your `.env` at the repo root:
 
 ```
-opik-frontend-1
-opik-backend-1
-opik-python-backend-1
-opik-mysql-1
-opik-redis-1
-opik-clickhouse-1
+LANGFUSE_PUBLIC_KEY=pk-lf-...
+LANGFUSE_SECRET_KEY=sk-lf-...
+# LANGFUSE_HOST=https://cloud.langfuse.com  # default; change for self-hosted
 ```
 
-#### Access Opik
+The framework auto-detects these variables. If they are absent, all Langfuse calls are silent no-ops and the pipeline runs exactly as before.
 
-Get your VM external IP:
+### 3. Push prompt versions to Langfuse
 
-```bash
-curl ifconfig.me
-```
-
-Open the Opik UI in your browser:
-
-```
-http://<VM_EXTERNAL_IP>:5173
-```
-
-Example:
-
-```
-http://34.xx.xx.xxx:5173
-```
-
-You should now see the **Comet Opik dashboard**.
-
-
-### 3. Configure the connection
-
-Add to your `.env`:
-
-```
-OPIK_URL_OVERRIDE=http://localhost:5173/api
-```
-
-The framework auto-detects this variable. If it is absent, all Opik calls are silent no-ops and the pipeline runs exactly as before.
-
-### 4. Push prompt versions to Opik
-
-Run once before starting experiments. This creates versioned entries for `planner.txt` and `vision.txt` in the Opik Prompt Library so every future experiment links to the exact prompt version used.
+Run once before starting experiments. This creates versioned entries for `planner.txt` and `vision.txt` in Langfuse Prompt Management so every future experiment links to the exact prompt version used.
 
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.opik_integration.prompts
+uv run --env-file .env -m -m agentic_chartqapro_eval.langfuse_integration.prompts
 ```
 
-### 5. Register the dataset
+### 4. Register the dataset
 
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.opik_integration.dataset \
+uv run --env-file .env -m agentic_chartqapro_eval.langfuse_integration.dataset \
     --split test --n 25
 ```
 
-This creates a dataset named `ChartQAPro_test` in Opik containing one item per sample (question, expected output, question type, image path).
+This creates a dataset named `ChartQAPro_test` in Langfuse containing one item per sample (question, expected output, question type, image path).
 
-### 6. Live tracing (automatic on new runs)
+### 5. Live tracing (automatic on new runs)
 
-No extra flags needed. When `OPIK_URL_OVERRIDE` is set, the pipeline automatically:
+No extra flags needed. When `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are set, the pipeline automatically:
 - registers the dataset and versions the prompts at run start
-- opens an Opik trace per sample
-- creates `planner` and `vision_qa_tool` child spans with inputs, outputs, and token usage
-- stores the `opik_trace_id` in the MEP for later score attachment
+- opens a Langfuse trace per sample
+- creates `planner` and `vision_qa_tool` child generations with inputs, outputs, and token usage
+- stores the `lf_trace_id` in the MEP for later score attachment
 
 ```bash
 uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \
     --split test --n 25 --config gemini_gemini --workers 4 --out meps/
 ```
 
-### 7. Attach evaluation scores
+### 6. Attach evaluation scores
 
-After running `eval_outputs.py`, accuracy and judge scores are automatically written back to the Opik traces:
+After running `eval_outputs.py`, accuracy and judge scores are automatically written back to the Langfuse traces:
 
 ```bash
 uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_outputs \
@@ -652,12 +550,12 @@ uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_outputs \
     --out metrics.jsonl
 ```
 
-### 8. Ingest existing MEPs (retroactive)
+### 7. Ingest existing MEPs (retroactive)
 
-If you have MEPs from runs before Opik was configured, import them without re-running the pipeline:
+If you have MEPs from runs before Langfuse was configured, import them without re-running the pipeline:
 
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.opik_integration.ingest \
+uv run --env-file .env -m agentic_chartqapro_eval.langfuse_integration.ingest \
     --mep_dir meps/gemini_gemini/chartqapro/test \
     --metrics_file metrics.jsonl   # optional: attaches scores if available
 ```
@@ -720,7 +618,7 @@ Each MEP file is a self-contained JSON evaluation artifact:
   },
   "timestamps": { "planner_ms": 2185, "ocr_ms": 1243, "vision_ms": 5684, "verifier_ms": 3712 },
   "errors": [],
-  "opik_trace_id": "tr_abc123..."   // present when Opik tracing is active
+  "lf_trace_id": "abc123..."   // present when Langfuse tracing is active
 }
 ```
 
@@ -740,7 +638,7 @@ Each MEP file is a self-contained JSON evaluation artifact:
 - **OpenAI Vision API** — GPT-4o multimodal inference for chart image understanding ([platform.openai.com](https://platform.openai.com/docs))
 - **Google Gemini API** — Alternative VLM backend for vision inference ([ai.google.dev](https://ai.google.dev/docs))
 - **LLM-as-Judge (Zheng et al., 2023)** — Methodology for using LLMs to score free-form outputs with rubric dimensions ([arXiv:2306.05685](https://arxiv.org/abs/2306.05685))
-- **Opik by Comet ML** — Open-source LLM observability platform used for tracing, prompt versioning, and experiment comparison ([github.com/comet-ml/opik](https://github.com/comet-ml/opik))
+- **Langfuse** — Open-source LLM observability platform used for tracing, prompt versioning, and experiment comparison ([langfuse.com](https://langfuse.com))
 
 ---
 
@@ -789,12 +687,5 @@ They serve different purposes and run at different times:
 
 The verifier improves the pipeline's answer quality; the judge measures the pipeline's reasoning quality.
 
-### 10. Do I need Opik to run the framework?
-No. Opik is entirely optional. If `OPIK_URL_OVERRIDE` is not set in `.env`, all Opik calls are silent no-ops. The pipeline produces the same MEPs, `metrics.jsonl`, and `summary.csv` as before.
-
-### 11. How do I stop the Opik Docker stack?
-```bash
-cd /tmp/opik-server/deployment/docker-compose
-docker compose --profile opik down
-```
-MEPs and metrics files are stored locally and are unaffected. Trace data in Opik is stored in the Docker volumes and will persist across restarts unless you run `docker compose down -v`.
+### 10. Do I need Langfuse to run the framework?
+No. Langfuse is entirely optional. If `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are not set in `.env`, all Langfuse calls are silent no-ops. The pipeline produces the same MEPs, `metrics.jsonl`, and `summary.csv` as before.
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py
index fda7d27..71d554f 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py
@@ -11,7 +11,7 @@
 from crewai import LLM, Agent, Crew, Task
 
 from ..datasets.perceived_sample import PerceivedSample
-from ..opik_integration.tracing import close_span, open_llm_span
+from ..langfuse_integration.tracing import close_span, open_llm_span
 from ..utils.json_strict import parse_strict
 
 
@@ -137,7 +137,9 @@ def __init__(
         self.api_key = api_key
         self._llm = _build_llm(backend, model, api_key)
 
-    def run(self, sample: PerceivedSample, opik_trace: Any = None) -> Tuple[str, dict, bool, str]:
+    def run(
+        self, sample: PerceivedSample, lf_trace: Any = None
+    ) -> Tuple[str, dict, bool, str]:
         """
         Execute the planning phase for a new question.
 
@@ -165,7 +167,7 @@ def run(self, sample: PerceivedSample, opik_trace: Any = None) -> Tuple[str, dic
         prompt = build_planner_prompt(sample)
 
         span = open_llm_span(
-            opik_trace,
+            lf_trace,
             name="planner",
             input_data={"prompt": prompt},
             model=self.model,
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py
index 7d91918..fabd702 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py
@@ -23,7 +23,7 @@
 from openai import OpenAI
 from PIL import Image
 
-from ..opik_integration.tracing import close_span, open_llm_span
+from ..langfuse_integration.tracing import close_span, open_llm_span
 from ..utils.json_strict import parse_strict
 
 
@@ -203,7 +203,7 @@ def run(
         sample,  # PerceivedSample
         plan: dict,
         vision_parsed: dict,
-        opik_trace: Any = None,
+        lf_trace: Any = None,
     ) -> Tuple[str, dict, bool, str]:
         """
         Critically audit a draft answer using a single VLM call.
@@ -250,7 +250,7 @@ def run(
         )
 
         span = open_llm_span(
-            opik_trace,
+            lf_trace,
             name="verifier",
             input_data={"prompt": prompt, "draft_answer": draft_answer},
             model=self.model,
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/vision_agent.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/vision_agent.py
index 550aa7c..4832d66 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/vision_agent.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/vision_agent.py
@@ -11,7 +11,7 @@
 from crewai import LLM, Agent, Crew, Task
 
 from ..datasets.perceived_sample import PerceivedSample
-from ..opik_integration.tracing import close_span, open_llm_span
+from ..langfuse_integration.tracing import close_span, open_llm_span
 from ..tools.vision_qa_tool import VisionQATool
 from ..utils.json_strict import parse_strict
 
@@ -183,7 +183,7 @@ def __init__(
         self.agent_api_key = agent_api_key
         self.vision_api_key = vision_api_key
 
-    def _build_tool(self, opik_trace: Any = None) -> VisionQATool:
+    def _build_tool(self, lf_trace: Any = None) -> VisionQATool:
         """
         Instantiate the vision tool with the configured vision model.
 
@@ -206,14 +206,14 @@ def _build_tool(self, opik_trace: Any = None) -> VisionQATool:
             backend=self.vision_backend,
             model=self.vision_model,
             api_key=key,
-            opik_trace=opik_trace,
+            lf_trace=lf_trace,
         )
 
     def run(
         self,
         sample: PerceivedSample,
         plan: dict,
-        opik_trace: Any = None,
+        lf_trace: Any = None,
         ocr_result: Optional[dict] = None,
     ) -> Tuple[str, dict, bool, str, List[dict]]:
         """
@@ -246,12 +246,12 @@ def run(
         tool_traces : list of dict
             A log of tool interactions during the run.
         """
-        tool = self._build_tool(opik_trace=opik_trace)
+        tool = self._build_tool(lf_trace=lf_trace)
         llm = _build_llm(self.agent_backend, self.agent_model, self.agent_api_key)
         task_description = build_vision_task_description(sample, plan, ocr_result=ocr_result)
 
         vision_span = open_llm_span(
-            opik_trace,
+            lf_trace,
             name="vision_agent",
             input_data={"task_description": task_description},
             model=self.agent_model,
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/error_taxonomy.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/error_taxonomy.py
index f60b561..78b357b 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/error_taxonomy.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/error_taxonomy.py
@@ -24,7 +24,7 @@
 from openai import OpenAI
 
 from ..mep.writer import iter_meps
-from ..opik_integration.client import get_client
+from ..langfuse_integration.client import get_client
 from ..utils.json_strict import parse_strict
 
 
@@ -262,7 +262,7 @@ def main() -> None:  # noqa: PLR0915
                     row = json.loads(line)
                     accuracy_by_id[row.get("sample_id", "")] = row.get("answer_accuracy", 0.0)
 
-    opik_client = get_client()
+    lf_client = get_client()
 
     with open(args.out, "w") as f_out:
         count = 0
@@ -311,19 +311,15 @@ def main() -> None:  # noqa: PLR0915
                 f_out.write(json.dumps(row) + "\n")
                 count += 1
 
-                # Log to Opik if trace_id is available
-                opik_trace_id = mep.get("opik_trace_id")
-                if opik_client and opik_trace_id:
+                # Log to Langfuse if trace_id is available
+                lf_trace_id = mep.get("lf_trace_id")
+                if lf_client and lf_trace_id:
                     failure_type = result.get("failure_type", "other")
                     with contextlib.suppress(Exception):
-                        opik_client.log_traces_feedback_scores(
-                            [
-                                {
-                                    "id": opik_trace_id,
-                                    "name": f"failure_{failure_type}",
-                                    "value": 1.0,
-                                }
-                            ]
+                        lf_client.create_score(
+                            trace_id=lf_trace_id,
+                            name=f"failure_{failure_type}",
+                            value=1.0,
                         )
 
                 if count % 10 == 0:
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_outputs.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_outputs.py
index e196809..bb1a412 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_outputs.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_outputs.py
@@ -17,7 +17,7 @@
 from dotenv import load_dotenv
 
 from ..mep.writer import iter_meps
-from ..opik_integration.client import get_client
+from ..langfuse_integration.client import get_client
 from .judge import judge_mep
 
 
@@ -137,19 +137,22 @@ def evaluate_mep(
         for k, v in judge_scores.items():
             metrics[f"judge_{k}"] = v
 
-    # Log all scores back to the Opik trace if one was recorded in the MEP
-    opik_trace_id = mep.get("opik_trace_id")
-    if opik_trace_id:
+    # Log all scores back to the Langfuse trace if one was recorded in the MEP
+    lf_trace_id = mep.get("lf_trace_id")
+    if lf_trace_id:
         client = get_client()
         if client:
             score_keys = ["answer_accuracy", "latency_sec"] + (
                 [f"judge_{k}" for k in judge_scores] if use_judge else []
             )
-            scores = {k: metrics[k] for k in score_keys if isinstance(metrics.get(k), (int, float))}
-            with contextlib.suppress(Exception):
-                client.log_traces_feedback_scores(
-                    [{"id": opik_trace_id, "name": k, "value": float(v)} for k, v in scores.items()]
-                )
+            scores = {
+                k: metrics[k]
+                for k in score_keys
+                if isinstance(metrics.get(k), (int, float))
+            }
+            for k, v in scores.items():
+                with contextlib.suppress(Exception):
+                    client.create_score(trace_id=lf_trace_id, name=k, value=float(v))
 
     return metrics
 
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_topk.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_topk.py
index 0584a6c..9a51a7e 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_topk.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_topk.py
@@ -15,6 +15,7 @@
 
 import argparse
 import base64
+import contextlib
 import json
 import os
 from pathlib import Path
@@ -24,6 +25,7 @@
 from google import genai
 from openai import OpenAI
 
+from ..langfuse_integration.client import get_client
 from ..mep.writer import iter_meps
 from ..utils.json_strict import parse_strict
 from .eval_outputs import score_answer_accuracy
@@ -209,6 +211,8 @@ def main() -> None:
 
     api_key = os.environ.get("OPENAI_API_KEY", "") if args.backend == "openai" else os.environ.get("GEMINI_API_KEY", "")
 
+    lf_client = get_client()
+
     with open(args.out, "w") as f_out:
         count = 0
         for mep in iter_meps(args.mep_dir):
@@ -228,7 +232,22 @@ def main() -> None:
                 cands = result["topk_candidates"]
                 h1 = result.get("hit_at_1", 0)
                 h3 = result.get(f"hit_at_{args.k}", 0)
-                print(f"  {sid}  exp={exp!r}  candidates={cands}  hit@1={h1}  hit@{args.k}={h3}")
+                print(
+                    f"  {sid}  exp={exp!r}  candidates={cands}  hit@1={h1}  hit@{args.k}={h3}"
+                )
+
+                lf_trace_id = mep.get("lf_trace_id")
+                if lf_client and lf_trace_id:
+                    for ki in range(1, args.k + 1):
+                        key = f"hit_at_{ki}"
+                        if key in result:
+                            with contextlib.suppress(Exception):
+                                lf_client.create_score(
+                                    trace_id=lf_trace_id,
+                                    name=key,
+                                    value=float(result[key]),
+                                )
+                                
                 count += 1
             except Exception as exc:
                 print(f"  Error: {exc}")
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/__init__.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/__init__.py
new file mode 100644
index 0000000..32d60f4
--- /dev/null
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/__init__.py
@@ -0,0 +1 @@
+"""Langfuse observability integration — tracing, prompt versioning, dataset registration."""
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py
new file mode 100644
index 0000000..5acfa2e
--- /dev/null
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py
@@ -0,0 +1,56 @@
+"""Langfuse client singleton with graceful degradation.
+
+Returns None when LANGFUSE_PUBLIC_KEY / LANGFUSE_SECRET_KEY are not set or
+langfuse is not installed, so every caller can guard with ``if client:``.
+"""
+
+import os
+
+
+_client = None
+_initialised = False
+
+
+def get_client():
+    """Return a configured langfuse.Langfuse() instance, or None if unavailable."""
+    global _client, _initialised  # noqa: PLW0603
+    if _initialised:
+        return _client
+
+    _initialised = True
+
+    try:
+        from dotenv import load_dotenv
+
+        load_dotenv()
+    except ImportError:
+        pass
+
+    public_key = os.environ.get("LANGFUSE_PUBLIC_KEY", "")
+    secret_key = os.environ.get("LANGFUSE_SECRET_KEY", "")
+
+    if not public_key or not secret_key:
+        return None
+
+    try:
+        from langfuse import Langfuse
+
+        kwargs: dict = {"public_key": public_key, "secret_key": secret_key}
+        # Accept LANGFUSE_HOST or LANGFUSE_BASE_URL (both are common)
+        host = os.environ.get("LANGFUSE_HOST") or os.environ.get("LANGFUSE_BASE_URL", "")
+        if host:
+            kwargs["host"] = host
+
+        _client = Langfuse(**kwargs)
+    except Exception as exc:
+        print(f"[langfuse] client init failed: {exc}")
+        _client = None
+
+    return _client
+
+
+def reset_client() -> None:
+    """Force re-initialisation on next call (useful for tests)."""
+    global _client, _initialised  # noqa: PLW0603
+    _client = None
+    _initialised = False
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/dataset.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/dataset.py
new file mode 100644
index 0000000..ab0f1b7
--- /dev/null
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/dataset.py
@@ -0,0 +1,69 @@
+"""Register ChartQAPro samples as a Langfuse Dataset.
+
+Usage:
+    python -m agentic_chartqapro_eval.langfuse_integration.dataset \
+        --split test --n 25
+"""
+
+import argparse
+from typing import Optional
+
+from .client import get_client
+
+
+def register_dataset(
+    samples,
+    dataset_name: str = "ChartQAPro",
+    split: str = "test",
+) -> Optional[str]:
+    """Insert PerceivedSamples into a Langfuse Dataset named ``{dataset_name}_{split}``.
+
+    Returns the dataset name, or None if Langfuse is unavailable.
+    """
+    client = get_client()
+    if client is None:
+        return None
+
+    name = f"{dataset_name}_{split}"
+    try:
+        client.create_dataset(name=name)
+        for s in samples:
+            client.create_dataset_item(
+                dataset_name=name,
+                input={
+                    "source_id": s.sample_id,
+                    "question": s.question,
+                    "question_type": s.question_type.value,
+                    "image_path": s.image_path or "",
+                    "choices": s.choices or [],
+                },
+                expected_output=s.expected_output,
+            )
+        print(f"[langfuse] Registered {len(samples)} samples → dataset '{name}'")
+        return name
+    except Exception as exc:
+        print(f"[langfuse] Dataset registration failed: {exc}")
+        return None
+
+
+def main() -> None:
+    """Register ChartQAPro dataset samples in Langfuse."""
+    parser = argparse.ArgumentParser(
+        description="Register ChartQAPro samples as Langfuse dataset"
+    )
+    parser.add_argument("--split", default="test")
+    parser.add_argument("--n", type=int, default=25)
+    parser.add_argument("--image_dir", default="data/chartqapro_images")
+    parser.add_argument("--cache_dir", default=None)
+    args = parser.parse_args()
+
+    from ..datasets.chartqapro_loader import load_chartqapro
+
+    samples = load_chartqapro(
+        split=args.split, n=args.n, image_dir=args.image_dir, cache_dir=args.cache_dir
+    )
+    register_dataset(samples, split=args.split)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/ingest.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/ingest.py
new file mode 100644
index 0000000..2ec4283
--- /dev/null
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/ingest.py
@@ -0,0 +1,174 @@
+"""Retroactive ingestion: convert existing MEP JSON files to Langfuse Traces.
+
+This lets you visualise runs that completed before Langfuse was wired in.
+
+Usage:
+    python -m agentic_chartqapro_eval.langfuse_integration.ingest \
+        --mep_dir meps/openai_openai/chartqapro/test \
+        [--metrics_file metrics.jsonl]
+"""
+
+import argparse
+import contextlib
+import json
+from pathlib import Path
+from typing import Optional
+
+from .client import get_client
+from .tracing import _normalize_usage
+
+
+def ingest_mep(
+    mep: dict,
+    client: object,
+    metrics: Optional[dict] = None,
+    project_name: str = "chartqapro-eval",  # noqa: ARG001 — kept for API compat
+) -> None:
+    """Create a Langfuse Trace from a single MEP dict (retroactively)."""
+    sample = mep.get("sample", {})
+    plan = mep.get("plan", {})
+    vision = mep.get("vision", {})
+    config = mep.get("config", {})
+
+    sample_id = sample.get("sample_id", "unknown")
+    config_name = config.get("config_name", "unknown")
+    question_type = sample.get("question_type", "standard")
+    question = sample.get("question", "")
+    expected = sample.get("expected_output", "")
+    vision_parsed = vision.get("parsed", {})
+
+    with client.start_as_current_observation(  # type: ignore[union-attr]
+        name=f"chartqapro/{sample_id}",
+        as_type="span",
+        input={"question": question, "expected_output": expected},
+        output=vision_parsed if vision_parsed else None,
+        metadata={
+            "run_id": mep.get("run_id", ""),
+            "config": config_name,
+            "question_type": question_type,
+            "schema_version": mep.get("schema_version", ""),
+            "has_errors": bool(mep.get("errors")),
+            "retroactive": True,
+        },
+    ) as trace_span:
+        # Planner generation
+        if plan.get("prompt"):
+            planner_gen = trace_span.start_observation(
+                name="planner",
+                as_type="generation",
+                input={"prompt": plan.get("prompt", "")},
+                model=config.get("planner_model", ""),
+                metadata={"backend": config.get("planner_backend", "")},
+            )
+            planner_gen.update(
+                output={
+                    "plan": plan.get("parsed", {}),
+                    "parse_error": plan.get("parse_error", False),
+                }
+            )
+            planner_gen.end()
+
+        # Vision tool generations — one per ToolTrace entry
+        for tt in vision.get("tool_trace", []):
+            usage = tt.get("provider_metadata", {}).get("usage", {})
+            tool_gen = trace_span.start_observation(
+                name="vision_qa_tool",
+                as_type="generation",
+                input={
+                    "question": question,
+                    "plan_steps": plan.get("parsed", {}).get("steps", []),
+                },
+                model=tt.get("model", config.get("vision_model", "")),
+                metadata={
+                    "backend": tt.get("backend", config.get("vision_backend", "")),
+                    "elapsed_ms": tt.get("elapsed_ms"),
+                },
+                usage_details=_normalize_usage(usage) if usage else None,
+            )
+            tool_gen.update(output=vision_parsed if vision_parsed else None)
+            tool_gen.end()
+
+        # Attach evaluation scores if provided
+        if metrics:
+            for key in [
+                "answer_accuracy",
+                "judge_explanation_quality",
+                "judge_hallucination_rate",
+                "judge_plan_coverage",
+                "judge_plan_adherence",
+                "judge_faithfulness_alignment",
+            ]:
+                if key in metrics and isinstance(metrics[key], (int, float)):
+                    with contextlib.suppress(Exception):
+                        trace_span.score_trace(name=key, value=float(metrics[key]))
+
+
+def ingest_dir(
+    mep_dir: str,
+    metrics_file: Optional[str] = None,
+    project_name: str = "chartqapro-eval",
+) -> int:
+    """Ingest all MEPs from a directory. Returns the number ingested."""
+    client = get_client()
+    if client is None:
+        print("[langfuse] No client — set LANGFUSE_PUBLIC_KEY and LANGFUSE_SECRET_KEY")
+        return 0
+
+    metrics_by_id: dict = {}
+    if metrics_file and Path(metrics_file).exists():
+        with open(metrics_file) as f:
+            for raw_line in f:
+                line = raw_line.strip()
+                if line:
+                    row = json.loads(line)
+                    metrics_by_id[row.get("sample_id", "")] = row
+
+    mep_path = Path(mep_dir)
+    mep_files = list(mep_path.glob("*.json"))
+    if not mep_files:
+        print(f"[langfuse] No MEP JSON files found in {mep_dir}")
+        return 0
+
+    count = 0
+    for fpath in sorted(mep_files):
+        try:
+            mep = json.loads(fpath.read_text())
+            sample_id = mep.get("sample", {}).get("sample_id", "")
+            ingest_mep(
+                mep,
+                client,
+                metrics=metrics_by_id.get(sample_id),
+                project_name=project_name,
+            )
+            count += 1
+            print(f"  ingested {sample_id}")
+        except Exception as exc:
+            print(f"  ERROR {fpath.name}: {exc}")
+
+    print(f"[langfuse] Ingested {count}/{len(mep_files)} MEPs from {mep_dir}")
+    with contextlib.suppress(Exception):
+        client.flush()  # type: ignore[union-attr]
+    return count
+
+
+def main() -> None:
+    """Parse CLI arguments and ingest MEP files into Langfuse."""
+    parser = argparse.ArgumentParser(description="Ingest existing MEPs into Langfuse")
+    parser.add_argument(
+        "--mep_dir", required=True, help="Directory containing MEP JSON files"
+    )
+    parser.add_argument(
+        "--metrics_file",
+        default=None,
+        help="Optional metrics.jsonl for feedback scores",
+    )
+    parser.add_argument(
+        "--project", default="chartqapro-eval", help="Langfuse project name (metadata)"
+    )
+    args = parser.parse_args()
+
+    ingest_dir(args.mep_dir, args.metrics_file, project_name=args.project)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/prompts.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/prompts.py
new file mode 100644
index 0000000..ec81d02
--- /dev/null
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/prompts.py
@@ -0,0 +1,81 @@
+"""Versioned prompt loading via Langfuse Prompt Management.
+
+Usage:
+    # Load prompt (falls back to file if Langfuse unavailable)
+    text = get_prompt("planner_prompt", PLANNER_PROMPT_PATH)
+
+    # Push current prompt files to Langfuse (run once before a new experiment)
+    python -m agentic_chartqapro_eval.langfuse_integration.prompts
+"""
+
+import argparse
+from pathlib import Path
+from typing import Optional
+
+from .client import get_client
+
+
+# Prompt names as stored in Langfuse Prompt Management
+PLANNER_PROMPT_NAME = "chartqapro_planner"
+VISION_PROMPT_NAME = "chartqapro_vision"
+
+
+def get_prompt(name: str, fallback_path: Path) -> str:
+    """Return the latest versioned prompt from Langfuse, or read from file."""
+    client = get_client()
+    if client:
+        try:
+            prompt = client.get_prompt(name=name)
+            if prompt:
+                return prompt.compile()
+        except Exception:
+            pass
+    return fallback_path.read_text()
+
+
+def push_prompts(
+    planner_path: Optional[Path] = None,
+    vision_path: Optional[Path] = None,
+) -> None:
+    """Upload current planner.txt and vision.txt to Langfuse Prompt Management."""
+    client = get_client()
+    if client is None:
+        print("[langfuse] No client — skipping prompt push")
+        return
+
+    agents_dir = Path(__file__).parents[1] / "agents" / "prompts"
+    planner_path = planner_path or (agents_dir / "planner.txt")
+    vision_path = vision_path or (agents_dir / "vision.txt")
+
+    for name, path in [
+        (PLANNER_PROMPT_NAME, planner_path),
+        (VISION_PROMPT_NAME, vision_path),
+    ]:
+        if not path.exists():
+            print(f"[langfuse] Prompt file not found: {path}")
+            continue
+        text = path.read_text()
+        try:
+            client.create_prompt(name=name, prompt=text, type="text")
+            print(f"[langfuse] Pushed prompt '{name}'")
+        except Exception as exc:
+            print(f"[langfuse] Failed to push prompt '{name}': {exc}")
+
+
+def main() -> None:
+    """Parse CLI arguments and push prompt files to Langfuse Prompt Management."""
+    parser = argparse.ArgumentParser(
+        description="Push prompt files to Langfuse Prompt Management"
+    )
+    parser.add_argument("--planner", default=None, help="Path to planner.txt")
+    parser.add_argument("--vision", default=None, help="Path to vision.txt")
+    args = parser.parse_args()
+
+    push_prompts(
+        planner_path=Path(args.planner) if args.planner else None,
+        vision_path=Path(args.vision) if args.vision else None,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py
new file mode 100644
index 0000000..7a8a84c
--- /dev/null
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py
@@ -0,0 +1,157 @@
+"""Lightweight wrappers around Langfuse v4 observations for the MEP pipeline.
+
+All helpers accept ``None`` as the client/trace and become no-ops, so the
+rest of the codebase can call them unconditionally.
+"""
+
+import contextlib
+from contextlib import contextmanager
+from typing import Optional
+
+
+def _normalize_usage(usage: dict) -> dict:
+    """Map provider usage dicts (OpenAI/Gemini) to Langfuse v4 usage_details keys."""
+    normalized: dict = {}
+    # OpenAI keys
+    if "prompt_tokens" in usage:
+        normalized["input"] = usage["prompt_tokens"]
+    elif "input" in usage:
+        normalized["input"] = usage["input"]
+    if "completion_tokens" in usage:
+        normalized["output"] = usage["completion_tokens"]
+    elif "output" in usage:
+        normalized["output"] = usage["output"]
+    if "total_tokens" in usage:
+        normalized["total"] = usage["total_tokens"]
+    elif "total" in usage:
+        normalized["total"] = usage["total"]
+    return normalized or usage
+
+
+class _TraceHandle:
+    """Thin wrapper yielded by sample_trace; exposes a stable interface across callers.
+
+    Attributes
+    ----------
+    id : str | None
+        The Langfuse trace ID, usable for attaching scores after the run.
+    """
+
+    def __init__(self, span: object, trace_id: Optional[str]) -> None:
+        self._span = span
+        self.id = trace_id
+
+    def update(self, **kwargs: object) -> None:
+        """Update the root trace span (e.g. set output after the run)."""
+        if self._span is not None:
+            with contextlib.suppress(Exception):
+                self._span.update(**kwargs)  # type: ignore[union-attr]
+
+    def score_trace(self, name: str, value: float) -> None:
+        """Attach a numeric score to the root trace."""
+        if self._span is not None:
+            with contextlib.suppress(Exception):
+                self._span.score_trace(name=name, value=value)  # type: ignore[union-attr]
+
+
+@contextmanager
+def sample_trace(
+    client: object,
+    sample_id: str,
+    question: str,
+    expected_output: str,
+    question_type: str,
+    config_name: str,
+    run_id: str,
+    project_name: str = "chartqapro-eval",
+):  # type: ignore[return]
+    """Open a Langfuse trace for one sample; yield a _TraceHandle (or None)."""
+    del project_name  # kept for API compatibility; Langfuse v4 uses project from SDK config
+    if client is None:
+        yield None
+        return
+
+    from langfuse import propagate_attributes
+
+    with client.start_as_current_observation(  # type: ignore[union-attr]
+        name=f"chartqapro/{sample_id}",
+        as_type="span",
+        input={"question": question, "expected_output": expected_output},
+        metadata={
+            "run_id": run_id,
+            "config": config_name,
+            "question_type": question_type,
+        },
+    ) as span:
+        with propagate_attributes(session_id=run_id):
+            trace_id = client.get_current_trace_id()  # type: ignore[union-attr]
+            handle = _TraceHandle(span=span, trace_id=trace_id)
+            try:
+                yield handle
+            finally:
+                with contextlib.suppress(Exception):
+                    client.flush()  # type: ignore[union-attr]
+
+
+def open_llm_span(
+    trace: object,
+    name: str,
+    input_data: dict,
+    model: str,
+    metadata: Optional[dict] = None,
+    parent_span_id: Optional[str] = None,
+) -> object:
+    """Create a Langfuse generation on the trace span (or return None).
+
+    ``parent_span_id`` is accepted for API compatibility but is unused in v4 —
+    nesting is handled by calling ``start_observation`` on the parent span.
+    """
+    del parent_span_id  # kept for API compatibility; v4 uses contextual nesting
+    if trace is None:
+        return None
+    span = getattr(trace, "_span", None)
+    if span is None:
+        return None
+    with contextlib.suppress(Exception):
+        return span.start_observation(  # type: ignore[union-attr]
+            name=name,
+            as_type="generation",
+            input=input_data,
+            model=model,
+            metadata=metadata or {},
+        )
+    return None
+
+
+def close_span(
+    span: object,
+    output: Optional[dict] = None,
+    usage: Optional[dict] = None,
+    error: Optional[str] = None,
+) -> None:
+    """End a Langfuse generation (no-op if span is None)."""
+    if span is None:
+        return
+    with contextlib.suppress(Exception):
+        update_kwargs: dict = {}
+        if output is not None:
+            update_kwargs["output"] = output
+        if usage:
+            update_kwargs["usage_details"] = _normalize_usage(usage)
+        if error:
+            update_kwargs["level"] = "ERROR"
+            update_kwargs["status_message"] = error
+        if update_kwargs:
+            span.update(**update_kwargs)  # type: ignore[union-attr]
+        span.end()  # type: ignore[union-attr]
+
+
+def log_trace_scores(trace: object, scores: dict) -> None:
+    """Log a dict of {metric_name: float} as scores on the trace."""
+    if trace is None:
+        return
+    for name, value in scores.items():
+        if isinstance(value, (int, float)):
+            with contextlib.suppress(Exception):
+                if hasattr(trace, "score_trace"):
+                    trace.score_trace(name=name, value=float(value))  # type: ignore[union-attr]
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/mep/schema.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/mep/schema.py
index f4eba74..1ae7a3e 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/mep/schema.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/mep/schema.py
@@ -157,7 +157,7 @@ class MEP:
     verifier: Optional[MEPVerifier] = None  # Pass 2.5 — None when skipped
     timestamps: Optional[MEPTimestamps] = None
     errors: List[str] = field(default_factory=list)
-    opik_trace_id: Optional[str] = None  # set when Opik tracing is active
+    lf_trace_id: Optional[str] = None  # set when Langfuse tracing is active
 
     def to_dict(self) -> dict:
         """Return a dict representation suitable for JSON serialization."""
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py
index 6940c24..e3eca3b 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py
@@ -37,10 +37,10 @@
     MEPVision,
 )
 from ..mep.writer import write_mep
-from ..opik_integration.client import get_client
-from ..opik_integration.dataset import register_dataset
-from ..opik_integration.prompts import push_prompts
-from ..opik_integration.tracing import (
+from ..langfuse_integration.client import get_client
+from ..langfuse_integration.dataset import register_dataset
+from ..langfuse_integration.prompts import push_prompts
+from ..langfuse_integration.tracing import (
     log_trace_scores,
     sample_trace,
 )
@@ -114,7 +114,7 @@ def process_sample(  # noqa: PLR0915
     config: dict,
     run_id: str,
     out_dir: str,
-    opik_client=None,
+    lf_client=None,
     verifier_agent: Optional[VerifierAgent] = None,
     ocr_tool: Optional[OcrReaderTool] = None,
 ) -> str:
@@ -155,15 +155,15 @@ def process_sample(  # noqa: PLR0915
     errors: list = []
 
     with sample_trace(
-        opik_client,
+        lf_client,
         sample_id=sample.sample_id,
         question=sample.question,
         expected_output=sample.expected_output,
         question_type=sample.question_type.value,
         config_name=config_name,
         run_id=run_id,
-    ) as opik_trace:
-        opik_trace_id = getattr(opik_trace, "id", None)
+    ) as lf_trace:
+        lf_trace_id = getattr(lf_trace, "id", None)
 
         # ---- Planner ----
         plan_prompt = ""
@@ -174,7 +174,8 @@ def process_sample(  # noqa: PLR0915
 
         try:
             with timed() as pt:
-                plan_prompt, plan_parsed, plan_parse_error, plan_raw = planner.run(sample, opik_trace=opik_trace)
+                plan_prompt, plan_parsed, plan_parse_error, plan_raw = planner.run(sample, lf_trace=lf_trace)
+
             plan_ms = pt.elapsed_ms
         except Exception as exc:
             errors.append(f"planner_error: {exc}")
@@ -192,7 +193,7 @@ def process_sample(  # noqa: PLR0915
 
         if ocr_tool is not None:
             try:
-                ocr_tool.opik_trace = opik_trace
+                ocr_tool.lf_trace = lf_trace
                 with timed() as ot:
                     ocr_raw = ocr_tool._run(sample.image_path)
                 ocr_ms = ot.elapsed_ms
@@ -225,7 +226,7 @@ def process_sample(  # noqa: PLR0915
                 ) = vision_agent.run(
                     sample,
                     plan_parsed,
-                    opik_trace=opik_trace,
+                    lf_trace=lf_trace,
                     ocr_result=ocr_parsed if ocr_parsed else None,
                 )
             vision_ms = vt.elapsed_ms
@@ -251,7 +252,8 @@ def process_sample(  # noqa: PLR0915
                         verifier_parsed,
                         verifier_parse_error,
                         verifier_raw,
-                    ) = verifier_agent.run(sample, plan_parsed, vision_parsed, opik_trace=opik_trace)
+                    ) = verifier_agent.run(sample, plan_parsed, vision_parsed, lf_trace=lf_trace)
+
                 verifier_ms = vrt.elapsed_ms
                 verifier_verdict = verifier_parsed.get("verdict", "confirmed")
             except Exception as exc:
@@ -331,20 +333,20 @@ def process_sample(  # noqa: PLR0915
                 verifier_ms=verifier_ms,
             ),
             errors=errors,
-            opik_trace_id=opik_trace_id,
+            lf_trace_id=lf_trace_id,
         )
 
-        # ---- Immediately log available scores to Opik ----
+        # ---- Immediately log available scores to Langfuse ----
         log_trace_scores(
-            opik_trace,
+            lf_trace,
             {
                 "planner_parse_ok": float(not plan_parse_error),
                 "vision_parse_ok": float(not vision_parse_error),
                 "has_errors": float(bool(errors)),
             },
         )
-        if opik_trace:
-            opik_trace.end(output=vision_parsed if vision_parsed else None)
+        if lf_trace:
+            lf_trace.update(output=vision_parsed if vision_parsed else None)
 
     return write_mep(mep, out_dir)
 
@@ -434,14 +436,14 @@ def main() -> None:  # noqa: PLR0912, PLR0915
     print(f"Output dir       : {out_dir}")
     print(f"Workers          : {args.workers}")
 
-    # Opik: register dataset + version prompts at run start (no-ops if unavailable)
-    opik_client = get_client()
-    if opik_client:
-        print("Opik             : enabled")
+    # Langfuse: register dataset + version prompts at run start (no-ops if unavailable)
+    lf_client = get_client()
+    if lf_client:
+        print("Langfuse         : enabled")
         register_dataset(samples, split=args.split)
         push_prompts()
     else:
-        print("Opik             : not configured (set OPIK_URL_OVERRIDE to enable)")
+        print("Langfuse         : not configured (set LANGFUSE_PUBLIC_KEY + LANGFUSE_SECRET_KEY to enable)")
 
     # Build agents once — run() creates fresh Crew/Tool per call so this is thread-safe
     print("Initialising agents …")
@@ -480,7 +482,7 @@ def main() -> None:  # noqa: PLR0912, PLR0915
                     config,
                     run_id,
                     out_dir,
-                    opik_client,
+                    lf_client,
                     verifier,
                     ocr,
                 )
@@ -498,7 +500,7 @@ def main() -> None:  # noqa: PLR0912, PLR0915
                     config,
                     run_id,
                     out_dir,
-                    opik_client,
+                    lf_client,
                     verifier,
                     ocr,
                 ): s
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/ocr_reader_tool.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/ocr_reader_tool.py
index aa58680..41677dd 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/ocr_reader_tool.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/ocr_reader_tool.py
@@ -18,7 +18,7 @@
 from openai import OpenAI
 from pydantic import BaseModel, Field, PrivateAttr
 
-from ..opik_integration.tracing import close_span, open_llm_span
+from ..langfuse_integration.tracing import close_span, open_llm_span
 
 
 _OCR_PROMPT = """\
@@ -77,7 +77,7 @@ class OcrReaderTool(BaseTool):
     backend: str = "gemini"
     model: str = "gemini-2.5-flash-lite"
     api_key: str = ""
-    opik_trace: Optional[Any] = None
+    lf_trace: Optional[Any] = None
 
     _traces: list = PrivateAttr(default_factory=list)
 
@@ -116,7 +116,7 @@ def _run(self, image_path: str) -> str:
         t0 = time.time()
 
         opik_span = open_llm_span(
-            self.opik_trace,
+            self.lf_trace,
             name="ocr_reader_tool",
             input_data={"image_path": image_path},
             model=self.model,
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/vision_qa_tool.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/vision_qa_tool.py
index b68ffec..01c3c9c 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/vision_qa_tool.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/vision_qa_tool.py
@@ -18,7 +18,7 @@
 from openai import OpenAI
 from pydantic import BaseModel, Field, PrivateAttr
 
-from ..opik_integration.tracing import close_span, open_llm_span
+from ..langfuse_integration.tracing import close_span, open_llm_span
 
 
 class VisionQAInput(BaseModel):
@@ -46,7 +46,7 @@ class VisionQATool(BaseTool):
     backend: str = "gemini"  # "openai" | "gemini"
     model: str = "gemini-2.5-flash-lite"
     api_key: str = ""
-    opik_trace: Optional[Any] = None  # Opik Trace object for span creation
+    lf_trace: Optional[Any] = None  # Langfuse Trace object for span creation
 
     # Private mutable trace storage (not a Pydantic field)
     _traces: list = PrivateAttr(default_factory=list)
@@ -100,8 +100,8 @@ def _run(
         start_ts = datetime.now(timezone.utc).isoformat()
         t0 = time.time()
 
-        opik_span = open_llm_span(
-            self.opik_trace,
+        lf_span = open_llm_span(
+            self.lf_trace,
             name="vision_qa_tool",
             input_data={
                 "image_path": image_path,
@@ -133,7 +133,7 @@ def _run(
         usage = provider_meta.get("usage", {})
 
         close_span(
-            opik_span,
+            lf_span,
             output={"raw_text": raw_text},
             usage=usage if usage else None,
             error=error_str,
diff --git a/pyproject.toml b/pyproject.toml
index 75916cf..6e9500a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,7 +30,6 @@ dependencies = [
 agentic-xai-eval = [
     "crewai>=1.6.1",
     "google-genai>=1.67.0",
-    "opik>=1.10.40",
     "streamlit>=1.55.0",
 ]
 dev = [
@@ -182,4 +181,4 @@ requires = ["setuptools>=68", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [tool.setuptools.packages.find]
-where = ["implementations/agentic_vqa_eval/src"]
+where = ["implementations/agentic_vqa_eval/src"]
\ No newline at end of file

From baaac24304d613b464f4b8721a01f08f51a212e8 Mon Sep 17 00:00:00 2001
From: Aravind N <aravindn1308@outlook.com>
Date: Wed, 18 Mar 2026 18:36:04 -0400
Subject: [PATCH 2/9] Refactor observability integration: remove Opik
 references, streamline Langfuse integration, and tidy up code formatting

---
 .../agents/planner_agent.py                   |   4 +-
 .../eval/error_taxonomy.py                    |   2 +-
 .../eval/eval_outputs.py                      |   8 +-
 .../agentic_chartqapro_eval/eval/eval_topk.py |   6 +-
 .../langfuse_integration/__init__.py          |   2 +-
 .../langfuse_integration/client.py            |  33 ++-
 .../langfuse_integration/dataset.py           |  44 +++-
 .../langfuse_integration/ingest.py            |   8 +-
 .../langfuse_integration/prompts.py           |   4 +-
 .../langfuse_integration/tracing.py           | 126 +++++++--
 .../opik_integration/__init__.py              |   1 -
 .../opik_integration/client.py                |  74 ------
 .../opik_integration/dataset.py               |  85 -------
 .../opik_integration/ingest.py                | 240 ------------------
 .../opik_integration/prompts.py               |  80 ------
 .../opik_integration/tracing.py               | 177 -------------
 .../runner/run_generate_meps.py               |  14 +-
 .../mechanistic_interpretability/README.md    |   1 -
 pyproject.toml                                |   2 +-
 19 files changed, 175 insertions(+), 736 deletions(-)
 delete mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/__init__.py
 delete mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/client.py
 delete mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/dataset.py
 delete mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/ingest.py
 delete mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/prompts.py
 delete mode 100644 implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/tracing.py

diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py
index 71d554f..5e34591 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py
@@ -137,9 +137,7 @@ def __init__(
         self.api_key = api_key
         self._llm = _build_llm(backend, model, api_key)
 
-    def run(
-        self, sample: PerceivedSample, lf_trace: Any = None
-    ) -> Tuple[str, dict, bool, str]:
+    def run(self, sample: PerceivedSample, lf_trace: Any = None) -> Tuple[str, dict, bool, str]:
         """
         Execute the planning phase for a new question.
 
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/error_taxonomy.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/error_taxonomy.py
index 78b357b..554a248 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/error_taxonomy.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/error_taxonomy.py
@@ -23,8 +23,8 @@
 from google import genai
 from openai import OpenAI
 
-from ..mep.writer import iter_meps
 from ..langfuse_integration.client import get_client
+from ..mep.writer import iter_meps
 from ..utils.json_strict import parse_strict
 
 
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_outputs.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_outputs.py
index bb1a412..711998e 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_outputs.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_outputs.py
@@ -16,8 +16,8 @@
 
 from dotenv import load_dotenv
 
-from ..mep.writer import iter_meps
 from ..langfuse_integration.client import get_client
+from ..mep.writer import iter_meps
 from .judge import judge_mep
 
 
@@ -145,11 +145,7 @@ def evaluate_mep(
             score_keys = ["answer_accuracy", "latency_sec"] + (
                 [f"judge_{k}" for k in judge_scores] if use_judge else []
             )
-            scores = {
-                k: metrics[k]
-                for k in score_keys
-                if isinstance(metrics.get(k), (int, float))
-            }
+            scores = {k: metrics[k] for k in score_keys if isinstance(metrics.get(k), (int, float))}
             for k, v in scores.items():
                 with contextlib.suppress(Exception):
                     client.create_score(trace_id=lf_trace_id, name=k, value=float(v))
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_topk.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_topk.py
index 9a51a7e..99df7b6 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_topk.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/eval_topk.py
@@ -232,9 +232,7 @@ def main() -> None:
                 cands = result["topk_candidates"]
                 h1 = result.get("hit_at_1", 0)
                 h3 = result.get(f"hit_at_{args.k}", 0)
-                print(
-                    f"  {sid}  exp={exp!r}  candidates={cands}  hit@1={h1}  hit@{args.k}={h3}"
-                )
+                print(f"  {sid}  exp={exp!r}  candidates={cands}  hit@1={h1}  hit@{args.k}={h3}")
 
                 lf_trace_id = mep.get("lf_trace_id")
                 if lf_client and lf_trace_id:
@@ -247,7 +245,7 @@ def main() -> None:
                                     name=key,
                                     value=float(result[key]),
                                 )
-                                
+
                 count += 1
             except Exception as exc:
                 print(f"  Error: {exc}")
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/__init__.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/__init__.py
index 32d60f4..83aa60e 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/__init__.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/__init__.py
@@ -1 +1 @@
-"""Langfuse observability integration — tracing, prompt versioning, dataset registration."""
+"""Langfuse observability integration."""
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py
index 5acfa2e..c725ce9 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py
@@ -5,6 +5,10 @@
 """
 
 import os
+from contextlib import suppress
+
+from dotenv import load_dotenv
+from langfuse import Langfuse
 
 
 _client = None
@@ -12,19 +16,26 @@
 
 
 def get_client():
-    """Return a configured langfuse.Langfuse() instance, or None if unavailable."""
+    """
+    Initialize and return a globally cached Langfuse client.
+
+    Retrieves configuration from environment variables and configures
+    the SDK for local or cloud usage.
+
+    Returns
+    -------
+    Langfuse or None
+        An active client, or None if configuration is missing or invalid.
+    """
     global _client, _initialised  # noqa: PLW0603
     if _initialised:
         return _client
 
     _initialised = True
 
-    try:
-        from dotenv import load_dotenv
-
+    # Load environment variables from .env file
+    with suppress(Exception):
         load_dotenv()
-    except ImportError:
-        pass
 
     public_key = os.environ.get("LANGFUSE_PUBLIC_KEY", "")
     secret_key = os.environ.get("LANGFUSE_SECRET_KEY", "")
@@ -33,8 +44,6 @@ def get_client():
         return None
 
     try:
-        from langfuse import Langfuse
-
         kwargs: dict = {"public_key": public_key, "secret_key": secret_key}
         # Accept LANGFUSE_HOST or LANGFUSE_BASE_URL (both are common)
         host = os.environ.get("LANGFUSE_HOST") or os.environ.get("LANGFUSE_BASE_URL", "")
@@ -50,7 +59,13 @@ def get_client():
 
 
 def reset_client() -> None:
-    """Force re-initialisation on next call (useful for tests)."""
+    """
+    Clear the cached client and reset initialization state.
+
+    Returns
+    -------
+    None
+    """
     global _client, _initialised  # noqa: PLW0603
     _client = None
     _initialised = False
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/dataset.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/dataset.py
index ab0f1b7..d0d0c47 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/dataset.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/dataset.py
@@ -8,6 +8,7 @@
 import argparse
 from typing import Optional
 
+from ..datasets.chartqapro_loader import load_chartqapro
 from .client import get_client
 
 
@@ -16,9 +17,24 @@ def register_dataset(
     dataset_name: str = "ChartQAPro",
     split: str = "test",
 ) -> Optional[str]:
-    """Insert PerceivedSamples into a Langfuse Dataset named ``{dataset_name}_{split}``.
+    """
+    Upload a collection of samples as a Langfuse Dataset.
+
+    Allows for versioned dataset management and evaluation in the Langfuse UI.
 
-    Returns the dataset name, or None if Langfuse is unavailable.
+    Parameters
+    ----------
+    samples : list of PerceivedSample
+        The data samples to register.
+    dataset_name : str, default 'ChartQAPro'
+        The base name for the dataset.
+    split : str, default 'test'
+        The split identifier (e.g., 'train', 'val').
+
+    Returns
+    -------
+    str or None
+        The name of the created dataset if successful, else None.
     """
     client = get_client()
     if client is None:
@@ -27,11 +43,11 @@ def register_dataset(
     name = f"{dataset_name}_{split}"
     try:
         client.create_dataset(name=name)
-        for s in samples:
+        [
             client.create_dataset_item(
                 dataset_name=name,
                 input={
-                    "source_id": s.sample_id,
+                    "source_id": s.sample_id,  # stored as data field; Langfuse auto-generates UUID v7 id
                     "question": s.question,
                     "question_type": s.question_type.value,
                     "image_path": s.image_path or "",
@@ -39,6 +55,8 @@ def register_dataset(
                 },
                 expected_output=s.expected_output,
             )
+            for s in samples
+        ]
         print(f"[langfuse] Registered {len(samples)} samples → dataset '{name}'")
         return name
     except Exception as exc:
@@ -47,21 +65,21 @@ def register_dataset(
 
 
 def main() -> None:
-    """Register ChartQAPro dataset samples in Langfuse."""
-    parser = argparse.ArgumentParser(
-        description="Register ChartQAPro samples as Langfuse dataset"
-    )
+    """
+    Command-line interface for registering ChartQAPro datasets.
+
+    Returns
+    -------
+    None
+    """
+    parser = argparse.ArgumentParser(description="Register ChartQAPro samples as Langfuse dataset")
     parser.add_argument("--split", default="test")
     parser.add_argument("--n", type=int, default=25)
     parser.add_argument("--image_dir", default="data/chartqapro_images")
     parser.add_argument("--cache_dir", default=None)
     args = parser.parse_args()
 
-    from ..datasets.chartqapro_loader import load_chartqapro
-
-    samples = load_chartqapro(
-        split=args.split, n=args.n, image_dir=args.image_dir, cache_dir=args.cache_dir
-    )
+    samples = load_chartqapro(split=args.split, n=args.n, image_dir=args.image_dir, cache_dir=args.cache_dir)
     register_dataset(samples, split=args.split)
 
 
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/ingest.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/ingest.py
index 2ec4283..c6d153b 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/ingest.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/ingest.py
@@ -154,17 +154,13 @@ def ingest_dir(
 def main() -> None:
     """Parse CLI arguments and ingest MEP files into Langfuse."""
     parser = argparse.ArgumentParser(description="Ingest existing MEPs into Langfuse")
-    parser.add_argument(
-        "--mep_dir", required=True, help="Directory containing MEP JSON files"
-    )
+    parser.add_argument("--mep_dir", required=True, help="Directory containing MEP JSON files")
     parser.add_argument(
         "--metrics_file",
         default=None,
         help="Optional metrics.jsonl for feedback scores",
     )
-    parser.add_argument(
-        "--project", default="chartqapro-eval", help="Langfuse project name (metadata)"
-    )
+    parser.add_argument("--project", default="chartqapro-eval", help="Langfuse project name (metadata)")
     args = parser.parse_args()
 
     ingest_dir(args.mep_dir, args.metrics_file, project_name=args.project)
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/prompts.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/prompts.py
index ec81d02..35aece4 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/prompts.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/prompts.py
@@ -64,9 +64,7 @@ def push_prompts(
 
 def main() -> None:
     """Parse CLI arguments and push prompt files to Langfuse Prompt Management."""
-    parser = argparse.ArgumentParser(
-        description="Push prompt files to Langfuse Prompt Management"
-    )
+    parser = argparse.ArgumentParser(description="Push prompt files to Langfuse Prompt Management")
     parser.add_argument("--planner", default=None, help="Path to planner.txt")
     parser.add_argument("--vision", default=None, help="Path to vision.txt")
     args = parser.parse_args()
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py
index 7a8a84c..546c99c 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py
@@ -8,6 +8,8 @@
 from contextlib import contextmanager
 from typing import Optional
 
+from langfuse import propagate_attributes
+
 
 def _normalize_usage(usage: dict) -> dict:
     """Map provider usage dicts (OpenAI/Gemini) to Langfuse v4 usage_details keys."""
@@ -65,32 +67,58 @@ def sample_trace(
     run_id: str,
     project_name: str = "chartqapro-eval",
 ):  # type: ignore[return]
-    """Open a Langfuse trace for one sample; yield a _TraceHandle (or None)."""
+    """
+    Context manager to create a Langfuse trace for a single sample.
+
+    Parameters
+    ----------
+    client : object
+        The Langfuse client. If None, the context manager yields None.
+    sample_id : str
+        Unique identifier for the sample.
+    question : str
+        The input prompt text.
+    expected_output : str
+        The ground truth answer.
+    question_type : str
+        The category of the question.
+    config_name : str
+        The evaluation configuration used.
+    run_id : str
+        The unique ID of the pipeline run.
+    project_name : str, default 'chartqapro-eval'
+        Langfuse project identifier (ignored in v4; use SDK config).
+
+    Yields
+    ------
+    trace_handle : _TraceHandle or None
+        The initialized trace object.
+    """
     del project_name  # kept for API compatibility; Langfuse v4 uses project from SDK config
     if client is None:
         yield None
         return
 
-    from langfuse import propagate_attributes
-
-    with client.start_as_current_observation(  # type: ignore[union-attr]
-        name=f"chartqapro/{sample_id}",
-        as_type="span",
-        input={"question": question, "expected_output": expected_output},
-        metadata={
-            "run_id": run_id,
-            "config": config_name,
-            "question_type": question_type,
-        },
-    ) as span:
-        with propagate_attributes(session_id=run_id):
-            trace_id = client.get_current_trace_id()  # type: ignore[union-attr]
-            handle = _TraceHandle(span=span, trace_id=trace_id)
-            try:
-                yield handle
-            finally:
-                with contextlib.suppress(Exception):
-                    client.flush()  # type: ignore[union-attr]
+    with (
+        client.start_as_current_observation(  # type: ignore[union-attr]
+            name=f"chartqapro/{sample_id}",
+            as_type="span",
+            input={"question": question, "expected_output": expected_output},
+            metadata={
+                "run_id": run_id,
+                "config": config_name,
+                "question_type": question_type,
+            },
+        ) as span,
+        propagate_attributes(session_id=run_id),
+    ):
+        trace_id = client.get_current_trace_id()  # type: ignore[union-attr]
+        handle = _TraceHandle(span=span, trace_id=trace_id)
+        try:
+            yield handle
+        finally:
+            with contextlib.suppress(Exception):
+                client.flush()  # type: ignore[union-attr]
 
 
 def open_llm_span(
@@ -101,10 +129,31 @@ def open_llm_span(
     metadata: Optional[dict] = None,
     parent_span_id: Optional[str] = None,
 ) -> object:
-    """Create a Langfuse generation on the trace span (or return None).
+    """
+    Begin a Langfuse generation observation on the given trace span.
 
     ``parent_span_id`` is accepted for API compatibility but is unused in v4 —
     nesting is handled by calling ``start_observation`` on the parent span.
+
+    Parameters
+    ----------
+    trace : object
+        The parent trace or span.
+    name : str
+        Logical name for the operation.
+    input_data : dict
+        Model inputs.
+    model : str
+        Model identifier.
+    metadata : dict, optional
+        Additional context keys.
+    parent_span_id : str, optional
+        Explicit parent linkage (ignored in v4; nesting is contextual).
+
+    Returns
+    -------
+    object or None
+        The active span object.
     """
     del parent_span_id  # kept for API compatibility; v4 uses contextual nesting
     if trace is None:
@@ -129,7 +178,23 @@ def close_span(
     usage: Optional[dict] = None,
     error: Optional[str] = None,
 ) -> None:
-    """End a Langfuse generation (no-op if span is None)."""
+    """Log results and terminate an active span.
+
+    Parameters
+    ----------
+    span : object
+        The span to close.
+    output : dict, optional
+        The model output to log.
+    usage : dict, optional
+        The provider usage dict (e.g. OpenAI or Gemini keys).
+    error : str, optional
+        An error message to log (if any).
+
+    Returns
+    -------
+    None
+    """
     if span is None:
         return
     with contextlib.suppress(Exception):
@@ -147,7 +212,20 @@ def close_span(
 
 
 def log_trace_scores(trace: object, scores: dict) -> None:
-    """Log a dict of {metric_name: float} as scores on the trace."""
+    """
+    Attach quantitative feedback scores to a trace.
+
+    Parameters
+    ----------
+    trace : object
+        The trace to update.
+    scores : dict
+        Mapping of metric names to numeric values.
+
+    Returns
+    -------
+    None
+    """
     if trace is None:
         return
     for name, value in scores.items():
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/__init__.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/__init__.py
deleted file mode 100644
index 9bf02f2..0000000
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-"""Opik observability integration — tracing, prompt versioning, dataset registration."""
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/client.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/client.py
deleted file mode 100644
index 2eebaf7..0000000
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/client.py
+++ /dev/null
@@ -1,74 +0,0 @@
-"""Opik client singleton with graceful degradation.
-
-Returns None when OPIK_URL_OVERRIDE / OPIK_API_KEY is not set or opik is not
-installed, so every caller can guard with ``if client:``.
-"""
-
-import os
-from contextlib import suppress
-
-import opik
-from dotenv import load_dotenv
-
-
-_client = None
-_initialised = False
-
-
-def get_client():
-    """
-    Initialize and return a globally cached Opik client.
-
-    Retrieves configuration from environment variables and configures
-    the SDK for local or cloud usage.
-
-    Returns
-    -------
-    Opik or None
-        An active client, or None if configuration is missing or invalid.
-    """
-    global _client, _initialised  # noqa: PLW0603
-    if _initialised:
-        return _client
-
-    _initialised = True
-
-    # Load environment variables from .env file
-    with suppress(Exception):
-        load_dotenv()
-
-    url = os.environ.get("OPIK_URL_OVERRIDE", "")
-    api_key = os.environ.get("OPIK_API_KEY", "")
-
-    if not url and not api_key:
-        return None
-
-    try:
-        if url:
-            # Opik SDK expects the base URL without /api suffix
-            base_url = url.rstrip("/")
-            if base_url.endswith("/api"):
-                base_url = base_url[:-4]
-            opik.configure(url=base_url, use_local=True, force=True, automatic_approvals=True)
-        else:
-            opik.configure(api_key=api_key, force=True, automatic_approvals=True)
-
-        _client = opik.Opik()
-    except Exception as exc:
-        print(f"[opik] client init failed: {exc}")
-        _client = None
-
-    return _client
-
-
-def reset_client() -> None:
-    """
-    Clear the cached Opik client and force re-initialization.
-
-    Returns
-    -------
-    None
-    """
-    global _client, _initialised  # noqa: PLW0603
-    _client = None
-    _initialised = False
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/dataset.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/dataset.py
deleted file mode 100644
index d4ac507..0000000
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/dataset.py
+++ /dev/null
@@ -1,85 +0,0 @@
-"""Register ChartQAPro samples as an Opik Dataset.
-
-Usage:
-    uv run --env-file .env -m agentic_chartqapro_eval.opik_integration.dataset \
-        --split test --n 25
-"""
-
-import argparse
-from typing import Optional
-
-from ..datasets.chartqapro_loader import load_chartqapro
-from .client import get_client
-
-
-def register_dataset(
-    samples,
-    dataset_name: str = "ChartQAPro",
-    split: str = "test",
-) -> Optional[object]:
-    """
-    Upload a collection of samples as an Opik Dataset.
-
-    Allows for versioned dataset management and evaluation in the Opik UI.
-
-    Parameters
-    ----------
-    samples : list of PerceivedSample
-        The data samples to register.
-    dataset_name : str, default 'ChartQAPro'
-        The base name for the dataset.
-    split : str, default 'test'
-        The split identifier (e.g., 'train', 'val').
-
-    Returns
-    -------
-    Dataset or None
-        The Opik Dataset object if successful.
-    """
-    client = get_client()
-    if client is None:
-        return None
-
-    name = f"{dataset_name}_{split}"
-    try:
-        dataset = client.get_or_create_dataset(name=name)
-        items = [
-            {
-                "source_id": s.sample_id,  # stored as data field; Opik auto-generates UUID v7 id
-                "question": s.question,
-                "expected_output": s.expected_output,
-                "question_type": s.question_type.value,
-                "image_path": s.image_path or "",
-                "choices": s.choices or [],
-            }
-            for s in samples
-        ]
-        dataset.insert(items)
-        print(f"[opik] Registered {len(items)} samples → dataset '{name}'")
-        return dataset
-    except Exception as exc:
-        print(f"[opik] Dataset registration failed: {exc}")
-        return None
-
-
-def main() -> None:
-    """
-    Command-line interface for registering ChartQAPro datasets.
-
-    Returns
-    -------
-    None
-    """
-    parser = argparse.ArgumentParser(description="Register ChartQAPro samples as Opik dataset")
-    parser.add_argument("--split", default="test")
-    parser.add_argument("--n", type=int, default=25)
-    parser.add_argument("--image_dir", default="data/chartqapro_images")
-    parser.add_argument("--cache_dir", default=None)
-    args = parser.parse_args()
-
-    samples = load_chartqapro(split=args.split, n=args.n, image_dir=args.image_dir, cache_dir=args.cache_dir)
-    register_dataset(samples, split=args.split)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/ingest.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/ingest.py
deleted file mode 100644
index d8baf39..0000000
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/ingest.py
+++ /dev/null
@@ -1,240 +0,0 @@
-"""Retroactive ingestion: convert existing MEP JSON files to Opik Traces.
-
-This lets you visualise runs that completed before Opik was wired in.
-
-Usage:
-    uv run --env-file .env -m agentic_chartqapro_eval.opik_integration.ingest \
-        --mep_dir meps/openai_openai/chartqapro/test \
-        [--metrics_file metrics.jsonl]
-"""
-
-import argparse
-import contextlib
-import json
-from datetime import datetime, timedelta
-from pathlib import Path
-from typing import Optional
-
-from .client import get_client
-
-
-def _parse_ts(iso: Optional[str]) -> Optional[datetime]:
-    if not iso:
-        return None
-    try:
-        return datetime.fromisoformat(iso)
-    except ValueError:
-        return None
-
-
-def ingest_mep(
-    mep: dict,
-    client,
-    metrics: Optional[dict] = None,
-    project_name: str = "chartqapro-eval",
-) -> None:
-    """
-    Convert a single MEP JSON record into a retroactive Opik trace.
-
-    Parameters
-    ----------
-    mep : dict
-        The raw MEP record.
-    client : object
-        The Opik client.
-    metrics : dict, optional
-        Pre-computed metrics for the sample.
-    project_name : str, default 'chartqapro-eval'
-        Target project.
-
-    Returns
-    -------
-    None
-    """
-    sample = mep.get("sample", {})
-    plan = mep.get("plan", {})
-    vision = mep.get("vision", {})
-    timestamps = mep.get("timestamps", {})
-    config = mep.get("config", {})
-
-    sample_id = sample.get("sample_id", "unknown")
-    config_name = config.get("config_name", "unknown")
-    question_type = sample.get("question_type", "standard")
-    question = sample.get("question", "")
-    expected = sample.get("expected_output", "")
-    vision_parsed = vision.get("parsed", {})
-
-    start_time = _parse_ts(timestamps.get("start"))
-    end_time = _parse_ts(timestamps.get("end"))
-    planner_ms = timestamps.get("planner_ms") or 0
-
-    trace = client.trace(
-        name=f"chartqapro/{sample_id}",
-        start_time=start_time,
-        end_time=end_time,
-        input={"question": question, "expected_output": expected},
-        output=vision_parsed if vision_parsed else None,
-        tags=[config_name, question_type, "chartqapro", "retroactive"],
-        metadata={
-            "run_id": mep.get("run_id", ""),
-            "config": config_name,
-            "question_type": question_type,
-            "schema_version": mep.get("schema_version", ""),
-            "has_errors": bool(mep.get("errors")),
-        },
-        project_name=project_name,
-    )
-
-    # Planner span — estimate its time window from the start
-    if plan.get("prompt"):
-        p_start = start_time
-        p_end = None
-        if start_time and planner_ms:
-            p_end = start_time + timedelta(milliseconds=planner_ms)
-        planner_span = trace.span(
-            name="planner",
-            type="llm",
-            start_time=p_start,
-            end_time=p_end,
-            input={"prompt": plan.get("prompt", "")},
-            output={
-                "plan": plan.get("parsed", {}),
-                "parse_error": plan.get("parse_error", False),
-            },
-            model=config.get("planner_model", ""),
-            metadata={"backend": config.get("planner_backend", "")},
-        )
-        planner_span.end()
-
-    # Vision tool spans — one per ToolTrace entry
-    for tt in vision.get("tool_trace", []):
-        ts_start = _parse_ts(tt.get("start_ts"))
-        ts_end = _parse_ts(tt.get("end_ts"))
-        usage = tt.get("provider_metadata", {}).get("usage", {})
-        tool_span = trace.span(
-            name="vision_qa_tool",
-            type="llm",
-            start_time=ts_start,
-            end_time=ts_end,
-            input={
-                "question": question,
-                "plan_steps": plan.get("parsed", {}).get("steps", []),
-            },
-            output=vision_parsed if vision_parsed else None,
-            model=tt.get("model", config.get("vision_model", "")),
-            usage=usage if usage else None,
-            metadata={
-                "backend": tt.get("backend", config.get("vision_backend", "")),
-                "elapsed_ms": tt.get("elapsed_ms"),
-            },
-        )
-        tool_span.end()
-
-    trace.end()
-
-    # Log feedback scores from the matching metrics row
-    if metrics:
-        scores_to_log = {}
-        for key in [
-            "answer_accuracy",
-            "judge_explanation_quality",
-            "judge_hallucination_rate",
-            "judge_plan_coverage",
-            "judge_plan_adherence",
-            "judge_faithfulness_alignment",
-        ]:
-            if key in metrics and isinstance(metrics[key], (int, float)):
-                scores_to_log[key] = float(metrics[key])
-        for name, value in scores_to_log.items():
-            with contextlib.suppress(Exception):
-                trace.log_feedback_score(name=name, value=value)
-
-
-def ingest_dir(
-    mep_dir: str,
-    metrics_file: Optional[str] = None,
-    project_name: str = "chartqapro-eval",
-) -> int:
-    """
-    Bulk ingest all MEP files from a local directory into Opik.
-
-    Parameters
-    ----------
-    mep_dir : str
-        Path to the folder containing JSON results.
-    metrics_file : str, optional
-        Path to a .jsonl file with metrics data.
-    project_name : str, default 'chartqapro-eval'
-        Opik project identifier.
-
-    Returns
-    -------
-    int
-        The total number of successfully ingested records.
-    """
-    client = get_client()
-    if client is None:
-        print("[opik] No client — set OPIK_URL_OVERRIDE or OPIK_API_KEY")
-        return 0
-
-    # Build sample_id → metrics lookup if provided
-    metrics_by_id: dict = {}
-    if metrics_file and Path(metrics_file).exists():
-        with open(metrics_file) as f:
-            for raw_line in f:
-                line = raw_line.strip()
-                if line:
-                    row = json.loads(line)
-                    metrics_by_id[row.get("sample_id", "")] = row
-
-    mep_path = Path(mep_dir)
-    mep_files = list(mep_path.glob("*.json"))
-    if not mep_files:
-        print(f"[opik] No MEP JSON files found in {mep_dir}")
-        return 0
-
-    count = 0
-    for fpath in sorted(mep_files):
-        try:
-            mep = json.loads(fpath.read_text())
-            sample_id = mep.get("sample", {}).get("sample_id", "")
-            ingest_mep(
-                mep,
-                client,
-                metrics=metrics_by_id.get(sample_id),
-                project_name=project_name,
-            )
-            count += 1
-            print(f"  ingested {sample_id}")
-        except Exception as exc:
-            print(f"  ERROR {fpath.name}: {exc}")
-
-    print(f"[opik] Ingested {count}/{len(mep_files)} MEPs from {mep_dir}")
-    with contextlib.suppress(Exception):
-        client.flush()
-    return count
-
-
-def main() -> None:
-    """
-    Command-line interface for retroactive ingestion into Opik.
-
-    Returns
-    -------
-    None
-    """
-    parser = argparse.ArgumentParser(description="Ingest existing MEPs into Opik")
-    parser.add_argument("--mep_dir", required=True, help="Directory containing MEP JSON files")
-    parser.add_argument(
-        "--metrics_file",
-        default=None,
-        help="Optional metrics.jsonl for feedback scores",
-    )
-    parser.add_argument("--project", default="chartqapro-eval", help="Opik project name")
-    args = parser.parse_args()
-
-    ingest_dir(args.mep_dir, args.metrics_file, project_name=args.project)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/prompts.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/prompts.py
deleted file mode 100644
index 44944c8..0000000
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/prompts.py
+++ /dev/null
@@ -1,80 +0,0 @@
-"""Versioned prompt loading via Opik Prompt Library.
-
-Usage:
-    # Load prompt (falls back to file if Opik unavailable)
-    text = get_prompt("planner_prompt", PLANNER_PROMPT_PATH)
-
-    # Push current prompt files to Opik (run once before a new experiment)
-    uv run --env-file .env -m agentic_chartqapro_eval.opik_integration.prompts
-"""
-
-import argparse
-from pathlib import Path
-from typing import Optional
-
-from .client import get_client
-
-
-# Prompt names as stored in the Opik Prompt Library
-PLANNER_PROMPT_NAME = "chartqapro_planner"
-VISION_PROMPT_NAME = "chartqapro_vision"
-
-
-def get_prompt(name: str, fallback_path: Path) -> str:
-    """Return the latest versioned prompt from Opik, or read from file."""
-    client = get_client()
-    if client:
-        try:
-            prompt = client.get_prompt(name=name)
-            if prompt:
-                return prompt.format()
-        except Exception:
-            pass
-    return fallback_path.read_text()
-
-
-def push_prompts(
-    planner_path: Optional[Path] = None,
-    vision_path: Optional[Path] = None,
-) -> None:
-    """Upload current planner.txt and vision.txt to Opik Prompt Library."""
-    client = get_client()
-    if client is None:
-        print("[opik] No client — skipping prompt push")
-        return
-
-    # Resolve default paths relative to the agents/prompts directory
-    agents_dir = Path(__file__).parents[1] / "agents" / "prompts"
-    planner_path = planner_path or (agents_dir / "planner.txt")
-    vision_path = vision_path or (agents_dir / "vision.txt")
-
-    for name, path in [
-        (PLANNER_PROMPT_NAME, planner_path),
-        (VISION_PROMPT_NAME, vision_path),
-    ]:
-        if not path.exists():
-            print(f"[opik] Prompt file not found: {path}")
-            continue
-        text = path.read_text()
-        try:
-            prompt = client.create_prompt(name=name, prompt=text)
-            print(f"[opik] Pushed prompt '{name}' (commit={prompt.commit})")
-        except Exception as exc:
-            print(f"[opik] Failed to push prompt '{name}': {exc}")
-
-
-def main() -> None:
-    """Parse CLI arguments and push prompt files to the Opik Prompt Library."""
-    parser = argparse.ArgumentParser(description="Push prompt files to Opik Prompt Library")
-    parser.add_argument("--planner", default=None, help="Path to planner.txt")
-    parser.add_argument("--vision", default=None, help="Path to vision.txt")
-    args = parser.parse_args()
-
-    push_prompts(
-        planner_path=Path(args.planner) if args.planner else None,
-        vision_path=Path(args.vision) if args.vision else None,
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/tracing.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/tracing.py
deleted file mode 100644
index 92d84b4..0000000
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/opik_integration/tracing.py
+++ /dev/null
@@ -1,177 +0,0 @@
-"""Lightweight wrappers around opik Trace/Span for the MEP pipeline.
-
-All helpers accept ``None`` as the client/trace and become no-ops, so the
-rest of the codebase can call them unconditionally.
-"""
-
-import contextlib
-from contextlib import contextmanager
-from datetime import datetime, timezone
-from typing import Optional
-
-from opik.types import ErrorInfoDict
-
-
-def _now() -> datetime:
-    return datetime.now(timezone.utc)
-
-
-@contextmanager
-def sample_trace(
-    client,
-    sample_id: str,
-    question: str,
-    expected_output: str,
-    question_type: str,
-    config_name: str,
-    run_id: str,
-    project_name: str = "chartqapro-eval",
-):
-    """
-    Context manager to open and automatically close an Opik trace.
-
-    Parameters
-    ----------
-    client : object
-        The Opik client. If None, the context manager yields None.
-    sample_id : str
-        Unique identifier for the sample.
-    question : str
-        The input prompt text.
-    expected_output : str
-        The ground truth answer.
-    question_type : str
-        The category of the question.
-    config_name : str
-        The evaluation configuration used.
-    run_id : str
-        The unique ID of the pipeline run.
-    project_name : str, default 'chartqapro-eval'
-        Opik project identifier.
-
-    Yields
-    ------
-    trace : object or None
-        The initialized trace object.
-    """
-    if client is None:
-        yield None
-        return
-
-    trace = client.trace(
-        name=f"chartqapro/{sample_id}",
-        input={"question": question, "expected_output": expected_output},
-        tags=[config_name, question_type, "chartqapro"],
-        metadata={
-            "run_id": run_id,
-            "config": config_name,
-            "question_type": question_type,
-        },
-        project_name=project_name,
-    )
-    try:
-        yield trace
-    finally:
-        trace.end()
-
-
-def open_llm_span(
-    trace,
-    name: str,
-    input_data: dict,
-    model: str,
-    metadata: Optional[dict] = None,
-    parent_span_id: Optional[str] = None,
-):
-    """
-    Begin a new LLM-type span within an active trace.
-
-    Parameters
-    ----------
-    trace : object
-        The parent trace or span.
-    name : str
-        Logical name for the operation.
-    input_data : dict
-        Model inputs.
-    model : str
-        Model identifier.
-    metadata : dict, optional
-        Additional context keys.
-    parent_span_id : str, optional
-        Explicit parent linkage.
-
-    Returns
-    -------
-    object or None
-        The active span object.
-    """
-    if trace is None:
-        return None
-    return trace.span(
-        name=name,
-        type="llm",
-        input=input_data,
-        model=model,
-        metadata=metadata or {},
-        parent_span_id=parent_span_id,
-    )
-
-
-def close_span(
-    span,
-    output: Optional[dict] = None,
-    usage: Optional[dict] = None,
-    error: Optional[str] = None,
-) -> None:
-    """
-    Log results and terminate an active span.
-
-    Parameters
-    ----------
-    span : object
-        The span to close.
-    output : dict, optional
-        The result of the operation.
-    usage : dict, optional
-        Token usage statistics.
-    error : str, optional
-        Error message if the span failed.
-
-    Returns
-    -------
-    None
-    """
-    if span is None:
-        return
-    kwargs: dict = {}
-    if output is not None:
-        kwargs["output"] = output
-    if usage:
-        kwargs["usage"] = usage
-    if error:
-        kwargs["error_info"] = ErrorInfoDict(message=error)
-    span.end(**kwargs)
-
-
-def log_trace_scores(trace, scores: dict) -> None:
-    """
-    Attach quantitative feedback scores to a trace.
-
-    Parameters
-    ----------
-    trace : object
-        The trace to update.
-    scores : dict
-        Mapping of metric names to numeric values.
-
-    Returns
-    -------
-    None
-    """
-    if trace is None:
-        return
-    for name, value in scores.items():
-        if isinstance(value, (int, float)):
-            with contextlib.suppress(Exception):
-                trace.log_feedback_score(name=name, value=float(value))
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py
index e3eca3b..bad9e28 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py
@@ -25,6 +25,13 @@
 from ..agents.vision_agent import VisionAgent
 from ..datasets.chartqapro_loader import load_chartqapro
 from ..datasets.perceived_sample import PerceivedSample
+from ..langfuse_integration.client import get_client
+from ..langfuse_integration.dataset import register_dataset
+from ..langfuse_integration.prompts import push_prompts
+from ..langfuse_integration.tracing import (
+    log_trace_scores,
+    sample_trace,
+)
 from ..mep.schema import (
     MEP,
     ImageRef,
@@ -37,13 +44,6 @@
     MEPVision,
 )
 from ..mep.writer import write_mep
-from ..langfuse_integration.client import get_client
-from ..langfuse_integration.dataset import register_dataset
-from ..langfuse_integration.prompts import push_prompts
-from ..langfuse_integration.tracing import (
-    log_trace_scores,
-    sample_trace,
-)
 from ..tools.ocr_reader_tool import OcrReaderTool
 from ..utils.hashing import sha256_file
 from ..utils.json_strict import parse_strict
diff --git a/implementations/mechanistic_interpretability/README.md b/implementations/mechanistic_interpretability/README.md
index 4a5b18d..9b04b2b 100644
--- a/implementations/mechanistic_interpretability/README.md
+++ b/implementations/mechanistic_interpretability/README.md
@@ -113,4 +113,3 @@ Pointers for the main tools and ideas used here:
 - **Activation patching in VLMs**: Neo et al., 2024 — `https://arxiv.org/abs/2401.15947`
 - **Logit lens for VLMs (MMNeuron)**: `https://arxiv.org/abs/2406.11193`
 - **VLM interpretability survey (ICLR blog, 2025)**: `https://d2jud02ci9yv69.cloudfront.net/2025-04-28-vlm-understanding-29/blog/vlm-understanding/`
-
diff --git a/pyproject.toml b/pyproject.toml
index 6e9500a..265a883 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -181,4 +181,4 @@ requires = ["setuptools>=68", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [tool.setuptools.packages.find]
-where = ["implementations/agentic_vqa_eval/src"]
\ No newline at end of file
+where = ["implementations/agentic_vqa_eval/src"]

From 1df04a804532acaf838a6ce425fe1f0cfc613c12 Mon Sep 17 00:00:00 2001
From: Aravind N <aravindn1308@outlook.com>
Date: Wed, 18 Mar 2026 19:56:41 -0400
Subject: [PATCH 3/9] Update README and notebooks from opik to langfuse.

---
 implementations/agentic_vqa_eval/README.md    |  34 ++--
 .../agentic_vqa_eval/analysis.ipynb           |   8 +-
 .../agentic_vqa_eval/run_pipeline.ipynb       | 138 +++++---------
 uv.lock                                       | 175 ------------------
 4 files changed, 69 insertions(+), 286 deletions(-)

diff --git a/implementations/agentic_vqa_eval/README.md b/implementations/agentic_vqa_eval/README.md
index d28a821..353fe8f 100644
--- a/implementations/agentic_vqa_eval/README.md
+++ b/implementations/agentic_vqa_eval/README.md
@@ -242,10 +242,12 @@ The `.env` file lives at the **repo root**. `load_dotenv()` searches upward from
 
 ### 3. Generate MEPs (run the agentic pipeline)
 
+> **Note:** All `uv run` commands below use `$(git rev-parse --show-toplevel)` so they work from any directory in the repo — it resolves the repo root for `--env-file`, while `--directory` ensures outputs (`meps/`, `output/`) are written inside `implementations/agentic_vqa_eval/`.
+
 Run on 25 test samples using GPT-4o for planner, vision, and verifier:
 
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \
+uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.runner.run_generate_meps \
     --split test \
     --n 25 \
     --config gemini_gemini \
@@ -257,7 +259,7 @@ MEPs are written to `meps/gemini_gemini/chartqapro/test/<sample_id>.json`.
 
 The **VerifierAgent (Pass 2.5)** runs automatically after the VisionAgent on every sample. To skip it (faster, lower cost):
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \
+uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.runner.run_generate_meps \
     --split test --n 25 --config gemini_gemini --no_verifier
 ```
 
@@ -265,7 +267,7 @@ uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \
 
 **Model overrides** (e.g. to test different models without changing config):
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \
+uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.runner.run_generate_meps \
     --split test --n 25 --config gemini_gemini \
     --planner_model gemini-2.5-flash-lite \
     --vision_model gemini-2.5-flash-lite \
@@ -280,14 +282,14 @@ OCR is **enabled by default** and uses the same vision backend and model as the
 
 To run with OCR using a cheaper model (recommended — OCR is simpler than full VQA):
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \
+uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.runner.run_generate_meps \
     --split test --n 25 --config gemini_gemini \
     --ocr_model gemini-2.5-flash-lite
 ```
 
 To disable OCR entirely (matches the original pipeline behaviour, faster and lower cost):
 ```bash
-uv run --env-file .env -magentic_chartqapro_eval.runner.run_generate_meps \
+uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.runner.run_generate_meps \
     --split test --n 25 --config gemini_gemini --no_ocr
 ```
 
@@ -298,7 +300,7 @@ When OCR is skipped, `"ocr": null` appears in the MEP and `"ocr_ms": 0.0` in tim
 ### 4. Evaluate outputs (Pass 1 — accuracy + judge)
 
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_outputs \
+uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.eval_outputs \
     --mep_dir meps/gemini_gemini/chartqapro/test \
     --out output/metrics.jsonl \
     --no_judge          # omit this flag to enable LLM judge (costs API calls)
@@ -315,7 +317,7 @@ The `predicted` column always reflects the **final answer** — the verifier's o
 ### 5. Evaluate traces (Pass 2 — latency and replayability)
 
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_traces \
+uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.eval_traces \
     --mep_dir meps/gemini_gemini/chartqapro/test \
     --out output/trace_metrics.jsonl
 ```
@@ -325,7 +327,7 @@ uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_traces \
 Re-queries the VLM for each MEP asking for the 3 most likely candidate answers:
 
 ```bash
-uv run --env-file .env -magentic_chartqapro_eval.eval.eval_topk \
+uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.eval_topk \
     --mep_dir meps/gemini_gemini/chartqapro/test \
     --out output/topk_metrics.jsonl \
     --backend gemini \
@@ -338,7 +340,7 @@ This pass does **not** modify existing MEPs or `metrics.jsonl`.
 ### 7. Summarize results
 
 ```bash
-uv run --env-file .env -magentic_chartqapro_eval.eval.summarize \
+uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.summarize \
     --metrics output/metrics.jsonl \
     --out output/summary.csv
 ```
@@ -348,7 +350,7 @@ uv run --env-file .env -magentic_chartqapro_eval.eval.summarize \
 This pass asks **why** the agent was wrong, not just **that** it was wrong. A VLM is given the original chart image alongside the wrong answer, the correct answer, the agent's explanation, and the inspection plan — so it can make a *visual* diagnosis of the failure mode.
 
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.eval.error_taxonomy \
+uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.error_taxonomy \
     --mep_dir meps/gemini_gemini/chartqapro/test \
     --metrics_file output/metrics.jsonl \
     --out output/taxonomy.jsonl
@@ -408,7 +410,7 @@ for sid in revised:
 Generates a single portable HTML file with summary cards, accuracy tables, verifier stats, failure taxonomy breakdown, and a per-sample results table:
 
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.eval.report \
+uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.report \
     --metrics output/metrics.jsonl \
     --taxonomy output/taxonomy.jsonl \
     --out output/report.html
@@ -515,13 +517,13 @@ The framework auto-detects these variables. If they are absent, all Langfuse cal
 Run once before starting experiments. This creates versioned entries for `planner.txt` and `vision.txt` in Langfuse Prompt Management so every future experiment links to the exact prompt version used.
 
 ```bash
-uv run --env-file .env -m -m agentic_chartqapro_eval.langfuse_integration.prompts
+uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.langfuse_integration.prompts
 ```
 
 ### 4. Register the dataset
 
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.langfuse_integration.dataset \
+uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.langfuse_integration.dataset \
     --split test --n 25
 ```
 
@@ -536,7 +538,7 @@ No extra flags needed. When `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are
 - stores the `lf_trace_id` in the MEP for later score attachment
 
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \
+uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.runner.run_generate_meps \
     --split test --n 25 --config gemini_gemini --workers 4 --out meps/
 ```
 
@@ -545,7 +547,7 @@ uv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps \
 After running `eval_outputs.py`, accuracy and judge scores are automatically written back to the Langfuse traces:
 
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_outputs \
+uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.eval_outputs \
     --mep_dir meps/gemini_gemini/chartqapro/test \
     --out metrics.jsonl
 ```
@@ -555,7 +557,7 @@ uv run --env-file .env -m agentic_chartqapro_eval.eval.eval_outputs \
 If you have MEPs from runs before Langfuse was configured, import them without re-running the pipeline:
 
 ```bash
-uv run --env-file .env -m agentic_chartqapro_eval.langfuse_integration.ingest \
+uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.langfuse_integration.ingest \
     --mep_dir meps/gemini_gemini/chartqapro/test \
     --metrics_file metrics.jsonl   # optional: attaches scores if available
 ```
diff --git a/implementations/agentic_vqa_eval/analysis.ipynb b/implementations/agentic_vqa_eval/analysis.ipynb
index d9537aa..4e47127 100644
--- a/implementations/agentic_vqa_eval/analysis.ipynb
+++ b/implementations/agentic_vqa_eval/analysis.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "id": "7fb27b941602401d91542211134fc71a",
    "metadata": {},
-   "source": "# ChartQAPro Evaluation — Analysis Walkthrough\n\nThis notebook walks through the full evaluation artifact stack produced by the\nagentic ChartQAPro framework. By the end you will be able to:\n\n- Load and inspect **MEPs** (Model Evaluation Packets) directly\n- Plot **accuracy by question type** from `metrics.jsonl`\n- Visualise the **verifier revision rate** and its effect on accuracy\n- Chart the **failure taxonomy** breakdown from `taxonomy.jsonl`\n- Browse individual samples — question, plan, vision answer, verifier verdict, chart image\n\n**Prerequisites:** Run these commands first:\n```bash\nuv run --env-file .env -m agentic_chartqapro_eval.runner.run_generate_meps --n 25 --config gemini_gemini\nuv run --env-file .env -m agentic_chartqapro_eval.eval.eval_outputs  --mep_dir meps/gemini_gemini/chartqapro/test --out output/metrics.jsonl\nuv run --env-file .env -m agentic_chartqapro_eval.eval.error_taxonomy --mep_dir meps/gemini_gemini/chartqapro/test --metrics_file output/metrics.jsonl --out output/taxonomy.jsonl\n```"
+   "source": "# ChartQAPro Evaluation — Analysis Walkthrough\n\nThis notebook walks through the full evaluation artifact stack produced by the\nagentic ChartQAPro framework. By the end you will be able to:\n\n- Load and inspect **MEPs** (Model Evaluation Packets) directly\n- Plot **accuracy by question type** from `metrics.jsonl`\n- Visualise the **verifier revision rate** and its effect on accuracy\n- Chart the **failure taxonomy** breakdown from `taxonomy.jsonl`\n- Browse individual samples — question, plan, vision answer, verifier verdict, chart image\n\n**Prerequisites:** Run these commands first (from any directory in the repo):\n```bash\nuv run --env-file \"$(git rev-parse --show-toplevel)/.env\" --directory \"$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval\" -m agentic_chartqapro_eval.runner.run_generate_meps --n 25 --config gemini_gemini\nuv run --env-file \"$(git rev-parse --show-toplevel)/.env\" --directory \"$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval\" -m agentic_chartqapro_eval.eval.eval_outputs  --mep_dir meps/gemini_gemini/chartqapro/test --out output/metrics.jsonl\nuv run --env-file \"$(git rev-parse --show-toplevel)/.env\" --directory \"$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval\" -m agentic_chartqapro_eval.eval.error_taxonomy --mep_dir meps/gemini_gemini/chartqapro/test --metrics_file output/metrics.jsonl --out output/taxonomy.jsonl\n```"
   },
   {
    "cell_type": "markdown",
@@ -272,7 +272,11 @@
     "\n",
     "    wrong = tax_df[tax_df[\"failure_type\"] != \"correct\"]\n",
     "    print(f\"\\nTotal wrong: {len(wrong)} / {len(tax_df)}\")\n",
-    "    print(f\"Most common failure: {counts[counts.index != 'correct'].idxmax()}\")"
+    "    failure_counts = counts[counts.index != \"correct\"]\n",
+    "    if failure_counts.empty:\n",
+    "        print(\"Most common failure: none (all samples correct)\")\n",
+    "    else:\n",
+    "        print(f\"Most common failure: {failure_counts.idxmax()}\")"
    ]
   },
   {
diff --git a/implementations/agentic_vqa_eval/run_pipeline.ipynb b/implementations/agentic_vqa_eval/run_pipeline.ipynb
index 294edfe..d40163e 100644
--- a/implementations/agentic_vqa_eval/run_pipeline.ipynb
+++ b/implementations/agentic_vqa_eval/run_pipeline.ipynb
@@ -16,7 +16,7 @@
     "|---|---|\n",
     "| 1 — Configuration | All tunable parameters in one place |\n",
     "| 2 — Environment | Check API keys, install path, imports |\n",
-    "| 2.5 — Opik health check | Verify Opik stack is reachable and API-responsive before running |\n",
+    "| 2.5 — Langfuse health check | Verify Langfuse credentials are configured before running |\n",
     "| 3 — Load dataset | Pull samples from HuggingFace |\n",
     "| 4 — Instantiate agents | Build Planner, OCR, Vision, Verifier |\n",
     "| 5 — Run pipeline | Generate MEPs (Plan → OCR → Vision → Verify) |\n",
@@ -119,7 +119,7 @@
     "    val = os.environ.get(var, \"\")\n",
     "    needed = needed_for in CONFIG\n",
     "    if val and not val.startswith(\"your_\"):\n",
-    "        print(f\"  ok  {var}  ({val[:12]}...)\")\n",
+    "        print(f\"  ok  {var}  ({val[:3]}...)\")\n",
     "    elif needed:\n",
     "        print(f\"  MISSING  {var}  <- required for {CONFIG}\")\n",
     "        missing.append(var)\n",
@@ -141,8 +141,8 @@
     "from agentic_chartqapro_eval.eval.eval_outputs import evaluate_mep  # noqa: E402\n",
     "from agentic_chartqapro_eval.eval.eval_traces import evaluate_trace  # noqa: E402\n",
     "from agentic_chartqapro_eval.eval.summarize import summarize, write_csv  # noqa: E402\n",
+    "from agentic_chartqapro_eval.langfuse_integration.client import get_client  # noqa: E402\n",
     "from agentic_chartqapro_eval.mep.writer import iter_meps  # noqa: E402\n",
-    "from agentic_chartqapro_eval.opik_integration.client import get_client  # noqa: E402\n",
     "from agentic_chartqapro_eval.runner.run_generate_meps import (  # noqa: E402\n",
     "    BACKEND_CONFIGS,\n",
     "    process_sample,\n",
@@ -159,19 +159,17 @@
    "id": "cell-opik-hdr",
    "metadata": {},
    "source": [
-    "## 2.5 — Opik Health Check\n",
+    "## 2.5 — Langfuse Health Check\n",
     "\n",
-    "Verifies that the self-hosted Opik stack is **fully operational** before the pipeline runs.\n",
-    "Three checks are run in sequence:\n",
+    "Verifies that Langfuse credentials are configured before the pipeline runs.\n",
     "\n",
     "| Check | What it tests |\n",
     "|---|---|\n",
-    "| HTTP reachable | TCP connection to `OPIK_URL_OVERRIDE` succeeds within 5 s |\n",
-    "| Client init | `opik.Opik()` initialises without error |\n",
-    "| API read test | A lightweight `search_traces` call returns a valid response |\n",
+    "| Env vars present | `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are set in `.env` |\n",
+    "| Client init | `Langfuse()` initialises without error |\n",
     "\n",
-    "If `OPIK_URL_OVERRIDE` is not set the cell prints a skip notice and continues — Opik is optional.\n",
-    "If any check fails the pipeline still runs; only tracing is affected."
+    "If the keys are absent the cell prints a skip notice and continues — Langfuse is optional.\n",
+    "The pipeline produces identical MEPs with or without it; tracing is purely additive."
    ]
   },
   {
@@ -181,107 +179,61 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "import urllib.error\n",
-    "import urllib.request\n",
-    "\n",
-    "# Force re-initialisation so re-running this cell after starting Docker works correctly\n",
-    "from agentic_chartqapro_eval.opik_integration.client import reset_client\n",
+    "from agentic_chartqapro_eval.langfuse_integration.client import reset_client\n",
     "\n",
     "\n",
+    "# Force re-initialisation so re-running this cell picks up any .env changes\n",
     "reset_client()\n",
     "\n",
-    "OPIK_URL = os.environ.get(\"OPIK_URL_OVERRIDE\", \"\")\n",
+    "lf_public = os.environ.get(\"LANGFUSE_PUBLIC_KEY\", \"\")\n",
+    "lf_secret = os.environ.get(\"LANGFUSE_SECRET_KEY\", \"\")\n",
     "\n",
-    "if not OPIK_URL:\n",
-    "    print(\"[skip] OPIK_URL_OVERRIDE is not set.\")\n",
-    "    print(\"       Opik tracing is disabled. Pipeline will run fine without it.\")\n",
+    "if not lf_public or not lf_secret:\n",
+    "    print(\"[skip] LANGFUSE_PUBLIC_KEY / LANGFUSE_SECRET_KEY are not set.\")\n",
+    "    print(\"       Langfuse tracing is disabled. Pipeline will run fine without it.\")\n",
     "    print()\n",
-    "    print(\"To enable Opik tracing:\")\n",
-    "    print(\"  1. Start the Docker stack:\")\n",
-    "    print(\"       cd /path/to/opik/deployment/docker-compose\")\n",
-    "    print(\"       docker compose --profile opik up -d\")\n",
-    "    print(\"  2. Add to .env:  OPIK_URL_OVERRIDE=http://localhost:5173/api\")\n",
-    "    print(\"  3. Re-run this cell.\")\n",
+    "    print(\"To enable Langfuse tracing, add to .env:\")\n",
+    "    print(\"  LANGFUSE_PUBLIC_KEY=pk-lf-...\")\n",
+    "    print(\"  LANGFUSE_SECRET_KEY=sk-lf-...\")\n",
+    "    print(\"  # LANGFUSE_HOST=https://cloud.langfuse.com  (default; change for self-hosted)\")\n",
     "else:\n",
     "    results = {}\n",
     "\n",
-    "    # -- Check 1: HTTP reachability (any response = server is up) --\n",
-    "    try:\n",
-    "        with urllib.request.urlopen(OPIK_URL, timeout=5) as r:\n",
-    "            results[\"http\"] = (\"ok\", f\"HTTP {r.status}\")\n",
-    "    except urllib.error.HTTPError as e:\n",
-    "        # HTTPError means server responded -- it is up, just returned a non-200\n",
-    "        results[\"http\"] = (\"ok\", f\"HTTP {e.code} (server responded)\")\n",
-    "    except Exception as e:\n",
-    "        results[\"http\"] = (\"fail\", str(e))\n",
+    "    # -- Check 1: Env vars present --\n",
+    "    results[\"env\"] = (\"ok\", f\"pk={lf_public[:3]}...\")\n",
     "\n",
-    "    # -- Check 2: Opik Python client initialises --\n",
-    "    _opik_hc = None\n",
+    "    # -- Check 2: Client initialises --\n",
     "    try:\n",
-    "        from agentic_chartqapro_eval.opik_integration.client import get_client\n",
-    "\n",
-    "        _opik_hc = get_client()\n",
-    "        if _opik_hc is not None:\n",
-    "            results[\"client\"] = (\"ok\", \"opik.Opik() ready\")\n",
+    "        _lf_hc = get_client()\n",
+    "        if _lf_hc is not None:\n",
+    "            results[\"client\"] = (\"ok\", \"Langfuse() ready\")\n",
     "        else:\n",
     "            results[\"client\"] = (\"fail\", \"get_client() returned None\")\n",
     "    except Exception as e:\n",
     "        results[\"client\"] = (\"fail\", str(e))\n",
     "\n",
-    "    # -- Check 3: API actually responds to a lightweight read --\n",
-    "    if results.get(\"client\", (\"\",))[0] == \"ok\" and _opik_hc is not None:\n",
-    "        try:\n",
-    "            traces = _opik_hc.search_traces(max_results=1)\n",
-    "            results[\"api\"] = (\"ok\", f\"search_traces returned {len(traces)} result(s)\")\n",
-    "        except Exception as e:\n",
-    "            err_str = str(e)\n",
-    "            hint = \"\"\n",
-    "            if \"readonly\" in err_str.lower() or \"500\" in err_str:\n",
-    "                hint = \" [ClickHouse replica may be read-only -- run SYSTEM RESTORE REPLICA]\"\n",
-    "            results[\"api\"] = (\"fail\", err_str[:120] + hint)\n",
-    "    else:\n",
-    "        results[\"api\"] = (\"skip\", \"client unavailable\")\n",
-    "\n",
     "    # -- Report --\n",
-    "    print(f\"Opik URL : {OPIK_URL}\")\n",
-    "    print()\n",
     "    labels = [\n",
-    "        (\"http\", \"HTTP reachable \"),\n",
-    "        (\"client\", \"Client init    \"),\n",
-    "        (\"api\", \"API read test  \"),\n",
+    "        (\"env\", \"Env vars present\"),\n",
+    "        (\"client\", \"Client init     \"),\n",
     "    ]\n",
     "    all_ok = True\n",
     "    for key, label in labels:\n",
     "        status, detail = results.get(key, (\"skip\", \"\"))\n",
-    "        if status == \"ok\":\n",
-    "            marker = \"✓ OK  \"\n",
-    "        elif status == \"skip\":\n",
-    "            marker = \"⊘ skip\"\n",
-    "        else:\n",
-    "            marker = \"✗ FAIL\"\n",
+    "        marker = \"✓ OK  \" if status == \"ok\" else (\"⊘ skip\" if status == \"skip\" else \"✗ FAIL\")\n",
+    "        if status not in (\"ok\", \"skip\"):\n",
     "            all_ok = False\n",
     "        print(f\"  {marker}  {label}  {detail}\")\n",
     "\n",
     "    print()\n",
     "    if all_ok:\n",
-    "        dashboard_url = OPIK_URL.rstrip(\"/\").removesuffix(\"/api\")\n",
-    "        print(\"✓ Opik is fully operational.\")\n",
-    "        print(f\"Dashboard : {dashboard_url}\")\n",
+    "        lf_host = os.environ.get(\"LANGFUSE_HOST\") or os.environ.get(\"LANGFUSE_BASE_URL\") or \"https://cloud.langfuse.com\"\n",
+    "        print(\"✓ Langfuse is configured.\")\n",
+    "        print(f\"Host      : {lf_host}\")\n",
     "        print(\"Traces and scores will be recorded automatically during the pipeline run.\")\n",
     "    else:\n",
-    "        print(\"⚠ WARNING: One or more Opik checks failed.\")\n",
-    "        print(\"The pipeline will still run; Opik tracing may not work correctly.\")\n",
-    "        if results.get(\"http\", (\"\",))[0] == \"fail\":\n",
-    "            print()\n",
-    "            print(\"  Docker stack appears to be down. To start it:\")\n",
-    "            print(\"    cd /path/to/opik/deployment/docker-compose\")\n",
-    "            print(\"    docker compose --profile opik up -d\")\n",
-    "        if results.get(\"api\", (\"\",))[0] == \"fail\":\n",
-    "            print()\n",
-    "            print(\"  API is reachable but not responding correctly.\")\n",
-    "            print(\"  Check ClickHouse replica state:\")\n",
-    "            print(\"    docker exec opik-clickhouse-1 clickhouse-client --query \\\\\")\n",
-    "            print(\"      \\\"SELECT database,table,is_readonly FROM system.replicas WHERE database='opik'\\\"\")"
+    "        print(\"⚠ WARNING: Langfuse client failed to initialise.\")\n",
+    "        print(\"The pipeline will still run; tracing will be skipped.\")"
    ]
   },
   {
@@ -376,10 +328,10 @@
     "else:\n",
     "    print(\"OcrReaderTool  : disabled (USE_OCR=False)\")\n",
     "\n",
-    "# Opik observability (no-op if OPIK_URL_OVERRIDE not set)\n",
-    "opik_client = get_client()\n",
-    "opik_status = \"enabled\" if opik_client else \"not configured\"\n",
-    "print(f\"Opik           : {opik_status}\")"
+    "# Langfuse observability (no-op if keys not set)\n",
+    "lf_client = get_client()\n",
+    "lf_status = \"enabled\" if lf_client else \"not configured\"\n",
+    "print(f\"Langfuse       : {lf_status}\")"
    ]
   },
   {
@@ -421,7 +373,7 @@
     "        config,\n",
     "        RUN_ID,\n",
     "        OUT_DIR,\n",
-    "        opik_client=opik_client,\n",
+    "        lf_client=lf_client,\n",
     "        verifier_agent=verifier,\n",
     "        ocr_tool=ocr,\n",
     "    )\n",
@@ -459,8 +411,8 @@
     "## 6 — Inspect First MEP\n",
     "\n",
     "MEPs are self-contained JSON files. Every field you see here is what the agent actually\n",
-    "produced — no post-processing. The `opik_trace_id` links this MEP back to the live trace\n",
-    "in the Opik dashboard if Opik is configured."
+    "produced — no post-processing. The `lf_trace_id` links this MEP back to the live trace\n",
+    "in the Langfuse dashboard if Langfuse is configured."
    ]
   },
   {
@@ -501,8 +453,8 @@
     "    print(\"Timestamps (ms):\")\n",
     "    for k in [\"planner_ms\", \"ocr_ms\", \"vision_ms\", \"verifier_ms\"]:\n",
     "        print(f\"  {k:<16} {ts.get(k, 0):.0f}\")\n",
-    "    if mep.get(\"opik_trace_id\"):\n",
-    "        print(f\"Opik trace ID: {mep['opik_trace_id']}\")\n",
+    "    if mep.get(\"lf_trace_id\"):\n",
+    "        print(f\"Langfuse trace ID: {mep['lf_trace_id']}\")\n",
     "    print(\"=\" * 64)\n",
     "\n",
     "    img_path = s.get(\"image_ref\", {}).get(\"path\", \"\")\n",
@@ -609,7 +561,7 @@
     "                config,\n",
     "                RUN_ID_NO_OCR,\n",
     "                OUT_DIR_NO_OCR,\n",
-    "                opik_client=opik_client,\n",
+    "                lf_client=lf_client,\n",
     "                verifier_agent=verifier,\n",
     "                ocr_tool=None,  # <-- OCR disabled\n",
     "            )\n",
diff --git a/uv.lock b/uv.lock
index 9bcb45a..db5369c 100644
--- a/uv.lock
+++ b/uv.lock
@@ -561,24 +561,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/18/a1/128e3676fb9b4fd965a93554e5e07045975ee6bd6e9fdb536cdffa32e99e/boto3-1.42.70-py3-none-any.whl", hash = "sha256:18a108c4d5df89a200b3949de0d39c0879b100c455e3229ea38275dd392db0f4", size = 140554, upload-time = "2026-03-17T19:43:20.406Z" },
 ]
 
-[[package]]
-name = "boto3-stubs"
-version = "1.42.69"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "botocore-stubs" },
-    { name = "types-s3transfer" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/1e/ba/b282b7ab3626a25a6896c2f31adc95324b3e5f50056923d274a35c5eaf0c/boto3_stubs-1.42.69.tar.gz", hash = "sha256:52ccd645a34d2b4e97af8f44dbaffbb854a1de52610e9502c284bfb24e6d8962", size = 101397, upload-time = "2026-03-16T20:58:58.538Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/6a/78/83ef6f549d88425618ce66d4b273ea46e379aefdf0e9e49bf4f9bfa01cda/boto3_stubs-1.42.69-py3-none-any.whl", hash = "sha256:021360b519ac54822eb00f125b0c4292ad2a1869ae8e1d0c6c097db99215d41b", size = 70010, upload-time = "2026-03-16T20:58:51.184Z" },
-]
-
-[package.optional-dependencies]
-bedrock-runtime = [
-    { name = "mypy-boto3-bedrock-runtime" },
-]
-
 [[package]]
 name = "botocore"
 version = "1.42.70"
@@ -593,18 +575,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/fb/51/08f32aea872253173f513ba68122f4300966290677c8e59887b4ffd5d957/botocore-1.42.70-py3-none-any.whl", hash = "sha256:54ed9d25f05f810efd22b0dfda0bb9178df3ad8952b2e4359e05156c9321bd3c", size = 14671393, upload-time = "2026-03-17T19:43:06.777Z" },
 ]
 
-[[package]]
-name = "botocore-stubs"
-version = "1.42.41"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "types-awscrt" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/0c/a8/a26608ff39e3a5866c6c79eda10133490205cbddd45074190becece3ff2a/botocore_stubs-1.42.41.tar.gz", hash = "sha256:dbeac2f744df6b814ce83ec3f3777b299a015cbea57a2efc41c33b8c38265825", size = 42411, upload-time = "2026-02-03T20:46:14.479Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/32/76/cab7af7f16c0b09347f2ebe7ffda7101132f786acb767666dce43055faab/botocore_stubs-1.42.41-py3-none-any.whl", hash = "sha256:9423110fb0e391834bd2ed44ae5f879d8cb370a444703d966d30842ce2bcb5f0", size = 66759, upload-time = "2026-02-03T20:46:13.02Z" },
-]
-
 [[package]]
 name = "build"
 version = "1.4.0"
@@ -1294,25 +1264,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/90/2b/0817a2b257fe88725c25589d89aec060581aabf668707a8d03b2e9e0cb2a/fastjsonschema-2.21.1-py3-none-any.whl", hash = "sha256:c9e5b7e908310918cf494a434eeb31384dd84a98b57a30bcb1f535015b554667", size = 23924, upload-time = "2024-12-02T10:55:07.599Z" },
 ]
 
-[[package]]
-name = "fastuuid"
-version = "0.14.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/c3/7d/d9daedf0f2ebcacd20d599928f8913e9d2aea1d56d2d355a93bfa2b611d7/fastuuid-0.14.0.tar.gz", hash = "sha256:178947fc2f995b38497a74172adee64fdeb8b7ec18f2a5934d037641ba265d26", size = 18232, upload-time = "2025-10-19T22:19:22.402Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/02/a2/e78fcc5df65467f0d207661b7ef86c5b7ac62eea337c0c0fcedbeee6fb13/fastuuid-0.14.0-cp312-cp312-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:77e94728324b63660ebf8adb27055e92d2e4611645bf12ed9d88d30486471d0a", size = 510164, upload-time = "2025-10-19T22:31:45.635Z" },
-    { url = "https://files.pythonhosted.org/packages/2b/b3/c846f933f22f581f558ee63f81f29fa924acd971ce903dab1a9b6701816e/fastuuid-0.14.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:caa1f14d2102cb8d353096bc6ef6c13b2c81f347e6ab9d6fbd48b9dea41c153d", size = 261837, upload-time = "2025-10-19T22:38:38.53Z" },
-    { url = "https://files.pythonhosted.org/packages/54/ea/682551030f8c4fa9a769d9825570ad28c0c71e30cf34020b85c1f7ee7382/fastuuid-0.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:d23ef06f9e67163be38cece704170486715b177f6baae338110983f99a72c070", size = 251370, upload-time = "2025-10-19T22:40:26.07Z" },
-    { url = "https://files.pythonhosted.org/packages/14/dd/5927f0a523d8e6a76b70968e6004966ee7df30322f5fc9b6cdfb0276646a/fastuuid-0.14.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c9ec605ace243b6dbe3bd27ebdd5d33b00d8d1d3f580b39fdd15cd96fd71796", size = 277766, upload-time = "2025-10-19T22:37:23.779Z" },
-    { url = "https://files.pythonhosted.org/packages/16/6e/c0fb547eef61293153348f12e0f75a06abb322664b34a1573a7760501336/fastuuid-0.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:808527f2407f58a76c916d6aa15d58692a4a019fdf8d4c32ac7ff303b7d7af09", size = 278105, upload-time = "2025-10-19T22:26:56.821Z" },
-    { url = "https://files.pythonhosted.org/packages/2d/b1/b9c75e03b768f61cf2e84ee193dc18601aeaf89a4684b20f2f0e9f52b62c/fastuuid-0.14.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2fb3c0d7fef6674bbeacdd6dbd386924a7b60b26de849266d1ff6602937675c8", size = 301564, upload-time = "2025-10-19T22:30:31.604Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/fa/f7395fdac07c7a54f18f801744573707321ca0cee082e638e36452355a9d/fastuuid-0.14.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:ab3f5d36e4393e628a4df337c2c039069344db5f4b9d2a3c9cea48284f1dd741", size = 459659, upload-time = "2025-10-19T22:31:32.341Z" },
-    { url = "https://files.pythonhosted.org/packages/66/49/c9fd06a4a0b1f0f048aacb6599e7d96e5d6bc6fa680ed0d46bf111929d1b/fastuuid-0.14.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:b9a0ca4f03b7e0b01425281ffd44e99d360e15c895f1907ca105854ed85e2057", size = 478430, upload-time = "2025-10-19T22:26:22.962Z" },
-    { url = "https://files.pythonhosted.org/packages/be/9c/909e8c95b494e8e140e8be6165d5fc3f61fdc46198c1554df7b3e1764471/fastuuid-0.14.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:3acdf655684cc09e60fb7e4cf524e8f42ea760031945aa8086c7eae2eeeabeb8", size = 450894, upload-time = "2025-10-19T22:27:01.647Z" },
-    { url = "https://files.pythonhosted.org/packages/90/eb/d29d17521976e673c55ef7f210d4cdd72091a9ec6755d0fd4710d9b3c871/fastuuid-0.14.0-cp312-cp312-win32.whl", hash = "sha256:9579618be6280700ae36ac42c3efd157049fe4dd40ca49b021280481c78c3176", size = 154374, upload-time = "2025-10-19T22:29:19.879Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/fc/f5c799a6ea6d877faec0472d0b27c079b47c86b1cdc577720a5386483b36/fastuuid-0.14.0-cp312-cp312-win_amd64.whl", hash = "sha256:d9e4332dc4ba054434a9594cbfaf7823b57993d7d8e7267831c3e059857cf397", size = 156550, upload-time = "2025-10-19T22:27:49.658Z" },
-]
-
 [[package]]
 name = "filelock"
 version = "3.20.3"
@@ -2084,7 +2035,6 @@ dependencies = [
 agentic-xai-eval = [
     { name = "crewai" },
     { name = "google-genai" },
-    { name = "opik" },
     { name = "streamlit" },
 ]
 dev = [
@@ -2163,7 +2113,6 @@ requires-dist = [
 agentic-xai-eval = [
     { name = "crewai", specifier = ">=1.6.1" },
     { name = "google-genai", specifier = ">=1.67.0" },
-    { name = "opik", specifier = ">=1.10.40" },
     { name = "streamlit", specifier = ">=1.55.0" },
 ]
 dev = [
@@ -2857,30 +2806,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/53/84/8a89614b2e7eeeaf0a68a4046d6cfaea4544c8619ea02595ebeec9b2bae3/license_expression-30.4.1-py3-none-any.whl", hash = "sha256:679646bc3261a17690494a3e1cada446e5ee342dbd87dcfa4a0c24cc5dce13ee", size = 111457, upload-time = "2025-01-14T05:11:38.658Z" },
 ]
 
-[[package]]
-name = "litellm"
-version = "1.82.2"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "aiohttp" },
-    { name = "click" },
-    { name = "fastuuid" },
-    { name = "httpx" },
-    { name = "importlib-metadata" },
-    { name = "jinja2" },
-    { name = "jsonschema" },
-    { name = "openai" },
-    { name = "pydantic" },
-    { name = "python-dotenv" },
-    { name = "tiktoken" },
-    { name = "tokenizers", version = "0.21.4", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'group-28-interpretability-llms-agents-xai-refresher'" },
-    { name = "tokenizers", version = "0.22.2", source = { registry = "https://pypi.org/simple" }, marker = "extra == 'group-28-interpretability-llms-agents-mechanistic-interp' or extra == 'group-28-interpretability-llms-agents-preference-alignment' or extra != 'group-28-interpretability-llms-agents-xai-refresher'" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/60/12/010a86643f12ac0b004032d5927c260094299a84ed38b5ed20a8f8c7e3c4/litellm-1.82.2.tar.gz", hash = "sha256:f5f4c4049f344a88bf80b2e421bb927807687c99624515d7ff4152d533ec9dcb", size = 17353218, upload-time = "2026-03-13T21:24:24.5Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/96/e4/87e3ca82a8bf6e6bfffb42a539a1350dd6ced1b7169397bd439ba56fde10/litellm-1.82.2-py3-none-any.whl", hash = "sha256:641ed024774fa3d5b4dd9347f0efb1e31fa422fba2a6500aabedee085d1194cb", size = 15524224, upload-time = "2026-03-13T21:24:21.288Z" },
-]
-
 [[package]]
 name = "llvmlite"
 version = "0.46.0"
@@ -3182,15 +3107,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/6c/28/dd72947e59a6a8c856448a5e74da6201cb5502ddff644fbc790e4bd40b9a/multiprocess-0.70.18-py39-none-any.whl", hash = "sha256:e78ca805a72b1b810c690b6b4cc32579eba34f403094bbbae962b7b5bf9dfcb8", size = 133478, upload-time = "2025-04-17T03:11:26.253Z" },
 ]
 
-[[package]]
-name = "mypy-boto3-bedrock-runtime"
-version = "1.42.42"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/46/bb/65dc1b2c5796a6ab5f60bdb57343bd6c3ecb82251c580eca415c8548333e/mypy_boto3_bedrock_runtime-1.42.42.tar.gz", hash = "sha256:3a4088218478b6fbbc26055c03c95bee4fc04624a801090b3cce3037e8275c8d", size = 29840, upload-time = "2026-02-04T20:53:05.999Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/00/43/7ea062f2228f47b5779dcfa14dab48d6e29f979b35d1a5102b0ba80b9c1b/mypy_boto3_bedrock_runtime-1.42.42-py3-none-any.whl", hash = "sha256:b2d16eae22607d0685f90796b3a0afc78c0b09d45872e00eafd634a31dd9358f", size = 36077, upload-time = "2026-02-04T20:53:01.768Z" },
-]
-
 [[package]]
 name = "namex"
 version = "0.1.0"
@@ -3871,32 +3787,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/b2/37/cc6a55e448deaa9b27377d087da8615a3416d8ad523d5960b78dbeadd02a/opentelemetry_semantic_conventions-0.61b0-py3-none-any.whl", hash = "sha256:fa530a96be229795f8cef353739b618148b0fe2b4b3f005e60e262926c4d38e2", size = 231621, upload-time = "2026-03-04T14:17:19.33Z" },
 ]
 
-[[package]]
-name = "opik"
-version = "1.10.40"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "boto3-stubs", extra = ["bedrock-runtime"] },
-    { name = "click" },
-    { name = "httpx" },
-    { name = "jinja2" },
-    { name = "litellm" },
-    { name = "openai" },
-    { name = "pydantic" },
-    { name = "pydantic-settings" },
-    { name = "pytest" },
-    { name = "rapidfuzz" },
-    { name = "rich" },
-    { name = "sentry-sdk" },
-    { name = "tenacity" },
-    { name = "tqdm" },
-    { name = "uuid6" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/ae/a4/3b7c5c705b57776a3808c58a985aa7864e77c5eea7ef16780d6792fc3e1c/opik-1.10.40.tar.gz", hash = "sha256:aee1cd8ffdb2d3f7a0a15276626c4d5e7a904722fd9fad8ec5a9fa679310f7e9", size = 777468, upload-time = "2026-03-16T13:37:18.978Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/c5/64/6a9f7231115055f60052a7b8e4f3d4182dd9c097f916ac1682fe093ddff4/opik-1.10.40-py3-none-any.whl", hash = "sha256:e446551b7603ce9b34b09eb8179e74bbae005e3ce589bc8bf613f6711771b24b", size = 1315663, upload-time = "2026-03-16T13:37:17.339Z" },
-]
-
 [[package]]
 name = "opt-einsum"
 version = "3.4.0"
@@ -4835,25 +4725,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c4/43/80f67e0336cb2fc725f8e06f7fe35c1d0fe946f4d2b8b2175e797e07349e/qwen_vl_utils-0.0.14-py3-none-any.whl", hash = "sha256:5e28657bfd031e56bd447c5901b58ddfc3835285ed100f4c56580e0ade054e96", size = 8120, upload-time = "2025-09-23T09:38:56.297Z" },
 ]
 
-[[package]]
-name = "rapidfuzz"
-version = "3.14.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/d3/28/9d808fe62375b9aab5ba92fa9b29371297b067c2790b2d7cda648b1e2f8d/rapidfuzz-3.14.3.tar.gz", hash = "sha256:2491937177868bc4b1e469087601d53f925e8d270ccc21e07404b4b5814b7b5f", size = 57863900, upload-time = "2025-11-01T11:54:52.321Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/fa/8e/3c215e860b458cfbedb3ed73bc72e98eb7e0ed72f6b48099604a7a3260c2/rapidfuzz-3.14.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:685c93ea961d135893b5984a5a9851637d23767feabe414ec974f43babbd8226", size = 1945306, upload-time = "2025-11-01T11:53:06.452Z" },
-    { url = "https://files.pythonhosted.org/packages/36/d9/31b33512015c899f4a6e6af64df8dfe8acddf4c8b40a4b3e0e6e1bcd00e5/rapidfuzz-3.14.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fa7c8f26f009f8c673fbfb443792f0cf8cf50c4e18121ff1e285b5e08a94fbdb", size = 1390788, upload-time = "2025-11-01T11:53:08.721Z" },
-    { url = "https://files.pythonhosted.org/packages/a9/67/2ee6f8de6e2081ccd560a571d9c9063184fe467f484a17fa90311a7f4a2e/rapidfuzz-3.14.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:57f878330c8d361b2ce76cebb8e3e1dc827293b6abf404e67d53260d27b5d941", size = 1374580, upload-time = "2025-11-01T11:53:10.164Z" },
-    { url = "https://files.pythonhosted.org/packages/30/83/80d22997acd928eda7deadc19ccd15883904622396d6571e935993e0453a/rapidfuzz-3.14.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6c5f545f454871e6af05753a0172849c82feaf0f521c5ca62ba09e1b382d6382", size = 3154947, upload-time = "2025-11-01T11:53:12.093Z" },
-    { url = "https://files.pythonhosted.org/packages/5b/cf/9f49831085a16384695f9fb096b99662f589e30b89b4a589a1ebc1a19d34/rapidfuzz-3.14.3-cp312-cp312-manylinux_2_31_armv7l.whl", hash = "sha256:07aa0b5d8863e3151e05026a28e0d924accf0a7a3b605da978f0359bb804df43", size = 1223872, upload-time = "2025-11-01T11:53:13.664Z" },
-    { url = "https://files.pythonhosted.org/packages/c8/0f/41ee8034e744b871c2e071ef0d360686f5ccfe5659f4fd96c3ec406b3c8b/rapidfuzz-3.14.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:73b07566bc7e010e7b5bd490fb04bb312e820970180df6b5655e9e6224c137db", size = 2392512, upload-time = "2025-11-01T11:53:15.109Z" },
-    { url = "https://files.pythonhosted.org/packages/da/86/280038b6b0c2ccec54fb957c732ad6b41cc1fd03b288d76545b9cf98343f/rapidfuzz-3.14.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:6de00eb84c71476af7d3110cf25d8fe7c792d7f5fa86764ef0b4ca97e78ca3ed", size = 2521398, upload-time = "2025-11-01T11:53:17.146Z" },
-    { url = "https://files.pythonhosted.org/packages/fa/7b/05c26f939607dca0006505e3216248ae2de631e39ef94dd63dbbf0860021/rapidfuzz-3.14.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d7843a1abf0091773a530636fdd2a49a41bcae22f9910b86b4f903e76ddc82dc", size = 4259416, upload-time = "2025-11-01T11:53:19.34Z" },
-    { url = "https://files.pythonhosted.org/packages/40/eb/9e3af4103d91788f81111af1b54a28de347cdbed8eaa6c91d5e98a889aab/rapidfuzz-3.14.3-cp312-cp312-win32.whl", hash = "sha256:dea97ac3ca18cd3ba8f3d04b5c1fe4aa60e58e8d9b7793d3bd595fdb04128d7a", size = 1709527, upload-time = "2025-11-01T11:53:20.949Z" },
-    { url = "https://files.pythonhosted.org/packages/b8/63/d06ecce90e2cf1747e29aeab9f823d21e5877a4c51b79720b2d3be7848f8/rapidfuzz-3.14.3-cp312-cp312-win_amd64.whl", hash = "sha256:b5100fd6bcee4d27f28f4e0a1c6b5127bc8ba7c2a9959cad9eab0bf4a7ab3329", size = 1538989, upload-time = "2025-11-01T11:53:22.428Z" },
-    { url = "https://files.pythonhosted.org/packages/fc/6d/beee32dcda64af8128aab3ace2ccb33d797ed58c434c6419eea015fec779/rapidfuzz-3.14.3-cp312-cp312-win_arm64.whl", hash = "sha256:4e49c9e992bc5fc873bd0fff7ef16a4405130ec42f2ce3d2b735ba5d3d4eb70f", size = 811161, upload-time = "2025-11-01T11:53:23.811Z" },
-]
-
 [[package]]
 name = "referencing"
 version = "0.36.2"
@@ -5541,25 +5412,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1a/e4/e804505f87627cd8cdae9c010c47c4485fd8c1ce31a7dd0ab7fcc4707377/tifffile-2026.3.3-py3-none-any.whl", hash = "sha256:e8be15c94273113d31ecb7aa3a39822189dd11c4967e3cc88c178f1ad2fd1170", size = 243960, upload-time = "2026-03-03T19:14:35.808Z" },
 ]
 
-[[package]]
-name = "tiktoken"
-version = "0.12.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "regex" },
-    { name = "requests" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/7d/ab/4d017d0f76ec3171d469d80fc03dfbb4e48a4bcaddaa831b31d526f05edc/tiktoken-0.12.0.tar.gz", hash = "sha256:b18ba7ee2b093863978fcb14f74b3707cdc8d4d4d3836853ce7ec60772139931", size = 37806, upload-time = "2025-10-06T20:22:45.419Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a4/85/be65d39d6b647c79800fd9d29241d081d4eeb06271f383bb87200d74cf76/tiktoken-0.12.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b97f74aca0d78a1ff21b8cd9e9925714c15a9236d6ceacf5c7327c117e6e21e8", size = 1050728, upload-time = "2025-10-06T20:21:52.756Z" },
-    { url = "https://files.pythonhosted.org/packages/4a/42/6573e9129bc55c9bf7300b3a35bef2c6b9117018acca0dc760ac2d93dffe/tiktoken-0.12.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2b90f5ad190a4bb7c3eb30c5fa32e1e182ca1ca79f05e49b448438c3e225a49b", size = 994049, upload-time = "2025-10-06T20:21:53.782Z" },
-    { url = "https://files.pythonhosted.org/packages/66/c5/ed88504d2f4a5fd6856990b230b56d85a777feab84e6129af0822f5d0f70/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:65b26c7a780e2139e73acc193e5c63ac754021f160df919add909c1492c0fb37", size = 1129008, upload-time = "2025-10-06T20:21:54.832Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/90/3dae6cc5436137ebd38944d396b5849e167896fc2073da643a49f372dc4f/tiktoken-0.12.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:edde1ec917dfd21c1f2f8046b86348b0f54a2c0547f68149d8600859598769ad", size = 1152665, upload-time = "2025-10-06T20:21:56.129Z" },
-    { url = "https://files.pythonhosted.org/packages/a3/fe/26df24ce53ffde419a42f5f53d755b995c9318908288c17ec3f3448313a3/tiktoken-0.12.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:35a2f8ddd3824608b3d650a000c1ef71f730d0c56486845705a8248da00f9fe5", size = 1194230, upload-time = "2025-10-06T20:21:57.546Z" },
-    { url = "https://files.pythonhosted.org/packages/20/cc/b064cae1a0e9fac84b0d2c46b89f4e57051a5f41324e385d10225a984c24/tiktoken-0.12.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83d16643edb7fa2c99eff2ab7733508aae1eebb03d5dfc46f5565862810f24e3", size = 1254688, upload-time = "2025-10-06T20:21:58.619Z" },
-    { url = "https://files.pythonhosted.org/packages/81/10/b8523105c590c5b8349f2587e2fdfe51a69544bd5a76295fc20f2374f470/tiktoken-0.12.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffc5288f34a8bc02e1ea7047b8d041104791d2ddbf42d1e5fa07822cbffe16bd", size = 878694, upload-time = "2025-10-06T20:21:59.876Z" },
-]
-
 [[package]]
 name = "timm"
 version = "1.0.25"
@@ -6050,15 +5902,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/4a/91/48db081e7a63bb37284f9fbcefda7c44c277b18b0e13fbc36ea2335b71e6/typer-0.24.1-py3-none-any.whl", hash = "sha256:112c1f0ce578bfb4cab9ffdabc68f031416ebcc216536611ba21f04e9aa84c9e", size = 56085, upload-time = "2026-02-21T16:54:41.616Z" },
 ]
 
-[[package]]
-name = "types-awscrt"
-version = "0.31.3"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/76/26/0aa563e229c269c528a3b8c709fc671ac2a5c564732fab0852ac6ee006cf/types_awscrt-0.31.3.tar.gz", hash = "sha256:09d3eaf00231e0f47e101bd9867e430873bc57040050e2a3bd8305cb4fc30865", size = 18178, upload-time = "2026-03-08T02:31:14.569Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3e/e5/47a573bbbd0a790f8f9fe452f7188ea72b212d21c9be57d5fc0cbc442075/types_awscrt-0.31.3-py3-none-any.whl", hash = "sha256:e5ce65a00a2ab4f35eacc1e3d700d792338d56e4823ee7b4dbe017f94cfc4458", size = 43340, upload-time = "2026-03-08T02:31:13.38Z" },
-]
-
 [[package]]
 name = "types-python-dateutil"
 version = "2.9.0.20250516"
@@ -6077,15 +5920,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c7/bb/9bc26fcf5155bd25efeca35f8ba6bffb8b3c9da2baac8bf40067606418f3/types_regex-2026.2.28.20260301-py3-none-any.whl", hash = "sha256:7da7a1fe67528238176a5844fd435ca90617cf605341308686afbc579fdea5c0", size = 11130, upload-time = "2026-03-01T04:11:11.454Z" },
 ]
 
-[[package]]
-name = "types-s3transfer"
-version = "0.16.0"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/fe/64/42689150509eb3e6e82b33ee3d89045de1592488842ddf23c56957786d05/types_s3transfer-0.16.0.tar.gz", hash = "sha256:b4636472024c5e2b62278c5b759661efeb52a81851cde5f092f24100b1ecb443", size = 13557, upload-time = "2025-12-08T08:13:09.928Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/98/27/e88220fe6274eccd3bdf95d9382918716d312f6f6cef6a46332d1ee2feff/types_s3transfer-0.16.0-py3-none-any.whl", hash = "sha256:1c0cd111ecf6e21437cb410f5cddb631bfb2263b77ad973e79b9c6d0cb24e0ef", size = 19247, upload-time = "2025-12-08T08:13:08.426Z" },
-]
-
 [[package]]
 name = "typing-extensions"
 version = "4.15.0"
@@ -6217,15 +6051,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/39/08/aaaad47bc4e9dc8c725e68f9d04865dbcb2052843ff09c97b08904852d84/urllib3-2.6.3-py3-none-any.whl", hash = "sha256:bf272323e553dfb2e87d9bfd225ca7b0f467b919d7bbd355436d3fd37cb0acd4", size = 131584, upload-time = "2026-01-07T16:24:42.685Z" },
 ]
 
-[[package]]
-name = "uuid6"
-version = "2025.0.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/ca/b7/4c0f736ca824b3a25b15e8213d1bcfc15f8ac2ae48d1b445b310892dc4da/uuid6-2025.0.1.tar.gz", hash = "sha256:cd0af94fa428675a44e32c5319ec5a3485225ba2179eefcf4c3f205ae30a81bd", size = 13932, upload-time = "2025-07-04T18:30:35.186Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/3d/b2/93faaab7962e2aa8d6e174afb6f76be2ca0ce89fde14d3af835acebcaa59/uuid6-2025.0.1-py3-none-any.whl", hash = "sha256:80530ce4d02a93cdf82e7122ca0da3ebbbc269790ec1cb902481fa3e9cc9ff99", size = 6979, upload-time = "2025-07-04T18:30:34.001Z" },
-]
-
 [[package]]
 name = "uv"
 version = "0.10.10"

From 40d7e95faa12e7576ba2cf98e9587fe2ea850605 Mon Sep 17 00:00:00 2001
From: Aravind N <aravindn1308@outlook.com>
Date: Wed, 18 Mar 2026 20:09:26 -0400
Subject: [PATCH 4/9] Update MEP directory path in dashboard.py for consistency
 with new structure

---
 .../src/agentic_chartqapro_eval/eval/dashboard.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/dashboard.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/dashboard.py
index 5bcdbee..7ea936a 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/dashboard.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/eval/dashboard.py
@@ -88,7 +88,7 @@ def load_meps(mep_dir: str) -> dict:
 
 mep_dir_input = st.sidebar.text_input(
     "MEP directory",
-    value="meps/openai_openai/chartqapro/test",
+    value="meps/gemini_gemini/chartqapro/test",
     help="Directory containing .json MEP files",
 )
 metrics_input = st.sidebar.text_input("metrics.jsonl", value="output/metrics.jsonl", help="Output of eval_outputs.py")

From 4928ce473f5f3967fc72265c039a6b7c81ca5d06 Mon Sep 17 00:00:00 2001
From: Aravind N <aravindn1308@outlook.com>
Date: Thu, 19 Mar 2026 11:11:39 -0400
Subject: [PATCH 5/9] Rename Opik references to Langfuse in agentic VQA
 evaluation agents and update tracing parameters for observability.

---
 .../src/agentic_chartqapro_eval/agents/planner_agent.py       | 2 +-
 .../src/agentic_chartqapro_eval/agents/verifier_agent.py      | 2 +-
 .../src/agentic_chartqapro_eval/agents/vision_agent.py        | 4 ++--
 .../src/agentic_chartqapro_eval/runner/run_generate_meps.py   | 4 ++--
 .../src/agentic_chartqapro_eval/tools/ocr_reader_tool.py      | 4 ++--
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py
index 5e34591..42abad4 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/planner_agent.py
@@ -148,7 +148,7 @@ def run(self, sample: PerceivedSample, lf_trace: Any = None) -> Tuple[str, dict,
         ----------
         sample : PerceivedSample
             The question and context to plan for.
-        opik_trace : Any, optional
+        langfuse_trace : Any, optional
             Observability object for logging.
 
         Returns
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py
index fabd702..a182f10 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/verifier_agent.py
@@ -216,7 +216,7 @@ def run(
             The inspection plan used by the previous agent.
         vision_parsed : dict
             The draft answer and explanation to audit.
-        opik_trace : Any, optional
+        langfuse_trace : Any, optional
             Tracing object for observability.
 
         Returns
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/vision_agent.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/vision_agent.py
index 4832d66..28bee45 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/vision_agent.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/agents/vision_agent.py
@@ -189,7 +189,7 @@ def _build_tool(self, lf_trace: Any = None) -> VisionQATool:
 
         Parameters
         ----------
-        opik_trace : Any, optional
+        langfuse_trace : Any, optional
             A tracing object for observability.
 
         Returns
@@ -228,7 +228,7 @@ def run(
             The question and image to analyze.
         plan : dict
             The inspection procedure to follow.
-        opik_trace : Any, optional
+        langfuse_trace : Any, optional
             Trace object for execution tracking.
         ocr_result : dict, optional
             Ground-truth OCR data for grounding.
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py
index bad9e28..ae97c5e 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/runner/run_generate_meps.py
@@ -138,8 +138,8 @@ def process_sample(  # noqa: PLR0915
         Unique identifier for the current evaluation run.
     out_dir : str
         Directory where the resulting MEP JSON should be saved.
-    opik_client : object, optional
-        The Opik client for tracing.
+    langfuse_client : object, optional
+        The Langfuse client for tracing and observability.
     verifier_agent : VerifierAgent, optional
         The agent for pass 2.5 verification.
     ocr_tool : OcrReaderTool, optional
diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/ocr_reader_tool.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/ocr_reader_tool.py
index 41677dd..2655b12 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/ocr_reader_tool.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/tools/ocr_reader_tool.py
@@ -115,7 +115,7 @@ def _run(self, image_path: str) -> str:
         start_ts = datetime.now(timezone.utc).isoformat()
         t0 = time.time()
 
-        opik_span = open_llm_span(
+        lf_span = open_llm_span(
             self.lf_trace,
             name="ocr_reader_tool",
             input_data={"image_path": image_path},
@@ -155,7 +155,7 @@ def _run(self, image_path: str) -> str:
         usage = provider_meta.get("usage", {})
 
         close_span(
-            opik_span,
+            lf_span,
             output={"raw_text": raw_text},
             usage=usage if usage else None,
             error=error_str,

From 01f6051fbfd361fab5fb106fe5233444fdcda577 Mon Sep 17 00:00:00 2001
From: Aravind N <aravindn1308@outlook.com>
Date: Thu, 19 Mar 2026 11:31:24 -0400
Subject: [PATCH 6/9] Add integration test instructions to README for API key
 validation

---
 README.md                                  | 12 +++
 implementations/agentic_vqa_eval/README.md | 88 +++++++++++++++++-----
 2 files changed, 83 insertions(+), 17 deletions(-)

diff --git a/README.md b/README.md
index 8eb159a..3f81971 100644
--- a/README.md
+++ b/README.md
@@ -72,6 +72,18 @@ recent research, with fully reproducible notebooks and evaluation pipelines.
    uv run jupyter lab
    ```
 
+5. Run integration tests to validate that your API keys are set up correctly:
+
+   ```bash
+   uv run --env-file .env pytest -sv tests/tool_tests/test_integration.py
+   ```
+
+   > **Note:** If your `.env` file is incomplete or needs to be updated, you can re-run onboarding manually from inside your Coder workspace (from the repo root):
+   >
+   > ```bash
+   > onboard --bootcamp-name "llm-interpretability-bootcamp" --output-dir "." --test-script "./aieng-llm-interp/tests/test_integration.py" --env-example "./.env.example" --test-marker "integration_test" --force
+   > ```
+
 ## License
 
 This project is licensed under the terms of the [LICENSE](LICENSE.md) file in the root directory.
diff --git a/implementations/agentic_vqa_eval/README.md b/implementations/agentic_vqa_eval/README.md
index 353fe8f..5541679 100644
--- a/implementations/agentic_vqa_eval/README.md
+++ b/implementations/agentic_vqa_eval/README.md
@@ -12,6 +12,8 @@ The core contribution is the **Model Evaluation Packet (MEP)** — a portable JS
 
 ## Architecture Overview
 
+<details>
+<summary>Show pipeline diagram</summary>
 
 ```
 ┌──────────────────────────────────────────────────────────┐
@@ -112,6 +114,8 @@ The core contribution is the **Model Evaluation Packet (MEP)** — a portable JS
                (summary.csv by config × question_type)
 ```
 
+</details>
+
 ### Explainability at Four Levels
 
 This framework produces explainability signals at four distinct levels:
@@ -143,6 +147,9 @@ This framework produces explainability signals at four distinct levels:
 
 ## Package Dependencies
 
+<details>
+<summary>Show dependencies table</summary>
+
 | Package | Version | Purpose |
 |---|---|---|
 | `crewai` | 1.10.1 | Multi-agent framework: Agent, Task, Crew, LLM, BaseTool |
@@ -159,10 +166,15 @@ This framework produces explainability signals at four distinct levels:
 | `streamlit` | ≥1.32 | Interactive evaluation dashboard |
 | `jupyter` / `ipykernel` | latest | Analysis notebook |
 
+</details>
+
 ---
 
 ## Internal Package Structure
 
+<details>
+<summary>Show package tree</summary>
+
 ```
 src/agentic_chartqapro_eval/
 ├── utils/
@@ -211,16 +223,18 @@ src/agentic_chartqapro_eval/
     └── ingest.py           — Retroactively import existing MEP files into Langfuse
 ```
 
+</details>
+
 ---
 
 ## Getting Started
 
 ### 1. Install dependencies
 
-From the **root of the repository**, install the `ref6-agentic-xai-eval` dependency group using `uv`:
+From the **root of the repository**, install the `agentic-xai-eval` dependency group using `uv`:
 
 ```bash
-uv sync --group ref6-agentic-xai-eval
+uv sync --group agentic-xai-eval
 source .venv/bin/activate
 ```
 
@@ -240,7 +254,19 @@ cp .env.example .env
 
 The `.env` file lives at the **repo root**. `load_dotenv()` searches upward from the working directory, so it is found automatically regardless of which subdirectory you run commands from.
 
-### 3. Generate MEPs (run the agentic pipeline)
+### 3. Run integration tests to validate your API keys
+
+```bash
+uv run --env-file .env pytest -sv tests/tool_tests/test_integration.py
+```
+
+> **Note:** If your `.env` file is incomplete or needs to be updated, you can re-run onboarding manually from inside your Coder workspace (from the repo root):
+>
+> ```bash
+> onboard --bootcamp-name "llm-interpretability-bootcamp" --output-dir "." --test-script "./aieng-llm-interp/tests/test_integration.py" --env-example "./.env.example" --test-marker "integration_test" --force
+> ```
+
+### 4. Generate MEPs (run the agentic pipeline)
 
 > **Note:** All `uv run` commands below use `$(git rev-parse --show-toplevel)` so they work from any directory in the repo — it resolves the repo root for `--env-file`, while `--directory` ensures outputs (`meps/`, `output/`) are written inside `implementations/agentic_vqa_eval/`.
 
@@ -297,7 +323,7 @@ When OCR is skipped, `"ocr": null` appears in the MEP and `"ocr_ms": 0.0` in tim
 
 **Context injection:** The VisionAgent uses a single shared prompt template (`agents/prompts/vision.txt`) that contains an `{ocr_block}` placeholder. When OCR ran successfully, this block is populated with the structured OCR fields (chart type, title, axis labels, legend). When OCR is skipped or produced no output, `{ocr_block}` renders as an empty string — the prompt is otherwise identical. This is a useful example of conditional context injection: the same template handles both modes without branching at the prompt level.
 
-### 4. Evaluate outputs (Pass 1 — accuracy + judge)
+### 5. Evaluate outputs (Pass 1 — accuracy + judge)
 
 ```bash
 uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.eval_outputs \
@@ -314,7 +340,7 @@ When the verifier ran, two extra columns are present:
 
 The `predicted` column always reflects the **final answer** — the verifier's output when it ran, or the vision agent's output when skipped. This means accuracy scores automatically capture any corrections made by the verifier.
 
-### 5. Evaluate traces (Pass 2 — latency and replayability)
+### 6. Evaluate traces (Pass 2 — latency and replayability)
 
 ```bash
 uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.eval_traces \
@@ -322,7 +348,7 @@ uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev
     --out output/trace_metrics.jsonl
 ```
 
-### 6. Run Top-K evaluation (hit@1/2/3)
+### 7. Run Top-K evaluation (hit@1/2/3)
 
 Re-queries the VLM for each MEP asking for the 3 most likely candidate answers:
 
@@ -337,7 +363,7 @@ uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev
 
 This pass does **not** modify existing MEPs or `metrics.jsonl`.
 
-### 7. Summarize results
+### 8. Summarize results
 
 ```bash
 uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.eval.summarize \
@@ -345,7 +371,7 @@ uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev
     --out output/summary.csv
 ```
 
-### 8. Failure taxonomy (Pass 4 — VLM-based diagnosis)
+### 9. Failure taxonomy (Pass 4 — VLM-based diagnosis)
 
 This pass asks **why** the agent was wrong, not just **that** it was wrong. A VLM is given the original chart image alongside the wrong answer, the correct answer, the agent's explanation, and the inspection plan — so it can make a *visual* diagnosis of the failure mode.
 
@@ -360,6 +386,9 @@ Each line in `taxonomy.jsonl` contains a `failure_type` (one of the categories b
 
 **Failure categories:**
 
+<details>
+<summary>Show failure categories</summary>
+
 | Category | Description |
 |---|---|
 | `correct` | Model got it right — no VLM call made |
@@ -372,6 +401,8 @@ Each line in `taxonomy.jsonl` contains a `failure_type` (one of the categories b
 | `extraction_error` | Could not locate the relevant data in the chart at all |
 | `other` | Does not fit any category above |
 
+</details>
+
 **Why VLM instead of text-only LLM?**
 A text-only judge can only read the agent's description of what it saw. A VLM can independently verify whether the axis labels were actually ambiguous, whether the cited data point actually appears in the image, or whether the legend entries are genuinely confusing — producing a grounded diagnosis rather than a guess.
 
@@ -403,7 +434,7 @@ for sid in revised:
 "
 ```
 
-### 9. Visualization & Reporting
+### 10. Visualization & Reporting
 
 #### HTML report (no extra dependencies)
 
@@ -459,6 +490,11 @@ Pre-built cells walk through: loading MEPs, accuracy by question type, verifier
 
 Langfuse is an open-source LLM observability platform that adds a live visualization and experiment-comparison layer on top of the MEP artifacts. MEPs remain the portable ground truth; Langfuse is purely additive.
 
+> **Optional:** If `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are not set in `.env`, all Langfuse calls are silent no-ops and the pipeline runs exactly as before.
+
+<details>
+<summary>Show Langfuse setup and usage</summary>
+
 ### What Langfuse gives you
 
 | Feature | Detail |
@@ -510,8 +546,6 @@ LANGFUSE_SECRET_KEY=sk-lf-...
 # LANGFUSE_HOST=https://cloud.langfuse.com  # default; change for self-hosted
 ```
 
-The framework auto-detects these variables. If they are absent, all Langfuse calls are silent no-ops and the pipeline runs exactly as before.
-
 ### 3. Push prompt versions to Langfuse
 
 Run once before starting experiments. This creates versioned entries for `planner.txt` and `vision.txt` in Langfuse Prompt Management so every future experiment links to the exact prompt version used.
@@ -531,11 +565,7 @@ This creates a dataset named `ChartQAPro_test` in Langfuse containing one item p
 
 ### 5. Live tracing (automatic on new runs)
 
-No extra flags needed. When `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are set, the pipeline automatically:
-- registers the dataset and versions the prompts at run start
-- opens a Langfuse trace per sample
-- creates `planner` and `vision_qa_tool` child generations with inputs, outputs, and token usage
-- stores the `lf_trace_id` in the MEP for later score attachment
+No extra flags needed. When `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are set, the pipeline automatically registers the dataset and versions the prompts at run start, opens a Langfuse trace per sample, creates child generations with inputs/outputs/token usage, and stores the `lf_trace_id` in the MEP for later score attachment.
 
 ```bash
 uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev-parse --show-toplevel)/implementations/agentic_vqa_eval" -m agentic_chartqapro_eval.runner.run_generate_meps \
@@ -562,10 +592,15 @@ uv run --env-file "$(git rev-parse --show-toplevel)/.env" --directory "$(git rev
     --metrics_file metrics.jsonl   # optional: attaches scores if available
 ```
 
+</details>
+
 ---
 
 ## MEP Schema
 
+<details>
+<summary>Show full MEP schema and field reference</summary>
+
 Each MEP file is a self-contained JSON evaluation artifact:
 
 ```json
@@ -620,17 +655,20 @@ Each MEP file is a self-contained JSON evaluation artifact:
   },
   "timestamps": { "planner_ms": 2185, "ocr_ms": 1243, "vision_ms": 5684, "verifier_ms": 3712 },
   "errors": [],
-  "lf_trace_id": "abc123..."   // present when Langfuse tracing is active
+  "lf_trace_id": "abc123..."
 }
 ```
 
 `ocr` is `null` when `--no_ocr` is passed. When present, `ocr.parsed` contains: `chart_type`, `title`, `x_axis`, `y_axis`, `legend`, `data_labels`, `annotations`.
 
 `verifier` is `null` when `--no_verifier` was passed. When present, `verifier.verdict` is one of:
+
 - `"confirmed"` — second model agreed with the draft answer
 - `"revised"` — second model caught an error and corrected the answer
 - `"skipped"` — verifier ran but fell back due to missing image or error
 
+</details>
+
 ---
 
 ## Resources
@@ -646,33 +684,46 @@ Each MEP file is a self-contained JSON evaluation artifact:
 
 ## FAQ
 
+<details>
+<summary>Show all FAQs</summary>
+
 ### 1. What is the purpose of the MEP schema?
+
 The Model Evaluation Packet (MEP) schema is designed to provide a comprehensive, portable, and reproducible trace of the evaluation process. It captures all relevant details, including the inspection plan, tool calls, timestamps, and errors, enabling post-hoc analysis and comparison across models.
 
 ### 2. Can I use a different dataset with this framework?
+
 Yes, the framework is modular and supports other datasets as long as they are compatible with the expected input format (question, chart image, expected answer). You may need to implement a custom dataset loader in `src/agentic_chartqapro_eval/datasets/`.
 
 ### 3. How do I add a new vision or planner backend?
+
 To add a new backend, you need to:
+
 - Implement the corresponding tool or agent in `src/agentic_chartqapro_eval/tools/` or `src/agentic_chartqapro_eval/agents/`.
 - Update the configuration options in `run_generate_meps.py` to include the new backend.
 
 ### 4. What happens if the VisionAgent produces malformed JSON?
+
 The framework uses the `json_repair` library to attempt to fix malformed JSON outputs. If repair fails, the error is logged in the MEP under the `errors` field.
 
 ### 5. How can I customize the evaluation rubric?
+
 The evaluation rubric is defined in `src/agentic_chartqapro_eval/eval/judge.py`. You can modify the scoring dimensions or add new ones by editing the `judge` function.
 
 ### 6. Is it possible to run the framework without API calls?
+
 Yes, you can use pre-generated MEPs for evaluation by skipping the generation step. This is useful for offline analysis or when API usage is restricted.
 
 ### 7. How do I handle large datasets efficiently?
+
 For large datasets, consider:
+
 - Using the `--n` flag to process a subset of samples.
 - Increasing the `--workers` count to parallelize processing.
 - Running the pipeline on a machine with sufficient memory and disk space.
 
 ### 8. Where can I find more examples or tutorials?
+
 Refer to the Resources section for links to documentation, datasets, and related research papers. Additional examples may be added in future updates.
 
 ### 9. How does the VerifierAgent differ from the LLM judge?
@@ -690,4 +741,7 @@ They serve different purposes and run at different times:
 The verifier improves the pipeline's answer quality; the judge measures the pipeline's reasoning quality.
 
 ### 10. Do I need Langfuse to run the framework?
+
 No. Langfuse is entirely optional. If `LANGFUSE_PUBLIC_KEY` and `LANGFUSE_SECRET_KEY` are not set in `.env`, all Langfuse calls are silent no-ops. The pipeline produces the same MEPs, `metrics.jsonl`, and `summary.csv` as before.
+
+</details>

From 335b468dafd55e614d7ace3e6079e60ad6ac43ae Mon Sep 17 00:00:00 2001
From: Aravind N <aravindn1308@outlook.com>
Date: Thu, 19 Mar 2026 11:42:54 -0400
Subject: [PATCH 7/9] Update Langfuse integration to require version 4 and add
 fallback for missing attributes propagation

---
 .../langfuse_integration/tracing.py           | 24 +++++++++++++++++--
 pyproject.toml                                |  2 +-
 uv.lock                                       |  2 +-
 3 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py
index 546c99c..89d5f47 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/tracing.py
@@ -8,11 +8,31 @@
 from contextlib import contextmanager
 from typing import Optional
 
-from langfuse import propagate_attributes
+
+try:
+    from langfuse import propagate_attributes  # requires langfuse>=4
+except Exception:
+
+    @contextmanager  # type: ignore[misc]
+    def propagate_attributes(**_: object):  # type: ignore[misc]
+        """Fallback no-op context manager if langfuse v4 is not available."""
+        yield
 
 
 def _normalize_usage(usage: dict) -> dict:
-    """Map provider usage dicts (OpenAI/Gemini) to Langfuse v4 usage_details keys."""
+    """
+    Map provider usage dicts (OpenAI/Gemini) to Langfuse v4 usage_details keys.
+
+    Parameters
+    ----------
+    usage : dict
+        The raw usage dict from the provider.
+
+    Returns
+    -------
+    dict
+        Normalized usage details for Langfuse.
+    """
     normalized: dict = {}
     # OpenAI keys
     if "prompt_tokens" in usage:
diff --git a/pyproject.toml b/pyproject.toml
index 265a883..3851f9d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,7 +23,7 @@ dependencies = [
     "python-dotenv>=1.2.2",
     "scikit-learn>=1.5.2",
     "transformers>=4.47.0",
-    "langfuse>=3.10.3",
+    "langfuse>=4",
 ]
 
 [dependency-groups]
diff --git a/uv.lock b/uv.lock
index db5369c..8ec741e 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2099,7 +2099,7 @@ requires-dist = [
     { name = "ipywidgets", specifier = ">=8.1.7" },
     { name = "jupyter", specifier = ">=1.1.1" },
     { name = "jupyterlab", specifier = ">=4.4.8" },
-    { name = "langfuse", specifier = ">=3.10.3" },
+    { name = "langfuse", specifier = ">=4" },
     { name = "matplotlib", specifier = ">=3.10.5" },
     { name = "numpy", specifier = ">=1.26,<2.0" },
     { name = "openai", specifier = ">=2.8.0" },

From 103570945446e38b8dbb1232364035c06bfd22f8 Mon Sep 17 00:00:00 2001
From: Aravind N <aravindn1308@outlook.com>
Date: Thu, 19 Mar 2026 12:33:39 -0400
Subject: [PATCH 8/9] Fix integration test command path in README for
 consistency

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3f81971..53a0e41 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,7 @@ recent research, with fully reproducible notebooks and evaluation pipelines.
 5. Run integration tests to validate that your API keys are set up correctly:
 
    ```bash
-   uv run --env-file .env pytest -sv tests/tool_tests/test_integration.py
+   uv run --env-file .env pytest -sv tests/test_integration.py
    ```
 
    > **Note:** If your `.env` file is incomplete or needs to be updated, you can re-run onboarding manually from inside your Coder workspace (from the repo root):

From 926c0888b7d6a434b1d8ad7503bde24a1248b076 Mon Sep 17 00:00:00 2001
From: Aravind N <aravindn1308@outlook.com>
Date: Thu, 19 Mar 2026 17:45:43 -0400
Subject: [PATCH 9/9] Add Google GenAI and OpenAI instrumentation support in
 Langfuse integration

---
 .../langfuse_integration/client.py            | 23 ++++++
 pyproject.toml                                |  2 +
 uv.lock                                       | 78 +++++++++++++++++++
 3 files changed, 103 insertions(+)

diff --git a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py
index c725ce9..aefdf1b 100644
--- a/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py
+++ b/implementations/agentic_vqa_eval/src/agentic_chartqapro_eval/langfuse_integration/client.py
@@ -11,6 +11,21 @@
 from langfuse import Langfuse
 
 
+try:
+    from openinference.instrumentation.google_genai import GoogleGenAIInstrumentor
+
+    _google_instrumentor = GoogleGenAIInstrumentor()
+except Exception:
+    _google_instrumentor = None  # type: ignore[assignment]
+
+try:
+    from openinference.instrumentation.openai import OpenAIInstrumentor
+
+    _openai_instrumentor = OpenAIInstrumentor()
+except Exception:
+    _openai_instrumentor = None  # type: ignore[assignment]
+
+
 _client = None
 _initialised = False
 
@@ -51,6 +66,14 @@ def get_client():
             kwargs["host"] = host
 
         _client = Langfuse(**kwargs)
+        # Activate OTel auto-instrumentation so provider SDK calls (Google GenAI,
+        # OpenAI) are captured as detailed child spans inside Langfuse traces.
+        if _google_instrumentor is not None:
+            with suppress(Exception):
+                _google_instrumentor.instrument()
+        if _openai_instrumentor is not None:
+            with suppress(Exception):
+                _openai_instrumentor.instrument()
     except Exception as exc:
         print(f"[langfuse] client init failed: {exc}")
         _client = None
diff --git a/pyproject.toml b/pyproject.toml
index 3851f9d..3b66108 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -30,6 +30,8 @@ dependencies = [
 agentic-xai-eval = [
     "crewai>=1.6.1",
     "google-genai>=1.67.0",
+    "openinference-instrumentation-google-genai>=0.1.0",
+    "openinference-instrumentation-openai>=0.1.0",
     "streamlit>=1.55.0",
 ]
 dev = [
diff --git a/uv.lock b/uv.lock
index 8ec741e..8e4c60b 100644
--- a/uv.lock
+++ b/uv.lock
@@ -2035,6 +2035,8 @@ dependencies = [
 agentic-xai-eval = [
     { name = "crewai" },
     { name = "google-genai" },
+    { name = "openinference-instrumentation-google-genai" },
+    { name = "openinference-instrumentation-openai" },
     { name = "streamlit" },
 ]
 dev = [
@@ -2113,6 +2115,8 @@ requires-dist = [
 agentic-xai-eval = [
     { name = "crewai", specifier = ">=1.6.1" },
     { name = "google-genai", specifier = ">=1.67.0" },
+    { name = "openinference-instrumentation-google-genai", specifier = ">=0.1.0" },
+    { name = "openinference-instrumentation-openai", specifier = ">=0.1.0" },
     { name = "streamlit", specifier = ">=1.55.0" },
 ]
 dev = [
@@ -3675,6 +3679,65 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a4/7d/f1c30a92854540bf789e9cd5dde7ef49bbe63f855b85a2e6b3db8135c591/opencv_python-4.11.0.86-cp37-abi3-win_amd64.whl", hash = "sha256:085ad9b77c18853ea66283e98affefe2de8cc4c1f43eda4c100cf9b2721142ec", size = 39488044, upload-time = "2025-01-16T13:52:21.928Z" },
 ]
 
+[[package]]
+name = "openinference-instrumentation"
+version = "0.1.46"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "openinference-semantic-conventions" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-sdk" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/64/8d/9b76b43e8b2ee2ccf1fe15b21c924095f9c0e4839919bcd4951b1c99c2ab/openinference_instrumentation-0.1.46.tar.gz", hash = "sha256:0b520002a1c682c525dcab49005c209bfd71611e8e4e4933b49779d5e899e6db", size = 23937, upload-time = "2026-03-04T10:13:48.883Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/25/d1/f6668492152a4180492044313e2dc427fbc237904f6bb1629abd030e3469/openinference_instrumentation-0.1.46-py3-none-any.whl", hash = "sha256:f7b63ccd5f93ce82e4e40035c9faa6b021984cbe06ad791f4cf033551533bc48", size = 30124, upload-time = "2026-03-04T10:13:47.613Z" },
+]
+
+[[package]]
+name = "openinference-instrumentation-google-genai"
+version = "0.1.13"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "openinference-instrumentation" },
+    { name = "openinference-semantic-conventions" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-instrumentation" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/7f/b0/91785c0d8740c488c7d9c0789985cccafbd4dad53266242a19d511603feb/openinference_instrumentation_google_genai-0.1.13.tar.gz", hash = "sha256:088a7300264486a41db2ab44b08848aaac788d0b6a3d61ff12d66b9b3b0703fb", size = 55136, upload-time = "2026-03-11T04:45:48.223Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/72/38/9c5f8ff01cce2e93b92abd6779c6d5bad8ddc399d26a5debc7ed9a32782c/openinference_instrumentation_google_genai-0.1.13-py3-none-any.whl", hash = "sha256:b14485015a4603accba17f77636501b68bc163e95bc9cf65ffb64caf60544cfc", size = 29135, upload-time = "2026-03-11T04:45:45.471Z" },
+]
+
+[[package]]
+name = "openinference-instrumentation-openai"
+version = "0.1.42"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "openinference-instrumentation" },
+    { name = "openinference-semantic-conventions" },
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-instrumentation" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "typing-extensions" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/f1/e4/cf114f6fedc90dde6e1d4062e55686542f8b7636a4d3340b81a49b1a09a8/openinference_instrumentation_openai-0.1.42.tar.gz", hash = "sha256:6f6b340292ab7dd7dc2e9a944958f7f812108efaafbfbcaa3f7ba205744ad1ce", size = 22839, upload-time = "2026-03-11T04:45:51.37Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c5/88/eaaa4840bf1ed8ff8c0927cd6ad5653ee0cfac14bfcb4e1e8f06fb0be9e8/openinference_instrumentation_openai-0.1.42-py3-none-any.whl", hash = "sha256:e7ff7b98612102d4a3e342842d3dd231709ff51abdc4b193e5df09e9afcfac0f", size = 30333, upload-time = "2026-03-11T04:45:48.535Z" },
+]
+
+[[package]]
+name = "openinference-semantic-conventions"
+version = "0.1.28"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/df/32/c79bf8bd3ea5a00e492449b31ca600bbc2a8e88a301e42c872af925a156c/openinference_semantic_conventions-0.1.28.tar.gz", hash = "sha256:6388465174e8ab3f27ebc6a9e9bb2e1b804d30caefb57234e16db874da1c6a7b", size = 12893, upload-time = "2026-03-11T04:45:46.543Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/04/40/34b570462c3ce250277254bb0cca655eb39b64c0dffe63cd7751f103f8d6/openinference_semantic_conventions-0.1.28-py3-none-any.whl", hash = "sha256:a2fed5bb167aa56c1c7448cdb7a8d775f989339ba1f8b04a7b45d4f8388cccfb", size = 10522, upload-time = "2026-03-11T04:45:45.423Z" },
+]
+
 [[package]]
 name = "openpyxl"
 version = "3.1.5"
@@ -3748,6 +3811,21 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a0/3a/8865d6754e61c9fb170cdd530a124a53769ee5f740236064816eb0ca7301/opentelemetry_exporter_otlp_proto_http-1.40.0-py3-none-any.whl", hash = "sha256:a8d1dab28f504c5d96577d6509f80a8150e44e8f45f82cdbe0e34c99ab040069", size = 19960, upload-time = "2026-03-04T14:17:07.153Z" },
 ]
 
+[[package]]
+name = "opentelemetry-instrumentation"
+version = "0.61b0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "opentelemetry-api" },
+    { name = "opentelemetry-semantic-conventions" },
+    { name = "packaging" },
+    { name = "wrapt" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/da/37/6bf8e66bfcee5d3c6515b79cb2ee9ad05fe573c20f7ceb288d0e7eeec28c/opentelemetry_instrumentation-0.61b0.tar.gz", hash = "sha256:cb21b48db738c9de196eba6b805b4ff9de3b7f187e4bbf9a466fa170514f1fc7", size = 32606, upload-time = "2026-03-04T14:20:16.825Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/d8/3e/f6f10f178b6316de67f0dfdbbb699a24fbe8917cf1743c1595fb9dcdd461/opentelemetry_instrumentation-0.61b0-py3-none-any.whl", hash = "sha256:92a93a280e69788e8f88391247cc530fd81f16f2b011979d4d6398f805cfbc63", size = 33448, upload-time = "2026-03-04T14:19:02.447Z" },
+]
+
 [[package]]
 name = "opentelemetry-proto"
 version = "1.40.0"