Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 108 additions & 10 deletions benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import cv2
import urllib.request
from libs.config.settings import settings
from ultralytics import YOLO # <-- FIX: Yeh line missing thi, ab model load ho jayega!
from ultralytics import YOLO # <-- FIX: Loaded successfully

class PipelineBenchmark:
def __init__(self, redis_url=settings.REDIS_URL):
Expand Down Expand Up @@ -46,14 +46,24 @@ def monitor_memory(self):
pass
time.sleep(0.05)

def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_videos/sample.mp4", num_frames=100, img_size=320):
def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_videos/sample.mp4", num_frames=100, img_size=320, device=None):
# 1. Reset run state and metrics at start (CodeRabbit State Fix)
self._stop_memory_monitor = False
self.peak_ram = 0
for key in self.metrics:
self.metrics[key].clear()

print(f"\n🚀 Starting End-to-End Pipeline Performance Benchmark using model: {model_path}...")
# Dynamic device determination
if device is None:
if model_path.endswith(".engine"):
device = "cuda"
else:
device = "cuda" if torch.cuda.is_available() else "cpu"

print(f"\n🚀 Starting End-to-End Pipeline Performance Benchmark...")
print(f" Model: {model_path}")
print(f" Device: {device.upper()}")
print(f" Frames: {num_frames}")

# Cross-Machine Reproducibility Check: Video download automation fallback
if not os.path.exists(video_source) and video_source == "data/sample_videos/sample.mp4":
Expand Down Expand Up @@ -83,8 +93,10 @@ def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_vide
model = YOLO(model_path, task='detect')
# Warmup frames setup
fake_tensor = torch.rand(1, 3, img_size, img_size)
if "cuda" in device.lower():
fake_tensor = fake_tensor.cuda()
for _ in range(5):
model.predict(fake_tensor, verbose=False, device='cpu', imgsz=img_size)
model.predict(fake_tensor, verbose=False, device=device, imgsz=img_size)
use_real_model = True
print("✨ Real YOLO model successfully loaded into the benchmark pipeline!")
except Exception as e:
Expand Down Expand Up @@ -116,11 +128,13 @@ def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_vide

if frame_to_process is None:
frame_to_process = torch.rand(1, 3, img_size, img_size)
if "cuda" in device.lower():
frame_to_process = frame_to_process.cuda()

# 1. Measure Detection Speed
t0 = time.time()
if use_real_model:
model.predict(frame_to_process, verbose=False, device='cpu', imgsz=img_size)
model.predict(frame_to_process, verbose=False, device=device, imgsz=img_size)
else:
time.sleep(0.015)
self.metrics["detection_times"].append(time.time() - t0)
Expand Down Expand Up @@ -164,7 +178,7 @@ def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_vide
cap.release()

total_duration = time.time() - start_total
self.generate_report(total_duration, model_path, "Real Video Asset" if use_real_video else "Synthetic Tensor Stream")
return self.generate_report(total_duration, model_path, "Real Video Asset" if use_real_video else "Synthetic Tensor Stream")

def generate_report(self, total_duration, model_used, source_used):
avg_det_time = np.mean(self.metrics["detection_times"])
Expand All @@ -177,7 +191,6 @@ def generate_report(self, total_duration, model_used, source_used):

os.makedirs("docs/benchmarks", exist_ok=True)

# Dynamic logic for Mermaid timelines (CodeRabbit Dynamic Timeline Fix)
det_ms = avg_det_time * 1000
track_ms = avg_track
redis_ms = avg_redis
Expand Down Expand Up @@ -227,10 +240,95 @@ def generate_report(self, total_duration, model_used, source_used):
print("\n🏆 Benchmark ran successfully!")
print(f"📊 Workload Source Verified: {source_used}")
print("📁 Report generated at: docs/benchmarks/pipeline_benchmark.md")

return {
"model": model_used,
"fps": fps,
"latency_ms": det_ms,
"e2e_ms": avg_e2e,
"ram_mb": self.peak_ram
}

def run_comparative_benchmark(benchrunner, models, num_frames=100):
"""Runs performance benchmarking across multiple model formats and outputs a consolidated report."""
results = []
print("\n🔍 Initiating Cross-Format Model Benchmark Comparison...")

for label, path in models.items():
if os.path.exists(path):
try:
# Decide device automatically based on model suffix
device = "cuda" if path.endswith(".engine") or torch.cuda.is_available() else "cpu"
res = benchrunner.run_full_pipeline_benchmark(model_path=path, num_frames=num_frames, device=device)
res["format"] = label
results.append(res)
except Exception as e:
print(f"❌ Failed to run benchmark for {label} ({path}): {e}")
else:
print(f"⚠️ Skipping comparison for format '{label}' since file was not found at '{path}'.")
Comment on lines +257 to +268
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Device picker can request CUDA when CUDA is unavailable.

device = "cuda" if path.endswith(".engine") or torch.cuda.is_available() else "cpu"

Operator precedence is (path.endswith(".engine")) or (torch.cuda.is_available()), so any .engine path forces device="cuda" even on a CPU-only host. That conflicts with the PR's "graceful fallback" promise — YOLO("yolov8n.engine", task="detect").predict(..., device="cuda") will raise on a machine without CUDA rather than skip the format. Note that, unlike Detector, this comparative runner uses YOLO directly and does not benefit from the loader's fallback logic, so the guard needs to live here.

🛡️ Proposed fix
-                # Decide device automatically based on model suffix
-                device = "cuda" if path.endswith(".engine") or torch.cuda.is_available() else "cpu"
-                res = benchrunner.run_full_pipeline_benchmark(model_path=path, num_frames=num_frames, device=device)
+                # Skip .engine on hosts without CUDA; otherwise pick CUDA when available
+                if path.endswith(".engine") and not torch.cuda.is_available():
+                    print(f"⚠️ Skipping '{label}' — '{path}' requires CUDA, which is unavailable on this host.")
+                    continue
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+                res = benchrunner.run_full_pipeline_benchmark(model_path=path, num_frames=num_frames, device=device)
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@benchmark.py` around lines 257 - 268, The device selection logic in
benchmark.py sets device="cuda" whenever path.endswith(".engine") is true, even
if CUDA isn't available; update the condition so CUDA is chosen only when the
model is an .engine file AND torch.cuda.is_available() is true (otherwise fall
back to "cpu") before calling benchrunner.run_full_pipeline_benchmark; locate
the device assignment near the loop over models and adjust the boolean
expression that sets device accordingly.


if not results:
print("❌ No models were successfully benchmarked.")
return

# Generate Markdown Table Comparison
table_rows = []
for r in results:
table_rows.append(
f"| **{r['format']}** | `{r['model']}` | {r['fps']:.2f} | {r['latency_ms']:.2f} ms | {r['e2e_ms']:.2f} ms | {r['ram_mb']:.1f} MB |"
)

comparison_md = (
f"# Consolidated Model Format Comparison Report\n\n"
f"Generated automatically on: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
f"This report compares the performance of Eagle core detection using various model formats on the current hardware.\n\n"
f"## Performance Summary\n\n"
f"| Model Format | Model Path | Throughput (FPS) | Detection Latency | E2E Latency | Peak RAM Usage |\n"
f"| :--- | :--- | :--- | :--- | :--- | :--- |\n"
+ "\n".join(table_rows) + "\n\n"
f"### Hardware / Environmental Diagnostics\n"
f"- **CUDA Available:** `{torch.cuda.is_available()}`\n"
f"- **Active GPU:** `{torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None (CPU)'}`\n\n"
f"### Summary Analysis\n"
f"- **TensorRT (.engine)** provides compiled CUDA-kernel optimization for the absolute lowest possible latency and highest FPS throughput on NVIDIA devices.\n"
f"- **ONNX (.onnx)** formats offer standardized execution via ONNX Runtime with substantial speedups compared to raw PyTorch CPU inference.\n"
f"- **PyTorch (.pt)** files serve as the robust development standard and baseline framework.\n"
)

os.makedirs("docs/benchmarks", exist_ok=True)
report_path = "docs/benchmarks/comparison_report.md"
with open(report_path, "w", encoding="utf-8") as f:
f.write(comparison_md)

print("\n==========================================================================")
print("🏆 Consolidated Cross-Format Comparison Complete!")
print(f"📁 Comparison report generated at: {report_path}")
print("==========================================================================")


if __name__ == "__main__":
int8_path = "yolov8n_int8_openvino_model"
import argparse
parser = argparse.ArgumentParser(description="Run Eagle performance benchmarks")
parser.add_argument("--model", type=str, default=None, help="Path to a specific model to benchmark")
parser.add_argument("--compare", action="store_true", default=True, help="Run comparisons across formats (.pt, .onnx, .engine)")
parser.add_argument("--frames", type=int, default=100, help="Number of frames to benchmark")
args = parser.parse_args()

REDIS_ENV_URL = os.getenv("REDIS_URL", settings.REDIS_URL)

benchrunner = PipelineBenchmark(redis_url=REDIS_ENV_URL)
benchrunner.run_full_pipeline_benchmark(model_path=int8_path, num_frames=100)

if args.model:
# Benchmark specific model
benchrunner.run_full_pipeline_benchmark(model_path=args.model, num_frames=args.frames)
elif args.compare:
# Cross-format comparison candidate paths
candidate_models = {
"PyTorch (.pt)": "yolov8n.pt",
"ONNX (.onnx)": "yolov8n.onnx",
"TensorRT (.engine)": "yolov8n.engine"
}
run_comparative_benchmark(benchrunner, candidate_models, num_frames=args.frames)
else:
# Default single run
int8_path = "yolov8n_int8_openvino_model"
benchrunner.run_full_pipeline_benchmark(model_path=int8_path, num_frames=args.frames)
Comment on lines 309 to +334
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical | ⚡ Quick win

--compare default makes the else branch unreachable and removes opt-out.

parser.add_argument("--compare", action="store_true", default=True, ...) with store_true and default=True means args.compare is always truthy regardless of whether the flag is passed. As a result:

  • The else branch at L331-333 (single OpenVINO run) is dead code.
  • Users have no way to disable comparative mode without supplying --model, which contradicts a typical opt-in --compare semantic.

Either default to False (idiomatic for store_true) or model the choices as a mutually exclusive group.

🐛 Proposed fix
-    parser.add_argument("--compare", action="store_true", default=True, help="Run comparisons across formats (.pt, .onnx, .engine)")
+    parser.add_argument("--compare", action="store_true", help="Run comparisons across formats (.pt, .onnx, .engine)")

Or, if compare-by-default is the intent, drop the unreachable else branch and document that behavior explicitly.

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@benchmark.py` around lines 309 - 334, The --compare flag is defined with
store_true but default=True so args.compare is always truthy, making the else
branch unreachable; update the argparse setup so --compare is opt-in (set
default=False) or use a mutually exclusive group between --model and --compare,
then adjust the control flow that calls run_comparative_benchmark(benchrunner,
candidate_models, num_frames=...) and
benchrunner.run_full_pipeline_benchmark(model_path=..., num_frames=...)
accordingly (references: parser.add_argument("--compare", ...), args.compare,
run_comparative_benchmark, benchrunner.run_full_pipeline_benchmark,
candidate_models).

101 changes: 101 additions & 0 deletions docs/tensorrt_conversion.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# TensorRT Compilation & Optimization Guide

This guide covers installing dependencies, converting models into high-performance TensorRT `.engine` formats, and running optimized inference using Eagle’s smart automatic fallback protocol.

---

## 🚀 Why TensorRT?

NVIDIA TensorRT is a high-performance deep learning inference library that optimizes neural network models for deployment on NVIDIA GPUs and Jetson hardware. Utilizing `.engine` formats provides:

* **Up to 5x Faster Inference**: Highly optimized CUDA kernels tailored directly to your GPU.
* **Low Latency & High FPS**: Crucial for real-time surveillance and anomaly detection.
* **FP16 Half-Precision Optimization**: Reduces memory footprint and doubles processing speed with negligible accuracy loss.
* **Dynamic Batching & Memory Efficiency**: Saves critical GPU memory (VRAM) bounds.

---

## 🛠️ 1. Installation & Setup

To compile and execute `.engine` models, your host machine requires the CUDA Toolkit, cuDNN, TensorRT, and PyCUDA python APIs.

### Step A: Install NVIDIA Drivers & CUDA Toolkit
1. Download and install compatible **NVIDIA GPU Drivers** from [NVIDIA Driver Downloads](https://www.nvidia.com/Download/index.aspx).
2. Download and install **CUDA Toolkit 11.8 or 12.x** from the [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive).
3. Ensure CUDA is added to your environment `PATH` variables. Verify by running:
```bash
nvcc --version
```

### Step B: Install cuDNN
1. Download **cuDNN** (matching your CUDA version) from the [cuDNN Download Portal](https://developer.nvidia.com/cudnn).
2. Copy cuDNN headers and libraries into your local CUDA Toolkit directory.

### Step C: Install TensorRT
1. Download **NVIDIA TensorRT** matching your CUDA version from [TensorRT Portal](https://developer.nvidia.com/tensorrt).
2. Follow the installation guide to unzip and add TensorRT binaries to your system library path.
3. Install the TensorRT Python wheel matching your Python version (found in the `python/` directory of the TensorRT package):
```bash
pip install tensorrt
```

### Step D: Install PyCUDA
PyCUDA is required for low-level memory copies (DMA transfer coordination) on NVIDIA GPUs.
```bash
pip install pycuda
```

---

## 📦 2. Model Conversion Using `export_tensorrt.py`

We have provided a streamlined conversion script in `scripts/export_tensorrt.py` to automate the compilation of `.pt` or `.onnx` models into `.engine` format.

### Basic Compilation (Recommended FP16)
To compile a PyTorch YOLOv8 baseline model using optimized **FP16 half-precision**, run:
```bash
python scripts/export_tensorrt.py --model yolov8n.pt --fp16
```
This automatically compiles the model and saves a newly optimized `yolov8n.engine` file in the same directory!

### Command Parameters
| Flag | Type | Default | Description |
| :--- | :--- | :--- | :--- |
| `--model` | `str` | `yolov8n.pt` | Path to the source `.pt` or `.onnx` model file to compile. |
| `--fp16` | `bool` | `True` | Enables FP16 half-precision optimization (highly recommended). |
| `--int8` | `bool` | `False` | Enables INT8 quantization (requires calibrating dataset). |
| `--imgsz` | `int` | `640` | Resolution width/height of input frames (default: 640). |
| `--device` | `str` | `cuda:0` | GPU device ID to execute compiling (default: `cuda:0`). |

Comment on lines +61 to +69
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

Docs reflect the misleading --fp16 default; update alongside CLI fix.

The table documents --fp16 as a bool defaulting to True, which matches the current (broken) CLI behavior where the flag cannot be disabled. Once the CLI is corrected to use BooleanOptionalAction (see review on scripts/export_tensorrt.py), please document the --no-fp16 opt-out here as well. Also, per markdownlint MD058, surround the table with blank lines (already mostly present — verify there is a blank line directly before Line 62 and after Line 68).

🧰 Tools
🪛 markdownlint-cli2 (0.22.1)

[warning] 62-62: Tables should be surrounded by blank lines

(MD058, blanks-around-tables)

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@docs/tensorrt_conversion.md` around lines 61 - 69, Update the docs to reflect
the corrected CLI behavior for the --fp16 flag: document that --fp16 remains
enabled by default but can be disabled with the generated --no-fp16 opt-out
(mirror the BooleanOptionalAction change made in scripts/export_tensorrt.py) and
ensure the table lists `--fp16` (bool, Default: True) and `--no-fp16` as the
opt-out; also add a blank line immediately before and after the Markdown table
to satisfy MD058.

---

## 🧠 3. Smart Automatic Fallback Execution

You **do not need** to modify your application logic or worry about crashing on non-GPU/non-TensorRT machines. The system implements a **smart fallback routing layer**:

1. **Auto-Search**: The `Detector` class checks if a matching `.engine` file exists in the directory of your configured model (e.g. if `yolov8n.pt` is requested, it looks for `yolov8n.engine`).
2. **Auto-Promote**: If the `.engine` model is present and CUDA/TensorRT drivers are available, the detector automatically loads the optimized TensorRT engine for accelerated performance.
3. **Resilient Fallback**: If the `.engine` file is missing, corrupted, compiled on a different GPU, or if TensorRT is not supported on the host system, the code:
- Prints a non-blocking warning log: `Failed to load TensorRT engine. Triggering automatic fallback...`
- Automatically loads the baseline `.pt` or `.onnx` file and continues normal execution without interruption.

---

## 📊 4. Running the Performance Benchmarks

To measure the latency and FPS throughput improvements, we have upgraded `benchmark.py` to test and compare multiple formats.

### Run Multi-Format Comparative Benchmark:
```bash
python benchmark.py --compare
```

This runs a simulated video pipeline processing frames across `.pt`, `.onnx`, and `.engine` files, and generates a unified report under:
`docs/benchmarks/comparison_report.md`

### Benchmark a Single Specific Model:
```bash
python benchmark.py --model yolov8n.engine
```
Report generated under:
`docs/benchmarks/pipeline_benchmark.md`
Loading
Loading