-
Notifications
You must be signed in to change notification settings - Fork 57
feat: add TensorRT .engine support with auto-fallback and benchmarking #98
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,7 +8,7 @@ | |
| import cv2 | ||
| import urllib.request | ||
| from libs.config.settings import settings | ||
| from ultralytics import YOLO # <-- FIX: Yeh line missing thi, ab model load ho jayega! | ||
| from ultralytics import YOLO # <-- FIX: Loaded successfully | ||
|
|
||
| class PipelineBenchmark: | ||
| def __init__(self, redis_url=settings.REDIS_URL): | ||
|
|
@@ -46,14 +46,24 @@ def monitor_memory(self): | |
| pass | ||
| time.sleep(0.05) | ||
|
|
||
| def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_videos/sample.mp4", num_frames=100, img_size=320): | ||
| def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_videos/sample.mp4", num_frames=100, img_size=320, device=None): | ||
| # 1. Reset run state and metrics at start (CodeRabbit State Fix) | ||
| self._stop_memory_monitor = False | ||
| self.peak_ram = 0 | ||
| for key in self.metrics: | ||
| self.metrics[key].clear() | ||
|
|
||
| print(f"\n🚀 Starting End-to-End Pipeline Performance Benchmark using model: {model_path}...") | ||
| # Dynamic device determination | ||
| if device is None: | ||
| if model_path.endswith(".engine"): | ||
| device = "cuda" | ||
| else: | ||
| device = "cuda" if torch.cuda.is_available() else "cpu" | ||
|
|
||
| print(f"\n🚀 Starting End-to-End Pipeline Performance Benchmark...") | ||
| print(f" Model: {model_path}") | ||
| print(f" Device: {device.upper()}") | ||
| print(f" Frames: {num_frames}") | ||
|
|
||
| # Cross-Machine Reproducibility Check: Video download automation fallback | ||
| if not os.path.exists(video_source) and video_source == "data/sample_videos/sample.mp4": | ||
|
|
@@ -83,8 +93,10 @@ def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_vide | |
| model = YOLO(model_path, task='detect') | ||
| # Warmup frames setup | ||
| fake_tensor = torch.rand(1, 3, img_size, img_size) | ||
| if "cuda" in device.lower(): | ||
| fake_tensor = fake_tensor.cuda() | ||
| for _ in range(5): | ||
| model.predict(fake_tensor, verbose=False, device='cpu', imgsz=img_size) | ||
| model.predict(fake_tensor, verbose=False, device=device, imgsz=img_size) | ||
| use_real_model = True | ||
| print("✨ Real YOLO model successfully loaded into the benchmark pipeline!") | ||
| except Exception as e: | ||
|
|
@@ -116,11 +128,13 @@ def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_vide | |
|
|
||
| if frame_to_process is None: | ||
| frame_to_process = torch.rand(1, 3, img_size, img_size) | ||
| if "cuda" in device.lower(): | ||
| frame_to_process = frame_to_process.cuda() | ||
|
|
||
| # 1. Measure Detection Speed | ||
| t0 = time.time() | ||
| if use_real_model: | ||
| model.predict(frame_to_process, verbose=False, device='cpu', imgsz=img_size) | ||
| model.predict(frame_to_process, verbose=False, device=device, imgsz=img_size) | ||
| else: | ||
| time.sleep(0.015) | ||
| self.metrics["detection_times"].append(time.time() - t0) | ||
|
|
@@ -164,7 +178,7 @@ def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_vide | |
| cap.release() | ||
|
|
||
| total_duration = time.time() - start_total | ||
| self.generate_report(total_duration, model_path, "Real Video Asset" if use_real_video else "Synthetic Tensor Stream") | ||
| return self.generate_report(total_duration, model_path, "Real Video Asset" if use_real_video else "Synthetic Tensor Stream") | ||
|
|
||
| def generate_report(self, total_duration, model_used, source_used): | ||
| avg_det_time = np.mean(self.metrics["detection_times"]) | ||
|
|
@@ -177,7 +191,6 @@ def generate_report(self, total_duration, model_used, source_used): | |
|
|
||
| os.makedirs("docs/benchmarks", exist_ok=True) | ||
|
|
||
| # Dynamic logic for Mermaid timelines (CodeRabbit Dynamic Timeline Fix) | ||
| det_ms = avg_det_time * 1000 | ||
| track_ms = avg_track | ||
| redis_ms = avg_redis | ||
|
|
@@ -227,10 +240,95 @@ def generate_report(self, total_duration, model_used, source_used): | |
| print("\n🏆 Benchmark ran successfully!") | ||
| print(f"📊 Workload Source Verified: {source_used}") | ||
| print("📁 Report generated at: docs/benchmarks/pipeline_benchmark.md") | ||
|
|
||
| return { | ||
| "model": model_used, | ||
| "fps": fps, | ||
| "latency_ms": det_ms, | ||
| "e2e_ms": avg_e2e, | ||
| "ram_mb": self.peak_ram | ||
| } | ||
|
|
||
| def run_comparative_benchmark(benchrunner, models, num_frames=100): | ||
| """Runs performance benchmarking across multiple model formats and outputs a consolidated report.""" | ||
| results = [] | ||
| print("\n🔍 Initiating Cross-Format Model Benchmark Comparison...") | ||
|
|
||
| for label, path in models.items(): | ||
| if os.path.exists(path): | ||
| try: | ||
| # Decide device automatically based on model suffix | ||
| device = "cuda" if path.endswith(".engine") or torch.cuda.is_available() else "cpu" | ||
| res = benchrunner.run_full_pipeline_benchmark(model_path=path, num_frames=num_frames, device=device) | ||
| res["format"] = label | ||
| results.append(res) | ||
| except Exception as e: | ||
| print(f"❌ Failed to run benchmark for {label} ({path}): {e}") | ||
| else: | ||
| print(f"⚠️ Skipping comparison for format '{label}' since file was not found at '{path}'.") | ||
|
|
||
| if not results: | ||
| print("❌ No models were successfully benchmarked.") | ||
| return | ||
|
|
||
| # Generate Markdown Table Comparison | ||
| table_rows = [] | ||
| for r in results: | ||
| table_rows.append( | ||
| f"| **{r['format']}** | `{r['model']}` | {r['fps']:.2f} | {r['latency_ms']:.2f} ms | {r['e2e_ms']:.2f} ms | {r['ram_mb']:.1f} MB |" | ||
| ) | ||
|
|
||
| comparison_md = ( | ||
| f"# Consolidated Model Format Comparison Report\n\n" | ||
| f"Generated automatically on: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n" | ||
| f"This report compares the performance of Eagle core detection using various model formats on the current hardware.\n\n" | ||
| f"## Performance Summary\n\n" | ||
| f"| Model Format | Model Path | Throughput (FPS) | Detection Latency | E2E Latency | Peak RAM Usage |\n" | ||
| f"| :--- | :--- | :--- | :--- | :--- | :--- |\n" | ||
| + "\n".join(table_rows) + "\n\n" | ||
| f"### Hardware / Environmental Diagnostics\n" | ||
| f"- **CUDA Available:** `{torch.cuda.is_available()}`\n" | ||
| f"- **Active GPU:** `{torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None (CPU)'}`\n\n" | ||
| f"### Summary Analysis\n" | ||
| f"- **TensorRT (.engine)** provides compiled CUDA-kernel optimization for the absolute lowest possible latency and highest FPS throughput on NVIDIA devices.\n" | ||
| f"- **ONNX (.onnx)** formats offer standardized execution via ONNX Runtime with substantial speedups compared to raw PyTorch CPU inference.\n" | ||
| f"- **PyTorch (.pt)** files serve as the robust development standard and baseline framework.\n" | ||
| ) | ||
|
|
||
| os.makedirs("docs/benchmarks", exist_ok=True) | ||
| report_path = "docs/benchmarks/comparison_report.md" | ||
| with open(report_path, "w", encoding="utf-8") as f: | ||
| f.write(comparison_md) | ||
|
|
||
| print("\n==========================================================================") | ||
| print("🏆 Consolidated Cross-Format Comparison Complete!") | ||
| print(f"📁 Comparison report generated at: {report_path}") | ||
| print("==========================================================================") | ||
|
|
||
|
|
||
| if __name__ == "__main__": | ||
| int8_path = "yolov8n_int8_openvino_model" | ||
| import argparse | ||
| parser = argparse.ArgumentParser(description="Run Eagle performance benchmarks") | ||
| parser.add_argument("--model", type=str, default=None, help="Path to a specific model to benchmark") | ||
| parser.add_argument("--compare", action="store_true", default=True, help="Run comparisons across formats (.pt, .onnx, .engine)") | ||
| parser.add_argument("--frames", type=int, default=100, help="Number of frames to benchmark") | ||
| args = parser.parse_args() | ||
|
|
||
| REDIS_ENV_URL = os.getenv("REDIS_URL", settings.REDIS_URL) | ||
|
|
||
| benchrunner = PipelineBenchmark(redis_url=REDIS_ENV_URL) | ||
| benchrunner.run_full_pipeline_benchmark(model_path=int8_path, num_frames=100) | ||
|
|
||
| if args.model: | ||
| # Benchmark specific model | ||
| benchrunner.run_full_pipeline_benchmark(model_path=args.model, num_frames=args.frames) | ||
| elif args.compare: | ||
| # Cross-format comparison candidate paths | ||
| candidate_models = { | ||
| "PyTorch (.pt)": "yolov8n.pt", | ||
| "ONNX (.onnx)": "yolov8n.onnx", | ||
| "TensorRT (.engine)": "yolov8n.engine" | ||
| } | ||
| run_comparative_benchmark(benchrunner, candidate_models, num_frames=args.frames) | ||
| else: | ||
| # Default single run | ||
| int8_path = "yolov8n_int8_openvino_model" | ||
| benchrunner.run_full_pipeline_benchmark(model_path=int8_path, num_frames=args.frames) | ||
|
Comment on lines
309
to
+334
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Either default to 🐛 Proposed fix- parser.add_argument("--compare", action="store_true", default=True, help="Run comparisons across formats (.pt, .onnx, .engine)")
+ parser.add_argument("--compare", action="store_true", help="Run comparisons across formats (.pt, .onnx, .engine)")Or, if compare-by-default is the intent, drop the unreachable 🤖 Prompt for AI Agents |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,101 @@ | ||
| # TensorRT Compilation & Optimization Guide | ||
|
|
||
| This guide covers installing dependencies, converting models into high-performance TensorRT `.engine` formats, and running optimized inference using Eagle’s smart automatic fallback protocol. | ||
|
|
||
| --- | ||
|
|
||
| ## 🚀 Why TensorRT? | ||
|
|
||
| NVIDIA TensorRT is a high-performance deep learning inference library that optimizes neural network models for deployment on NVIDIA GPUs and Jetson hardware. Utilizing `.engine` formats provides: | ||
|
|
||
| * **Up to 5x Faster Inference**: Highly optimized CUDA kernels tailored directly to your GPU. | ||
| * **Low Latency & High FPS**: Crucial for real-time surveillance and anomaly detection. | ||
| * **FP16 Half-Precision Optimization**: Reduces memory footprint and doubles processing speed with negligible accuracy loss. | ||
| * **Dynamic Batching & Memory Efficiency**: Saves critical GPU memory (VRAM) bounds. | ||
|
|
||
| --- | ||
|
|
||
| ## 🛠️ 1. Installation & Setup | ||
|
|
||
| To compile and execute `.engine` models, your host machine requires the CUDA Toolkit, cuDNN, TensorRT, and PyCUDA python APIs. | ||
|
|
||
| ### Step A: Install NVIDIA Drivers & CUDA Toolkit | ||
| 1. Download and install compatible **NVIDIA GPU Drivers** from [NVIDIA Driver Downloads](https://www.nvidia.com/Download/index.aspx). | ||
| 2. Download and install **CUDA Toolkit 11.8 or 12.x** from the [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive). | ||
| 3. Ensure CUDA is added to your environment `PATH` variables. Verify by running: | ||
| ```bash | ||
| nvcc --version | ||
| ``` | ||
|
|
||
| ### Step B: Install cuDNN | ||
| 1. Download **cuDNN** (matching your CUDA version) from the [cuDNN Download Portal](https://developer.nvidia.com/cudnn). | ||
| 2. Copy cuDNN headers and libraries into your local CUDA Toolkit directory. | ||
|
|
||
| ### Step C: Install TensorRT | ||
| 1. Download **NVIDIA TensorRT** matching your CUDA version from [TensorRT Portal](https://developer.nvidia.com/tensorrt). | ||
| 2. Follow the installation guide to unzip and add TensorRT binaries to your system library path. | ||
| 3. Install the TensorRT Python wheel matching your Python version (found in the `python/` directory of the TensorRT package): | ||
| ```bash | ||
| pip install tensorrt | ||
| ``` | ||
|
|
||
| ### Step D: Install PyCUDA | ||
| PyCUDA is required for low-level memory copies (DMA transfer coordination) on NVIDIA GPUs. | ||
| ```bash | ||
| pip install pycuda | ||
| ``` | ||
|
|
||
| --- | ||
|
|
||
| ## 📦 2. Model Conversion Using `export_tensorrt.py` | ||
|
|
||
| We have provided a streamlined conversion script in `scripts/export_tensorrt.py` to automate the compilation of `.pt` or `.onnx` models into `.engine` format. | ||
|
|
||
| ### Basic Compilation (Recommended FP16) | ||
| To compile a PyTorch YOLOv8 baseline model using optimized **FP16 half-precision**, run: | ||
| ```bash | ||
| python scripts/export_tensorrt.py --model yolov8n.pt --fp16 | ||
| ``` | ||
| This automatically compiles the model and saves a newly optimized `yolov8n.engine` file in the same directory! | ||
|
|
||
| ### Command Parameters | ||
| | Flag | Type | Default | Description | | ||
| | :--- | :--- | :--- | :--- | | ||
| | `--model` | `str` | `yolov8n.pt` | Path to the source `.pt` or `.onnx` model file to compile. | | ||
| | `--fp16` | `bool` | `True` | Enables FP16 half-precision optimization (highly recommended). | | ||
| | `--int8` | `bool` | `False` | Enables INT8 quantization (requires calibrating dataset). | | ||
| | `--imgsz` | `int` | `640` | Resolution width/height of input frames (default: 640). | | ||
| | `--device` | `str` | `cuda:0` | GPU device ID to execute compiling (default: `cuda:0`). | | ||
|
|
||
|
Comment on lines
+61
to
+69
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Docs reflect the misleading The table documents 🧰 Tools🪛 markdownlint-cli2 (0.22.1)[warning] 62-62: Tables should be surrounded by blank lines (MD058, blanks-around-tables) 🤖 Prompt for AI Agents |
||
| --- | ||
|
|
||
| ## 🧠 3. Smart Automatic Fallback Execution | ||
|
|
||
| You **do not need** to modify your application logic or worry about crashing on non-GPU/non-TensorRT machines. The system implements a **smart fallback routing layer**: | ||
|
|
||
| 1. **Auto-Search**: The `Detector` class checks if a matching `.engine` file exists in the directory of your configured model (e.g. if `yolov8n.pt` is requested, it looks for `yolov8n.engine`). | ||
| 2. **Auto-Promote**: If the `.engine` model is present and CUDA/TensorRT drivers are available, the detector automatically loads the optimized TensorRT engine for accelerated performance. | ||
| 3. **Resilient Fallback**: If the `.engine` file is missing, corrupted, compiled on a different GPU, or if TensorRT is not supported on the host system, the code: | ||
| - Prints a non-blocking warning log: `Failed to load TensorRT engine. Triggering automatic fallback...` | ||
| - Automatically loads the baseline `.pt` or `.onnx` file and continues normal execution without interruption. | ||
|
|
||
| --- | ||
|
|
||
| ## 📊 4. Running the Performance Benchmarks | ||
|
|
||
| To measure the latency and FPS throughput improvements, we have upgraded `benchmark.py` to test and compare multiple formats. | ||
|
|
||
| ### Run Multi-Format Comparative Benchmark: | ||
| ```bash | ||
| python benchmark.py --compare | ||
| ``` | ||
|
|
||
| This runs a simulated video pipeline processing frames across `.pt`, `.onnx`, and `.engine` files, and generates a unified report under: | ||
| `docs/benchmarks/comparison_report.md` | ||
|
|
||
| ### Benchmark a Single Specific Model: | ||
| ```bash | ||
| python benchmark.py --model yolov8n.engine | ||
| ``` | ||
| Report generated under: | ||
| `docs/benchmarks/pipeline_benchmark.md` | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Device picker can request CUDA when CUDA is unavailable.
Operator precedence is
(path.endswith(".engine")) or (torch.cuda.is_available()), so any.enginepath forcesdevice="cuda"even on a CPU-only host. That conflicts with the PR's "graceful fallback" promise —YOLO("yolov8n.engine", task="detect").predict(..., device="cuda")will raise on a machine without CUDA rather than skip the format. Note that, unlikeDetector, this comparative runner usesYOLOdirectly and does not benefit from the loader's fallback logic, so the guard needs to live here.🛡️ Proposed fix
🤖 Prompt for AI Agents