diff --git a/benchmark.py b/benchmark.py index 8a734c8..a579e83 100644 --- a/benchmark.py +++ b/benchmark.py @@ -8,7 +8,7 @@ import cv2 import urllib.request from libs.config.settings import settings -from ultralytics import YOLO # <-- FIX: Yeh line missing thi, ab model load ho jayega! +from ultralytics import YOLO # <-- FIX: Loaded successfully class PipelineBenchmark: def __init__(self, redis_url=settings.REDIS_URL): @@ -46,14 +46,24 @@ def monitor_memory(self): pass time.sleep(0.05) - def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_videos/sample.mp4", num_frames=100, img_size=320): + def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_videos/sample.mp4", num_frames=100, img_size=320, device=None): # 1. Reset run state and metrics at start (CodeRabbit State Fix) self._stop_memory_monitor = False self.peak_ram = 0 for key in self.metrics: self.metrics[key].clear() - print(f"\nπŸš€ Starting End-to-End Pipeline Performance Benchmark using model: {model_path}...") + # Dynamic device determination + if device is None: + if model_path.endswith(".engine"): + device = "cuda" + else: + device = "cuda" if torch.cuda.is_available() else "cpu" + + print(f"\nπŸš€ Starting End-to-End Pipeline Performance Benchmark...") + print(f" Model: {model_path}") + print(f" Device: {device.upper()}") + print(f" Frames: {num_frames}") # Cross-Machine Reproducibility Check: Video download automation fallback if not os.path.exists(video_source) and video_source == "data/sample_videos/sample.mp4": @@ -83,8 +93,10 @@ def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_vide model = YOLO(model_path, task='detect') # Warmup frames setup fake_tensor = torch.rand(1, 3, img_size, img_size) + if "cuda" in device.lower(): + fake_tensor = fake_tensor.cuda() for _ in range(5): - model.predict(fake_tensor, verbose=False, device='cpu', imgsz=img_size) + model.predict(fake_tensor, verbose=False, device=device, imgsz=img_size) use_real_model = True print("✨ Real YOLO model successfully loaded into the benchmark pipeline!") except Exception as e: @@ -116,11 +128,13 @@ def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_vide if frame_to_process is None: frame_to_process = torch.rand(1, 3, img_size, img_size) + if "cuda" in device.lower(): + frame_to_process = frame_to_process.cuda() # 1. Measure Detection Speed t0 = time.time() if use_real_model: - model.predict(frame_to_process, verbose=False, device='cpu', imgsz=img_size) + model.predict(frame_to_process, verbose=False, device=device, imgsz=img_size) else: time.sleep(0.015) self.metrics["detection_times"].append(time.time() - t0) @@ -164,7 +178,7 @@ def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_vide cap.release() total_duration = time.time() - start_total - self.generate_report(total_duration, model_path, "Real Video Asset" if use_real_video else "Synthetic Tensor Stream") + return self.generate_report(total_duration, model_path, "Real Video Asset" if use_real_video else "Synthetic Tensor Stream") def generate_report(self, total_duration, model_used, source_used): avg_det_time = np.mean(self.metrics["detection_times"]) @@ -177,7 +191,6 @@ def generate_report(self, total_duration, model_used, source_used): os.makedirs("docs/benchmarks", exist_ok=True) - # Dynamic logic for Mermaid timelines (CodeRabbit Dynamic Timeline Fix) det_ms = avg_det_time * 1000 track_ms = avg_track redis_ms = avg_redis @@ -227,10 +240,95 @@ def generate_report(self, total_duration, model_used, source_used): print("\nπŸ† Benchmark ran successfully!") print(f"πŸ“Š Workload Source Verified: {source_used}") print("πŸ“ Report generated at: docs/benchmarks/pipeline_benchmark.md") + + return { + "model": model_used, + "fps": fps, + "latency_ms": det_ms, + "e2e_ms": avg_e2e, + "ram_mb": self.peak_ram + } + +def run_comparative_benchmark(benchrunner, models, num_frames=100): + """Runs performance benchmarking across multiple model formats and outputs a consolidated report.""" + results = [] + print("\nπŸ” Initiating Cross-Format Model Benchmark Comparison...") + + for label, path in models.items(): + if os.path.exists(path): + try: + # Decide device automatically based on model suffix + device = "cuda" if path.endswith(".engine") or torch.cuda.is_available() else "cpu" + res = benchrunner.run_full_pipeline_benchmark(model_path=path, num_frames=num_frames, device=device) + res["format"] = label + results.append(res) + except Exception as e: + print(f"❌ Failed to run benchmark for {label} ({path}): {e}") + else: + print(f"⚠️ Skipping comparison for format '{label}' since file was not found at '{path}'.") + + if not results: + print("❌ No models were successfully benchmarked.") + return + + # Generate Markdown Table Comparison + table_rows = [] + for r in results: + table_rows.append( + f"| **{r['format']}** | `{r['model']}` | {r['fps']:.2f} | {r['latency_ms']:.2f} ms | {r['e2e_ms']:.2f} ms | {r['ram_mb']:.1f} MB |" + ) + + comparison_md = ( + f"# Consolidated Model Format Comparison Report\n\n" + f"Generated automatically on: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n" + f"This report compares the performance of Eagle core detection using various model formats on the current hardware.\n\n" + f"## Performance Summary\n\n" + f"| Model Format | Model Path | Throughput (FPS) | Detection Latency | E2E Latency | Peak RAM Usage |\n" + f"| :--- | :--- | :--- | :--- | :--- | :--- |\n" + + "\n".join(table_rows) + "\n\n" + f"### Hardware / Environmental Diagnostics\n" + f"- **CUDA Available:** `{torch.cuda.is_available()}`\n" + f"- **Active GPU:** `{torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None (CPU)'}`\n\n" + f"### Summary Analysis\n" + f"- **TensorRT (.engine)** provides compiled CUDA-kernel optimization for the absolute lowest possible latency and highest FPS throughput on NVIDIA devices.\n" + f"- **ONNX (.onnx)** formats offer standardized execution via ONNX Runtime with substantial speedups compared to raw PyTorch CPU inference.\n" + f"- **PyTorch (.pt)** files serve as the robust development standard and baseline framework.\n" + ) + + os.makedirs("docs/benchmarks", exist_ok=True) + report_path = "docs/benchmarks/comparison_report.md" + with open(report_path, "w", encoding="utf-8") as f: + f.write(comparison_md) + + print("\n==========================================================================") + print("πŸ† Consolidated Cross-Format Comparison Complete!") + print(f"πŸ“ Comparison report generated at: {report_path}") + print("==========================================================================") + if __name__ == "__main__": - int8_path = "yolov8n_int8_openvino_model" + import argparse + parser = argparse.ArgumentParser(description="Run Eagle performance benchmarks") + parser.add_argument("--model", type=str, default=None, help="Path to a specific model to benchmark") + parser.add_argument("--compare", action="store_true", default=True, help="Run comparisons across formats (.pt, .onnx, .engine)") + parser.add_argument("--frames", type=int, default=100, help="Number of frames to benchmark") + args = parser.parse_args() + REDIS_ENV_URL = os.getenv("REDIS_URL", settings.REDIS_URL) - benchrunner = PipelineBenchmark(redis_url=REDIS_ENV_URL) - benchrunner.run_full_pipeline_benchmark(model_path=int8_path, num_frames=100) \ No newline at end of file + + if args.model: + # Benchmark specific model + benchrunner.run_full_pipeline_benchmark(model_path=args.model, num_frames=args.frames) + elif args.compare: + # Cross-format comparison candidate paths + candidate_models = { + "PyTorch (.pt)": "yolov8n.pt", + "ONNX (.onnx)": "yolov8n.onnx", + "TensorRT (.engine)": "yolov8n.engine" + } + run_comparative_benchmark(benchrunner, candidate_models, num_frames=args.frames) + else: + # Default single run + int8_path = "yolov8n_int8_openvino_model" + benchrunner.run_full_pipeline_benchmark(model_path=int8_path, num_frames=args.frames) \ No newline at end of file diff --git a/docs/tensorrt_conversion.md b/docs/tensorrt_conversion.md new file mode 100644 index 0000000..4a67b17 --- /dev/null +++ b/docs/tensorrt_conversion.md @@ -0,0 +1,101 @@ +# TensorRT Compilation & Optimization Guide + +This guide covers installing dependencies, converting models into high-performance TensorRT `.engine` formats, and running optimized inference using Eagle’s smart automatic fallback protocol. + +--- + +## πŸš€ Why TensorRT? + +NVIDIA TensorRT is a high-performance deep learning inference library that optimizes neural network models for deployment on NVIDIA GPUs and Jetson hardware. Utilizing `.engine` formats provides: + +* **Up to 5x Faster Inference**: Highly optimized CUDA kernels tailored directly to your GPU. +* **Low Latency & High FPS**: Crucial for real-time surveillance and anomaly detection. +* **FP16 Half-Precision Optimization**: Reduces memory footprint and doubles processing speed with negligible accuracy loss. +* **Dynamic Batching & Memory Efficiency**: Saves critical GPU memory (VRAM) bounds. + +--- + +## πŸ› οΈ 1. Installation & Setup + +To compile and execute `.engine` models, your host machine requires the CUDA Toolkit, cuDNN, TensorRT, and PyCUDA python APIs. + +### Step A: Install NVIDIA Drivers & CUDA Toolkit +1. Download and install compatible **NVIDIA GPU Drivers** from [NVIDIA Driver Downloads](https://www.nvidia.com/Download/index.aspx). +2. Download and install **CUDA Toolkit 11.8 or 12.x** from the [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive). +3. Ensure CUDA is added to your environment `PATH` variables. Verify by running: + ```bash + nvcc --version + ``` + +### Step B: Install cuDNN +1. Download **cuDNN** (matching your CUDA version) from the [cuDNN Download Portal](https://developer.nvidia.com/cudnn). +2. Copy cuDNN headers and libraries into your local CUDA Toolkit directory. + +### Step C: Install TensorRT +1. Download **NVIDIA TensorRT** matching your CUDA version from [TensorRT Portal](https://developer.nvidia.com/tensorrt). +2. Follow the installation guide to unzip and add TensorRT binaries to your system library path. +3. Install the TensorRT Python wheel matching your Python version (found in the `python/` directory of the TensorRT package): + ```bash + pip install tensorrt + ``` + +### Step D: Install PyCUDA +PyCUDA is required for low-level memory copies (DMA transfer coordination) on NVIDIA GPUs. +```bash +pip install pycuda +``` + +--- + +## πŸ“¦ 2. Model Conversion Using `export_tensorrt.py` + +We have provided a streamlined conversion script in `scripts/export_tensorrt.py` to automate the compilation of `.pt` or `.onnx` models into `.engine` format. + +### Basic Compilation (Recommended FP16) +To compile a PyTorch YOLOv8 baseline model using optimized **FP16 half-precision**, run: +```bash +python scripts/export_tensorrt.py --model yolov8n.pt --fp16 +``` +This automatically compiles the model and saves a newly optimized `yolov8n.engine` file in the same directory! + +### Command Parameters +| Flag | Type | Default | Description | +| :--- | :--- | :--- | :--- | +| `--model` | `str` | `yolov8n.pt` | Path to the source `.pt` or `.onnx` model file to compile. | +| `--fp16` | `bool` | `True` | Enables FP16 half-precision optimization (highly recommended). | +| `--int8` | `bool` | `False` | Enables INT8 quantization (requires calibrating dataset). | +| `--imgsz` | `int` | `640` | Resolution width/height of input frames (default: 640). | +| `--device` | `str` | `cuda:0` | GPU device ID to execute compiling (default: `cuda:0`). | + +--- + +## 🧠 3. Smart Automatic Fallback Execution + +You **do not need** to modify your application logic or worry about crashing on non-GPU/non-TensorRT machines. The system implements a **smart fallback routing layer**: + +1. **Auto-Search**: The `Detector` class checks if a matching `.engine` file exists in the directory of your configured model (e.g. if `yolov8n.pt` is requested, it looks for `yolov8n.engine`). +2. **Auto-Promote**: If the `.engine` model is present and CUDA/TensorRT drivers are available, the detector automatically loads the optimized TensorRT engine for accelerated performance. +3. **Resilient Fallback**: If the `.engine` file is missing, corrupted, compiled on a different GPU, or if TensorRT is not supported on the host system, the code: + - Prints a non-blocking warning log: `Failed to load TensorRT engine. Triggering automatic fallback...` + - Automatically loads the baseline `.pt` or `.onnx` file and continues normal execution without interruption. + +--- + +## πŸ“Š 4. Running the Performance Benchmarks + +To measure the latency and FPS throughput improvements, we have upgraded `benchmark.py` to test and compare multiple formats. + +### Run Multi-Format Comparative Benchmark: +```bash +python benchmark.py --compare +``` + +This runs a simulated video pipeline processing frames across `.pt`, `.onnx`, and `.engine` files, and generates a unified report under: +`docs/benchmarks/comparison_report.md` + +### Benchmark a Single Specific Model: +```bash +python benchmark.py --model yolov8n.engine +``` +Report generated under: +`docs/benchmarks/pipeline_benchmark.md` diff --git a/scripts/export_tensorrt.py b/scripts/export_tensorrt.py new file mode 100644 index 0000000..ecc1141 --- /dev/null +++ b/scripts/export_tensorrt.py @@ -0,0 +1,142 @@ +""" +export_tensorrt.py β€” CLI tool to compile YOLO models to high-performance TensorRT (.engine) format. + +This script manages hardware validation (CUDA, GPU capability) and uses the Ultralytics +export engine wrapper to convert standard PyTorch (.pt) or ONNX (.onnx) files into +accelerated TensorRT engines tailored specifically to the host GPU. + +Usage: + python scripts/export_tensorrt.py --model yolov8n.pt --fp16 +""" + +import argparse +import sys +import os +import logging +from pathlib import Path + +# Configure logging +logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") +logger = logging.getLogger(__name__) + +try: + import torch + from ultralytics import YOLO +except ImportError: + logger.error("Required libraries (torch, ultralytics) are missing. Please run: pip install ultralytics torch") + sys.exit(1) + + +def export_model(model_path: str, fp16: bool, int8: bool, imgsz: int, device: str) -> None: + """ + Compiles a PyTorch (.pt) or ONNX (.onnx) model into a TensorRT (.engine) model. + """ + logger.info("Checking environment status for TensorRT export...") + + # 1. Hardware verification + if "cpu" in device.lower(): + logger.error("TensorRT compilation is NOT supported on CPU. Please specify a CUDA device (e.g., --device 0 or cuda).") + sys.exit(1) + + if not torch.cuda.is_available(): + logger.error("CUDA is not available on this machine. TensorRT requires an NVIDIA GPU with CUDA drivers.") + sys.exit(1) + + # Check if GPU device is valid + device_id = 0 + if ":" in device: + device_id = int(device.split(":")[1]) + try: + device_name = torch.cuda.get_device_name(device_id) + logger.info(f"Using NVIDIA GPU: {device_name} (Device ID: {device_id})") + except Exception as e: + logger.error(f"Invalid CUDA device specified: {device}. Error: {e}") + sys.exit(1) + + # 2. File verification + model_file = Path(model_path) + if not model_file.exists(): + logger.error(f"Source model file '{model_path}' not found!") + sys.exit(1) + + if not (model_path.endswith(".pt") or model_path.endswith(".onnx")): + logger.error("Unsupported source format. Model must end with '.pt' or '.onnx'") + sys.exit(1) + + logger.info(f"Loading source model: {model_path}...") + model = YOLO(model_path) + + logger.info("Starting compilation to TensorRT (.engine) format...") + logger.info(f"Configuration: FP16={fp16}, INT8={int8}, Image Size={imgsz}, Target Device={device}") + + try: + # Ultralytics natively wraps the ONNX -> TensorRT conversion process + exported_path = model.export( + format="engine", + half=fp16, + int8=int8, + imgsz=imgsz, + device=device, + dynamic=True # Enables dynamic batching support + ) + logger.info("========================================= SUCCESS =========================================") + logger.info("TensorRT Engine compiled and optimized successfully!") + logger.info(f"Saved optimized model to: {os.path.abspath(exported_path)}") + logger.info("===========================================================================================") + + except Exception as e: + logger.error(f"An error occurred during TensorRT compilation: {e}") + logger.error( + "Please ensure you have the TensorRT Python API and CUDA toolkit properly installed. " + "Refer to docs/tensorrt_conversion.md for assistance." + ) + sys.exit(1) + + +def main(): + parser = argparse.ArgumentParser( + description="Compile YOLOv8/v9 models (.pt/.onnx) to highly-optimized TensorRT (.engine) format." + ) + parser.add_argument( + "--model", + type=str, + default="yolov8n.pt", + help="Path to the source model (.pt or .onnx file) to compile." + ) + parser.add_argument( + "--fp16", + action="store_true", + default=True, + help="Enable FP16 (half-precision) float operations for faster inference (recommended)." + ) + parser.add_argument( + "--int8", + action="store_true", + default=False, + help="Enable INT8 quantization (requires calibrating dataset)." + ) + parser.add_argument( + "--imgsz", + type=int, + default=640, + help="Standard resolution (width/height) of input frames (default: 640)." + ) + parser.add_argument( + "--device", + type=str, + default="cuda:0", + help="CUDA device to use for compilation (default: cuda:0)." + ) + + args = parser.parse_args() + export_model( + model_path=args.model, + fp16=args.fp16, + int8=args.int8, + imgsz=args.imgsz, + device=args.device + ) + + +if __name__ == "__main__": + main() diff --git a/services/detection/detection.py b/services/detection/detection.py index a26960d..417e147 100644 --- a/services/detection/detection.py +++ b/services/detection/detection.py @@ -62,10 +62,84 @@ def __init__( confidence_threshold: float = 0.45, device: str = "cpu", ) -> None: - logger.info(f"Loading YOLO model: {model_name} on {device}") - self.model = YOLO(model_name) + self.model_path = model_name self.conf = confidence_threshold self.device = device + + logger.info(f"Initializing Detector with config: model={model_name}, device={device}") + self._load_model_with_fallback() + + def _load_model_with_fallback(self) -> None: + """ + Implements smart, automatic model format resolution and fallback routing. + First attempts to locate and load a TensorRT engine if CUDA hardware is available, + otherwise falls back gracefully to ONNX or PyTorch models. + """ + path = Path(self.model_path) + base_name = path.stem + parent_dir = path.parent + + # Check for a matching .engine file in the same directory + engine_path = parent_dir / f"{base_name}.engine" + + # Determine if we should attempt to load a TensorRT engine + should_try_engine = self.model_path.endswith(".engine") or engine_path.exists() + + if should_try_engine: + resolved_engine_path = self.model_path if self.model_path.endswith(".engine") else str(engine_path) + + # TensorRT requires an NVIDIA GPU with CUDA + if "cuda" in self.device.lower(): + try: + logger.info(f"Attempting optimized TensorRT engine load: {resolved_engine_path}") + self.load_tensorrt_model(resolved_engine_path) + return + except Exception as e: + logger.warning( + f"Failed to load TensorRT engine '{resolved_engine_path}': {e}. " + f"Triggering automatic fallback to standard model format..." + ) + else: + logger.warning( + f"TensorRT engine '{resolved_engine_path}' cannot run on non-CUDA device '{self.device}'. " + f"Triggering automatic fallback to standard model format..." + ) + + # Main loader routing based on model extension + if self.model_path.endswith(".onnx"): + self.load_onnx_model(self.model_path) + elif self.model_path.endswith(".pt"): + self.load_pytorch_model(self.model_path) + else: + # If explicitly requested .engine failed or file is generic, seek compatible counterpart + pt_path = parent_dir / f"{base_name}.pt" + onnx_path = parent_dir / f"{base_name}.onnx" + + if pt_path.exists(): + logger.info(f"Auto-fallback: Loading counterpart PyTorch model: {pt_path}") + self.load_pytorch_model(str(pt_path)) + elif onnx_path.exists(): + logger.info(f"Auto-fallback: Loading counterpart ONNX model: {onnx_path}") + self.load_onnx_model(str(onnx_path)) + else: + logger.info(f"No counterpart found. Loading default fallback model path: {self.model_path}") + self.load_pytorch_model(self.model_path) + + def load_tensorrt_model(self, model_path: str) -> None: + """Loads a TensorRT engine model using the Ultralytics YOLO framework.""" + logger.info(f"Successfully routed to load_tensorrt_model: {model_path}") + self.model = YOLO(model_path, task="detect") + + def load_onnx_model(self, model_path: str) -> None: + """Loads an ONNX model using the Ultralytics YOLO framework.""" + logger.info(f"Successfully routed to load_onnx_model: {model_path}") + self.model = YOLO(model_path, task="detect") + + def load_pytorch_model(self, model_path: str) -> None: + """Loads a PyTorch (.pt) model using the Ultralytics YOLO framework.""" + logger.info(f"Successfully routed to load_pytorch_model: {model_path}") + self.model = YOLO(model_path, task="detect") + def detect(self, frame: np.ndarray, frame_id: int = 0) -> DetectionFrame: """ diff --git a/services/detection/trt_utils.py b/services/detection/trt_utils.py new file mode 100644 index 0000000..a2f8b57 --- /dev/null +++ b/services/detection/trt_utils.py @@ -0,0 +1,171 @@ +""" +trt_utils.py β€” Low-level TensorRT inference utilities for Eagle. + +Provides the `TensorRTInference` class to load and execute serialized .engine files +directly on NVIDIA GPUs with optimized CUDA bindings, including memory management +and asynchronous stream coordination. +""" + +from __future__ import annotations +import logging +import numpy as np + +logger = logging.getLogger(__name__) + +# Safe imports to prevent crashes on systems without NVIDIA drivers/TensorRT installed. +try: + import tensorrt as trt + TRT_AVAILABLE = True +except ImportError: + trt = None + TRT_AVAILABLE = False + +try: + import pycuda.driver as cuda + import pycuda.autoinit # Automatically handles CUDA context creation/destruction + CUDA_AVAILABLE = True +except ImportError: + cuda = None + CUDA_AVAILABLE = False + + +class TensorRTInference: + """ + Handles low-level TensorRT model deserialization, binding memory allocation + (host-to-device and device-to-host pagelocked buffers), and optimized + inference for compiled .engine files on NVIDIA GPUs. + """ + def __init__(self, engine_path: str) -> None: + """ + Initialize the TensorRT inference engine. + + Args: + engine_path: Path to the serialized `.engine` model file. + """ + if not TRT_AVAILABLE: + raise ImportError( + "TensorRT python package is not installed. " + "Please install tensorrt using: pip install tensorrt" + ) + if not CUDA_AVAILABLE: + raise ImportError( + "PyCUDA is not installed or CUDA is unavailable. " + "Please install pycuda using: pip install pycuda" + ) + + self.engine_path = engine_path + self.logger = trt.Logger(trt.Logger.WARNING) + + logger.info(f"Deserializing TensorRT Engine: {self.engine_path}") + with open(self.engine_path, "rb") as f, trt.Runtime(self.logger) as runtime: + self.engine = runtime.deserialize_cuda_engine(f.read()) + + if self.engine is None: + raise RuntimeError(f"Failed to deserialize TensorRT engine from {self.engine_path}") + + self.context = self.engine.create_execution_context() + if self.context is None: + raise RuntimeError(f"Failed to create TensorRT execution context for {self.engine_path}") + + self.inputs = [] + self.outputs = [] + self.bindings = [] + self.stream = cuda.Stream() + + self._allocate_buffers() + logger.info(f"TensorRT Engine loaded successfully. Inputs: {len(self.inputs)}, Outputs: {len(self.outputs)}") + + def _allocate_buffers(self) -> None: + """ + Query binding metadata from the engine and allocate pinned/pagelocked + host memory and GPU device buffers for each input/output tensor. + """ + # Determine maximum batch size + max_batch_size = 1 + if hasattr(self.engine, "max_batch_size"): + max_batch_size = max(1, self.engine.max_batch_size) + + for binding in self.engine: + shape = self.engine.get_binding_shape(binding) + # Handle dynamic/undefined batch dimension + if shape[0] == -1: + shape = (max_batch_size,) + shape[1:] + + size = trt.volume(shape) + dtype = trt.nptype(self.engine.get_binding_dtype(binding)) + + # Pinned/pagelocked host memory for faster DMA transfers + host_mem = cuda.pagelocked_empty(size, dtype) + # CUDA device memory allocation + device_mem = cuda.mem_alloc(host_mem.nbytes) + + self.bindings.append(int(device_mem)) + + binding_info = { + "host": host_mem, + "device": device_mem, + "name": binding, + "dtype": dtype, + "shape": shape + } + + if self.engine.binding_is_input(binding): + self.inputs.append(binding_info) + else: + self.outputs.append(binding_info) + + def infer(self, input_data: np.ndarray) -> list[np.ndarray]: + """ + Performs synchronized, high-speed inference on a preprocessed input frame. + + Args: + input_data: Preprocessed input image numpy array. + + Returns: + A list of numpy arrays representing raw model predictions. + """ + if not self.inputs: + raise ValueError("No input bindings allocated in the TensorRT engine.") + + input_info = self.inputs[0] + # Fast copy to pagelocked host buffer + np.copyto(input_info["host"], input_data.ravel()) + + # Host to Device transfer (Asynchronous) + cuda.memcpy_htod_async(input_info["device"], input_info["host"], self.stream) + + # Enqueue inference execution context + self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle) + + # Device to Host transfer (Asynchronous) + for out in self.outputs: + cuda.memcpy_dtoh_async(out["host"], out["device"], self.stream) + + # Synchronize CPU and GPU stream execution + self.stream.synchronize() + + # Reshape output vectors back to standard multi-dimensional tensors + results = [] + for out in self.outputs: + reshaped = out["host"].reshape(out["shape"]) + results.append(reshaped) + + return results + + def __del__(self) -> None: + """ + Cleans up GPU bindings and device pointers when the class object is garbage collected. + """ + self.bindings.clear() + self.inputs.clear() + self.outputs.clear() + + +def is_tensorrt_supported() -> bool: + """ + Utility check to see if the local machine fully supports native TensorRT execution. + + Returns: + True if tensorrt and pycuda are installed and available, False otherwise. + """ + return TRT_AVAILABLE and CUDA_AVAILABLE diff --git a/tests/test_tensorrt_routing.py b/tests/test_tensorrt_routing.py new file mode 100644 index 0000000..ea9f5f9 --- /dev/null +++ b/tests/test_tensorrt_routing.py @@ -0,0 +1,78 @@ +""" +test_tensorrt_routing.py β€” Unit tests verifying model loader routing and auto-fallback behavior. +""" + +import sys +import types +from pathlib import Path + +# Dynamically alias the 'Eagle' namespace to the project root at runtime +if "Eagle" not in sys.modules: + eagle_mod = types.ModuleType("Eagle") + eagle_mod.__path__ = [str(Path(__file__).resolve().parents[1])] + sys.modules["Eagle"] = eagle_mod + +import pytest +from unittest.mock import MagicMock, patch +from services.detection.detection import Detector + + +@pytest.fixture +def mock_yolo(): + """Mocks the ultralytics YOLO class to prevent loading real model weights during tests.""" + with patch("services.detection.detection.YOLO") as mock: + yield mock + + +def test_routing_pytorch(mock_yolo): + """Verifies that .pt model paths correctly route to load_pytorch_model.""" + detector = Detector(model_name="yolov8n.pt", device="cpu") + assert detector.model_path == "yolov8n.pt" + mock_yolo.assert_called_with("yolov8n.pt", task="detect") + + +def test_routing_onnx(mock_yolo): + """Verifies that .onnx model paths correctly route to load_onnx_model.""" + detector = Detector(model_name="yolov8n.onnx", device="cpu") + assert detector.model_path == "yolov8n.onnx" + mock_yolo.assert_called_with("yolov8n.onnx", task="detect") + + +def test_routing_engine_success(mock_yolo): + """Verifies that .engine model paths route to load_tensorrt_model when device is CUDA.""" + with patch("services.detection.detection.Path.exists") as mock_exists: + mock_exists.return_value = True + detector = Detector(model_name="yolov8n.engine", device="cuda:0") + assert detector.model_path == "yolov8n.engine" + mock_yolo.assert_called_with("yolov8n.engine", task="detect") + + +def test_routing_engine_cpu_fallback(mock_yolo): + """Verifies that .engine model path on CPU triggers auto-fallback to available formats.""" + # Define a plain method to bypass bound descriptor mock complexities + def mock_exists(self_obj): + return str(self_obj).endswith(".pt") + + with patch("services.detection.detection.Path.exists", mock_exists): + detector = Detector(model_name="yolov8n.engine", device="cpu") + # Should fallback to yolov8n.pt + mock_yolo.assert_called_with("yolov8n.pt", task="detect") + + +def test_routing_engine_load_failure_fallback(mock_yolo): + """Verifies that .engine loading failure on CUDA triggers automatic fallback to .pt.""" + def mock_exists(self_obj): + return str(self_obj).endswith(".pt") or str(self_obj).endswith(".engine") + + with patch("services.detection.detection.Path.exists", mock_exists): + # YOLO fails to load the engine file (simulating driver mismatch or corrupt engine) + def side_effect(path, task=None): + if path.endswith(".engine"): + raise RuntimeError("Cuda driver mismatch") + return MagicMock() + mock_yolo.side_effect = side_effect + + detector = Detector(model_name="yolov8n.engine", device="cuda:0") + # Should fallback to yolov8n.pt + mock_yolo.assert_called_with("yolov8n.pt", task="detect") +