diff --git a/benchmark.py b/benchmark.py
index 8a734c8..a579e83 100644
--- a/benchmark.py
+++ b/benchmark.py
@@ -8,7 +8,7 @@
 import cv2
 import urllib.request
 from libs.config.settings import settings
-from ultralytics import YOLO  # <-- FIX: Yeh line missing thi, ab model load ho jayega!
+from ultralytics import YOLO  # <-- FIX: Loaded successfully
 
 class PipelineBenchmark:
     def __init__(self, redis_url=settings.REDIS_URL):
@@ -46,14 +46,24 @@ def monitor_memory(self):
                 pass
             time.sleep(0.05)
 
-    def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_videos/sample.mp4", num_frames=100, img_size=320):
+    def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_videos/sample.mp4", num_frames=100, img_size=320, device=None):
         # 1. Reset run state and metrics at start (CodeRabbit State Fix)
         self._stop_memory_monitor = False
         self.peak_ram = 0
         for key in self.metrics:
             self.metrics[key].clear()
 
-        print(f"\n🚀 Starting End-to-End Pipeline Performance Benchmark using model: {model_path}...")
+        # Dynamic device determination
+        if device is None:
+            if model_path.endswith(".engine"):
+                device = "cuda"
+            else:
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+                
+        print(f"\n🚀 Starting End-to-End Pipeline Performance Benchmark...")
+        print(f"   Model: {model_path}")
+        print(f"   Device: {device.upper()}")
+        print(f"   Frames: {num_frames}")
         
         # Cross-Machine Reproducibility Check: Video download automation fallback
         if not os.path.exists(video_source) and video_source == "data/sample_videos/sample.mp4":
@@ -83,8 +93,10 @@ def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_vide
             model = YOLO(model_path, task='detect')
             # Warmup frames setup
             fake_tensor = torch.rand(1, 3, img_size, img_size)
+            if "cuda" in device.lower():
+                fake_tensor = fake_tensor.cuda()
             for _ in range(5):
-                model.predict(fake_tensor, verbose=False, device='cpu', imgsz=img_size)
+                model.predict(fake_tensor, verbose=False, device=device, imgsz=img_size)
             use_real_model = True
             print("✨ Real YOLO model successfully loaded into the benchmark pipeline!")
         except Exception as e:
@@ -116,11 +128,13 @@ def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_vide
 
                 if frame_to_process is None:
                     frame_to_process = torch.rand(1, 3, img_size, img_size)
+                    if "cuda" in device.lower():
+                        frame_to_process = frame_to_process.cuda()
 
                 # 1. Measure Detection Speed
                 t0 = time.time()
                 if use_real_model:
-                    model.predict(frame_to_process, verbose=False, device='cpu', imgsz=img_size)
+                    model.predict(frame_to_process, verbose=False, device=device, imgsz=img_size)
                 else:
                     time.sleep(0.015) 
                 self.metrics["detection_times"].append(time.time() - t0)
@@ -164,7 +178,7 @@ def run_full_pipeline_benchmark(self, model_path, video_source="data/sample_vide
                 cap.release()
         
         total_duration = time.time() - start_total
-        self.generate_report(total_duration, model_path, "Real Video Asset" if use_real_video else "Synthetic Tensor Stream")
+        return self.generate_report(total_duration, model_path, "Real Video Asset" if use_real_video else "Synthetic Tensor Stream")
 
     def generate_report(self, total_duration, model_used, source_used):
         avg_det_time = np.mean(self.metrics["detection_times"])
@@ -177,7 +191,6 @@ def generate_report(self, total_duration, model_used, source_used):
 
         os.makedirs("docs/benchmarks", exist_ok=True)
 
-        # Dynamic logic for Mermaid timelines (CodeRabbit Dynamic Timeline Fix)
         det_ms = avg_det_time * 1000
         track_ms = avg_track
         redis_ms = avg_redis
@@ -227,10 +240,95 @@ def generate_report(self, total_duration, model_used, source_used):
         print("\n🏆 Benchmark ran successfully!")
         print(f"📊 Workload Source Verified: {source_used}")
         print("📁 Report generated at: docs/benchmarks/pipeline_benchmark.md")
+        
+        return {
+            "model": model_used,
+            "fps": fps,
+            "latency_ms": det_ms,
+            "e2e_ms": avg_e2e,
+            "ram_mb": self.peak_ram
+        }
+
+def run_comparative_benchmark(benchrunner, models, num_frames=100):
+    """Runs performance benchmarking across multiple model formats and outputs a consolidated report."""
+    results = []
+    print("\n🔍 Initiating Cross-Format Model Benchmark Comparison...")
+    
+    for label, path in models.items():
+        if os.path.exists(path):
+            try:
+                # Decide device automatically based on model suffix
+                device = "cuda" if path.endswith(".engine") or torch.cuda.is_available() else "cpu"
+                res = benchrunner.run_full_pipeline_benchmark(model_path=path, num_frames=num_frames, device=device)
+                res["format"] = label
+                results.append(res)
+            except Exception as e:
+                print(f"❌ Failed to run benchmark for {label} ({path}): {e}")
+        else:
+            print(f"⚠️ Skipping comparison for format '{label}' since file was not found at '{path}'.")
+            
+    if not results:
+        print("❌ No models were successfully benchmarked.")
+        return
+
+    # Generate Markdown Table Comparison
+    table_rows = []
+    for r in results:
+        table_rows.append(
+            f"| **{r['format']}** | `{r['model']}` | {r['fps']:.2f} | {r['latency_ms']:.2f} ms | {r['e2e_ms']:.2f} ms | {r['ram_mb']:.1f} MB |"
+        )
+        
+    comparison_md = (
+        f"# Consolidated Model Format Comparison Report\n\n"
+        f"Generated automatically on: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n"
+        f"This report compares the performance of Eagle core detection using various model formats on the current hardware.\n\n"
+        f"## Performance Summary\n\n"
+        f"| Model Format | Model Path | Throughput (FPS) | Detection Latency | E2E Latency | Peak RAM Usage |\n"
+        f"| :--- | :--- | :--- | :--- | :--- | :--- |\n"
+        + "\n".join(table_rows) + "\n\n"
+        f"### Hardware / Environmental Diagnostics\n"
+        f"- **CUDA Available:** `{torch.cuda.is_available()}`\n"
+        f"- **Active GPU:** `{torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None (CPU)'}`\n\n"
+        f"### Summary Analysis\n"
+        f"- **TensorRT (.engine)** provides compiled CUDA-kernel optimization for the absolute lowest possible latency and highest FPS throughput on NVIDIA devices.\n"
+        f"- **ONNX (.onnx)** formats offer standardized execution via ONNX Runtime with substantial speedups compared to raw PyTorch CPU inference.\n"
+        f"- **PyTorch (.pt)** files serve as the robust development standard and baseline framework.\n"
+    )
+    
+    os.makedirs("docs/benchmarks", exist_ok=True)
+    report_path = "docs/benchmarks/comparison_report.md"
+    with open(report_path, "w", encoding="utf-8") as f:
+        f.write(comparison_md)
+        
+    print("\n==========================================================================")
+    print("🏆 Consolidated Cross-Format Comparison Complete!")
+    print(f"📁 Comparison report generated at: {report_path}")
+    print("==========================================================================")
+
 
 if __name__ == "__main__":
-    int8_path = "yolov8n_int8_openvino_model" 
+    import argparse
+    parser = argparse.ArgumentParser(description="Run Eagle performance benchmarks")
+    parser.add_argument("--model", type=str, default=None, help="Path to a specific model to benchmark")
+    parser.add_argument("--compare", action="store_true", default=True, help="Run comparisons across formats (.pt, .onnx, .engine)")
+    parser.add_argument("--frames", type=int, default=100, help="Number of frames to benchmark")
+    args = parser.parse_args()
+
     REDIS_ENV_URL = os.getenv("REDIS_URL", settings.REDIS_URL)
-    
     benchrunner = PipelineBenchmark(redis_url=REDIS_ENV_URL)
-    benchrunner.run_full_pipeline_benchmark(model_path=int8_path, num_frames=100)
\ No newline at end of file
+    
+    if args.model:
+        # Benchmark specific model
+        benchrunner.run_full_pipeline_benchmark(model_path=args.model, num_frames=args.frames)
+    elif args.compare:
+        # Cross-format comparison candidate paths
+        candidate_models = {
+            "PyTorch (.pt)": "yolov8n.pt",
+            "ONNX (.onnx)": "yolov8n.onnx",
+            "TensorRT (.engine)": "yolov8n.engine"
+        }
+        run_comparative_benchmark(benchrunner, candidate_models, num_frames=args.frames)
+    else:
+        # Default single run
+        int8_path = "yolov8n_int8_openvino_model"
+        benchrunner.run_full_pipeline_benchmark(model_path=int8_path, num_frames=args.frames)
\ No newline at end of file
diff --git a/docs/tensorrt_conversion.md b/docs/tensorrt_conversion.md
new file mode 100644
index 0000000..4a67b17
--- /dev/null
+++ b/docs/tensorrt_conversion.md
@@ -0,0 +1,101 @@
+# TensorRT Compilation & Optimization Guide
+
+This guide covers installing dependencies, converting models into high-performance TensorRT `.engine` formats, and running optimized inference using Eagle’s smart automatic fallback protocol.
+
+---
+
+## 🚀 Why TensorRT?
+
+NVIDIA TensorRT is a high-performance deep learning inference library that optimizes neural network models for deployment on NVIDIA GPUs and Jetson hardware. Utilizing `.engine` formats provides:
+
+* **Up to 5x Faster Inference**: Highly optimized CUDA kernels tailored directly to your GPU.
+* **Low Latency & High FPS**: Crucial for real-time surveillance and anomaly detection.
+* **FP16 Half-Precision Optimization**: Reduces memory footprint and doubles processing speed with negligible accuracy loss.
+* **Dynamic Batching & Memory Efficiency**: Saves critical GPU memory (VRAM) bounds.
+
+---
+
+## 🛠️ 1. Installation & Setup
+
+To compile and execute `.engine` models, your host machine requires the CUDA Toolkit, cuDNN, TensorRT, and PyCUDA python APIs.
+
+### Step A: Install NVIDIA Drivers & CUDA Toolkit
+1. Download and install compatible **NVIDIA GPU Drivers** from [NVIDIA Driver Downloads](https://www.nvidia.com/Download/index.aspx).
+2. Download and install **CUDA Toolkit 11.8 or 12.x** from the [CUDA Toolkit Archive](https://developer.nvidia.com/cuda-toolkit-archive).
+3. Ensure CUDA is added to your environment `PATH` variables. Verify by running:
+   ```bash
+   nvcc --version
+   ```
+
+### Step B: Install cuDNN
+1. Download **cuDNN** (matching your CUDA version) from the [cuDNN Download Portal](https://developer.nvidia.com/cudnn).
+2. Copy cuDNN headers and libraries into your local CUDA Toolkit directory.
+
+### Step C: Install TensorRT
+1. Download **NVIDIA TensorRT** matching your CUDA version from [TensorRT Portal](https://developer.nvidia.com/tensorrt).
+2. Follow the installation guide to unzip and add TensorRT binaries to your system library path.
+3. Install the TensorRT Python wheel matching your Python version (found in the `python/` directory of the TensorRT package):
+   ```bash
+   pip install tensorrt
+   ```
+
+### Step D: Install PyCUDA
+PyCUDA is required for low-level memory copies (DMA transfer coordination) on NVIDIA GPUs.
+```bash
+pip install pycuda
+```
+
+---
+
+## 📦 2. Model Conversion Using `export_tensorrt.py`
+
+We have provided a streamlined conversion script in `scripts/export_tensorrt.py` to automate the compilation of `.pt` or `.onnx` models into `.engine` format.
+
+### Basic Compilation (Recommended FP16)
+To compile a PyTorch YOLOv8 baseline model using optimized **FP16 half-precision**, run:
+```bash
+python scripts/export_tensorrt.py --model yolov8n.pt --fp16
+```
+This automatically compiles the model and saves a newly optimized `yolov8n.engine` file in the same directory!
+
+### Command Parameters
+| Flag | Type | Default | Description |
+| :--- | :--- | :--- | :--- |
+| `--model` | `str` | `yolov8n.pt` | Path to the source `.pt` or `.onnx` model file to compile. |
+| `--fp16` | `bool` | `True` | Enables FP16 half-precision optimization (highly recommended). |
+| `--int8` | `bool` | `False` | Enables INT8 quantization (requires calibrating dataset). |
+| `--imgsz` | `int` | `640` | Resolution width/height of input frames (default: 640). |
+| `--device` | `str` | `cuda:0` | GPU device ID to execute compiling (default: `cuda:0`). |
+
+---
+
+## 🧠 3. Smart Automatic Fallback Execution
+
+You **do not need** to modify your application logic or worry about crashing on non-GPU/non-TensorRT machines. The system implements a **smart fallback routing layer**:
+
+1. **Auto-Search**: The `Detector` class checks if a matching `.engine` file exists in the directory of your configured model (e.g. if `yolov8n.pt` is requested, it looks for `yolov8n.engine`).
+2. **Auto-Promote**: If the `.engine` model is present and CUDA/TensorRT drivers are available, the detector automatically loads the optimized TensorRT engine for accelerated performance.
+3. **Resilient Fallback**: If the `.engine` file is missing, corrupted, compiled on a different GPU, or if TensorRT is not supported on the host system, the code:
+   - Prints a non-blocking warning log: `Failed to load TensorRT engine. Triggering automatic fallback...`
+   - Automatically loads the baseline `.pt` or `.onnx` file and continues normal execution without interruption.
+
+---
+
+## 📊 4. Running the Performance Benchmarks
+
+To measure the latency and FPS throughput improvements, we have upgraded `benchmark.py` to test and compare multiple formats.
+
+### Run Multi-Format Comparative Benchmark:
+```bash
+python benchmark.py --compare
+```
+
+This runs a simulated video pipeline processing frames across `.pt`, `.onnx`, and `.engine` files, and generates a unified report under:
+`docs/benchmarks/comparison_report.md`
+
+### Benchmark a Single Specific Model:
+```bash
+python benchmark.py --model yolov8n.engine
+```
+Report generated under:
+`docs/benchmarks/pipeline_benchmark.md`
diff --git a/scripts/export_tensorrt.py b/scripts/export_tensorrt.py
new file mode 100644
index 0000000..ecc1141
--- /dev/null
+++ b/scripts/export_tensorrt.py
@@ -0,0 +1,142 @@
+"""
+export_tensorrt.py — CLI tool to compile YOLO models to high-performance TensorRT (.engine) format.
+
+This script manages hardware validation (CUDA, GPU capability) and uses the Ultralytics 
+export engine wrapper to convert standard PyTorch (.pt) or ONNX (.onnx) files into 
+accelerated TensorRT engines tailored specifically to the host GPU.
+
+Usage:
+    python scripts/export_tensorrt.py --model yolov8n.pt --fp16
+"""
+
+import argparse
+import sys
+import os
+import logging
+from pathlib import Path
+
+# Configure logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+
+try:
+    import torch
+    from ultralytics import YOLO
+except ImportError:
+    logger.error("Required libraries (torch, ultralytics) are missing. Please run: pip install ultralytics torch")
+    sys.exit(1)
+
+
+def export_model(model_path: str, fp16: bool, int8: bool, imgsz: int, device: str) -> None:
+    """
+    Compiles a PyTorch (.pt) or ONNX (.onnx) model into a TensorRT (.engine) model.
+    """
+    logger.info("Checking environment status for TensorRT export...")
+    
+    # 1. Hardware verification
+    if "cpu" in device.lower():
+        logger.error("TensorRT compilation is NOT supported on CPU. Please specify a CUDA device (e.g., --device 0 or cuda).")
+        sys.exit(1)
+        
+    if not torch.cuda.is_available():
+        logger.error("CUDA is not available on this machine. TensorRT requires an NVIDIA GPU with CUDA drivers.")
+        sys.exit(1)
+        
+    # Check if GPU device is valid
+    device_id = 0
+    if ":" in device:
+        device_id = int(device.split(":")[1])
+    try:
+        device_name = torch.cuda.get_device_name(device_id)
+        logger.info(f"Using NVIDIA GPU: {device_name} (Device ID: {device_id})")
+    except Exception as e:
+        logger.error(f"Invalid CUDA device specified: {device}. Error: {e}")
+        sys.exit(1)
+
+    # 2. File verification
+    model_file = Path(model_path)
+    if not model_file.exists():
+        logger.error(f"Source model file '{model_path}' not found!")
+        sys.exit(1)
+        
+    if not (model_path.endswith(".pt") or model_path.endswith(".onnx")):
+        logger.error("Unsupported source format. Model must end with '.pt' or '.onnx'")
+        sys.exit(1)
+
+    logger.info(f"Loading source model: {model_path}...")
+    model = YOLO(model_path)
+
+    logger.info("Starting compilation to TensorRT (.engine) format...")
+    logger.info(f"Configuration: FP16={fp16}, INT8={int8}, Image Size={imgsz}, Target Device={device}")
+
+    try:
+        # Ultralytics natively wraps the ONNX -> TensorRT conversion process
+        exported_path = model.export(
+            format="engine",
+            half=fp16,
+            int8=int8,
+            imgsz=imgsz,
+            device=device,
+            dynamic=True  # Enables dynamic batching support
+        )
+        logger.info("========================================= SUCCESS =========================================")
+        logger.info("TensorRT Engine compiled and optimized successfully!")
+        logger.info(f"Saved optimized model to: {os.path.abspath(exported_path)}")
+        logger.info("===========================================================================================")
+        
+    except Exception as e:
+        logger.error(f"An error occurred during TensorRT compilation: {e}")
+        logger.error(
+            "Please ensure you have the TensorRT Python API and CUDA toolkit properly installed. "
+            "Refer to docs/tensorrt_conversion.md for assistance."
+        )
+        sys.exit(1)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compile YOLOv8/v9 models (.pt/.onnx) to highly-optimized TensorRT (.engine) format."
+    )
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="yolov8n.pt",
+        help="Path to the source model (.pt or .onnx file) to compile."
+    )
+    parser.add_argument(
+        "--fp16",
+        action="store_true",
+        default=True,
+        help="Enable FP16 (half-precision) float operations for faster inference (recommended)."
+    )
+    parser.add_argument(
+        "--int8",
+        action="store_true",
+        default=False,
+        help="Enable INT8 quantization (requires calibrating dataset)."
+    )
+    parser.add_argument(
+        "--imgsz",
+        type=int,
+        default=640,
+        help="Standard resolution (width/height) of input frames (default: 640)."
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cuda:0",
+        help="CUDA device to use for compilation (default: cuda:0)."
+    )
+    
+    args = parser.parse_args()
+    export_model(
+        model_path=args.model,
+        fp16=args.fp16,
+        int8=args.int8,
+        imgsz=args.imgsz,
+        device=args.device
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/services/detection/detection.py b/services/detection/detection.py
index a26960d..417e147 100644
--- a/services/detection/detection.py
+++ b/services/detection/detection.py
@@ -62,10 +62,84 @@ def __init__(
         confidence_threshold: float = 0.45,
         device: str = "cpu",
     ) -> None:
-        logger.info(f"Loading YOLO model: {model_name} on {device}")
-        self.model = YOLO(model_name)
+        self.model_path = model_name
         self.conf = confidence_threshold
         self.device = device
+        
+        logger.info(f"Initializing Detector with config: model={model_name}, device={device}")
+        self._load_model_with_fallback()
+
+    def _load_model_with_fallback(self) -> None:
+        """
+        Implements smart, automatic model format resolution and fallback routing.
+        First attempts to locate and load a TensorRT engine if CUDA hardware is available,
+        otherwise falls back gracefully to ONNX or PyTorch models.
+        """
+        path = Path(self.model_path)
+        base_name = path.stem
+        parent_dir = path.parent
+        
+        # Check for a matching .engine file in the same directory
+        engine_path = parent_dir / f"{base_name}.engine"
+        
+        # Determine if we should attempt to load a TensorRT engine
+        should_try_engine = self.model_path.endswith(".engine") or engine_path.exists()
+        
+        if should_try_engine:
+            resolved_engine_path = self.model_path if self.model_path.endswith(".engine") else str(engine_path)
+            
+            # TensorRT requires an NVIDIA GPU with CUDA
+            if "cuda" in self.device.lower():
+                try:
+                    logger.info(f"Attempting optimized TensorRT engine load: {resolved_engine_path}")
+                    self.load_tensorrt_model(resolved_engine_path)
+                    return
+                except Exception as e:
+                    logger.warning(
+                        f"Failed to load TensorRT engine '{resolved_engine_path}': {e}. "
+                        f"Triggering automatic fallback to standard model format..."
+                    )
+            else:
+                logger.warning(
+                    f"TensorRT engine '{resolved_engine_path}' cannot run on non-CUDA device '{self.device}'. "
+                    f"Triggering automatic fallback to standard model format..."
+                )
+
+        # Main loader routing based on model extension
+        if self.model_path.endswith(".onnx"):
+            self.load_onnx_model(self.model_path)
+        elif self.model_path.endswith(".pt"):
+            self.load_pytorch_model(self.model_path)
+        else:
+            # If explicitly requested .engine failed or file is generic, seek compatible counterpart
+            pt_path = parent_dir / f"{base_name}.pt"
+            onnx_path = parent_dir / f"{base_name}.onnx"
+            
+            if pt_path.exists():
+                logger.info(f"Auto-fallback: Loading counterpart PyTorch model: {pt_path}")
+                self.load_pytorch_model(str(pt_path))
+            elif onnx_path.exists():
+                logger.info(f"Auto-fallback: Loading counterpart ONNX model: {onnx_path}")
+                self.load_onnx_model(str(onnx_path))
+            else:
+                logger.info(f"No counterpart found. Loading default fallback model path: {self.model_path}")
+                self.load_pytorch_model(self.model_path)
+
+    def load_tensorrt_model(self, model_path: str) -> None:
+        """Loads a TensorRT engine model using the Ultralytics YOLO framework."""
+        logger.info(f"Successfully routed to load_tensorrt_model: {model_path}")
+        self.model = YOLO(model_path, task="detect")
+
+    def load_onnx_model(self, model_path: str) -> None:
+        """Loads an ONNX model using the Ultralytics YOLO framework."""
+        logger.info(f"Successfully routed to load_onnx_model: {model_path}")
+        self.model = YOLO(model_path, task="detect")
+
+    def load_pytorch_model(self, model_path: str) -> None:
+        """Loads a PyTorch (.pt) model using the Ultralytics YOLO framework."""
+        logger.info(f"Successfully routed to load_pytorch_model: {model_path}")
+        self.model = YOLO(model_path, task="detect")
+
 
     def detect(self, frame: np.ndarray, frame_id: int = 0) -> DetectionFrame:
         """
diff --git a/services/detection/trt_utils.py b/services/detection/trt_utils.py
new file mode 100644
index 0000000..a2f8b57
--- /dev/null
+++ b/services/detection/trt_utils.py
@@ -0,0 +1,171 @@
+"""
+trt_utils.py — Low-level TensorRT inference utilities for Eagle.
+
+Provides the `TensorRTInference` class to load and execute serialized .engine files 
+directly on NVIDIA GPUs with optimized CUDA bindings, including memory management 
+and asynchronous stream coordination.
+"""
+
+from __future__ import annotations
+import logging
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+# Safe imports to prevent crashes on systems without NVIDIA drivers/TensorRT installed.
+try:
+    import tensorrt as trt
+    TRT_AVAILABLE = True
+except ImportError:
+    trt = None
+    TRT_AVAILABLE = False
+
+try:
+    import pycuda.driver as cuda
+    import pycuda.autoinit  # Automatically handles CUDA context creation/destruction
+    CUDA_AVAILABLE = True
+except ImportError:
+    cuda = None
+    CUDA_AVAILABLE = False
+
+
+class TensorRTInference:
+    """
+    Handles low-level TensorRT model deserialization, binding memory allocation
+    (host-to-device and device-to-host pagelocked buffers), and optimized
+    inference for compiled .engine files on NVIDIA GPUs.
+    """
+    def __init__(self, engine_path: str) -> None:
+        """
+        Initialize the TensorRT inference engine.
+
+        Args:
+            engine_path: Path to the serialized `.engine` model file.
+        """
+        if not TRT_AVAILABLE:
+            raise ImportError(
+                "TensorRT python package is not installed. "
+                "Please install tensorrt using: pip install tensorrt"
+            )
+        if not CUDA_AVAILABLE:
+            raise ImportError(
+                "PyCUDA is not installed or CUDA is unavailable. "
+                "Please install pycuda using: pip install pycuda"
+            )
+
+        self.engine_path = engine_path
+        self.logger = trt.Logger(trt.Logger.WARNING)
+        
+        logger.info(f"Deserializing TensorRT Engine: {self.engine_path}")
+        with open(self.engine_path, "rb") as f, trt.Runtime(self.logger) as runtime:
+            self.engine = runtime.deserialize_cuda_engine(f.read())
+            
+        if self.engine is None:
+            raise RuntimeError(f"Failed to deserialize TensorRT engine from {self.engine_path}")
+            
+        self.context = self.engine.create_execution_context()
+        if self.context is None:
+            raise RuntimeError(f"Failed to create TensorRT execution context for {self.engine_path}")
+            
+        self.inputs = []
+        self.outputs = []
+        self.bindings = []
+        self.stream = cuda.Stream()
+        
+        self._allocate_buffers()
+        logger.info(f"TensorRT Engine loaded successfully. Inputs: {len(self.inputs)}, Outputs: {len(self.outputs)}")
+
+    def _allocate_buffers(self) -> None:
+        """
+        Query binding metadata from the engine and allocate pinned/pagelocked
+        host memory and GPU device buffers for each input/output tensor.
+        """
+        # Determine maximum batch size
+        max_batch_size = 1
+        if hasattr(self.engine, "max_batch_size"):
+            max_batch_size = max(1, self.engine.max_batch_size)
+
+        for binding in self.engine:
+            shape = self.engine.get_binding_shape(binding)
+            # Handle dynamic/undefined batch dimension
+            if shape[0] == -1:
+                shape = (max_batch_size,) + shape[1:]
+                
+            size = trt.volume(shape)
+            dtype = trt.nptype(self.engine.get_binding_dtype(binding))
+            
+            # Pinned/pagelocked host memory for faster DMA transfers
+            host_mem = cuda.pagelocked_empty(size, dtype)
+            # CUDA device memory allocation
+            device_mem = cuda.mem_alloc(host_mem.nbytes)
+            
+            self.bindings.append(int(device_mem))
+            
+            binding_info = {
+                "host": host_mem,
+                "device": device_mem,
+                "name": binding,
+                "dtype": dtype,
+                "shape": shape
+            }
+            
+            if self.engine.binding_is_input(binding):
+                self.inputs.append(binding_info)
+            else:
+                self.outputs.append(binding_info)
+
+    def infer(self, input_data: np.ndarray) -> list[np.ndarray]:
+        """
+        Performs synchronized, high-speed inference on a preprocessed input frame.
+
+        Args:
+            input_data: Preprocessed input image numpy array.
+
+        Returns:
+            A list of numpy arrays representing raw model predictions.
+        """
+        if not self.inputs:
+            raise ValueError("No input bindings allocated in the TensorRT engine.")
+
+        input_info = self.inputs[0]
+        # Fast copy to pagelocked host buffer
+        np.copyto(input_info["host"], input_data.ravel())
+        
+        # Host to Device transfer (Asynchronous)
+        cuda.memcpy_htod_async(input_info["device"], input_info["host"], self.stream)
+        
+        # Enqueue inference execution context
+        self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle)
+        
+        # Device to Host transfer (Asynchronous)
+        for out in self.outputs:
+            cuda.memcpy_dtoh_async(out["host"], out["device"], self.stream)
+            
+        # Synchronize CPU and GPU stream execution
+        self.stream.synchronize()
+        
+        # Reshape output vectors back to standard multi-dimensional tensors
+        results = []
+        for out in self.outputs:
+            reshaped = out["host"].reshape(out["shape"])
+            results.append(reshaped)
+            
+        return results
+
+    def __del__(self) -> None:
+        """
+        Cleans up GPU bindings and device pointers when the class object is garbage collected.
+        """
+        self.bindings.clear()
+        self.inputs.clear()
+        self.outputs.clear()
+
+
+def is_tensorrt_supported() -> bool:
+    """
+    Utility check to see if the local machine fully supports native TensorRT execution.
+    
+    Returns:
+        True if tensorrt and pycuda are installed and available, False otherwise.
+    """
+    return TRT_AVAILABLE and CUDA_AVAILABLE
diff --git a/tests/test_tensorrt_routing.py b/tests/test_tensorrt_routing.py
new file mode 100644
index 0000000..ea9f5f9
--- /dev/null
+++ b/tests/test_tensorrt_routing.py
@@ -0,0 +1,78 @@
+"""
+test_tensorrt_routing.py — Unit tests verifying model loader routing and auto-fallback behavior.
+"""
+
+import sys
+import types
+from pathlib import Path
+
+# Dynamically alias the 'Eagle' namespace to the project root at runtime
+if "Eagle" not in sys.modules:
+    eagle_mod = types.ModuleType("Eagle")
+    eagle_mod.__path__ = [str(Path(__file__).resolve().parents[1])]
+    sys.modules["Eagle"] = eagle_mod
+
+import pytest
+from unittest.mock import MagicMock, patch
+from services.detection.detection import Detector
+
+
+@pytest.fixture
+def mock_yolo():
+    """Mocks the ultralytics YOLO class to prevent loading real model weights during tests."""
+    with patch("services.detection.detection.YOLO") as mock:
+        yield mock
+
+
+def test_routing_pytorch(mock_yolo):
+    """Verifies that .pt model paths correctly route to load_pytorch_model."""
+    detector = Detector(model_name="yolov8n.pt", device="cpu")
+    assert detector.model_path == "yolov8n.pt"
+    mock_yolo.assert_called_with("yolov8n.pt", task="detect")
+    
+
+def test_routing_onnx(mock_yolo):
+    """Verifies that .onnx model paths correctly route to load_onnx_model."""
+    detector = Detector(model_name="yolov8n.onnx", device="cpu")
+    assert detector.model_path == "yolov8n.onnx"
+    mock_yolo.assert_called_with("yolov8n.onnx", task="detect")
+
+
+def test_routing_engine_success(mock_yolo):
+    """Verifies that .engine model paths route to load_tensorrt_model when device is CUDA."""
+    with patch("services.detection.detection.Path.exists") as mock_exists:
+        mock_exists.return_value = True
+        detector = Detector(model_name="yolov8n.engine", device="cuda:0")
+        assert detector.model_path == "yolov8n.engine"
+        mock_yolo.assert_called_with("yolov8n.engine", task="detect")
+
+
+def test_routing_engine_cpu_fallback(mock_yolo):
+    """Verifies that .engine model path on CPU triggers auto-fallback to available formats."""
+    # Define a plain method to bypass bound descriptor mock complexities
+    def mock_exists(self_obj):
+        return str(self_obj).endswith(".pt")
+        
+    with patch("services.detection.detection.Path.exists", mock_exists):
+        detector = Detector(model_name="yolov8n.engine", device="cpu")
+        # Should fallback to yolov8n.pt
+        mock_yolo.assert_called_with("yolov8n.pt", task="detect")
+
+
+def test_routing_engine_load_failure_fallback(mock_yolo):
+    """Verifies that .engine loading failure on CUDA triggers automatic fallback to .pt."""
+    def mock_exists(self_obj):
+        return str(self_obj).endswith(".pt") or str(self_obj).endswith(".engine")
+        
+    with patch("services.detection.detection.Path.exists", mock_exists):
+        # YOLO fails to load the engine file (simulating driver mismatch or corrupt engine)
+        def side_effect(path, task=None):
+            if path.endswith(".engine"):
+                raise RuntimeError("Cuda driver mismatch")
+            return MagicMock()
+        mock_yolo.side_effect = side_effect
+        
+        detector = Detector(model_name="yolov8n.engine", device="cuda:0")
+        # Should fallback to yolov8n.pt
+        mock_yolo.assert_called_with("yolov8n.pt", task="detect")
+