Robbyant · nvandamme · Jul 1, 2026
diff --git a/.gitignore b/.gitignore
@@ -17,3 +17,9 @@ weights/
 **/.DS_Store
 docs/
 
+uv.lock
+*.ply
+*.glb
+*.7z
+export_data.bin
+lingbot-map-long.pt
diff --git a/README.md b/README.md
@@ -129,6 +129,48 @@ pip install --index-url https://pypi.org/simple flashinfer-python
 pip install -e ".[vis]"
 ```
 
+### 💚 Alternative: `uv` for faster installs (conda alternative)
+
+The `pyproject.toml` is fully compatible with [`uv`](https://github.com/astral-sh/uv), a fast Python package installer and virtual environment manager written in Rust. It mirrors the conda setup below but resolves and installs dependencies orders of magnitude faster:
+
+**1. Create uv virtual environment**
+
+```bash
+curl -LsSf https://astral.sh/uv/install.sh | sh
+uv venv --python 3.12
+source .venv/bin/activate
+```
+
+**2. Install core package (installs all dependencies from `pyproject.toml`)**
+
+```bash
+uv sync
+```
+
+This installs the package in editable mode (`-e .`) plus all listed `[project]` dependencies — including `torch`, `torchvision`, `flashinfer-python`, `Pillow`, `huggingface_hub`, `einops`, `safetensors`, `opencv-python`, `tqdm`, `scipy`, and `flashinfer-cubin`.
+
+> **Note:** PyTorch version pinning (`2.8.0` with CUDA 12.8) is currently specified in the conda guide for Kaolin compatibility. If you need that exact pin, install it first:
+> ```bash
+> uv pip install torch==2.8.0 torchvision==0.23.0 \
+>     --index-url https://download.pytorch.org/whl/cu128
+> ```
+
+**3. Optional — visualization dependencies**
+
+```bash
+uv sync --group vis
+```
+
+Installs `[project.optional-dependencies.vis]` (`viser`, `trimesh`, `matplotlib`, `onnxruntime-gpu`, `requests`).
+
+**4. Optional — demo extra (pulls in `vis`)**
+
+```bash
+uv sync --group demo
+```
+
+> `uv` manages the virtual environment entirely through PEP 621 metadata from `pyproject.toml` — no `environment.yml`, no conda channels needed. It is fully compatible with the existing conda instructions above; pick whichever tooling you prefer.
+
 ## 📦 Model Download
 
 | Model Name | Huggingface Repository | ModelScope Repository | Description |

diff --git a/demo.py b/demo.py
@@ -24,6 +24,7 @@
 import sys
 import tempfile
 import time
+from pathlib import Path
 
 # Must be set before `import torch` / any CUDA init. Reduces the reserved-vs-allocated
 # memory gap by letting the caching allocator grow segments on demand instead of
@@ -88,10 +89,11 @@ def load_images(image_folder=None, video_path=None, fps=10, image_ext=".jpg,.png
         resolved_folder = out_dir
         print(f"Extracted {len(paths)} frames from video ({total_frames} total, interval={interval})")
     else:
+        assert image_folder is not None
         exts = image_ext.split(",")
         paths = []
         for ext in exts:
-            paths.extend(glob.glob(os.path.join(image_folder, f"*{ext}")))
+            paths.extend(glob.glob(str(Path(image_folder) / f"*{ext}")))
         paths = sorted(paths)
         resolved_folder = image_folder
 
@@ -106,7 +108,7 @@ def load_images(image_folder=None, video_path=None, fps=10, image_ext=".jpg,.png
         # Image.ROTATE_270 = lossless 90° clockwise (270° counter-clockwise) reordering.
         for p in tqdm(paths, desc="Rotating images 90° CW"):
             out_path = os.path.join(rotated_dir, os.path.basename(p))
-            Image.open(p).transpose(Image.ROTATE_270).save(out_path)
+            Image.open(p).transpose(Image.Transpose.ROTATE_270).save(out_path)
             rotated_paths.append(out_path)
         paths = rotated_paths
         resolved_folder = rotated_dir
@@ -211,10 +213,15 @@ def _warm_streaming(model, images, scale_frames, warm_stream_n, dtype,
     warm_stream_n = max(1, min(int(warm_stream_n), num_avail - scale_frames))
     kf_int = max(int(keyframe_interval), 1)
 
-    # images: [S, 3, H, W] on device already; slice + add batch dim, no copy of
-    # spatial dims so warmup shape == real inference shape (H, W).
-    warm_scale = images[:scale_frames].unsqueeze(0).to(dtype)
-    warm_stream = images[scale_frames:scale_frames + warm_stream_n].unsqueeze(0).to(dtype)
+    # images may live on CPU; move only the warmup slices to the model device so
+    # long videos do not become persistent GPU residents before inference starts.
+    _model_device = next(model.parameters()).device
+    warm_scale = images[:scale_frames].unsqueeze(0).to(
+        device=_model_device, dtype=dtype, non_blocking=True
+    )
+    warm_stream = images[scale_frames:scale_frames + warm_stream_n].unsqueeze(0).to(
+        device=_model_device, dtype=dtype, non_blocking=True
+    )
 
     for _ in range(passes):
         model.clean_kv_cache()
@@ -387,7 +394,7 @@ def main():
     parser.add_argument(
         "--offload_to_cpu",
         action=argparse.BooleanOptionalAction,
-        default=False,
+        default=True,
         help="Offload per-frame predictions to CPU during inference to cut GPU peak memory "
             "(on by default).  Use --no-offload_to_cpu to keep outputs on GPU.",
     )
@@ -459,14 +466,13 @@ def main():
         print(f"Casting aggregator to {dtype} (heads kept in fp32)")
         model.aggregator = model.aggregator.to(dtype=dtype)
 
-    images = images.to(device)
     num_frames = images.shape[0]
     print(f"Input: {num_frames} frames, shape {tuple(images.shape)}")
     print(f"Mode: {args.mode}")
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
         print(
-            f"GPU mem after load: "
+            f"GPU mem after model load: "
             f"alloc={torch.cuda.memory_allocated()/1e9:.2f} GB, "
             f"reserved={torch.cuda.memory_reserved()/1e9:.2f} GB"
         )
@@ -498,6 +504,12 @@ def main():
                 f"(window_size={args.window_size} keyframes, scale={args.num_scale_frames})."
             )
 
+    if not args.offload_to_cpu and (args.mode == "windowed" or num_frames > 512):
+        print(
+            "Warning: --no-offload_to_cpu keeps the full prediction history on the GPU; "
+            "long sequences can OOM even when KV cache growth is bounded."
+        )
+
     # ── Optional: torch.compile + CUDA-graph warmup (streaming only) ────────
     if args.compile:
         if args.mode != "streaming":

diff --git a/gct_profile.py b/gct_profile.py
@@ -169,11 +169,11 @@ def fps(ms):
     ms_lo, ms_mid, ms_hi = avg_ms(p_lo), avg_ms(p_mid), avg_ms(p_hi)
 
     print(f"\n  [{label}]  ({total_frames} total frames: {scale_frames} scale + {n} streaming)")
-    print(f"    ── Global FPS ─────────────────────────────────────")
+    print("    ── Global FPS ─────────────────────────────────────")
     print(f"      total time: {total_ms / 1000:.2f} s  "
           f"({phase1_ms:.1f} ms phase1 + {total_ms - phase1_ms:.1f} ms phase2)")
     print(f"      per frame : {global_ms_per_frame:6.2f} ms  →  {global_fps:6.2f} FPS")
-    print(f"    ── Windowed FPS (±30 streaming frames) ────────────")
+    print("    ── Windowed FPS (±30 streaming frames) ────────────")
     print(f"      frame {scale_frames + p_lo:>5d} (10%): {ms_lo:6.2f} ms  →  {fps(ms_lo):6.2f} FPS")
     print(f"      frame {scale_frames + p_mid:>5d} (50%): {ms_mid:6.2f} ms  →  {fps(ms_mid):6.2f} FPS")
     print(f"      frame {scale_frames + p_hi:>5d} (90%): {ms_hi:6.2f} ms  →  {fps(ms_hi):6.2f} FPS")
@@ -182,7 +182,7 @@ def fps(ms):
     # original script. This naturally skips the cold first streaming frame
     # (global index = scale_frames), whose ms is dominated by one-time CUDA
     # graph (re)capture after `clean_kv_cache()` in profile_streaming.
-    print(f"    ── FPS trace (every 100 global frames) ────────────")
+    print("    ── FPS trace (every 100 global frames) ────────────")
     first_trace = (100 - scale_frames) % 100 or 100
     for i in range(first_trace, n, 100):
         ms_i = avg_ms(i, window=3)
@@ -344,12 +344,12 @@ def _warm(m, passes=1):
             _warm(model)
 
             if args.compile:
-                print(f"  Compiling hot modules...")
+                print("  Compiling hot modules...")
                 compile_model(model)
                 # Three passes under compile: 1st captures CUDA graphs, 2nd/3rd
                 # replay so the caching allocator and graph-address map converge
                 # on the exact state the subsequent profile will see.
-                print(f"  Warmup compiled (3× dress rehearsal)...")
+                print("  Warmup compiled (3× dress rehearsal)...")
                 _warm(model, passes=3)
             else:
                 # No compile → a single dress-rehearsal pass is enough to