Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,9 @@ weights/
**/.DS_Store
docs/

uv.lock
*.ply
*.glb
*.7z
export_data.bin
lingbot-map-long.pt
42 changes: 42 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,48 @@ pip install --index-url https://pypi.org/simple flashinfer-python
pip install -e ".[vis]"
```

### 💚 Alternative: `uv` for faster installs (conda alternative)

The `pyproject.toml` is fully compatible with [`uv`](https://github.com/astral-sh/uv), a fast Python package installer and virtual environment manager written in Rust. It mirrors the conda setup below but resolves and installs dependencies orders of magnitude faster:

**1. Create uv virtual environment**

```bash
curl -LsSf https://astral.sh/uv/install.sh | sh
uv venv --python 3.12
source .venv/bin/activate
```

**2. Install core package (installs all dependencies from `pyproject.toml`)**

```bash
uv sync
```

This installs the package in editable mode (`-e .`) plus all listed `[project]` dependencies — including `torch`, `torchvision`, `flashinfer-python`, `Pillow`, `huggingface_hub`, `einops`, `safetensors`, `opencv-python`, `tqdm`, `scipy`, and `flashinfer-cubin`.

> **Note:** PyTorch version pinning (`2.8.0` with CUDA 12.8) is currently specified in the conda guide for Kaolin compatibility. If you need that exact pin, install it first:
> ```bash
> uv pip install torch==2.8.0 torchvision==0.23.0 \
> --index-url https://download.pytorch.org/whl/cu128
> ```

**3. Optional — visualization dependencies**

```bash
uv sync --group vis
```

Installs `[project.optional-dependencies.vis]` (`viser`, `trimesh`, `matplotlib`, `onnxruntime-gpu`, `requests`).

**4. Optional — demo extra (pulls in `vis`)**

```bash
uv sync --group demo
```

> `uv` manages the virtual environment entirely through PEP 621 metadata from `pyproject.toml` — no `environment.yml`, no conda channels needed. It is fully compatible with the existing conda instructions above; pick whichever tooling you prefer.

## 📦 Model Download

| Model Name | Huggingface Repository | ModelScope Repository | Description |
Expand Down
30 changes: 21 additions & 9 deletions demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import sys
import tempfile
import time
from pathlib import Path

# Must be set before `import torch` / any CUDA init. Reduces the reserved-vs-allocated
# memory gap by letting the caching allocator grow segments on demand instead of
Expand Down Expand Up @@ -88,10 +89,11 @@ def load_images(image_folder=None, video_path=None, fps=10, image_ext=".jpg,.png
resolved_folder = out_dir
print(f"Extracted {len(paths)} frames from video ({total_frames} total, interval={interval})")
else:
assert image_folder is not None
exts = image_ext.split(",")
paths = []
for ext in exts:
paths.extend(glob.glob(os.path.join(image_folder, f"*{ext}")))
paths.extend(glob.glob(str(Path(image_folder) / f"*{ext}")))
paths = sorted(paths)
resolved_folder = image_folder

Expand All @@ -106,7 +108,7 @@ def load_images(image_folder=None, video_path=None, fps=10, image_ext=".jpg,.png
# Image.ROTATE_270 = lossless 90° clockwise (270° counter-clockwise) reordering.
for p in tqdm(paths, desc="Rotating images 90° CW"):
out_path = os.path.join(rotated_dir, os.path.basename(p))
Image.open(p).transpose(Image.ROTATE_270).save(out_path)
Image.open(p).transpose(Image.Transpose.ROTATE_270).save(out_path)
rotated_paths.append(out_path)
paths = rotated_paths
resolved_folder = rotated_dir
Expand Down Expand Up @@ -211,10 +213,15 @@ def _warm_streaming(model, images, scale_frames, warm_stream_n, dtype,
warm_stream_n = max(1, min(int(warm_stream_n), num_avail - scale_frames))
kf_int = max(int(keyframe_interval), 1)

# images: [S, 3, H, W] on device already; slice + add batch dim, no copy of
# spatial dims so warmup shape == real inference shape (H, W).
warm_scale = images[:scale_frames].unsqueeze(0).to(dtype)
warm_stream = images[scale_frames:scale_frames + warm_stream_n].unsqueeze(0).to(dtype)
# images may live on CPU; move only the warmup slices to the model device so
# long videos do not become persistent GPU residents before inference starts.
_model_device = next(model.parameters()).device
warm_scale = images[:scale_frames].unsqueeze(0).to(
device=_model_device, dtype=dtype, non_blocking=True
)
warm_stream = images[scale_frames:scale_frames + warm_stream_n].unsqueeze(0).to(
device=_model_device, dtype=dtype, non_blocking=True
)

for _ in range(passes):
model.clean_kv_cache()
Expand Down Expand Up @@ -387,7 +394,7 @@ def main():
parser.add_argument(
"--offload_to_cpu",
action=argparse.BooleanOptionalAction,
default=False,
default=True,
help="Offload per-frame predictions to CPU during inference to cut GPU peak memory "
"(on by default). Use --no-offload_to_cpu to keep outputs on GPU.",
)
Expand Down Expand Up @@ -459,14 +466,13 @@ def main():
print(f"Casting aggregator to {dtype} (heads kept in fp32)")
model.aggregator = model.aggregator.to(dtype=dtype)

images = images.to(device)
num_frames = images.shape[0]
print(f"Input: {num_frames} frames, shape {tuple(images.shape)}")
print(f"Mode: {args.mode}")
if torch.cuda.is_available():
torch.cuda.empty_cache()
print(
f"GPU mem after load: "
f"GPU mem after model load: "
f"alloc={torch.cuda.memory_allocated()/1e9:.2f} GB, "
f"reserved={torch.cuda.memory_reserved()/1e9:.2f} GB"
)
Expand Down Expand Up @@ -498,6 +504,12 @@ def main():
f"(window_size={args.window_size} keyframes, scale={args.num_scale_frames})."
)

if not args.offload_to_cpu and (args.mode == "windowed" or num_frames > 512):
print(
"Warning: --no-offload_to_cpu keeps the full prediction history on the GPU; "
"long sequences can OOM even when KV cache growth is bounded."
)

# ── Optional: torch.compile + CUDA-graph warmup (streaming only) ────────
if args.compile:
if args.mode != "streaming":
Expand Down
10 changes: 5 additions & 5 deletions gct_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,11 @@ def fps(ms):
ms_lo, ms_mid, ms_hi = avg_ms(p_lo), avg_ms(p_mid), avg_ms(p_hi)

print(f"\n [{label}] ({total_frames} total frames: {scale_frames} scale + {n} streaming)")
print(f" ── Global FPS ─────────────────────────────────────")
print(" ── Global FPS ─────────────────────────────────────")
print(f" total time: {total_ms / 1000:.2f} s "
f"({phase1_ms:.1f} ms phase1 + {total_ms - phase1_ms:.1f} ms phase2)")
print(f" per frame : {global_ms_per_frame:6.2f} ms → {global_fps:6.2f} FPS")
print(f" ── Windowed FPS (±30 streaming frames) ────────────")
print(" ── Windowed FPS (±30 streaming frames) ────────────")
print(f" frame {scale_frames + p_lo:>5d} (10%): {ms_lo:6.2f} ms → {fps(ms_lo):6.2f} FPS")
print(f" frame {scale_frames + p_mid:>5d} (50%): {ms_mid:6.2f} ms → {fps(ms_mid):6.2f} FPS")
print(f" frame {scale_frames + p_hi:>5d} (90%): {ms_hi:6.2f} ms → {fps(ms_hi):6.2f} FPS")
Expand All @@ -182,7 +182,7 @@ def fps(ms):
# original script. This naturally skips the cold first streaming frame
# (global index = scale_frames), whose ms is dominated by one-time CUDA
# graph (re)capture after `clean_kv_cache()` in profile_streaming.
print(f" ── FPS trace (every 100 global frames) ────────────")
print(" ── FPS trace (every 100 global frames) ────────────")
first_trace = (100 - scale_frames) % 100 or 100
for i in range(first_trace, n, 100):
ms_i = avg_ms(i, window=3)
Expand Down Expand Up @@ -344,12 +344,12 @@ def _warm(m, passes=1):
_warm(model)

if args.compile:
print(f" Compiling hot modules...")
print(" Compiling hot modules...")
compile_model(model)
# Three passes under compile: 1st captures CUDA graphs, 2nd/3rd
# replay so the caching allocator and graph-address map converge
# on the exact state the subsequent profile will see.
print(f" Warmup compiled (3× dress rehearsal)...")
print(" Warmup compiled (3× dress rehearsal)...")
_warm(model, passes=3)
else:
# No compile → a single dress-rehearsal pass is enough to
Expand Down
Loading