From 2586304510d5620727db58bc931494a5f90a1ceb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 22 Apr 2026 08:43:38 +0000 Subject: [PATCH 1/8] feat: add GitHub Copilot orchestrator as alternative LLM backend Agent-Logs-Url: https://github.com/Delgerskhn/video-use/sessions/d0dd1259-9f0d-4c67-a4d1-f368300227c1 Co-authored-by: Delgerskhn <57222574+Delgerskhn@users.noreply.github.com> --- .env.example | 5 + README.md | 44 +++ orchestrator.py | 742 ++++++++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 1 + 4 files changed, 792 insertions(+) create mode 100644 orchestrator.py diff --git a/.env.example b/.env.example index 4c49a94..f300000 100644 --- a/.env.example +++ b/.env.example @@ -1 +1,6 @@ ELEVENLABS_API_KEY= + +# Required for the GitHub Copilot orchestrator (orchestrator.py). +# Create a Personal Access Token at https://github.com/settings/tokens +# with the 'copilot' scope, then paste it here or export it in your shell. +GITHUB_TOKEN= diff --git a/README.md b/README.md index 59cc112..adf9c37 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,12 @@ Drop raw footage in a folder, chat with Claude Code, get `final.mp4` back. Works ## Get started +Two ways to run video-use — pick the one that matches your subscription: + +### Option A — Claude Code (original) + +Requires an Anthropic API subscription or Claude Pro. + ```bash # 1. Clone and symlink into Claude Code's skills directory git clone https://github.com/browser-use/video-use @@ -43,6 +49,44 @@ cd /path/to/your/videos claude ``` +### Option B — GitHub Copilot (no Anthropic key required) + +Uses your existing GitHub Copilot subscription as the LLM backend via the +OpenAI-compatible Copilot API. Same pipeline, same production rules, same helpers. + +```bash +# 1. Clone the repo +git clone https://github.com/browser-use/video-use +cd video-use + +# 2. Install deps (includes the openai SDK) +pip install -e ".[copilot]" +brew install ffmpeg # required +brew install yt-dlp # optional + +# 3. Configure API keys +cp .env.example .env +$EDITOR .env +# ELEVENLABS_API_KEY=... ← for transcription (same as before) +# GITHUB_TOKEN=... ← PAT with 'copilot' scope +# https://github.com/settings/tokens +``` + +Then run the orchestrator against your video folder: + +```bash +python /path/to/video-use/orchestrator.py /path/to/your/videos +``` + +Available options: + +``` +--model gpt-4o # default; also: gpt-4o-mini, claude-3.5-sonnet, o3-mini +--endpoint # default: https://api.githubcopilot.com + # GitHub Models alternative: https://models.inference.ai.azure.com +--max-turns 100 # safety cap on LLM turns (default: 100) +``` + And in the session: > edit these into a launch video diff --git a/orchestrator.py b/orchestrator.py new file mode 100644 index 0000000..964fd2d --- /dev/null +++ b/orchestrator.py @@ -0,0 +1,742 @@ +#!/usr/bin/env python3 +"""GitHub Copilot-backed video editing orchestrator for video-use. + +Replaces the `claude` CLI runtime with a standalone Python script that drives +the same video editing pipeline using the GitHub Copilot API (OpenAI-compatible). +All 12 hard production rules from SKILL.md are enforced via the same system +prompt — no logic changes to the skill or helpers are needed. + +Requirements: + pip install -e ".[copilot]" # openai>=1.0 + export GITHUB_TOKEN= # PAT with `copilot` scope + ELEVENLABS_API_KEY=... in .env # for transcription (same as before) + ffmpeg and ffprobe on PATH + +Usage: + python orchestrator.py /path/to/videos + python orchestrator.py /path/to/videos --model gpt-4o + python orchestrator.py /path/to/videos --endpoint https://models.inference.ai.azure.com +""" + +from __future__ import annotations + +import argparse +import base64 +import json +import os +import subprocess +import sys +import tempfile +from pathlib import Path + +# --------------------------------------------------------------------------- +# Repo-relative paths +# --------------------------------------------------------------------------- + +REPO_ROOT = Path(__file__).resolve().parent +HELPERS_DIR = REPO_ROOT / "helpers" +SKILL_MD = REPO_ROOT / "SKILL.md" + +# --------------------------------------------------------------------------- +# System prompt +# --------------------------------------------------------------------------- + + +def load_skill_prompt() -> str: + """Read SKILL.md and strip the YAML front matter used by Claude Code.""" + text = SKILL_MD.read_text() + if text.startswith("---"): + end = text.find("---", 3) + if end != -1: + text = text[end + 3 :].lstrip("\n") + return text + + +# Maximum characters returned from a single tool call before truncation. +# Keeps large files (packed transcripts, long ffmpeg logs) from consuming +# the entire context window. +MAX_TOOL_RESULT_LENGTH = 20_000 + +# --------------------------------------------------------------------------- +# Tool schemas (OpenAI function-calling format) +# --------------------------------------------------------------------------- + +TOOLS: list[dict] = [ + { + "type": "function", + "function": { + "name": "transcribe", + "description": ( + "Transcribe a single video with ElevenLabs Scribe. " + "Writes word-level transcript JSON to edit/transcripts/.json. " + "Cached — skips upload if the JSON already exists." + ), + "parameters": { + "type": "object", + "properties": { + "video_path": { + "type": "string", + "description": "Absolute path to the video file.", + }, + "edit_dir": { + "type": "string", + "description": "Edit output directory. Defaults to /edit.", + }, + "language": { + "type": "string", + "description": "ISO language code (e.g. 'en'). Omit to auto-detect.", + }, + "num_speakers": { + "type": "integer", + "description": "Number of speakers. Improves diarization when known.", + }, + }, + "required": ["video_path"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "transcribe_batch", + "description": ( + "Batch-transcribe every video in a directory using parallel workers. " + "Cached per source — already-transcribed files are skipped." + ), + "parameters": { + "type": "object", + "properties": { + "videos_dir": { + "type": "string", + "description": "Directory containing source videos.", + }, + "workers": { + "type": "integer", + "description": "Parallel workers (default 4).", + }, + "edit_dir": { + "type": "string", + "description": "Override edit output directory.", + }, + "num_speakers": { + "type": "integer", + "description": "Number of speakers (optional).", + }, + }, + "required": ["videos_dir"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "pack_transcripts", + "description": ( + "Pack all per-source transcript JSONs in edit/transcripts/ into " + "takes_packed.md — the primary phrase-level reading surface for cut decisions." + ), + "parameters": { + "type": "object", + "properties": { + "edit_dir": { + "type": "string", + "description": "Edit output directory containing transcripts/ subdirectory.", + }, + "silence_threshold": { + "type": "number", + "description": "Silence gap in seconds that triggers a phrase break (default 0.5).", + }, + }, + "required": ["edit_dir"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "timeline_view", + "description": ( + "Generate a filmstrip + waveform PNG for a time range of a video. " + "Use at decision points (ambiguous pauses, retake comparison, cut-point " + "sanity checks). NOT a scan tool — call only when you need a visual check." + ), + "parameters": { + "type": "object", + "properties": { + "video_path": { + "type": "string", + "description": "Absolute path to the video file.", + }, + "start": { + "type": "number", + "description": "Start time in seconds.", + }, + "end": { + "type": "number", + "description": "End time in seconds.", + }, + "n_frames": { + "type": "integer", + "description": "Number of filmstrip frames to extract (default 8).", + }, + "transcript_path": { + "type": "string", + "description": "Optional path to a transcript JSON for word label overlay.", + }, + }, + "required": ["video_path", "start", "end"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "render", + "description": ( + "Render a video from an EDL (edit decision list JSON). " + "Runs the full pipeline: per-segment extract with grade + 30ms audio fades → " + "lossless concat → overlays (PTS-shifted) → subtitles LAST → loudnorm." + ), + "parameters": { + "type": "object", + "properties": { + "edl_path": { + "type": "string", + "description": "Absolute path to edl.json.", + }, + "output_path": { + "type": "string", + "description": "Output video path (e.g. edit/final.mp4).", + }, + "preview": { + "type": "boolean", + "description": "Preview mode: 1080p, CRF 22, faster encode.", + }, + "build_subtitles": { + "type": "boolean", + "description": "Build master.srt from transcripts + EDL timeline offsets.", + }, + "no_subtitles": { + "type": "boolean", + "description": "Skip subtitles even if the EDL references one.", + }, + "no_loudnorm": { + "type": "boolean", + "description": "Skip audio loudness normalization (default: on, -14 LUFS).", + }, + }, + "required": ["edl_path", "output_path"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "grade", + "description": ( + "Apply a color grade to a video via ffmpeg filter chain. " + "Presets: subtle, neutral_punch, warm_cinematic, none. " + "Omit both preset and filter for auto mode (data-driven per-clip correction)." + ), + "parameters": { + "type": "object", + "properties": { + "input_path": { + "type": "string", + "description": "Input video path.", + }, + "output_path": { + "type": "string", + "description": "Output video path.", + }, + "preset": { + "type": "string", + "description": "Grade preset name.", + "enum": ["subtle", "neutral_punch", "warm_cinematic", "none"], + }, + "filter": { + "type": "string", + "description": "Raw ffmpeg filter string (overrides preset).", + }, + }, + "required": ["input_path", "output_path"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "bash", + "description": ( + "Run a shell command. Use for ffprobe, yt-dlp, file listing, " + "ffmpeg one-offs, and other system tasks the other tools don't cover." + ), + "parameters": { + "type": "object", + "properties": { + "command": { + "type": "string", + "description": "Shell command to execute.", + }, + }, + "required": ["command"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "read_file", + "description": "Read the contents of a text file (takes_packed.md, project.md, edl.json, transcripts, etc.).", + "parameters": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "Absolute path to the file.", + }, + }, + "required": ["path"], + }, + }, + }, + { + "type": "function", + "function": { + "name": "write_file", + "description": "Write or append content to a file (edl.json, project.md, etc.).", + "parameters": { + "type": "object", + "properties": { + "path": { + "type": "string", + "description": "Absolute path to the file.", + }, + "content": { + "type": "string", + "description": "Content to write.", + }, + "append": { + "type": "boolean", + "description": "If true, append to existing file instead of overwriting.", + }, + }, + "required": ["path", "content"], + }, + }, + }, +] + + +# --------------------------------------------------------------------------- +# Tool dispatch +# --------------------------------------------------------------------------- + + +def _run_helper(args: list[str]) -> tuple[int, str, str]: + """Run a Python helper from the helpers/ directory.""" + cmd = [sys.executable] + args + proc = subprocess.run(cmd, capture_output=True, text=True) + return proc.returncode, proc.stdout, proc.stderr + + +def _format_result(returncode: int, stdout: str, stderr: str) -> str: + parts: list[str] = [] + if stdout.strip(): + parts.append(stdout.strip()) + if returncode != 0 and stderr.strip(): + parts.append(f"[stderr]\n{stderr.strip()}") + if not parts: + parts.append("(no output)" if returncode == 0 else f"[exit {returncode}] (no output)") + if returncode != 0: + parts.insert(0, f"[exit code {returncode}]") + return "\n".join(parts) + + +def dispatch_tool( + name: str, + args: dict, + videos_dir: Path, + edit_dir: Path, +) -> tuple[str, Path | None]: + """Execute a tool call. Returns (result_text, optional_image_path).""" + + if name == "transcribe": + video_path = args["video_path"] + cmd = [str(HELPERS_DIR / "transcribe.py"), video_path] + if args.get("edit_dir"): + cmd += ["--edit-dir", args["edit_dir"]] + else: + cmd += ["--edit-dir", str(edit_dir)] + if args.get("language"): + cmd += ["--language", args["language"]] + if args.get("num_speakers"): + cmd += ["--num-speakers", str(args["num_speakers"])] + rc, out, err = _run_helper(cmd) + return _format_result(rc, out, err), None + + if name == "transcribe_batch": + cmd = [str(HELPERS_DIR / "transcribe_batch.py"), args["videos_dir"]] + if args.get("edit_dir"): + cmd += ["--edit-dir", args["edit_dir"]] + if args.get("workers"): + cmd += ["--workers", str(args["workers"])] + if args.get("num_speakers"): + cmd += ["--num-speakers", str(args["num_speakers"])] + rc, out, err = _run_helper(cmd) + return _format_result(rc, out, err), None + + if name == "pack_transcripts": + cmd = [str(HELPERS_DIR / "pack_transcripts.py"), "--edit-dir", args["edit_dir"]] + if args.get("silence_threshold") is not None: + cmd += ["--silence-threshold", str(args["silence_threshold"])] + rc, out, err = _run_helper(cmd) + return _format_result(rc, out, err), None + + if name == "timeline_view": + video_path = Path(args["video_path"]) + start = args["start"] + end = args["end"] + verify_dir = edit_dir / "verify" + verify_dir.mkdir(parents=True, exist_ok=True) + out_img = verify_dir / f"timeline_{video_path.stem}_{start:.2f}_{end:.2f}.png" + cmd = [ + str(HELPERS_DIR / "timeline_view.py"), + str(video_path), + str(start), + str(end), + "-o", str(out_img), + ] + if args.get("n_frames"): + cmd += ["--n-frames", str(args["n_frames"])] + if args.get("transcript_path"): + cmd += ["--transcript", args["transcript_path"]] + rc, out, err = _run_helper(cmd) + result = _format_result(rc, out, err) + if rc == 0 and out_img.exists(): + result += f"\nImage saved to: {out_img}" + return result, out_img + return result, None + + if name == "render": + cmd = [ + str(HELPERS_DIR / "render.py"), + args["edl_path"], + "-o", args["output_path"], + ] + if args.get("preview"): + cmd.append("--preview") + if args.get("build_subtitles"): + cmd.append("--build-subtitles") + if args.get("no_subtitles"): + cmd.append("--no-subtitles") + if args.get("no_loudnorm"): + cmd.append("--no-loudnorm") + rc, out, err = _run_helper(cmd) + return _format_result(rc, out, err), None + + if name == "grade": + cmd = [ + str(HELPERS_DIR / "grade.py"), + args["input_path"], + "-o", args["output_path"], + ] + if args.get("filter"): + cmd += ["--filter", args["filter"]] + elif args.get("preset"): + cmd += ["--preset", args["preset"]] + rc, out, err = _run_helper(cmd) + return _format_result(rc, out, err), None + + if name == "bash": + command = args["command"] + proc = subprocess.run( + command, + shell=True, + capture_output=True, + text=True, + timeout=300, + ) + return _format_result(proc.returncode, proc.stdout, proc.stderr), None + + if name == "read_file": + path = Path(args["path"]) + if not path.exists(): + return f"File not found: {path}", None + try: + return path.read_text(), None + except Exception as e: + return f"Error reading file: {e}", None + + if name == "write_file": + path = Path(args["path"]) + path.parent.mkdir(parents=True, exist_ok=True) + mode = "a" if args.get("append") else "w" + try: + with open(path, mode) as f: + f.write(args["content"]) + return f"Written to {path}", None + except Exception as e: + return f"Error writing file: {e}", None + + return f"Unknown tool: {name}", None + + +# --------------------------------------------------------------------------- +# Session loop +# --------------------------------------------------------------------------- + + +def _build_image_message(img_path: Path) -> dict: + b64 = base64.b64encode(img_path.read_bytes()).decode() + return { + "role": "user", + "content": [ + {"type": "text", "text": f"[Timeline view image: {img_path.name}]"}, + {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}}, + ], + } + + +def run_session( + videos_dir: Path, + model: str, + endpoint: str, + max_turns: int, +) -> None: + try: + from openai import OpenAI + except ImportError: + sys.exit( + "openai package not found.\n" + "Install with: pip install -e \".[copilot]\"\n" + "or: pip install openai" + ) + + github_token = os.environ.get("GITHUB_TOKEN", "").strip() + if not github_token: + sys.exit( + "GITHUB_TOKEN is not set.\n" + "Export a Personal Access Token with the 'copilot' scope:\n" + " export GITHUB_TOKEN=github_pat_..." + ) + + client = OpenAI(base_url=endpoint, api_key=github_token) + + edit_dir = videos_dir / "edit" + edit_dir.mkdir(parents=True, exist_ok=True) + + # Build system prompt with working-directory context injected at the end + system_prompt = load_skill_prompt() + system_prompt += ( + f"\n\n## Session context\n\n" + f"- Videos directory: `{videos_dir}`\n" + f"- Edit directory: `{edit_dir}`\n" + f"- Helpers directory: `{HELPERS_DIR}`\n" + f"- All session outputs must go to `{edit_dir}/` (Hard Rule 12).\n" + ) + + messages: list[dict] = [{"role": "system", "content": system_prompt}] + + # Seed with prior session memory if available + project_md = edit_dir / "project.md" + if project_md.exists(): + prior = project_md.read_text().strip() + if prior: + messages.append({ + "role": "user", + "content": ( + f"[Prior session memory — project.md]\n\n{prior}\n\n---\n" + "I'm back. What should we pick up from or start fresh on?" + ), + }) + messages.append({ + "role": "assistant", + "content": ( + "I've reviewed the session notes above. Ready when you are — " + "just tell me what you'd like to work on." + ), + }) + + print(f"\nvideo-use — GitHub Copilot orchestrator") + print(f" model: {model}") + print(f" endpoint: {endpoint}") + print(f" videos: {videos_dir}") + print("Type your message. Enter 'exit' or press Ctrl+C to quit.\n") + + # Prompt for the first user message + try: + first_input = input("You: ").strip() + except (EOFError, KeyboardInterrupt): + print("\nBye.") + return + + if not first_input or first_input.lower() in ("exit", "quit", "q"): + print("Bye.") + return + + messages.append({"role": "user", "content": first_input}) + + turn = 0 + while turn < max_turns: + try: + response = client.chat.completions.create( + model=model, + messages=messages, + tools=TOOLS, + max_tokens=4096, + ) + except KeyboardInterrupt: + print("\n[Interrupted]") + break + except Exception as e: + print(f"\n[API error: {e}]") + break + + choice = response.choices[0] + message = choice.message + + # Serialize the assistant message back into the history + msg_dict: dict = {"role": "assistant", "content": message.content} + if message.tool_calls: + msg_dict["tool_calls"] = [ + { + "id": tc.id, + "type": "function", + "function": { + "name": tc.function.name, + "arguments": tc.function.arguments, + }, + } + for tc in message.tool_calls + ] + messages.append(msg_dict) + + if message.tool_calls: + # Execute every requested tool call + image_paths: list[Path] = [] + + for tc in message.tool_calls: + tool_name = tc.function.name + try: + tool_args = json.loads(tc.function.arguments) + except json.JSONDecodeError: + tool_args = {} + + # Pretty-print what we're doing + args_preview = ", ".join( + f"{k}={v!r}" for k, v in list(tool_args.items())[:3] + ) + print(f" [tool] {tool_name}({args_preview})", flush=True) + + result_text, image_path = dispatch_tool( + tool_name, tool_args, videos_dir, edit_dir + ) + + # Truncate very long results so we don't blow the context window + if len(result_text) > MAX_TOOL_RESULT_LENGTH: + result_text = result_text[:MAX_TOOL_RESULT_LENGTH] + "\n... [truncated]" + + messages.append({ + "role": "tool", + "tool_call_id": tc.id, + "content": result_text, + }) + + if image_path and image_path.exists(): + image_paths.append(image_path) + + # Inject timeline view images as user messages so vision-capable + # models (gpt-4o, etc.) can reason about them + for img_path in image_paths: + messages.append(_build_image_message(img_path)) + + turn += 1 + continue # Let the model respond to the tool results + + # No tool calls — conversational turn + if message.content: + print(f"\nAssistant: {message.content}\n") + + if choice.finish_reason == "stop": + try: + user_input = input("You: ").strip() + except (EOFError, KeyboardInterrupt): + print("\nBye.") + break + + if not user_input or user_input.lower() in ("exit", "quit", "q"): + print("Bye.") + break + + messages.append({"role": "user", "content": user_input}) + + turn += 1 + + if turn >= max_turns: + print(f"\n[Reached max_turns={max_turns}. Session ended.]") + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + + +def main() -> None: + ap = argparse.ArgumentParser( + description="GitHub Copilot-backed video editing orchestrator for video-use.", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=( + "Environment variables:\n" + " GITHUB_TOKEN Personal Access Token with 'copilot' scope (required)\n" + " ELEVENLABS_API_KEY ElevenLabs API key for transcription (required for transcribe tools)\n" + "\nModel options (via GitHub Copilot):\n" + " gpt-4o Default — strong reasoning, vision support\n" + " gpt-4o-mini Faster and lighter\n" + " claude-3.5-sonnet Anthropic model via Copilot\n" + " o3-mini Reasoning model\n" + "\nAlternative endpoint (GitHub Models free tier):\n" + " --endpoint https://models.inference.ai.azure.com\n" + ), + ) + ap.add_argument( + "videos_dir", + type=Path, + help="Directory containing the source video files.", + ) + ap.add_argument( + "--model", + default="gpt-4o", + help="Model identifier for the Copilot API (default: gpt-4o).", + ) + ap.add_argument( + "--endpoint", + default="https://api.githubcopilot.com", + help=( + "GitHub Copilot API base URL " + "(default: https://api.githubcopilot.com). " + "Use https://models.inference.ai.azure.com for GitHub Models." + ), + ) + ap.add_argument( + "--max-turns", + type=int, + default=100, + help="Maximum LLM turns before the session ends (default: 100).", + ) + args = ap.parse_args() + + videos_dir = args.videos_dir.resolve() + if not videos_dir.is_dir(): + sys.exit(f"Not a directory: {videos_dir}") + + run_session( + videos_dir=videos_dir, + model=args.model, + endpoint=args.endpoint, + max_turns=args.max_turns, + ) + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 1221f19..296cad6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,6 +13,7 @@ dependencies = [ [project.optional-dependencies] animations = ["manim"] +copilot = ["openai>=1.0"] [build-system] requires = ["setuptools>=61.0"] From 4f0d45a5e8e11d153f78e667dedb03292dc3a5ce Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 22 Apr 2026 08:58:58 +0000 Subject: [PATCH 2/8] feat: set claude-opus-4-7 as default model for GitHub Copilot orchestrator Agent-Logs-Url: https://github.com/Delgerskhn/video-use/sessions/e28e4d80-3925-45dc-927c-dd0e31247735 Co-authored-by: Delgerskhn <57222574+Delgerskhn@users.noreply.github.com> --- README.md | 11 +++++++---- orchestrator.py | 13 +++++++------ 2 files changed, 14 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index adf9c37..1179e80 100644 --- a/README.md +++ b/README.md @@ -81,10 +81,13 @@ python /path/to/video-use/orchestrator.py /path/to/your/videos Available options: ``` ---model gpt-4o # default; also: gpt-4o-mini, claude-3.5-sonnet, o3-mini ---endpoint # default: https://api.githubcopilot.com - # GitHub Models alternative: https://models.inference.ai.azure.com ---max-turns 100 # safety cap on LLM turns (default: 100) +--model claude-opus-4-7 # default — Anthropic Claude Opus 4.7 (strong reasoning + vision) +--model gpt-4o # OpenAI GPT-4o alternative +--model claude-sonnet-4-5 # faster Anthropic option +--model gpt-4o-mini # fastest/lightest option +--endpoint # default: https://api.githubcopilot.com + # GitHub Models alternative: https://models.inference.ai.azure.com +--max-turns 100 # safety cap on LLM turns (default: 100) ``` And in the session: diff --git a/orchestrator.py b/orchestrator.py index 964fd2d..57fecc7 100644 --- a/orchestrator.py +++ b/orchestrator.py @@ -691,10 +691,11 @@ def main() -> None: " GITHUB_TOKEN Personal Access Token with 'copilot' scope (required)\n" " ELEVENLABS_API_KEY ElevenLabs API key for transcription (required for transcribe tools)\n" "\nModel options (via GitHub Copilot):\n" - " gpt-4o Default — strong reasoning, vision support\n" - " gpt-4o-mini Faster and lighter\n" - " claude-3.5-sonnet Anthropic model via Copilot\n" - " o3-mini Reasoning model\n" + " claude-opus-4-7 Default — Anthropic Claude Opus 4.7, strong reasoning + vision\n" + " claude-sonnet-4-5 Anthropic Claude Sonnet 4.5 — faster, lighter\n" + " gpt-4o OpenAI GPT-4o — strong reasoning, vision support\n" + " gpt-4o-mini OpenAI GPT-4o mini — fastest OpenAI option\n" + " o3-mini OpenAI o3-mini — reasoning model\n" "\nAlternative endpoint (GitHub Models free tier):\n" " --endpoint https://models.inference.ai.azure.com\n" ), @@ -706,8 +707,8 @@ def main() -> None: ) ap.add_argument( "--model", - default="gpt-4o", - help="Model identifier for the Copilot API (default: gpt-4o).", + default="claude-opus-4-7", + help="Model identifier for the Copilot API (default: claude-opus-4-7).", ) ap.add_argument( "--endpoint", From 6ddc103b61302f7c72ffe9ea167efaf2abf07751 Mon Sep 17 00:00:00 2001 From: delgerskhn Date: Wed, 22 Apr 2026 17:52:26 +0800 Subject: [PATCH 3/8] fix: address PR review comments\n\n- Load .env automatically for GITHUB_TOKEN (matches transcribe.py pattern)\n- Disable bash tool by default; add --enable-bash flag for explicit opt-in\n- Return tool error on JSON parse failure instead of silently using {}\n- Default pack_transcripts edit_dir to session edit_dir when not provided\n- Sandbox read_file/write_file to videos_dir/edit_dir (path traversal fix)\n- Print message.content before executing tool calls so user sees it\n- Lock transcribe_batch to session videos_dir, ignore model-provided path\n- Downscale timeline images via ffmpeg before base64 embedding (max 1.5 MB)\n- Update .env.example to note that orchestrator loads .env automatically" --- .env.example | 3 +- orchestrator.py | 112 ++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 106 insertions(+), 9 deletions(-) diff --git a/.env.example b/.env.example index f300000..f7de35e 100644 --- a/.env.example +++ b/.env.example @@ -2,5 +2,6 @@ ELEVENLABS_API_KEY= # Required for the GitHub Copilot orchestrator (orchestrator.py). # Create a Personal Access Token at https://github.com/settings/tokens -# with the 'copilot' scope, then paste it here or export it in your shell. +# with the 'copilot' scope, then paste it here. The orchestrator loads +# this file automatically (like the transcription helpers do). GITHUB_TOKEN= diff --git a/orchestrator.py b/orchestrator.py index 57fecc7..627929b 100644 --- a/orchestrator.py +++ b/orchestrator.py @@ -37,6 +37,23 @@ HELPERS_DIR = REPO_ROOT / "helpers" SKILL_MD = REPO_ROOT / "SKILL.md" + +def _load_env_file() -> None: + """Load key=value pairs from .env into os.environ (does not overwrite existing vars).""" + for candidate in [REPO_ROOT / ".env", Path(".env")]: + if candidate.exists(): + for line in candidate.read_text().splitlines(): + line = line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + k, v = line.split("=", 1) + k = k.strip() + v = v.strip().strip('"').strip("'") + if k and k not in os.environ: + os.environ[k] = v + break + + # --------------------------------------------------------------------------- # System prompt # --------------------------------------------------------------------------- @@ -353,11 +370,21 @@ def _format_result(returncode: int, stdout: str, stderr: str) -> str: return "\n".join(parts) +def _is_under(path: Path, parent: Path) -> bool: + """Return True if *path* is the same as or nested under *parent*.""" + try: + path.relative_to(parent.resolve()) + return True + except ValueError: + return False + + def dispatch_tool( name: str, args: dict, videos_dir: Path, edit_dir: Path, + enable_bash: bool = False, ) -> tuple[str, Path | None]: """Execute a tool call. Returns (result_text, optional_image_path).""" @@ -376,7 +403,8 @@ def dispatch_tool( return _format_result(rc, out, err), None if name == "transcribe_batch": - cmd = [str(HELPERS_DIR / "transcribe_batch.py"), args["videos_dir"]] + # Always use the session videos_dir to prevent operating outside it + cmd = [str(HELPERS_DIR / "transcribe_batch.py"), str(videos_dir)] if args.get("edit_dir"): cmd += ["--edit-dir", args["edit_dir"]] if args.get("workers"): @@ -387,7 +415,7 @@ def dispatch_tool( return _format_result(rc, out, err), None if name == "pack_transcripts": - cmd = [str(HELPERS_DIR / "pack_transcripts.py"), "--edit-dir", args["edit_dir"]] + cmd = [str(HELPERS_DIR / "pack_transcripts.py"), "--edit-dir", args.get("edit_dir") or str(edit_dir)] if args.get("silence_threshold") is not None: cmd += ["--silence-threshold", str(args["silence_threshold"])] rc, out, err = _run_helper(cmd) @@ -449,6 +477,12 @@ def dispatch_tool( return _format_result(rc, out, err), None if name == "bash": + if not enable_bash: + return ( + "[bash tool is disabled by default. Restart the orchestrator with " + "--enable-bash to allow shell commands.]", + None, + ) command = args["command"] proc = subprocess.run( command, @@ -460,7 +494,9 @@ def dispatch_tool( return _format_result(proc.returncode, proc.stdout, proc.stderr), None if name == "read_file": - path = Path(args["path"]) + path = Path(args["path"]).resolve() + if not (_is_under(path, videos_dir) or _is_under(path, edit_dir)): + return f"Access denied: path must be under {videos_dir} or {edit_dir}", None if not path.exists(): return f"File not found: {path}", None try: @@ -469,7 +505,9 @@ def dispatch_tool( return f"Error reading file: {e}", None if name == "write_file": - path = Path(args["path"]) + path = Path(args["path"]).resolve() + if not _is_under(path, edit_dir): + return f"Access denied: write path must be under {edit_dir}", None path.parent.mkdir(parents=True, exist_ok=True) mode = "a" if args.get("append") else "w" try: @@ -487,13 +525,47 @@ def dispatch_tool( # --------------------------------------------------------------------------- +# Maximum image size (bytes) to embed in chat; larger images are downscaled first. +MAX_IMAGE_BYTES = 1_500_000 # 1.5 MB + + def _build_image_message(img_path: Path) -> dict: - b64 = base64.b64encode(img_path.read_bytes()).decode() + """Embed a timeline image as a base64 data URL, downscaling via ffmpeg if needed.""" + raw = img_path.read_bytes() + mime = "image/png" + if len(raw) > MAX_IMAGE_BYTES: + with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp: + tmp_path = Path(tmp.name) + try: + subprocess.run( + [ + "ffmpeg", "-y", "-i", str(img_path), + "-vf", "scale='min(960,iw)':-2", + str(tmp_path), + ], + capture_output=True, + check=False, + ) + raw = tmp_path.read_bytes() + mime = "image/jpeg" + except Exception: + pass + finally: + tmp_path.unlink(missing_ok=True) + if len(raw) > MAX_IMAGE_BYTES: + return { + "role": "user", + "content": ( + f"[Timeline image too large to embed ({len(raw):,} bytes); " + f"saved to: {img_path}]" + ), + } + b64 = base64.b64encode(raw).decode() return { "role": "user", "content": [ {"type": "text", "text": f"[Timeline view image: {img_path.name}]"}, - {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64}"}}, + {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}}, ], } @@ -503,6 +575,7 @@ def run_session( model: str, endpoint: str, max_turns: int, + enable_bash: bool = False, ) -> None: try: from openai import OpenAI @@ -513,6 +586,7 @@ def run_session( "or: pip install openai" ) + _load_env_file() github_token = os.environ.get("GITHUB_TOKEN", "").strip() if not github_token: sys.exit( @@ -613,6 +687,10 @@ def run_session( messages.append(msg_dict) if message.tool_calls: + # Show any explanation the model included alongside the tool calls + if message.content: + print(f"\nAssistant: {message.content}\n") + # Execute every requested tool call image_paths: list[Path] = [] @@ -621,7 +699,15 @@ def run_session( try: tool_args = json.loads(tc.function.arguments) except json.JSONDecodeError: - tool_args = {} + messages.append({ + "role": "tool", + "tool_call_id": tc.id, + "content": ( + f"[Invalid JSON in tool arguments — could not parse. " + f"Raw arguments: {tc.function.arguments!r}. Please retry with valid JSON.]" + ), + }) + continue # Pretty-print what we're doing args_preview = ", ".join( @@ -630,7 +716,7 @@ def run_session( print(f" [tool] {tool_name}({args_preview})", flush=True) result_text, image_path = dispatch_tool( - tool_name, tool_args, videos_dir, edit_dir + tool_name, tool_args, videos_dir, edit_dir, enable_bash=enable_bash ) # Truncate very long results so we don't blow the context window @@ -725,6 +811,15 @@ def main() -> None: default=100, help="Maximum LLM turns before the session ends (default: 100).", ) + ap.add_argument( + "--enable-bash", + action="store_true", + default=False, + help=( + "Enable the bash tool (disabled by default). " + "Only enable when you trust the model and understand the security implications." + ), + ) args = ap.parse_args() videos_dir = args.videos_dir.resolve() @@ -736,6 +831,7 @@ def main() -> None: model=args.model, endpoint=args.endpoint, max_turns=args.max_turns, + enable_bash=args.enable_bash, ) From adf06574f74a8e26e4c5811547ac8d70c80853f2 Mon Sep 17 00:00:00 2001 From: delgerskhn Date: Wed, 22 Apr 2026 19:25:40 +0800 Subject: [PATCH 4/8] feat: migrate orchestrator to GitHub Copilot SDK MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the manual OpenAI REST API orchestration loop with the official GitHub Copilot SDK (github-copilot-sdk). Key changes: orchestrator.py: - Drop openai dependency; use CopilotClient + define_tool from copilot SDK - SDK spawns the Copilot CLI subprocess automatically — no separate install - Tools defined with @define_tool + Pydantic models instead of raw JSON schemas - SDK manages session history, context compaction (infinite sessions) natively - Authentication via copilot auth login OR GITHUB_TOKEN — SDK handles both - Shell tool disabled by default via on_permission_request handler (--enable-shell flag to opt in), replacing the previous --enable-bash flag - File writes sandboxed to edit_dir via the same permission handler - pack_transcripts and transcribe_batch always use session edit_dir/videos_dir - Timeline images attached via SDK blob attachment API instead of manual base64 embedding in message history - Async rewrite: asyncio.run() entry point, async tool handlers - Remove --endpoint flag (no longer needed — SDK manages the CLI subprocess) - Remove manual JSON parse error handling (SDK handles protocol errors) - Increase default max_turns from 100 to 200 pyproject.toml: - copilot extras: openai>=1.0 → github-copilot-sdk + pydantic>=2.0 .env.example: - Document both auth options: copilot auth login vs GITHUB_TOKEN README.md: - Update Option B setup steps for Copilot SDK - Add copilot auth login as recommended auth method - Update model list to current Copilot CLI models - Remove --endpoint option, add --enable-shell, /model mid-session note --- .env.example | 10 +- README.md | 30 +- orchestrator.py | 1054 +++++++++++++++++++---------------------------- pyproject.toml | 2 +- 4 files changed, 443 insertions(+), 653 deletions(-) diff --git a/.env.example b/.env.example index f7de35e..9fa52cf 100644 --- a/.env.example +++ b/.env.example @@ -1,7 +1,11 @@ ELEVENLABS_API_KEY= # Required for the GitHub Copilot orchestrator (orchestrator.py). -# Create a Personal Access Token at https://github.com/settings/tokens -# with the 'copilot' scope, then paste it here. The orchestrator loads -# this file automatically (like the transcription helpers do). +# +# Option A — browser login (recommended, no token needed): +# copilot auth login +# +# Option B — Personal Access Token: +# Create one at https://github.com/settings/tokens with the 'copilot' scope, +# then paste it below. The orchestrator loads this file automatically. GITHUB_TOKEN= diff --git a/README.md b/README.md index 1179e80..e89fbe5 100644 --- a/README.md +++ b/README.md @@ -52,23 +52,27 @@ claude ### Option B — GitHub Copilot (no Anthropic key required) Uses your existing GitHub Copilot subscription as the LLM backend via the -OpenAI-compatible Copilot API. Same pipeline, same production rules, same helpers. +[GitHub Copilot SDK](https://github.com/github/copilot-sdk). The SDK bundles the +Copilot CLI automatically — no separate CLI install needed. Same pipeline, same +production rules, same helpers. ```bash # 1. Clone the repo git clone https://github.com/browser-use/video-use cd video-use -# 2. Install deps (includes the openai SDK) +# 2. Install deps (includes the Copilot SDK) pip install -e ".[copilot]" brew install ffmpeg # required brew install yt-dlp # optional -# 3. Configure API keys +# 3. Authenticate — pick one: +copilot auth login # Option A: browser login (recommended, no token needed) +# — OR — cp .env.example .env $EDITOR .env # ELEVENLABS_API_KEY=... ← for transcription (same as before) -# GITHUB_TOKEN=... ← PAT with 'copilot' scope +# GITHUB_TOKEN=... ← PAT with 'copilot' scope (option B) # https://github.com/settings/tokens ``` @@ -81,15 +85,19 @@ python /path/to/video-use/orchestrator.py /path/to/your/videos Available options: ``` ---model claude-opus-4-7 # default — Anthropic Claude Opus 4.7 (strong reasoning + vision) ---model gpt-4o # OpenAI GPT-4o alternative ---model claude-sonnet-4-5 # faster Anthropic option ---model gpt-4o-mini # fastest/lightest option ---endpoint # default: https://api.githubcopilot.com - # GitHub Models alternative: https://models.inference.ai.azure.com ---max-turns 100 # safety cap on LLM turns (default: 100) +# Model (omit to let Copilot auto-select — recommended) +--model claude-opus-4.5 # Anthropic Claude Opus 4.5 — complex tasks, deep reasoning +--model claude-sonnet-4.5 # Anthropic Claude Sonnet 4.5 — faster, most routine tasks +--model gpt-5 # OpenAI GPT-5 +--model gpt-4.1 # OpenAI GPT-4.1 + +# Other flags +--enable-shell # enable built-in shell tool (off by default for safety) +--max-turns 200 # safety cap on interactive turns (default: 200) ``` +You can also switch models mid-session with `/model` at the prompt. + And in the session: > edit these into a launch video diff --git a/orchestrator.py b/orchestrator.py index 627929b..64b1a89 100644 --- a/orchestrator.py +++ b/orchestrator.py @@ -2,32 +2,36 @@ """GitHub Copilot-backed video editing orchestrator for video-use. Replaces the `claude` CLI runtime with a standalone Python script that drives -the same video editing pipeline using the GitHub Copilot API (OpenAI-compatible). -All 12 hard production rules from SKILL.md are enforced via the same system -prompt — no logic changes to the skill or helpers are needed. +the same video editing pipeline using the GitHub Copilot SDK. The SDK spawns +the Copilot CLI as a subprocess automatically — no separate CLI install needed. + +All 12 hard production rules from SKILL.md are enforced via the system prompt. +No logic changes to the skill or helpers are required. Requirements: - pip install -e ".[copilot]" # openai>=1.0 - export GITHUB_TOKEN= # PAT with `copilot` scope - ELEVENLABS_API_KEY=... in .env # for transcription (same as before) + pip install -e ".[copilot]" # github-copilot-sdk + pydantic + GITHUB_TOKEN=... in .env # PAT with 'copilot' scope + OR run `copilot auth login` once # sign in via browser (no token needed) + ELEVENLABS_API_KEY=... in .env # for transcription ffmpeg and ffprobe on PATH Usage: python orchestrator.py /path/to/videos - python orchestrator.py /path/to/videos --model gpt-4o - python orchestrator.py /path/to/videos --endpoint https://models.inference.ai.azure.com + python orchestrator.py /path/to/videos --model claude-sonnet-4.5 + python orchestrator.py /path/to/videos --enable-shell """ from __future__ import annotations import argparse +import asyncio import base64 -import json import os import subprocess import sys import tempfile from pathlib import Path +from typing import Optional # --------------------------------------------------------------------------- # Repo-relative paths @@ -70,283 +74,28 @@ def load_skill_prompt() -> str: # Maximum characters returned from a single tool call before truncation. -# Keeps large files (packed transcripts, long ffmpeg logs) from consuming -# the entire context window. MAX_TOOL_RESULT_LENGTH = 20_000 +# Maximum image size (bytes) to embed; larger images are downscaled first. +MAX_IMAGE_BYTES = 1_500_000 # 1.5 MB + + # --------------------------------------------------------------------------- -# Tool schemas (OpenAI function-calling format) +# Path sandbox helper # --------------------------------------------------------------------------- -TOOLS: list[dict] = [ - { - "type": "function", - "function": { - "name": "transcribe", - "description": ( - "Transcribe a single video with ElevenLabs Scribe. " - "Writes word-level transcript JSON to edit/transcripts/.json. " - "Cached — skips upload if the JSON already exists." - ), - "parameters": { - "type": "object", - "properties": { - "video_path": { - "type": "string", - "description": "Absolute path to the video file.", - }, - "edit_dir": { - "type": "string", - "description": "Edit output directory. Defaults to /edit.", - }, - "language": { - "type": "string", - "description": "ISO language code (e.g. 'en'). Omit to auto-detect.", - }, - "num_speakers": { - "type": "integer", - "description": "Number of speakers. Improves diarization when known.", - }, - }, - "required": ["video_path"], - }, - }, - }, - { - "type": "function", - "function": { - "name": "transcribe_batch", - "description": ( - "Batch-transcribe every video in a directory using parallel workers. " - "Cached per source — already-transcribed files are skipped." - ), - "parameters": { - "type": "object", - "properties": { - "videos_dir": { - "type": "string", - "description": "Directory containing source videos.", - }, - "workers": { - "type": "integer", - "description": "Parallel workers (default 4).", - }, - "edit_dir": { - "type": "string", - "description": "Override edit output directory.", - }, - "num_speakers": { - "type": "integer", - "description": "Number of speakers (optional).", - }, - }, - "required": ["videos_dir"], - }, - }, - }, - { - "type": "function", - "function": { - "name": "pack_transcripts", - "description": ( - "Pack all per-source transcript JSONs in edit/transcripts/ into " - "takes_packed.md — the primary phrase-level reading surface for cut decisions." - ), - "parameters": { - "type": "object", - "properties": { - "edit_dir": { - "type": "string", - "description": "Edit output directory containing transcripts/ subdirectory.", - }, - "silence_threshold": { - "type": "number", - "description": "Silence gap in seconds that triggers a phrase break (default 0.5).", - }, - }, - "required": ["edit_dir"], - }, - }, - }, - { - "type": "function", - "function": { - "name": "timeline_view", - "description": ( - "Generate a filmstrip + waveform PNG for a time range of a video. " - "Use at decision points (ambiguous pauses, retake comparison, cut-point " - "sanity checks). NOT a scan tool — call only when you need a visual check." - ), - "parameters": { - "type": "object", - "properties": { - "video_path": { - "type": "string", - "description": "Absolute path to the video file.", - }, - "start": { - "type": "number", - "description": "Start time in seconds.", - }, - "end": { - "type": "number", - "description": "End time in seconds.", - }, - "n_frames": { - "type": "integer", - "description": "Number of filmstrip frames to extract (default 8).", - }, - "transcript_path": { - "type": "string", - "description": "Optional path to a transcript JSON for word label overlay.", - }, - }, - "required": ["video_path", "start", "end"], - }, - }, - }, - { - "type": "function", - "function": { - "name": "render", - "description": ( - "Render a video from an EDL (edit decision list JSON). " - "Runs the full pipeline: per-segment extract with grade + 30ms audio fades → " - "lossless concat → overlays (PTS-shifted) → subtitles LAST → loudnorm." - ), - "parameters": { - "type": "object", - "properties": { - "edl_path": { - "type": "string", - "description": "Absolute path to edl.json.", - }, - "output_path": { - "type": "string", - "description": "Output video path (e.g. edit/final.mp4).", - }, - "preview": { - "type": "boolean", - "description": "Preview mode: 1080p, CRF 22, faster encode.", - }, - "build_subtitles": { - "type": "boolean", - "description": "Build master.srt from transcripts + EDL timeline offsets.", - }, - "no_subtitles": { - "type": "boolean", - "description": "Skip subtitles even if the EDL references one.", - }, - "no_loudnorm": { - "type": "boolean", - "description": "Skip audio loudness normalization (default: on, -14 LUFS).", - }, - }, - "required": ["edl_path", "output_path"], - }, - }, - }, - { - "type": "function", - "function": { - "name": "grade", - "description": ( - "Apply a color grade to a video via ffmpeg filter chain. " - "Presets: subtle, neutral_punch, warm_cinematic, none. " - "Omit both preset and filter for auto mode (data-driven per-clip correction)." - ), - "parameters": { - "type": "object", - "properties": { - "input_path": { - "type": "string", - "description": "Input video path.", - }, - "output_path": { - "type": "string", - "description": "Output video path.", - }, - "preset": { - "type": "string", - "description": "Grade preset name.", - "enum": ["subtle", "neutral_punch", "warm_cinematic", "none"], - }, - "filter": { - "type": "string", - "description": "Raw ffmpeg filter string (overrides preset).", - }, - }, - "required": ["input_path", "output_path"], - }, - }, - }, - { - "type": "function", - "function": { - "name": "bash", - "description": ( - "Run a shell command. Use for ffprobe, yt-dlp, file listing, " - "ffmpeg one-offs, and other system tasks the other tools don't cover." - ), - "parameters": { - "type": "object", - "properties": { - "command": { - "type": "string", - "description": "Shell command to execute.", - }, - }, - "required": ["command"], - }, - }, - }, - { - "type": "function", - "function": { - "name": "read_file", - "description": "Read the contents of a text file (takes_packed.md, project.md, edl.json, transcripts, etc.).", - "parameters": { - "type": "object", - "properties": { - "path": { - "type": "string", - "description": "Absolute path to the file.", - }, - }, - "required": ["path"], - }, - }, - }, - { - "type": "function", - "function": { - "name": "write_file", - "description": "Write or append content to a file (edl.json, project.md, etc.).", - "parameters": { - "type": "object", - "properties": { - "path": { - "type": "string", - "description": "Absolute path to the file.", - }, - "content": { - "type": "string", - "description": "Content to write.", - }, - "append": { - "type": "boolean", - "description": "If true, append to existing file instead of overwriting.", - }, - }, - "required": ["path", "content"], - }, - }, - }, -] + +def _is_under(path: Path, parent: Path) -> bool: + """Return True if *path* is the same as or nested under *parent*.""" + try: + path.resolve().relative_to(parent.resolve()) + return True + except ValueError: + return False # --------------------------------------------------------------------------- -# Tool dispatch +# Helpers runner # --------------------------------------------------------------------------- @@ -367,172 +116,22 @@ def _format_result(returncode: int, stdout: str, stderr: str) -> str: parts.append("(no output)" if returncode == 0 else f"[exit {returncode}] (no output)") if returncode != 0: parts.insert(0, f"[exit code {returncode}]") - return "\n".join(parts) - - -def _is_under(path: Path, parent: Path) -> bool: - """Return True if *path* is the same as or nested under *parent*.""" - try: - path.relative_to(parent.resolve()) - return True - except ValueError: - return False - - -def dispatch_tool( - name: str, - args: dict, - videos_dir: Path, - edit_dir: Path, - enable_bash: bool = False, -) -> tuple[str, Path | None]: - """Execute a tool call. Returns (result_text, optional_image_path).""" - - if name == "transcribe": - video_path = args["video_path"] - cmd = [str(HELPERS_DIR / "transcribe.py"), video_path] - if args.get("edit_dir"): - cmd += ["--edit-dir", args["edit_dir"]] - else: - cmd += ["--edit-dir", str(edit_dir)] - if args.get("language"): - cmd += ["--language", args["language"]] - if args.get("num_speakers"): - cmd += ["--num-speakers", str(args["num_speakers"])] - rc, out, err = _run_helper(cmd) - return _format_result(rc, out, err), None - - if name == "transcribe_batch": - # Always use the session videos_dir to prevent operating outside it - cmd = [str(HELPERS_DIR / "transcribe_batch.py"), str(videos_dir)] - if args.get("edit_dir"): - cmd += ["--edit-dir", args["edit_dir"]] - if args.get("workers"): - cmd += ["--workers", str(args["workers"])] - if args.get("num_speakers"): - cmd += ["--num-speakers", str(args["num_speakers"])] - rc, out, err = _run_helper(cmd) - return _format_result(rc, out, err), None - - if name == "pack_transcripts": - cmd = [str(HELPERS_DIR / "pack_transcripts.py"), "--edit-dir", args.get("edit_dir") or str(edit_dir)] - if args.get("silence_threshold") is not None: - cmd += ["--silence-threshold", str(args["silence_threshold"])] - rc, out, err = _run_helper(cmd) - return _format_result(rc, out, err), None - - if name == "timeline_view": - video_path = Path(args["video_path"]) - start = args["start"] - end = args["end"] - verify_dir = edit_dir / "verify" - verify_dir.mkdir(parents=True, exist_ok=True) - out_img = verify_dir / f"timeline_{video_path.stem}_{start:.2f}_{end:.2f}.png" - cmd = [ - str(HELPERS_DIR / "timeline_view.py"), - str(video_path), - str(start), - str(end), - "-o", str(out_img), - ] - if args.get("n_frames"): - cmd += ["--n-frames", str(args["n_frames"])] - if args.get("transcript_path"): - cmd += ["--transcript", args["transcript_path"]] - rc, out, err = _run_helper(cmd) - result = _format_result(rc, out, err) - if rc == 0 and out_img.exists(): - result += f"\nImage saved to: {out_img}" - return result, out_img - return result, None - - if name == "render": - cmd = [ - str(HELPERS_DIR / "render.py"), - args["edl_path"], - "-o", args["output_path"], - ] - if args.get("preview"): - cmd.append("--preview") - if args.get("build_subtitles"): - cmd.append("--build-subtitles") - if args.get("no_subtitles"): - cmd.append("--no-subtitles") - if args.get("no_loudnorm"): - cmd.append("--no-loudnorm") - rc, out, err = _run_helper(cmd) - return _format_result(rc, out, err), None - - if name == "grade": - cmd = [ - str(HELPERS_DIR / "grade.py"), - args["input_path"], - "-o", args["output_path"], - ] - if args.get("filter"): - cmd += ["--filter", args["filter"]] - elif args.get("preset"): - cmd += ["--preset", args["preset"]] - rc, out, err = _run_helper(cmd) - return _format_result(rc, out, err), None - - if name == "bash": - if not enable_bash: - return ( - "[bash tool is disabled by default. Restart the orchestrator with " - "--enable-bash to allow shell commands.]", - None, - ) - command = args["command"] - proc = subprocess.run( - command, - shell=True, - capture_output=True, - text=True, - timeout=300, - ) - return _format_result(proc.returncode, proc.stdout, proc.stderr), None - - if name == "read_file": - path = Path(args["path"]).resolve() - if not (_is_under(path, videos_dir) or _is_under(path, edit_dir)): - return f"Access denied: path must be under {videos_dir} or {edit_dir}", None - if not path.exists(): - return f"File not found: {path}", None - try: - return path.read_text(), None - except Exception as e: - return f"Error reading file: {e}", None - - if name == "write_file": - path = Path(args["path"]).resolve() - if not _is_under(path, edit_dir): - return f"Access denied: write path must be under {edit_dir}", None - path.parent.mkdir(parents=True, exist_ok=True) - mode = "a" if args.get("append") else "w" - try: - with open(path, mode) as f: - f.write(args["content"]) - return f"Written to {path}", None - except Exception as e: - return f"Error writing file: {e}", None - - return f"Unknown tool: {name}", None + result = "\n".join(parts) + if len(result) > MAX_TOOL_RESULT_LENGTH: + result = result[:MAX_TOOL_RESULT_LENGTH] + "\n... [truncated]" + return result # --------------------------------------------------------------------------- -# Session loop +# Image attachment helper # --------------------------------------------------------------------------- -# Maximum image size (bytes) to embed in chat; larger images are downscaled first. -MAX_IMAGE_BYTES = 1_500_000 # 1.5 MB - - -def _build_image_message(img_path: Path) -> dict: - """Embed a timeline image as a base64 data URL, downscaling via ffmpeg if needed.""" +def _prepare_image_attachment(img_path: Path) -> dict: + """Return a blob attachment dict, downscaling via ffmpeg if > MAX_IMAGE_BYTES.""" raw = img_path.read_bytes() mime = "image/png" + if len(raw) > MAX_IMAGE_BYTES: with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp: tmp_path = Path(tmp.name) @@ -552,212 +151,397 @@ def _build_image_message(img_path: Path) -> dict: pass finally: tmp_path.unlink(missing_ok=True) + if len(raw) > MAX_IMAGE_BYTES: - return { - "role": "user", - "content": ( - f"[Timeline image too large to embed ({len(raw):,} bytes); " - f"saved to: {img_path}]" - ), - } - b64 = base64.b64encode(raw).decode() + return {} # too large — caller will skip attachment + return { - "role": "user", - "content": [ - {"type": "text", "text": f"[Timeline view image: {img_path.name}]"}, - {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}}, - ], + "type": "blob", + "data": base64.b64encode(raw).decode(), + "mimeType": mime, } -def run_session( +# --------------------------------------------------------------------------- +# Session loop +# --------------------------------------------------------------------------- + + +async def run_session( videos_dir: Path, model: str, - endpoint: str, + enable_shell: bool, max_turns: int, - enable_bash: bool = False, ) -> None: try: - from openai import OpenAI + from copilot import CopilotClient, SubprocessConfig, define_tool + from copilot.session import PermissionRequestResult + from pydantic import BaseModel, Field except ImportError: sys.exit( - "openai package not found.\n" + "Required packages not found.\n" "Install with: pip install -e \".[copilot]\"\n" - "or: pip install openai" + "or: pip install github-copilot-sdk pydantic" ) _load_env_file() - github_token = os.environ.get("GITHUB_TOKEN", "").strip() - if not github_token: - sys.exit( - "GITHUB_TOKEN is not set.\n" - "Export a Personal Access Token with the 'copilot' scope:\n" - " export GITHUB_TOKEN=github_pat_..." - ) - - client = OpenAI(base_url=endpoint, api_key=github_token) edit_dir = videos_dir / "edit" edit_dir.mkdir(parents=True, exist_ok=True) - # Build system prompt with working-directory context injected at the end - system_prompt = load_skill_prompt() - system_prompt += ( - f"\n\n## Session context\n\n" - f"- Videos directory: `{videos_dir}`\n" - f"- Edit directory: `{edit_dir}`\n" - f"- Helpers directory: `{HELPERS_DIR}`\n" - f"- All session outputs must go to `{edit_dir}/` (Hard Rule 12).\n" + # ------------------------------------------------------------------ + # Tool parameter models + # ------------------------------------------------------------------ + + class TranscribeParams(BaseModel): + video_path: str = Field(description="Absolute path to the video file.") + language: Optional[str] = Field(default=None, description="ISO language code (e.g. 'en'). Omit to auto-detect.") + num_speakers: Optional[int] = Field(default=None, description="Number of speakers for diarization.") + + class TranscribeBatchParams(BaseModel): + workers: Optional[int] = Field(default=None, description="Parallel workers (default 4).") + num_speakers: Optional[int] = Field(default=None, description="Number of speakers (optional).") + + class PackTranscriptsParams(BaseModel): + silence_threshold: Optional[float] = Field(default=None, description="Silence gap in seconds that triggers a phrase break (default 0.5).") + + class TimelineViewParams(BaseModel): + video_path: str = Field(description="Absolute path to the video file.") + start: float = Field(description="Start time in seconds.") + end: float = Field(description="End time in seconds.") + n_frames: Optional[int] = Field(default=None, description="Number of filmstrip frames to extract (default 8).") + transcript_path: Optional[str] = Field(default=None, description="Optional path to transcript JSON for word label overlay.") + + class RenderParams(BaseModel): + edl_path: str = Field(description="Absolute path to edl.json.") + output_path: str = Field(description="Output video path (e.g. edit/final.mp4).") + preview: Optional[bool] = Field(default=None, description="Preview mode: 1080p, CRF 22, faster encode.") + build_subtitles: Optional[bool] = Field(default=None, description="Build master.srt from transcripts + EDL timeline offsets.") + no_subtitles: Optional[bool] = Field(default=None, description="Skip subtitles even if the EDL references one.") + no_loudnorm: Optional[bool] = Field(default=None, description="Skip audio loudness normalization.") + + class GradeParams(BaseModel): + input_path: str = Field(description="Input video path.") + output_path: str = Field(description="Output video path.") + preset: Optional[str] = Field(default=None, description="Grade preset: subtle, neutral_punch, warm_cinematic, none.") + filter: Optional[str] = Field(default=None, description="Raw ffmpeg filter string (overrides preset).") + + # ------------------------------------------------------------------ + # Tool implementations + # ------------------------------------------------------------------ + + @define_tool( + description=( + "Transcribe a single video with ElevenLabs Scribe. " + "Writes word-level transcript JSON to edit/transcripts/.json. " + "Cached — skips upload if the JSON already exists." + ), + skip_permission=True, + ) + async def transcribe(params: TranscribeParams) -> str: + cmd = [str(HELPERS_DIR / "transcribe.py"), params.video_path] + cmd += ["--edit-dir", str(edit_dir)] + if params.language: + cmd += ["--language", params.language] + if params.num_speakers: + cmd += ["--num-speakers", str(params.num_speakers)] + rc, out, err = _run_helper(cmd) + return _format_result(rc, out, err) + + @define_tool( + description=( + "Batch-transcribe every video in the session videos directory using parallel workers. " + "Cached per source — already-transcribed files are skipped." + ), + skip_permission=True, ) + async def transcribe_batch(params: TranscribeBatchParams) -> str: + # Always use the session videos_dir — model cannot redirect this elsewhere + cmd = [str(HELPERS_DIR / "transcribe_batch.py"), str(videos_dir)] + cmd += ["--edit-dir", str(edit_dir)] + if params.workers: + cmd += ["--workers", str(params.workers)] + if params.num_speakers: + cmd += ["--num-speakers", str(params.num_speakers)] + rc, out, err = _run_helper(cmd) + return _format_result(rc, out, err) - messages: list[dict] = [{"role": "system", "content": system_prompt}] + @define_tool( + description=( + "Pack all per-source transcript JSONs in edit/transcripts/ into " + "takes_packed.md — the primary phrase-level reading surface for cut decisions." + ), + skip_permission=True, + ) + async def pack_transcripts(params: PackTranscriptsParams) -> str: + cmd = [str(HELPERS_DIR / "pack_transcripts.py"), "--edit-dir", str(edit_dir)] + if params.silence_threshold is not None: + cmd += ["--silence-threshold", str(params.silence_threshold)] + rc, out, err = _run_helper(cmd) + return _format_result(rc, out, err) - # Seed with prior session memory if available - project_md = edit_dir / "project.md" - if project_md.exists(): - prior = project_md.read_text().strip() - if prior: - messages.append({ - "role": "user", - "content": ( - f"[Prior session memory — project.md]\n\n{prior}\n\n---\n" - "I'm back. What should we pick up from or start fresh on?" - ), - }) - messages.append({ - "role": "assistant", - "content": ( - "I've reviewed the session notes above. Ready when you are — " - "just tell me what you'd like to work on." - ), - }) - - print(f"\nvideo-use — GitHub Copilot orchestrator") - print(f" model: {model}") - print(f" endpoint: {endpoint}") - print(f" videos: {videos_dir}") - print("Type your message. Enter 'exit' or press Ctrl+C to quit.\n") + # Side-channel for the last timeline image path so it can be attached in the + # next user message (the SDK attachment API goes on session.send, not tool results). + _pending_images: list[Path] = [] - # Prompt for the first user message - try: - first_input = input("You: ").strip() - except (EOFError, KeyboardInterrupt): - print("\nBye.") - return + @define_tool( + description=( + "Generate a filmstrip + waveform PNG for a time range of a video. " + "Use at decision points (ambiguous pauses, retake comparison, cut-point " + "sanity checks). NOT a scan tool — call only when you need a visual check." + ), + skip_permission=True, + ) + async def timeline_view(params: TimelineViewParams) -> str: + video_path = Path(params.video_path) + verify_dir = edit_dir / "verify" + verify_dir.mkdir(parents=True, exist_ok=True) + out_img = verify_dir / f"timeline_{video_path.stem}_{params.start:.2f}_{params.end:.2f}.png" + cmd = [ + str(HELPERS_DIR / "timeline_view.py"), + str(video_path), + str(params.start), + str(params.end), + "-o", str(out_img), + ] + if params.n_frames: + cmd += ["--n-frames", str(params.n_frames)] + if params.transcript_path: + cmd += ["--transcript", params.transcript_path] + rc, out, err = _run_helper(cmd) + result = _format_result(rc, out, err) + if rc == 0 and out_img.exists(): + _pending_images.append(out_img) + result += f"\nImage saved to: {out_img} (will be attached to your next reply)" + return result + + @define_tool( + description=( + "Render a video from an EDL (edit decision list JSON). " + "Runs the full pipeline: per-segment extract with grade + 30ms audio fades → " + "lossless concat → overlays → subtitles LAST → loudnorm." + ), + skip_permission=True, + ) + async def render(params: RenderParams) -> str: + cmd = [ + str(HELPERS_DIR / "render.py"), + params.edl_path, + "-o", params.output_path, + ] + if params.preview: + cmd.append("--preview") + if params.build_subtitles: + cmd.append("--build-subtitles") + if params.no_subtitles: + cmd.append("--no-subtitles") + if params.no_loudnorm: + cmd.append("--no-loudnorm") + rc, out, err = _run_helper(cmd) + return _format_result(rc, out, err) + + @define_tool( + description=( + "Apply a color grade to a video via ffmpeg filter chain. " + "Presets: subtle, neutral_punch, warm_cinematic, none. " + "Omit both preset and filter for auto mode (data-driven per-clip correction)." + ), + skip_permission=True, + ) + async def grade(params: GradeParams) -> str: + cmd = [ + str(HELPERS_DIR / "grade.py"), + params.input_path, + "-o", params.output_path, + ] + if params.filter: + cmd += ["--filter", params.filter] + elif params.preset: + cmd += ["--preset", params.preset] + rc, out, err = _run_helper(cmd) + return _format_result(rc, out, err) - if not first_input or first_input.lower() in ("exit", "quit", "q"): - print("Bye.") - return + # ------------------------------------------------------------------ + # Permission handler — sandboxes file writes to edit_dir; shell off by default + # ------------------------------------------------------------------ - messages.append({"role": "user", "content": first_input}) + def on_permission_request(request, invocation) -> "PermissionRequestResult": + kind = request.kind.value if hasattr(request.kind, "value") else str(request.kind) - turn = 0 - while turn < max_turns: + if kind == "shell" and not enable_shell: + print( + "\n[shell tool blocked — restart with --enable-shell to allow shell commands]", + flush=True, + ) + return PermissionRequestResult(kind="denied-interactively-by-user") + + if kind == "write": + file_name = getattr(request, "file_name", None) or "" + if file_name and not _is_under(Path(file_name), edit_dir): + print(f"\n[write blocked — path outside edit_dir: {file_name}]", flush=True) + return PermissionRequestResult(kind="denied-by-rules") + + return PermissionRequestResult(kind="approved") + + # ------------------------------------------------------------------ + # User input handler (enables ask_user tool in the CLI) + # ------------------------------------------------------------------ + + async def on_user_input_request(request, invocation) -> dict: + question = request.get("question", "") + choices = request.get("choices") + print(f"\nAssistant asks: {question}") + if choices: + for i, c in enumerate(choices, 1): + print(f" {i}. {c}") try: - response = client.chat.completions.create( - model=model, - messages=messages, - tools=TOOLS, - max_tokens=4096, + answer = await asyncio.get_event_loop().run_in_executor( + None, lambda: input("Your answer: ").strip() ) - except KeyboardInterrupt: - print("\n[Interrupted]") - break - except Exception as e: - print(f"\n[API error: {e}]") - break - - choice = response.choices[0] - message = choice.message - - # Serialize the assistant message back into the history - msg_dict: dict = {"role": "assistant", "content": message.content} - if message.tool_calls: - msg_dict["tool_calls"] = [ - { - "id": tc.id, - "type": "function", - "function": { - "name": tc.function.name, - "arguments": tc.function.arguments, - }, - } - for tc in message.tool_calls - ] - messages.append(msg_dict) - - if message.tool_calls: - # Show any explanation the model included alongside the tool calls - if message.content: - print(f"\nAssistant: {message.content}\n") - - # Execute every requested tool call - image_paths: list[Path] = [] - - for tc in message.tool_calls: - tool_name = tc.function.name - try: - tool_args = json.loads(tc.function.arguments) - except json.JSONDecodeError: - messages.append({ - "role": "tool", - "tool_call_id": tc.id, - "content": ( - f"[Invalid JSON in tool arguments — could not parse. " - f"Raw arguments: {tc.function.arguments!r}. Please retry with valid JSON.]" - ), - }) - continue - - # Pretty-print what we're doing - args_preview = ", ".join( - f"{k}={v!r}" for k, v in list(tool_args.items())[:3] - ) - print(f" [tool] {tool_name}({args_preview})", flush=True) + except (EOFError, KeyboardInterrupt): + answer = "" + return {"answer": answer, "wasFreeform": True} - result_text, image_path = dispatch_tool( - tool_name, tool_args, videos_dir, edit_dir, enable_bash=enable_bash - ) + # ------------------------------------------------------------------ + # Build system prompt + # ------------------------------------------------------------------ - # Truncate very long results so we don't blow the context window - if len(result_text) > MAX_TOOL_RESULT_LENGTH: - result_text = result_text[:MAX_TOOL_RESULT_LENGTH] + "\n... [truncated]" + system_content = ( + load_skill_prompt() + + f"\n\n## Session context\n\n" + f"- Videos directory: `{videos_dir}`\n" + f"- Edit directory: `{edit_dir}`\n" + f"- Helpers directory: `{HELPERS_DIR}`\n" + f"- All session outputs must go to `{edit_dir}/` (Hard Rule 12).\n" + ) - messages.append({ - "role": "tool", - "tool_call_id": tc.id, - "content": result_text, - }) + # ------------------------------------------------------------------ + # Print banner + # ------------------------------------------------------------------ - if image_path and image_path.exists(): - image_paths.append(image_path) + print(f"\nvideo-use — GitHub Copilot SDK orchestrator") + print(f" model: {model or 'auto (Copilot selects)'}") + print(f" videos: {videos_dir}") + print(f" shell: {'enabled' if enable_shell else 'disabled (--enable-shell to allow)'}") + print("Type your message. Enter 'exit' or press Ctrl+C to quit.\n") - # Inject timeline view images as user messages so vision-capable - # models (gpt-4o, etc.) can reason about them - for img_path in image_paths: - messages.append(_build_image_message(img_path)) + # ------------------------------------------------------------------ + # Prior session memory + # ------------------------------------------------------------------ - turn += 1 - continue # Let the model respond to the tool results + project_md = edit_dir / "project.md" + initial_context: str | None = None + if project_md.exists(): + prior = project_md.read_text().strip() + if prior: + initial_context = ( + f"[Prior session memory — project.md]\n\n{prior}\n\n---\n" + "I'm back. What should we pick up from or start fresh on?" + ) - # No tool calls — conversational turn - if message.content: - print(f"\nAssistant: {message.content}\n") + # ------------------------------------------------------------------ + # SDK client + session + # ------------------------------------------------------------------ - if choice.finish_reason == "stop": - try: - user_input = input("You: ").strip() - except (EOFError, KeyboardInterrupt): - print("\nBye.") - break + github_token = os.environ.get("GITHUB_TOKEN", "").strip() or None + config = SubprocessConfig( + cwd=str(videos_dir), + github_token=github_token, + ) - if not user_input or user_input.lower() in ("exit", "quit", "q"): - print("Bye.") - break + session_kwargs: dict = dict( + on_permission_request=on_permission_request, + on_user_input_request=on_user_input_request, + tools=[transcribe, transcribe_batch, pack_transcripts, timeline_view, render, grade], + system_message={"content": system_content}, + streaming=True, + ) + if model: + session_kwargs["model"] = model - messages.append({"role": "user", "content": user_input}) + try: + from copilot.generated.session_events import ( + AssistantMessageData, + AssistantMessageDeltaData, + SessionIdleData, + ) + except ImportError: + # Older SDK versions may use a different import path + from copilot.session_events import ( # type: ignore[no-redef] + AssistantMessageData, + AssistantMessageDeltaData, + SessionIdleData, + ) - turn += 1 + async with CopilotClient(config) as client: + async with await client.create_session(**session_kwargs) as session: + + # Seed prior session memory as first user turn + if initial_context: + seed_done = asyncio.Event() + + def _on_seed(event): + match event.data: + case AssistantMessageData(): + seed_done.set() + case SessionIdleData(): + seed_done.set() + + unsub_seed = session.on(_on_seed) + await session.send(initial_context) + await seed_done.wait() + unsub_seed() + print() + + turn = 0 + while turn < max_turns: + # Collect any pending timeline images + attachments: list[dict] = [] + while _pending_images: + img_path = _pending_images.pop(0) + if img_path.exists(): + att = _prepare_image_attachment(img_path) + if att: + attachments.append(att) + + # Prompt user + try: + user_input = await asyncio.get_event_loop().run_in_executor( + None, lambda: input("You: ").strip() + ) + except (EOFError, KeyboardInterrupt): + print("\nBye.") + break + + if not user_input or user_input.lower() in ("exit", "quit", "q"): + print("Bye.") + break + + # Wait for full response + response_done = asyncio.Event() + print("\nAssistant: ", end="", flush=True) + + def on_event(event): + match event.data: + case AssistantMessageDeltaData() as data: + delta = data.delta_content or "" + print(delta, end="", flush=True) + case AssistantMessageData(): + print() # ensure newline after full message + case SessionIdleData(): + response_done.set() + + unsub = session.on(on_event) + send_kwargs: dict = {"prompt": user_input} + if attachments: + send_kwargs["attachments"] = attachments + + await session.send(**send_kwargs) + await response_done.wait() + unsub() + print() + + turn += 1 if turn >= max_turns: print(f"\n[Reached max_turns={max_turns}. Session ended.]") @@ -770,20 +554,22 @@ def run_session( def main() -> None: ap = argparse.ArgumentParser( - description="GitHub Copilot-backed video editing orchestrator for video-use.", + description="GitHub Copilot SDK video editing orchestrator for video-use.", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=( - "Environment variables:\n" - " GITHUB_TOKEN Personal Access Token with 'copilot' scope (required)\n" - " ELEVENLABS_API_KEY ElevenLabs API key for transcription (required for transcribe tools)\n" - "\nModel options (via GitHub Copilot):\n" - " claude-opus-4-7 Default — Anthropic Claude Opus 4.7, strong reasoning + vision\n" - " claude-sonnet-4-5 Anthropic Claude Sonnet 4.5 — faster, lighter\n" - " gpt-4o OpenAI GPT-4o — strong reasoning, vision support\n" - " gpt-4o-mini OpenAI GPT-4o mini — fastest OpenAI option\n" - " o3-mini OpenAI o3-mini — reasoning model\n" - "\nAlternative endpoint (GitHub Models free tier):\n" - " --endpoint https://models.inference.ai.azure.com\n" + "Authentication (pick one):\n" + " copilot auth login Sign in via browser — no token needed\n" + " GITHUB_TOKEN=... in .env PAT with 'copilot' scope\n" + " https://github.com/settings/tokens\n" + "\nModel options (via GitHub Copilot CLI — use /model inside session to switch):\n" + " (omit --model) Copilot auto-selects the best model\n" + " claude-opus-4.5 Anthropic Claude Opus 4.5 — complex tasks\n" + " claude-sonnet-4.5 Anthropic Claude Sonnet 4.5 — faster\n" + " gpt-5 OpenAI GPT-5\n" + " gpt-4.1 OpenAI GPT-4.1\n" + "\nEnvironment variables:\n" + " GITHUB_TOKEN PAT with 'copilot' scope (alternative to browser login)\n" + " ELEVENLABS_API_KEY ElevenLabs API key for transcription\n" ), ) ap.add_argument( @@ -793,32 +579,23 @@ def main() -> None: ) ap.add_argument( "--model", - default="claude-opus-4-7", - help="Model identifier for the Copilot API (default: claude-opus-4-7).", + default="", + help="Model identifier (default: Copilot auto-selects). Use /model inside session to switch.", ) ap.add_argument( - "--endpoint", - default="https://api.githubcopilot.com", + "--enable-shell", + action="store_true", + default=False, help=( - "GitHub Copilot API base URL " - "(default: https://api.githubcopilot.com). " - "Use https://models.inference.ai.azure.com for GitHub Models." + "Enable the built-in shell tool (disabled by default). " + "Only enable when you trust the model and understand the security implications." ), ) ap.add_argument( "--max-turns", type=int, - default=100, - help="Maximum LLM turns before the session ends (default: 100).", - ) - ap.add_argument( - "--enable-bash", - action="store_true", - default=False, - help=( - "Enable the bash tool (disabled by default). " - "Only enable when you trust the model and understand the security implications." - ), + default=200, + help="Maximum interactive turns before the session ends (default: 200).", ) args = ap.parse_args() @@ -826,12 +603,13 @@ def main() -> None: if not videos_dir.is_dir(): sys.exit(f"Not a directory: {videos_dir}") - run_session( - videos_dir=videos_dir, - model=args.model, - endpoint=args.endpoint, - max_turns=args.max_turns, - enable_bash=args.enable_bash, + asyncio.run( + run_session( + videos_dir=videos_dir, + model=args.model, + enable_shell=args.enable_shell, + max_turns=args.max_turns, + ) ) diff --git a/pyproject.toml b/pyproject.toml index 296cad6..211866a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ dependencies = [ [project.optional-dependencies] animations = ["manim"] -copilot = ["openai>=1.0"] +copilot = ["github-copilot-sdk", "pydantic>=2.0"] [build-system] requires = ["setuptools>=61.0"] From 9a7d73d32f0cc52c7545dba274f9eaae40488d64 Mon Sep 17 00:00:00 2001 From: delgerskhn Date: Wed, 22 Apr 2026 19:31:51 +0800 Subject: [PATCH 5/8] fix: move Pydantic models to module level to fix NameError on startup define_tool calls get_type_hints() which resolves annotations from the module's global namespace. Models defined inside run_session() are invisible to it, causing NameError: name 'TranscribeParams' is not defined. Also move copilot/pydantic imports to module level with a graceful fallback (deferred ImportError displayed at runtime via sys.exit). --- orchestrator.py | 96 +++++++++++++++++++++++++++---------------------- 1 file changed, 54 insertions(+), 42 deletions(-) diff --git a/orchestrator.py b/orchestrator.py index 64b1a89..c01fc05 100644 --- a/orchestrator.py +++ b/orchestrator.py @@ -33,6 +33,15 @@ from pathlib import Path from typing import Optional +try: + from copilot import CopilotClient, SubprocessConfig, define_tool + from copilot.session import PermissionRequestResult + from pydantic import BaseModel, Field +except ImportError: # deferred error — shown at runtime with a friendly message + CopilotClient = SubprocessConfig = define_tool = PermissionRequestResult = None # type: ignore + BaseModel = object # type: ignore + Field = lambda **_: None # type: ignore + # --------------------------------------------------------------------------- # Repo-relative paths # --------------------------------------------------------------------------- @@ -162,6 +171,50 @@ def _prepare_image_attachment(img_path: Path) -> dict: } +# --------------------------------------------------------------------------- +# Tool parameter models (module-level so get_type_hints() can resolve them) +# --------------------------------------------------------------------------- + + +class TranscribeParams(BaseModel): + video_path: str = Field(description="Absolute path to the video file.") + language: Optional[str] = Field(default=None, description="ISO language code (e.g. 'en'). Omit to auto-detect.") + num_speakers: Optional[int] = Field(default=None, description="Number of speakers for diarization.") + + +class TranscribeBatchParams(BaseModel): + workers: Optional[int] = Field(default=None, description="Parallel workers (default 4).") + num_speakers: Optional[int] = Field(default=None, description="Number of speakers (optional).") + + +class PackTranscriptsParams(BaseModel): + silence_threshold: Optional[float] = Field(default=None, description="Silence gap in seconds that triggers a phrase break (default 0.5).") + + +class TimelineViewParams(BaseModel): + video_path: str = Field(description="Absolute path to the video file.") + start: float = Field(description="Start time in seconds.") + end: float = Field(description="End time in seconds.") + n_frames: Optional[int] = Field(default=None, description="Number of filmstrip frames to extract (default 8).") + transcript_path: Optional[str] = Field(default=None, description="Optional path to transcript JSON for word label overlay.") + + +class RenderParams(BaseModel): + edl_path: str = Field(description="Absolute path to edl.json.") + output_path: str = Field(description="Output video path (e.g. edit/final.mp4).") + preview: Optional[bool] = Field(default=None, description="Preview mode: 1080p, CRF 22, faster encode.") + build_subtitles: Optional[bool] = Field(default=None, description="Build master.srt from transcripts + EDL timeline offsets.") + no_subtitles: Optional[bool] = Field(default=None, description="Skip subtitles even if the EDL references one.") + no_loudnorm: Optional[bool] = Field(default=None, description="Skip audio loudness normalization.") + + +class GradeParams(BaseModel): + input_path: str = Field(description="Input video path.") + output_path: str = Field(description="Output video path.") + preset: Optional[str] = Field(default=None, description="Grade preset: subtle, neutral_punch, warm_cinematic, none.") + filter: Optional[str] = Field(default=None, description="Raw ffmpeg filter string (overrides preset).") + + # --------------------------------------------------------------------------- # Session loop # --------------------------------------------------------------------------- @@ -173,11 +226,7 @@ async def run_session( enable_shell: bool, max_turns: int, ) -> None: - try: - from copilot import CopilotClient, SubprocessConfig, define_tool - from copilot.session import PermissionRequestResult - from pydantic import BaseModel, Field - except ImportError: + if CopilotClient is None: sys.exit( "Required packages not found.\n" "Install with: pip install -e \".[copilot]\"\n" @@ -189,43 +238,6 @@ async def run_session( edit_dir = videos_dir / "edit" edit_dir.mkdir(parents=True, exist_ok=True) - # ------------------------------------------------------------------ - # Tool parameter models - # ------------------------------------------------------------------ - - class TranscribeParams(BaseModel): - video_path: str = Field(description="Absolute path to the video file.") - language: Optional[str] = Field(default=None, description="ISO language code (e.g. 'en'). Omit to auto-detect.") - num_speakers: Optional[int] = Field(default=None, description="Number of speakers for diarization.") - - class TranscribeBatchParams(BaseModel): - workers: Optional[int] = Field(default=None, description="Parallel workers (default 4).") - num_speakers: Optional[int] = Field(default=None, description="Number of speakers (optional).") - - class PackTranscriptsParams(BaseModel): - silence_threshold: Optional[float] = Field(default=None, description="Silence gap in seconds that triggers a phrase break (default 0.5).") - - class TimelineViewParams(BaseModel): - video_path: str = Field(description="Absolute path to the video file.") - start: float = Field(description="Start time in seconds.") - end: float = Field(description="End time in seconds.") - n_frames: Optional[int] = Field(default=None, description="Number of filmstrip frames to extract (default 8).") - transcript_path: Optional[str] = Field(default=None, description="Optional path to transcript JSON for word label overlay.") - - class RenderParams(BaseModel): - edl_path: str = Field(description="Absolute path to edl.json.") - output_path: str = Field(description="Output video path (e.g. edit/final.mp4).") - preview: Optional[bool] = Field(default=None, description="Preview mode: 1080p, CRF 22, faster encode.") - build_subtitles: Optional[bool] = Field(default=None, description="Build master.srt from transcripts + EDL timeline offsets.") - no_subtitles: Optional[bool] = Field(default=None, description="Skip subtitles even if the EDL references one.") - no_loudnorm: Optional[bool] = Field(default=None, description="Skip audio loudness normalization.") - - class GradeParams(BaseModel): - input_path: str = Field(description="Input video path.") - output_path: str = Field(description="Output video path.") - preset: Optional[str] = Field(default=None, description="Grade preset: subtle, neutral_punch, warm_cinematic, none.") - filter: Optional[str] = Field(default=None, description="Raw ffmpeg filter string (overrides preset).") - # ------------------------------------------------------------------ # Tool implementations # ------------------------------------------------------------------ From 8b35fa7faf62632c8cd334dfd01dadb6919356f4 Mon Sep 17 00:00:00 2001 From: delgerskhn Date: Wed, 22 Apr 2026 20:16:33 +0800 Subject: [PATCH 6/8] refactor: streamline event handling in run_session for Copilot SDK integration --- orchestrator.py | 50 ++++++++++++++++++++++++------------------------- 1 file changed, 24 insertions(+), 26 deletions(-) diff --git a/orchestrator.py b/orchestrator.py index c01fc05..4aa6d99 100644 --- a/orchestrator.py +++ b/orchestrator.py @@ -471,19 +471,11 @@ async def on_user_input_request(request, invocation) -> dict: if model: session_kwargs["model"] = model - try: - from copilot.generated.session_events import ( - AssistantMessageData, - AssistantMessageDeltaData, - SessionIdleData, - ) - except ImportError: - # Older SDK versions may use a different import path - from copilot.session_events import ( # type: ignore[no-redef] - AssistantMessageData, - AssistantMessageDeltaData, - SessionIdleData, - ) + from copilot.generated.session_events import SessionEventType + + def _event_type(event) -> str: + et = getattr(event, "type", "") + return str(getattr(et, "value", et)) async with CopilotClient(config) as client: async with await client.create_session(**session_kwargs) as session: @@ -493,11 +485,12 @@ async def on_user_input_request(request, invocation) -> dict: seed_done = asyncio.Event() def _on_seed(event): - match event.data: - case AssistantMessageData(): - seed_done.set() - case SessionIdleData(): - seed_done.set() + et = _event_type(event) + if et in ( + SessionEventType.ASSISTANT_MESSAGE.value, + SessionEventType.SESSION_IDLE.value, + ): + seed_done.set() unsub_seed = session.on(_on_seed) await session.send(initial_context) @@ -534,14 +527,19 @@ def _on_seed(event): print("\nAssistant: ", end="", flush=True) def on_event(event): - match event.data: - case AssistantMessageDeltaData() as data: - delta = data.delta_content or "" - print(delta, end="", flush=True) - case AssistantMessageData(): - print() # ensure newline after full message - case SessionIdleData(): - response_done.set() + et = _event_type(event) + # Support both current "assistant.message_delta" and legacy + # docs/examples that use "assistant.message.delta". + if et in ( + SessionEventType.ASSISTANT_MESSAGE_DELTA.value, + "assistant.message.delta", + ): + delta = getattr(event.data, "delta_content", "") or "" + print(delta, end="", flush=True) + elif et == SessionEventType.ASSISTANT_MESSAGE.value: + print() # ensure newline after full message + elif et == SessionEventType.SESSION_IDLE.value: + response_done.set() unsub = session.on(on_event) send_kwargs: dict = {"prompt": user_input} From 6b395779867e6163ed5c7151bdbde2c6e90537be Mon Sep 17 00:00:00 2001 From: delgerskhn Date: Fri, 24 Apr 2026 11:53:18 +0800 Subject: [PATCH 7/8] Address PR review feedback Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .env.example | 5 +- README.md | 4 +- orchestrator.py | 121 ++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 101 insertions(+), 29 deletions(-) diff --git a/.env.example b/.env.example index 9fa52cf..e34aee0 100644 --- a/.env.example +++ b/.env.example @@ -6,6 +6,7 @@ ELEVENLABS_API_KEY= # copilot auth login # # Option B — Personal Access Token: -# Create one at https://github.com/settings/tokens with the 'copilot' scope, -# then paste it below. The orchestrator loads this file automatically. +# Create a fine-grained token at https://github.com/settings/tokens +# with Copilot Requests permission, then paste it below. +# The orchestrator loads this file automatically. GITHUB_TOKEN= diff --git a/README.md b/README.md index e89fbe5..1e6add6 100644 --- a/README.md +++ b/README.md @@ -72,8 +72,8 @@ copilot auth login # Option A: browser login (recommended, no token cp .env.example .env $EDITOR .env # ELEVENLABS_API_KEY=... ← for transcription (same as before) -# GITHUB_TOKEN=... ← PAT with 'copilot' scope (option B) -# https://github.com/settings/tokens +# GITHUB_TOKEN=... ← fine-grained token with Copilot Requests permission +# https://github.com/settings/tokens (option B) ``` Then run the orchestrator against your video folder: diff --git a/orchestrator.py b/orchestrator.py index 4aa6d99..075fd10 100644 --- a/orchestrator.py +++ b/orchestrator.py @@ -10,7 +10,7 @@ Requirements: pip install -e ".[copilot]" # github-copilot-sdk + pydantic - GITHUB_TOKEN=... in .env # PAT with 'copilot' scope + GITHUB_TOKEN=... in .env # fine-grained token with Copilot Requests permission OR run `copilot auth login` once # sign in via browser (no token needed) ELEVENLABS_API_KEY=... in .env # for transcription ffmpeg and ffprobe on PATH @@ -103,16 +103,49 @@ def _is_under(path: Path, parent: Path) -> bool: return False +def _resolve_session_path(raw_path: str, base_dir: Path) -> Path: + """Resolve a model-provided path relative to the session videos directory.""" + path = Path(raw_path) + if not path.is_absolute(): + path = base_dir / path + return path.resolve() + + +def _validate_edit_dir_path( + raw_path: str, + *, + videos_dir: Path, + edit_dir: Path, + label: str, + must_exist: bool = False, +) -> Path: + """Resolve a session path and enforce the edit_dir sandbox.""" + path = _resolve_session_path(raw_path, videos_dir) + if not _is_under(path, edit_dir): + raise ValueError(f"{label} must stay inside {edit_dir}: {raw_path}") + if must_exist and not path.exists(): + raise ValueError(f"{label} does not exist: {path}") + return path + + # --------------------------------------------------------------------------- # Helpers runner # --------------------------------------------------------------------------- -def _run_helper(args: list[str]) -> tuple[int, str, str]: - """Run a Python helper from the helpers/ directory.""" +async def _run_helper(args: list[str]) -> tuple[int, str, str]: + """Run a Python helper from the helpers/ directory without blocking the event loop.""" cmd = [sys.executable] + args - proc = subprocess.run(cmd, capture_output=True, text=True) - return proc.returncode, proc.stdout, proc.stderr + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout_bytes, stderr_bytes = await proc.communicate() + stdout = stdout_bytes.decode(errors="replace") if stdout_bytes is not None else "" + stderr = stderr_bytes.decode(errors="replace") if stderr_bytes is not None else "" + returncode = proc.returncode if proc.returncode is not None else 1 + return returncode, stdout, stderr def _format_result(returncode: int, stdout: str, stderr: str) -> str: @@ -145,18 +178,20 @@ def _prepare_image_attachment(img_path: Path) -> dict: with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as tmp: tmp_path = Path(tmp.name) try: - subprocess.run( + result = subprocess.run( [ "ffmpeg", "-y", "-i", str(img_path), "-vf", "scale='min(960,iw)':-2", str(tmp_path), ], - capture_output=True, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, check=False, ) - raw = tmp_path.read_bytes() - mime = "image/jpeg" - except Exception: + if result.returncode == 0 and tmp_path.exists() and tmp_path.stat().st_size > 0: + raw = tmp_path.read_bytes() + mime = "image/jpeg" + except OSError: pass finally: tmp_path.unlink(missing_ok=True) @@ -257,7 +292,7 @@ async def transcribe(params: TranscribeParams) -> str: cmd += ["--language", params.language] if params.num_speakers: cmd += ["--num-speakers", str(params.num_speakers)] - rc, out, err = _run_helper(cmd) + rc, out, err = await _run_helper(cmd) return _format_result(rc, out, err) @define_tool( @@ -275,7 +310,7 @@ async def transcribe_batch(params: TranscribeBatchParams) -> str: cmd += ["--workers", str(params.workers)] if params.num_speakers: cmd += ["--num-speakers", str(params.num_speakers)] - rc, out, err = _run_helper(cmd) + rc, out, err = await _run_helper(cmd) return _format_result(rc, out, err) @define_tool( @@ -289,7 +324,7 @@ async def pack_transcripts(params: PackTranscriptsParams) -> str: cmd = [str(HELPERS_DIR / "pack_transcripts.py"), "--edit-dir", str(edit_dir)] if params.silence_threshold is not None: cmd += ["--silence-threshold", str(params.silence_threshold)] - rc, out, err = _run_helper(cmd) + rc, out, err = await _run_helper(cmd) return _format_result(rc, out, err) # Side-channel for the last timeline image path so it can be attached in the @@ -320,7 +355,7 @@ async def timeline_view(params: TimelineViewParams) -> str: cmd += ["--n-frames", str(params.n_frames)] if params.transcript_path: cmd += ["--transcript", params.transcript_path] - rc, out, err = _run_helper(cmd) + rc, out, err = await _run_helper(cmd) result = _format_result(rc, out, err) if rc == 0 and out_img.exists(): _pending_images.append(out_img) @@ -336,10 +371,27 @@ async def timeline_view(params: TimelineViewParams) -> str: skip_permission=True, ) async def render(params: RenderParams) -> str: + try: + edl_path = _validate_edit_dir_path( + params.edl_path, + videos_dir=videos_dir, + edit_dir=edit_dir, + label="edl_path", + must_exist=True, + ) + output_path = _validate_edit_dir_path( + params.output_path, + videos_dir=videos_dir, + edit_dir=edit_dir, + label="output_path", + ) + except ValueError as exc: + return f"[invalid input]\n{exc}" + cmd = [ str(HELPERS_DIR / "render.py"), - params.edl_path, - "-o", params.output_path, + str(edl_path), + "-o", str(output_path), ] if params.preview: cmd.append("--preview") @@ -349,7 +401,7 @@ async def render(params: RenderParams) -> str: cmd.append("--no-subtitles") if params.no_loudnorm: cmd.append("--no-loudnorm") - rc, out, err = _run_helper(cmd) + rc, out, err = await _run_helper(cmd) return _format_result(rc, out, err) @define_tool( @@ -361,16 +413,33 @@ async def render(params: RenderParams) -> str: skip_permission=True, ) async def grade(params: GradeParams) -> str: + try: + input_path = _validate_edit_dir_path( + params.input_path, + videos_dir=videos_dir, + edit_dir=edit_dir, + label="input_path", + must_exist=True, + ) + output_path = _validate_edit_dir_path( + params.output_path, + videos_dir=videos_dir, + edit_dir=edit_dir, + label="output_path", + ) + except ValueError as exc: + return f"[invalid input]\n{exc}" + cmd = [ str(HELPERS_DIR / "grade.py"), - params.input_path, - "-o", params.output_path, + str(input_path), + "-o", str(output_path), ] if params.filter: cmd += ["--filter", params.filter] elif params.preset: cmd += ["--preset", params.preset] - rc, out, err = _run_helper(cmd) + rc, out, err = await _run_helper(cmd) return _format_result(rc, out, err) # ------------------------------------------------------------------ @@ -389,9 +458,11 @@ def on_permission_request(request, invocation) -> "PermissionRequestResult": if kind == "write": file_name = getattr(request, "file_name", None) or "" - if file_name and not _is_under(Path(file_name), edit_dir): - print(f"\n[write blocked — path outside edit_dir: {file_name}]", flush=True) - return PermissionRequestResult(kind="denied-by-rules") + if file_name: + file_path = _resolve_session_path(file_name, videos_dir) + if not _is_under(file_path, edit_dir): + print(f"\n[write blocked — path outside edit_dir: {file_name}]", flush=True) + return PermissionRequestResult(kind="denied-by-rules") return PermissionRequestResult(kind="approved") @@ -569,7 +640,7 @@ def main() -> None: epilog=( "Authentication (pick one):\n" " copilot auth login Sign in via browser — no token needed\n" - " GITHUB_TOKEN=... in .env PAT with 'copilot' scope\n" + " GITHUB_TOKEN=... in .env Fine-grained token with Copilot Requests permission\n" " https://github.com/settings/tokens\n" "\nModel options (via GitHub Copilot CLI — use /model inside session to switch):\n" " (omit --model) Copilot auto-selects the best model\n" @@ -578,7 +649,7 @@ def main() -> None: " gpt-5 OpenAI GPT-5\n" " gpt-4.1 OpenAI GPT-4.1\n" "\nEnvironment variables:\n" - " GITHUB_TOKEN PAT with 'copilot' scope (alternative to browser login)\n" + " GITHUB_TOKEN Fine-grained token with Copilot Requests permission\n" " ELEVENLABS_API_KEY ElevenLabs API key for transcription\n" ), ) From 43fb7f409267a12a7fb40ee5d57f88d73652c559 Mon Sep 17 00:00:00 2001 From: Delgerskhn Date: Fri, 24 Apr 2026 16:22:33 +0800 Subject: [PATCH 8/8] Update pyproject.toml Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com> --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 211866a..29e6f87 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ dependencies = [ [project.optional-dependencies] animations = ["manim"] -copilot = ["github-copilot-sdk", "pydantic>=2.0"] +copilot = ["github-copilot-sdk>=0.1.0; python_version >= '3.11'", "pydantic>=2.0"] [build-system] requires = ["setuptools>=61.0"]