From 0d96d58624b763e5a42b0812962c1da7cf15e221 Mon Sep 17 00:00:00 2001 From: Simon Fan Date: Thu, 9 Apr 2026 11:59:11 +0200 Subject: [PATCH 1/8] Refactor repo profiles into config-driven system Move hardcoded pytorch profile into a config-driven RepoProfile registry loaded from [repos.*] sections in config.toml. Prompt templates are discovered by naming convention. Built-in defaults used as fallback when config has no [repos] section. --- ptq/config.py | 19 ++++++ ptq/repo_profiles.py | 115 ++++++++++++++++++++++++++++++++++++ tests/test_repo_profiles.py | 115 ++++++++++++++++++++++++++++++++++++ 3 files changed, 249 insertions(+) create mode 100644 ptq/repo_profiles.py create mode 100644 tests/test_repo_profiles.py diff --git a/ptq/config.py b/ptq/config.py index f80a7ba..4a51ad9 100644 --- a/ptq/config.py +++ b/ptq/config.py @@ -48,6 +48,16 @@ # title = "My Investigation" # body = "Investigate with my preferred checklist..." +[repos.pytorch] +github_repo = "pytorch/pytorch" +clone_url = "https://github.com/pytorch/pytorch.git" +dir_name = "pytorch" +smoke_test_import = "torch" +repro_import_hint = "import torch" +uses_custom_worktree_tool = true +needs_cpp_build = true +lint_cmd = "spin fixlint" + [build.env] USE_NINJA = "1" USE_NNPACK = "0" @@ -229,6 +239,7 @@ class Config: build_env: dict[str, str] = field( default_factory=lambda: {"USE_NINJA": "1", "USE_NNPACK": "0"} ) + repos: dict = field(default_factory=dict) def models_for(self, agent: str) -> AgentModels: return self.agent_models.get(agent, AgentModels(available=[], default="")) @@ -320,6 +331,13 @@ def _collect_presets(section: dict) -> dict[str, PromptPreset]: ).items() } + repos_section = data.get("repos", {}) + repos: dict = {} + if repos_section: + from ptq.repo_profiles import load_profiles_from_config + + repos = load_profiles_from_config(repos_section) + return Config( default_agent=defaults.get("agent", "claude"), default_model=defaults.get("model", "opus"), @@ -328,6 +346,7 @@ def _collect_presets(section: dict) -> dict[str, PromptPreset]: agent_models=agent_models, prompt_presets=prompt_presets, build_env=build_env, + repos=repos, ) diff --git a/ptq/repo_profiles.py b/ptq/repo_profiles.py new file mode 100644 index 0000000..54be6b1 --- /dev/null +++ b/ptq/repo_profiles.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +from dataclasses import dataclass +from pathlib import Path + +PROMPTS_DIR = Path(__file__).parent.parent / "prompts" + + +@dataclass(frozen=True) +class RepoProfile: + name: str + github_repo: str + clone_url: str + dir_name: str + smoke_test_import: str + repro_import_hint: str + uses_custom_worktree_tool: bool + needs_cpp_build: bool + lint_cmd: str | None + prompt_template: str + adhoc_prompt_template: str + + +def _resolve_prompt_templates(name: str) -> tuple[str, str]: + """Derive prompt filenames by convention, with pytorch backward compat.""" + if name == "pytorch": + return "investigate.md", "adhoc.md" + return f"investigate_{name}.md", f"adhoc_{name}.md" + + +def _validate_prompt_templates(profile: RepoProfile) -> None: + for attr in ("prompt_template", "adhoc_prompt_template"): + filename = getattr(profile, attr) + path = PROMPTS_DIR / filename + if not path.exists(): + raise ValueError( + f"Prompt template '{filename}' not found at {path}. " + f"Create it to use the '{profile.name}' repo profile." + ) + + +# Built-in defaults used when config has no [repos] section. +_DEFAULT_PROFILES: dict[str, RepoProfile] = { + "pytorch": RepoProfile( + name="pytorch", + github_repo="pytorch/pytorch", + clone_url="https://github.com/pytorch/pytorch.git", + dir_name="pytorch", + smoke_test_import="torch", + repro_import_hint="import torch", + uses_custom_worktree_tool=True, + needs_cpp_build=True, + lint_cmd="spin fixlint", + prompt_template="investigate.md", + adhoc_prompt_template="adhoc.md", + ), +} + + +def load_profiles_from_config(repos_section: dict) -> dict[str, RepoProfile]: + """Parse [repos.*] TOML sections into RepoProfile instances.""" + profiles: dict[str, RepoProfile] = {} + for name, data in repos_section.items(): + if not isinstance(data, dict): + continue + investigate, adhoc = _resolve_prompt_templates(name) + profiles[name] = RepoProfile( + name=name, + github_repo=data["github_repo"], + clone_url=data["clone_url"], + dir_name=data.get("dir_name", name), + smoke_test_import=data["smoke_test_import"], + repro_import_hint=data["repro_import_hint"], + uses_custom_worktree_tool=data.get("uses_custom_worktree_tool", False), + needs_cpp_build=data.get("needs_cpp_build", False), + lint_cmd=data.get("lint_cmd"), + prompt_template=data.get("prompt_template", investigate), + adhoc_prompt_template=data.get("adhoc_prompt_template", adhoc), + ) + for profile in profiles.values(): + _validate_prompt_templates(profile) + return profiles + + +_profiles_cache: dict[str, RepoProfile] | None = None + + +def _loaded_profiles() -> dict[str, RepoProfile]: + global _profiles_cache + if _profiles_cache is None: + from ptq.config import load_config + + cfg = load_config() + _profiles_cache = cfg.repos if cfg.repos else dict(_DEFAULT_PROFILES) + return _profiles_cache + + +def get_profile(name: str) -> RepoProfile: + profiles = _loaded_profiles() + profile = profiles.get(name) + if profile is None: + raise ValueError( + f"Unknown repo '{name}'. Available: {', '.join(profiles)}" + ) + return profile + + +def available_repos() -> list[str]: + return list(_loaded_profiles()) + + +def reset_cache() -> None: + """Clear cached profiles (for testing).""" + global _profiles_cache + _profiles_cache = None diff --git a/tests/test_repo_profiles.py b/tests/test_repo_profiles.py new file mode 100644 index 0000000..ddf8818 --- /dev/null +++ b/tests/test_repo_profiles.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +import pytest + +from ptq.repo_profiles import ( + _DEFAULT_PROFILES, + available_repos, + get_profile, + load_profiles_from_config, + reset_cache, +) + + +class TestGetProfile: + def test_pytorch(self): + p = get_profile("pytorch") + assert p.name == "pytorch" + assert p.github_repo == "pytorch/pytorch" + assert p.dir_name == "pytorch" + assert p.needs_cpp_build is True + assert p.uses_custom_worktree_tool is True + assert p.lint_cmd == "spin fixlint" + + def test_unknown_raises(self): + with pytest.raises(ValueError, match="Unknown repo 'nope'"): + get_profile("nope") + + def test_all_profiles_have_prompt_templates(self): + for name, profile in _DEFAULT_PROFILES.items(): + assert profile.prompt_template, f"{name} missing prompt_template" + assert profile.adhoc_prompt_template, f"{name} missing adhoc_prompt_template" + + def test_profiles_frozen(self): + p = get_profile("pytorch") + with pytest.raises(AttributeError): + p.name = "changed" + + def test_available_repos(self): + repos = available_repos() + assert "pytorch" in repos + + +class TestLoadFromConfig: + def test_minimal_config(self, tmp_path): + # Create a minimal prompt file so validation passes + from ptq.repo_profiles import PROMPTS_DIR + + prompt = PROMPTS_DIR / "investigate_myrepo.md" + adhoc = PROMPTS_DIR / "adhoc_myrepo.md" + try: + prompt.write_text("test") + adhoc.write_text("test") + + section = { + "myrepo": { + "github_repo": "org/myrepo", + "clone_url": "https://github.com/org/myrepo.git", + "dir_name": "myrepo", + "smoke_test_import": "myrepo", + "repro_import_hint": "import myrepo", + }, + } + profiles = load_profiles_from_config(section) + assert "myrepo" in profiles + p = profiles["myrepo"] + assert p.github_repo == "org/myrepo" + assert p.needs_cpp_build is False + assert p.uses_custom_worktree_tool is False + assert p.lint_cmd is None + assert p.prompt_template == "investigate_myrepo.md" + assert p.adhoc_prompt_template == "adhoc_myrepo.md" + finally: + prompt.unlink(missing_ok=True) + adhoc.unlink(missing_ok=True) + + def test_missing_prompt_raises(self): + section = { + "noprompts": { + "github_repo": "org/noprompts", + "clone_url": "https://github.com/org/noprompts.git", + "smoke_test_import": "noprompts", + "repro_import_hint": "import noprompts", + }, + } + with pytest.raises(ValueError, match="not found"): + load_profiles_from_config(section) + + def test_defaults_override(self, tmp_path): + """Boolean fields default to False when not specified.""" + from ptq.repo_profiles import PROMPTS_DIR + + prompt = PROMPTS_DIR / "investigate_testdefaults.md" + adhoc = PROMPTS_DIR / "adhoc_testdefaults.md" + try: + prompt.write_text("test") + adhoc.write_text("test") + section = { + "testdefaults": { + "github_repo": "org/testdefaults", + "clone_url": "https://github.com/org/testdefaults.git", + "smoke_test_import": "testdefaults", + "repro_import_hint": "import testdefaults", + "uses_custom_worktree_tool": True, + "needs_cpp_build": True, + "lint_cmd": "make lint", + }, + } + profiles = load_profiles_from_config(section) + p = profiles["testdefaults"] + assert p.uses_custom_worktree_tool is True + assert p.needs_cpp_build is True + assert p.lint_cmd == "make lint" + finally: + prompt.unlink(missing_ok=True) + adhoc.unlink(missing_ok=True) From 7fdddd5a25133983d5f2f942af7d2e8110f14534 Mon Sep 17 00:00:00 2001 From: Simon Fan Date: Thu, 9 Apr 2026 11:59:23 +0200 Subject: [PATCH 2/8] Add torchtitan support: profile, prompts, and domain model - Add torchtitan profile to config.toml and _DEFAULT_PROFILES - Add investigate/adhoc prompt templates for torchtitan - Add repo field to JobRecord and RunRequest - Include repo name in job IDs to avoid cross-repo collisions - Filter find_by_issue by repo for correct re-run matching - Update agent.py and issue.py to use repo profiles --- prompts/adhoc_torchtitan.md | 62 ++++++++++++++++ prompts/investigate_torchtitan.md | 95 ++++++++++++++++++++++++ ptq/agent.py | 42 +++++++---- ptq/config.py | 7 ++ ptq/domain/models.py | 5 ++ ptq/domain/policies.py | 10 ++- ptq/infrastructure/job_repository.py | 3 + ptq/issue.py | 6 +- ptq/repo_profiles.py | 13 ++++ tests/test_job.py | 6 +- tests/test_repo_profiles.py | 10 +++ tests/test_torchtitan.py | 106 +++++++++++++++++++++++++++ 12 files changed, 344 insertions(+), 21 deletions(-) create mode 100644 prompts/adhoc_torchtitan.md create mode 100644 prompts/investigate_torchtitan.md create mode 100644 tests/test_torchtitan.py diff --git a/prompts/adhoc_torchtitan.md b/prompts/adhoc_torchtitan.md new file mode 100644 index 0000000..1393b33 --- /dev/null +++ b/prompts/adhoc_torchtitan.md @@ -0,0 +1,62 @@ +# TorchTitan Task Agent + +You are performing a task on a TorchTitan codebase. + +## Job Info +- **Job ID**: {job_id} +- **Mode**: adhoc + +## Environment +- **Python** (always use this): `{workspace}/jobs/{job_id}/.venv/bin/python` +- **TorchTitan source** (edit here): `{workspace}/jobs/{job_id}/torchtitan/` +- **Job artifacts** (write output here): `{workspace}/jobs/{job_id}/` + +## Task + +{task_description} + +## Worklog + +Maintain a running worklog at `{workspace}/jobs/{job_id}/worklog.md`. Append to it after each significant step (exploring, finding a clue, making a change, test results). Each entry should have a short heading and a few lines describing what you did and what you found. This lets the user check progress while you're still running. + +## CRITICAL RULES + +### Stay in your worktree +You MUST only read and write files within these directories: +- `{workspace}/jobs/{job_id}/` (your job directory — edits, scripts, artifacts) +- `{workspace}/pytorch/` (upstream PyTorch source — read and edit if the root cause is in PyTorch) +- `{workspace}/scripts/` (read-only) + +NEVER `cd` outside these directories. All TorchTitan source is in YOUR worktree at `{workspace}/jobs/{job_id}/torchtitan/`. + +### Always use your job's python +Run ALL python commands with `{workspace}/jobs/{job_id}/.venv/bin/python`. NEVER use bare `python`, `python3`, or any other python binary. NEVER use `conda`, `pip install`, or modify the environment. + +### Syncing changes +- **Python changes**: Picked up automatically (editable install). No action needed. +- TorchTitan is pure Python — no C++ rebuild needed. + +## Debugging Tools + +**Distributed training debugging**: +- Single-GPU debugging: `{workspace}/jobs/{job_id}/.venv/bin/torchrun --nproc_per_node=1 ` +- Multi-GPU: `{workspace}/jobs/{job_id}/.venv/bin/torchrun --nproc_per_node=N ` +- Enable debug logging: `TORCH_DISTRIBUTED_DEBUG=DETAIL ` +- Trace compilation: `TORCH_LOGS="output_code" ` + +**CUDA errors**: +``` +CUDA_LAUNCH_BLOCKING=1 PYTORCH_NO_CUDA_MEMORY_CACHING=1 compute-sanitizer --tool memcheck {workspace}/jobs/{job_id}/.venv/bin/python +``` + +## Output +Write these files to `{workspace}/jobs/{job_id}/`: + +**report.md** — A concise summary of what you did and what you found. + +**fix.diff** (if you made code changes) — Generate with: +``` +cd {workspace}/jobs/{job_id}/torchtitan && git diff > {workspace}/jobs/{job_id}/fix.diff +``` + +IMPORTANT: Always generate report.md before finishing. Generate fix.diff if you made any code changes. diff --git a/prompts/investigate_torchtitan.md b/prompts/investigate_torchtitan.md new file mode 100644 index 0000000..7d8338d --- /dev/null +++ b/prompts/investigate_torchtitan.md @@ -0,0 +1,95 @@ +# TorchTitan Issue Investigation Agent + +You are investigating a TorchTitan bug. Your goal is to reproduce, understand, and fix the issue. + +## Job Info +- **Job ID**: {job_id} +- **Issue**: pytorch/torchtitan#{issue_number} + +## Environment +- **Python** (always use this): `{workspace}/jobs/{job_id}/.venv/bin/python` +- **TorchTitan source** (edit here): `{workspace}/jobs/{job_id}/torchtitan/` +- **Job artifacts** (write output here): `{workspace}/jobs/{job_id}/` + +## Issue Context + +{issue_context} + +## Worklog + +Maintain a running worklog at `{workspace}/jobs/{job_id}/worklog.md`. Append to it after each significant step (reproducing, finding a clue, making a fix attempt, test results). Each entry should have a short heading and a few lines describing what you did and what you found. This lets the user check progress while you're still running. + +## CRITICAL RULES + +### Stay in your worktree +You MUST only read and write files within these directories: +- `{workspace}/jobs/{job_id}/` (your job directory — edits, scripts, artifacts) +- `{workspace}/pytorch/` (upstream PyTorch source — read and edit if the root cause is in PyTorch) +- `{workspace}/scripts/` (read-only) + +NEVER `cd` outside these directories. All TorchTitan source is in YOUR worktree at `{workspace}/jobs/{job_id}/torchtitan/`. + +### Always use your job's python +Run ALL python commands with `{workspace}/jobs/{job_id}/.venv/bin/python`. NEVER use bare `python`, `python3`, or any other python binary. NEVER use `conda`, `pip install`, or modify the environment. + +### Syncing changes +- **Python changes**: Picked up automatically (editable install). No action needed. +- TorchTitan is pure Python — no C++ rebuild needed. + +## Debugging Tools + +**Distributed training debugging**: +- Single-GPU debugging: `{workspace}/jobs/{job_id}/.venv/bin/torchrun --nproc_per_node=1 ` +- Multi-GPU: `{workspace}/jobs/{job_id}/.venv/bin/torchrun --nproc_per_node=N ` +- Enable debug logging: `TORCH_DISTRIBUTED_DEBUG=DETAIL ` +- Trace compilation: `TORCH_LOGS="output_code" ` +- Disable async compile: `TORCHINDUCTOR_COMPILE_THREADS=1 ` + +**CUDA errors**: +``` +CUDA_LAUNCH_BLOCKING=1 PYTORCH_NO_CUDA_MEMORY_CACHING=1 compute-sanitizer --tool memcheck {workspace}/jobs/{job_id}/.venv/bin/python +``` + +## Instructions + +### 1. Reproduce +- If a repro script exists at `{workspace}/jobs/{job_id}/repro.py`, run it: + ``` + {workspace}/jobs/{job_id}/.venv/bin/python {workspace}/jobs/{job_id}/repro.py + ``` +- If no repro script exists, write one based on the issue description and run it. +- For distributed issues, use `torchrun` with the appropriate number of processes. +- **You MUST confirm you can reproduce the reported failure before moving on.** If you cannot reproduce after reasonable attempts, stop and document in `report.md` that the issue could not be reproduced, including hardware, PyTorch version, TorchTitan version, and what you tried. + +### 2. Investigate +- Read relevant TorchTitan source code in `{workspace}/jobs/{job_id}/torchtitan/`. +- Key source locations: `torchtitan/models/`, `torchtitan/parallelisms/`, `torchtitan/train.py`, `torchtitan/config_manager.py` +- **Also check upstream PyTorch** at `{workspace}/pytorch/` — TorchTitan bugs are often caused by changes in PyTorch (FSDP, tensor parallel, compile, distributed). Cross-reference if the stack trace touches `torch.*` internals. +- Trace the code path from the repro to the root cause. +- Understand how TorchTitan's parallelism wrappers, model definitions, and training loop interact. + +### 3. Fix +- Edit source files in `{workspace}/jobs/{job_id}/torchtitan/` to fix the bug. +- If the root cause is in PyTorch, edit files in `{workspace}/pytorch/` instead (changes are picked up after C++ rebuild if needed). +- Make minimal, targeted changes. + +### 4. Test +- Re-run the repro script to confirm the fix works. +- Write additional edge-case tests if appropriate. + +### 5. Output +Write these files to `{workspace}/jobs/{job_id}/`: + +**report.md** — A concise report covering: +- Summary of the issue +- Root cause analysis +- What the fix does +- Repro script — wrap in a collapsible `
` block with `Repro Script`, containing the full script as a fenced python code block followed by its output +- Test results + +**fix.diff** — Generate with: +``` +cd {workspace}/jobs/{job_id}/torchtitan && git diff > {workspace}/jobs/{job_id}/fix.diff +``` + +IMPORTANT: Always generate both report.md and fix.diff before finishing. diff --git a/ptq/agent.py b/ptq/agent.py index 21d6b06..cf3b585 100644 --- a/ptq/agent.py +++ b/ptq/agent.py @@ -3,13 +3,13 @@ import re from pathlib import Path -PROMPT_TEMPLATE = ( - Path(__file__).parent.parent / "prompts" / "investigate.md" -).read_text() +from ptq.issue import format_issue_context +from ptq.repo_profiles import get_profile -ADHOC_PROMPT_TEMPLATE = ( - Path(__file__).parent.parent / "prompts" / "adhoc.md" -).read_text() +PROMPTS_DIR = Path(__file__).parent.parent / "prompts" + +PROMPT_TEMPLATE = (PROMPTS_DIR / "investigate.md").read_text() +ADHOC_PROMPT_TEMPLATE = (PROMPTS_DIR / "adhoc.md").read_text() RESERVED_HEADER_RE = re.compile(r"x-anthropic-\S+", re.IGNORECASE) @@ -18,21 +18,33 @@ MAX_OUTPUT_LINES = 30 DEFAULT_MESSAGE = ( - "Investigate and fix the PyTorch issue described in your system prompt." + "Investigate and fix the issue described in your system prompt." ) +_template_cache: dict[str, str] = {} + + +def _load_template(filename: str) -> str: + if filename not in _template_cache: + _template_cache[filename] = (PROMPTS_DIR / filename).read_text() + return _template_cache[filename] + def _sanitize_for_api(text: str) -> str: return RESERVED_HEADER_RE.sub("[redacted-header]", text) def build_system_prompt( - issue_data: dict, issue_number: int, job_id: str, workspace: str + issue_data: dict, + issue_number: int, + job_id: str, + workspace: str, + repo: str = "pytorch", ) -> str: - from ptq.issue import format_issue_context - + profile = get_profile(repo) + template = _load_template(profile.prompt_template) return _sanitize_for_api( - PROMPT_TEMPLATE.format( + template.format( job_id=job_id, issue_number=issue_number, issue_context=format_issue_context(issue_data, issue_number), @@ -41,9 +53,13 @@ def build_system_prompt( ) -def build_adhoc_prompt(message: str, job_id: str, workspace: str) -> str: +def build_adhoc_prompt( + message: str, job_id: str, workspace: str, repo: str = "pytorch" +) -> str: + profile = get_profile(repo) + template = _load_template(profile.adhoc_prompt_template) return _sanitize_for_api( - ADHOC_PROMPT_TEMPLATE.format( + template.format( job_id=job_id, task_description=message, workspace=workspace, diff --git a/ptq/config.py b/ptq/config.py index 4a51ad9..cf42b28 100644 --- a/ptq/config.py +++ b/ptq/config.py @@ -58,6 +58,13 @@ needs_cpp_build = true lint_cmd = "spin fixlint" +[repos.torchtitan] +github_repo = "pytorch/torchtitan" +clone_url = "https://github.com/pytorch/torchtitan.git" +dir_name = "torchtitan" +smoke_test_import = "torchtitan" +repro_import_hint = "import torchtitan" + [build.env] USE_NINJA = "1" USE_NNPACK = "0" diff --git a/ptq/domain/models.py b/ptq/domain/models.py index 959d268..3878b0e 100644 --- a/ptq/domain/models.py +++ b/ptq/domain/models.py @@ -82,6 +82,7 @@ class JobRecord: pr_title: str | None = None rebase: RebaseInfo | None = None name: str | None = None + repo: str = "pytorch" @property def target(self) -> str: @@ -117,6 +118,8 @@ def to_dict(self) -> dict: d["pr_title"] = self.pr_title if self.name: d["name"] = self.name + if self.repo != "pytorch": + d["repo"] = self.repo if self.rebase is not None: rebase_data = self.rebase.to_dict() if rebase_data: @@ -145,6 +148,7 @@ def from_dict(cls, job_id: str, data: dict) -> JobRecord: pr_title=data.get("pr_title"), rebase=RebaseInfo.from_dict(rebase_data) if rebase_data else None, name=data.get("name"), + repo=data.get("repo", "pytorch"), ) @@ -162,6 +166,7 @@ class RunRequest: existing_job_id: str | None = None verbose: bool = False name: str | None = None + repo: str = "pytorch" @dataclass diff --git a/ptq/domain/policies.py b/ptq/domain/policies.py index 324dcad..0d4b4c3 100644 --- a/ptq/domain/policies.py +++ b/ptq/domain/policies.py @@ -4,8 +4,12 @@ from datetime import datetime -def make_job_id(issue_number: int | None = None, message: str | None = None) -> str: +def make_job_id( + issue_number: int | None = None, + message: str | None = None, + repo: str = "pytorch", +) -> str: date = datetime.now().strftime("%Y%m%d") if issue_number is not None: - return f"{date}-{issue_number}" - return f"{date}-adhoc-{hashlib.md5((message or 'adhoc').encode()).hexdigest()[:6]}" + return f"{date}-{repo}-{issue_number}" + return f"{date}-{repo}-adhoc-{hashlib.md5((message or 'adhoc').encode()).hexdigest()[:6]}" diff --git a/ptq/infrastructure/job_repository.py b/ptq/infrastructure/job_repository.py index 913cfe5..c7dda32 100644 --- a/ptq/infrastructure/job_repository.py +++ b/ptq/infrastructure/job_repository.py @@ -113,10 +113,13 @@ def find_by_issue( issue_number: int, machine: str | None = None, local: bool = False, + repo: str = "pytorch", ) -> str | None: for job_id, entry in sorted(self._load_raw().items(), reverse=True): if entry.get("issue") != issue_number: continue + if entry.get("repo", "pytorch") != repo: + continue if local and entry.get("local"): return job_id if machine and entry.get("machine") == machine: diff --git a/ptq/issue.py b/ptq/issue.py index 352e52f..68ce011 100644 --- a/ptq/issue.py +++ b/ptq/issue.py @@ -24,14 +24,16 @@ def fetch_issue(issue_number: int, repo: str = "pytorch/pytorch") -> dict: return json.loads(result.stdout) -def extract_repro_script(issue_data: dict) -> str | None: +def extract_repro_script( + issue_data: dict, import_hint: str = "import torch" +) -> str | None: body = issue_data.get("body", "") or "" for comment in issue_data.get("comments", []): body += "\n" + (comment.get("body", "") or "") code_blocks = re.findall(r"```(?:python)?\s*\n(.*?)```", body, re.DOTALL) for block in code_blocks: - if "import torch" in block: + if import_hint in block or "import torch" in block: return block.strip() return None diff --git a/ptq/repo_profiles.py b/ptq/repo_profiles.py index 54be6b1..2dd044a 100644 --- a/ptq/repo_profiles.py +++ b/ptq/repo_profiles.py @@ -54,6 +54,19 @@ def _validate_prompt_templates(profile: RepoProfile) -> None: prompt_template="investigate.md", adhoc_prompt_template="adhoc.md", ), + "torchtitan": RepoProfile( + name="torchtitan", + github_repo="pytorch/torchtitan", + clone_url="https://github.com/pytorch/torchtitan.git", + dir_name="torchtitan", + smoke_test_import="torchtitan", + repro_import_hint="import torchtitan", + uses_custom_worktree_tool=False, + needs_cpp_build=False, + lint_cmd=None, + prompt_template="investigate_torchtitan.md", + adhoc_prompt_template="adhoc_torchtitan.md", + ), } diff --git a/tests/test_job.py b/tests/test_job.py index 7b84828..2b5b061 100644 --- a/tests/test_job.py +++ b/tests/test_job.py @@ -28,12 +28,12 @@ def _clear_pid_after_barrier(path, barrier: threading.Barrier, job_id: str) -> N class TestMakeJobId: def test_issue_id(self, frozen_date): - assert make_job_id(issue_number=42) == "20260217-42" + assert make_job_id(issue_number=42) == "20260217-pytorch-42" def test_adhoc_id(self, frozen_date): result = make_job_id(message="hello") - assert result.startswith("20260217-adhoc-") - assert len(result.split("-")) == 3 + assert result.startswith("20260217-pytorch-adhoc-") + assert len(result.split("-")) == 4 def test_adhoc_ids_differ_by_message(self, frozen_date): assert make_job_id(message="a") != make_job_id(message="b") diff --git a/tests/test_repo_profiles.py b/tests/test_repo_profiles.py index ddf8818..a939677 100644 --- a/tests/test_repo_profiles.py +++ b/tests/test_repo_profiles.py @@ -21,6 +21,15 @@ def test_pytorch(self): assert p.uses_custom_worktree_tool is True assert p.lint_cmd == "spin fixlint" + def test_torchtitan(self): + p = get_profile("torchtitan") + assert p.name == "torchtitan" + assert p.github_repo == "pytorch/torchtitan" + assert p.dir_name == "torchtitan" + assert p.needs_cpp_build is False + assert p.uses_custom_worktree_tool is False + assert p.lint_cmd is None + def test_unknown_raises(self): with pytest.raises(ValueError, match="Unknown repo 'nope'"): get_profile("nope") @@ -38,6 +47,7 @@ def test_profiles_frozen(self): def test_available_repos(self): repos = available_repos() assert "pytorch" in repos + assert "torchtitan" in repos class TestLoadFromConfig: diff --git a/tests/test_torchtitan.py b/tests/test_torchtitan.py new file mode 100644 index 0000000..8f9732f --- /dev/null +++ b/tests/test_torchtitan.py @@ -0,0 +1,106 @@ +"""Tests for torchtitan repo support: domain models, issue extraction, prompts.""" + +from __future__ import annotations + +from ptq.agent import build_adhoc_prompt, build_system_prompt +from ptq.domain.models import JobRecord, RunRequest +from ptq.issue import extract_repro_script + + +# -- Domain models -- + + +class TestJobRecordRepo: + def test_default_repo_is_pytorch(self): + record = JobRecord(job_id="j1") + assert record.repo == "pytorch" + + def test_repo_roundtrip(self): + record = JobRecord(job_id="j1", repo="torchtitan") + d = record.to_dict() + assert d["repo"] == "torchtitan" + restored = JobRecord.from_dict("j1", d) + assert restored.repo == "torchtitan" + + def test_default_repo_omitted_from_dict(self): + record = JobRecord(job_id="j1", repo="pytorch") + d = record.to_dict() + assert "repo" not in d + + def test_from_dict_missing_repo_defaults_to_pytorch(self): + record = JobRecord.from_dict("j1", {}) + assert record.repo == "pytorch" + + +class TestRunRequestRepo: + def test_default_repo_is_pytorch(self): + req = RunRequest(message="hello") + assert req.repo == "pytorch" + + def test_torchtitan_repo(self): + req = RunRequest(message="hello", repo="torchtitan") + assert req.repo == "torchtitan" + + +# -- Issue extraction -- + + +class TestExtractReproTorchtitan: + def test_finds_torchtitan_import(self): + issue = { + "body": "```python\nimport torchtitan\ncrash()\n```", + } + result = extract_repro_script(issue, import_hint="import torchtitan") + assert result == "import torchtitan\ncrash()" + + def test_torch_import_fallback(self): + """Even with torchtitan hint, blocks with 'import torch' are still found.""" + issue = { + "body": "```python\nimport torch\ntorch.distributed.init_process_group()\n```", + } + result = extract_repro_script(issue, import_hint="import torchtitan") + assert "import torch" in result + + def test_skips_unrelated_blocks(self): + issue = { + "body": "```python\nimport numpy\nprint('hi')\n```", + } + result = extract_repro_script(issue, import_hint="import torchtitan") + assert result is None + + +# -- Agent prompts -- + + +class TestTorchtitanPrompts: + def test_system_prompt_uses_torchtitan_template(self): + issue_data = { + "title": "FSDP crash", + "body": "OOM during training", + "labels": [], + "comments": [], + } + result = build_system_prompt(issue_data, 2818, "j-2818", "/ws", repo="torchtitan") + assert "torchtitan" in result + assert "j-2818" in result + assert "spin fixlint" not in result + assert "create_worktree.py" not in result + + def test_adhoc_prompt_uses_torchtitan_template(self): + result = build_adhoc_prompt("fix FSDP", "j-123", "/ws", repo="torchtitan") + assert "torchtitan" in result + assert "j-123" in result + assert "spin fixlint" not in result + + def test_pytorch_prompts_unchanged(self): + issue_data = { + "title": "Bug", + "body": "desc", + "labels": [], + "comments": [], + } + result = build_system_prompt(issue_data, 42, "j-42", "/ws", repo="pytorch") + assert "spin fixlint" in result + + result = build_adhoc_prompt("fix oom", "j-42", "/ws", repo="pytorch") + assert "spin fixlint" in result From e4e3f75c34144c07cd3d784380b9580d39abb1c0 Mon Sep 17 00:00:00 2001 From: Simon Fan Date: Thu, 9 Apr 2026 11:59:32 +0200 Subject: [PATCH 3/8] Wire torchtitan through services, CLI, and workspace - run_service / worktree_service: repo-aware worktree and venv setup; move _setup_lightweight_venv to worktree_service - job_service / pr_service / rebase_service: top-level profile imports - cli.py: generic --repo flag, auto-reload via create_debug_app factory - workspace.py: generic _clone_repo driven by repo profiles - app.py: add create_debug_app() factory for uvicorn auto-reload --- ptq/application/job_service.py | 42 +++++++--- ptq/application/pr_service.py | 4 +- ptq/application/rebase_service.py | 4 +- ptq/application/run_service.py | 70 +++++++++++----- ptq/application/worktree_service.py | 120 +++++++++++++++++++++++++--- ptq/cli.py | 77 ++++++++++++------ ptq/web/app.py | 5 ++ ptq/workspace.py | 38 +++++---- tests/test_torchtitan_launch.py | 101 +++++++++++++++++++++++ tests/test_worktree.py | 40 +++++++++- 10 files changed, 413 insertions(+), 88 deletions(-) create mode 100644 tests/test_torchtitan_launch.py diff --git a/ptq/application/job_service.py b/ptq/application/job_service.py index 20bb8c9..fba9a88 100644 --- a/ptq/application/job_service.py +++ b/ptq/application/job_service.py @@ -3,6 +3,7 @@ from ptq.domain.models import JobRecord, JobStatus from ptq.infrastructure.backends import backend_for_job from ptq.infrastructure.job_repository import JobRepository +from ptq.repo_profiles import get_profile from ptq.ssh import Backend @@ -30,6 +31,7 @@ def kill_job(repo: JobRepository, job_id: str) -> bool: def clean_single_job(repo: JobRepository, job_id: str) -> JobRecord: """Remove a job: kill agent, delete files, drop from DB. Returns the removed record.""" job = repo.get(job_id) + profile = get_profile(job.repo) backend = backend_for_job(job) ws = backend.workspace job_dir = f"{ws}/jobs/{job_id}" @@ -37,13 +39,18 @@ def clean_single_job(repo: JobRepository, job_id: str) -> JobRecord: if job.pid is not None and backend.is_pid_alive(job.pid): backend.kill_pid(job.pid) - backend.run( - f"cd {ws}/pytorch && {ws}/.venv/bin/python tools/create_worktree.py remove pytorch " - f"--parent-dir {job_dir}", - check=False, - ) + if profile.uses_custom_worktree_tool: + backend.run( + f"cd {ws}/pytorch && {ws}/.venv/bin/python tools/create_worktree.py remove pytorch " + f"--parent-dir {job_dir}", + check=False, + ) + else: + worktree_path = f"{job_dir}/{profile.dir_name}" + backend.run(f"git -C {ws}/{profile.dir_name} worktree remove --force {worktree_path}", check=False) + backend.run(f"rm -rf {job_dir}", check=False) - backend.run(f"cd {ws}/pytorch && git worktree prune", check=False) + backend.run(f"cd {ws}/{profile.dir_name} && git worktree prune", check=False) repo.delete(job_id) return job @@ -80,21 +87,30 @@ def clean_machine( return [], skipped_running ws = backend.workspace - backend.run(f"cd {ws}/pytorch && git worktree prune", check=False) + # Prune worktrees for all repos that have jobs being cleaned + repos_seen: set[str] = set() removed: list[str] = [] for jid, job in to_remove: + profile = get_profile(job.repo) + repos_seen.add(job.repo) if job.pid is not None and backend.is_pid_alive(job.pid): backend.kill_pid(job.pid) job_dir = f"{ws}/jobs/{jid}" - backend.run( - f"cd {ws}/pytorch && {ws}/.venv/bin/python tools/create_worktree.py remove pytorch " - f"--parent-dir {job_dir}", - check=False, - ) + if profile.uses_custom_worktree_tool: + backend.run( + f"cd {ws}/pytorch && {ws}/.venv/bin/python tools/create_worktree.py remove pytorch " + f"--parent-dir {job_dir}", + check=False, + ) + else: + worktree_path = f"{job_dir}/{profile.dir_name}" + backend.run(f"git -C {ws}/{profile.dir_name} worktree remove --force {worktree_path}", check=False) backend.run(f"rm -rf {job_dir}") repo.delete(jid) removed.append(jid) - backend.run(f"cd {ws}/pytorch && git worktree prune", check=False) + for repo_name in repos_seen: + p = get_profile(repo_name) + backend.run(f"cd {ws}/{p.dir_name} && git worktree prune", check=False) return removed, skipped_running diff --git a/ptq/application/pr_service.py b/ptq/application/pr_service.py index 3511f3b..40f7bf3 100644 --- a/ptq/application/pr_service.py +++ b/ptq/application/pr_service.py @@ -6,6 +6,7 @@ from ptq.domain.models import PRResult, PtqError from ptq.infrastructure.backends import backend_for_job from ptq.infrastructure.job_repository import JobRepository +from ptq.repo_profiles import get_profile from ptq.ssh import Backend _HTTPS_TO_SSH = { @@ -125,7 +126,8 @@ def create_pr( job = repo.get(job_id) backend = backend_for_job(job) job_dir = f"{backend.workspace}/jobs/{job_id}" - worktree = f"{job_dir}/pytorch" + profile = get_profile(job.repo) + worktree = f"{job_dir}/{profile.dir_name}" existing_open_pr_url = "" if job.pr_url: diff --git a/ptq/application/rebase_service.py b/ptq/application/rebase_service.py index e1f2c3e..1b560db 100644 --- a/ptq/application/rebase_service.py +++ b/ptq/application/rebase_service.py @@ -9,6 +9,7 @@ from ptq.domain.models import PtqError, RebaseInfo, RebaseState from ptq.infrastructure.backends import backend_for_job from ptq.infrastructure.job_repository import JobRepository +from ptq.repo_profiles import get_profile from ptq.ssh import Backend, RemoteBackend ProgressCallback = Callable[[str], None] @@ -130,7 +131,8 @@ def rebase( backend = backend_for_job(job) workspace = backend.workspace job_dir = f"{workspace}/jobs/{job_id}" - worktree = f"{job_dir}/pytorch" + profile = get_profile(job.repo) + worktree = f"{job_dir}/{profile.dir_name}" agent_name = agent_name or job.agent model = model or job.model diff --git a/ptq/application/run_service.py b/ptq/application/run_service.py index f562915..9046ffb 100644 --- a/ptq/application/run_service.py +++ b/ptq/application/run_service.py @@ -18,6 +18,8 @@ from ptq.domain.policies import make_job_id from ptq.infrastructure.job_repository import JobRepository from ptq.issue import extract_repro_script +from ptq.application.worktree_service import _setup_lightweight_venv +from ptq.repo_profiles import get_profile from ptq.ssh import Backend, RemoteBackend from ptq.workspace import deploy_scripts @@ -46,11 +48,12 @@ def _chain_result( return next_step() -def _validate_workspace(backend: Backend, workspace: str) -> None: - result = backend.run(f"test -d {workspace}/pytorch/.git", check=False) +def _validate_workspace(backend: Backend, workspace: str, repo: str = "pytorch") -> None: + profile = get_profile(repo) + result = backend.run(f"test -d {workspace}/{profile.dir_name}/.git", check=False) if result.returncode != 0: raise PtqError( - f"Workspace broken: {workspace}/pytorch/.git missing. Re-run: ptq setup" + f"Workspace broken: {workspace}/{profile.dir_name}/.git missing. Re-run: ptq setup" ) @@ -61,6 +64,7 @@ def _try_clone_base_venv( *, verbose: bool = False, progress: ProgressCallback = _noop_progress, + repo: str = "pytorch", ) -> bool: """Clone base workspace venv + source artifacts instead of rebuilding. @@ -217,9 +221,20 @@ def _setup_job_venv( verbose: bool = False, progress: ProgressCallback = _noop_progress, build_env_prefix: str = "USE_NINJA=1 ", + repo: str = "pytorch", ) -> None: + profile = get_profile(repo) + + if not profile.needs_cpp_build: + _setup_lightweight_venv( + backend, job_dir, worktree_path, + verbose=verbose, progress=progress, repo=repo, + ) + return + if not _try_clone_base_venv( - backend, job_dir, worktree_path, verbose=verbose, progress=progress + backend, job_dir, worktree_path, verbose=verbose, progress=progress, + repo=repo, ): log.info("slow-path: full editable install for %s", job_dir) with _timed("venv creation", progress): @@ -328,6 +343,8 @@ def launch( is_adhoc = request.issue_number is None issue_number = request.issue_number issue_data = request.issue_data + repo_name = request.repo + profile = get_profile(repo_name) if not is_adhoc and (issue_number is None or issue_data is None): raise PtqError("Issue runs require both issue number and issue data.") @@ -342,13 +359,14 @@ def launch( existing = job_id elif is_adhoc: existing = None - job_id = make_job_id(message=request.message) + job_id = make_job_id(message=request.message, repo=repo_name) run_number = 1 progress(f"Job {job_id} — adhoc (run 1)") else: assert issue_number is not None existing = repo.find_by_issue( - issue_number, machine=request.machine, local=request.local + issue_number, machine=request.machine, local=request.local, + repo=repo_name, ) if existing: job_id = existing @@ -357,15 +375,15 @@ def launch( ) progress(f"Job {job_id} — issue #{issue_number} (run {run_number})") else: - job_id = make_job_id(issue_number) + job_id = make_job_id(issue_number, repo=repo_name) run_number = 1 progress(f"Job {job_id} — issue #{issue_number} (run 1)") job_dir = f"{workspace}/jobs/{job_id}" - worktree_path = f"{job_dir}/pytorch" + worktree_path = f"{job_dir}/{profile.dir_name}" if existing: - _validate_workspace(backend, workspace) + _validate_workspace(backend, workspace, repo_name) backend.run(f"mkdir -p {job_dir}") @@ -382,6 +400,7 @@ def launch( workspace=workspace, initializing=True, name=request.name, + repo=repo_name, ) ) elif request.name: @@ -397,13 +416,23 @@ def launch( progress("Reusing existing worktree.") else: if worktree_exists.returncode != 0: - progress("Creating worktree with submodules...") - with _timed("worktree creation", progress): - backend.run( - f"cd {workspace}/pytorch && {workspace}/.venv/bin/python tools/create_worktree.py create pytorch " - f"--parent-dir {job_dir} --commit HEAD", - stream=request.verbose, - ) + if profile.uses_custom_worktree_tool: + progress("Creating worktree with submodules...") + with _timed("worktree creation", progress): + backend.run( + f"cd {workspace}/pytorch && {workspace}/.venv/bin/python tools/create_worktree.py create pytorch " + f"--parent-dir {job_dir} --commit HEAD", + stream=request.verbose, + ) + else: + progress(f"Creating {profile.name} worktree...") + with _timed("worktree creation", progress): + branch = f"ptq-{job_id}" + backend.run( + f"cd {workspace}/{profile.dir_name} && " + f"git worktree add -b {branch} {worktree_path} HEAD", + stream=request.verbose, + ) if venv_exists.returncode != 0: progress("Creating per-job venv...") from ptq.config import load_config @@ -415,16 +444,19 @@ def launch( verbose=request.verbose, progress=progress, build_env_prefix=load_config().build_env_prefix(), + repo=repo_name, ) if is_adhoc: system_prompt = build_adhoc_prompt( - request.message or DEFAULT_MESSAGE, job_id, workspace + request.message or DEFAULT_MESSAGE, job_id, workspace, repo=repo_name ) else: assert issue_number is not None assert issue_data is not None - system_prompt = build_system_prompt(issue_data, issue_number, job_id, workspace) + system_prompt = build_system_prompt( + issue_data, issue_number, job_id, workspace, repo=repo_name + ) if existing: prior_context = _build_prior_context(backend, job_dir, run_number) @@ -447,7 +479,7 @@ def launch( if not is_adhoc: assert issue_data is not None - repro = extract_repro_script(issue_data) + repro = extract_repro_script(issue_data, import_hint=profile.repro_import_hint) if repro: with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=False) as f: f.write(repro) diff --git a/ptq/application/worktree_service.py b/ptq/application/worktree_service.py index dd2c250..d3fde9b 100644 --- a/ptq/application/worktree_service.py +++ b/ptq/application/worktree_service.py @@ -7,6 +7,7 @@ from subprocess import CompletedProcess from ptq.domain.models import PtqError +from ptq.repo_profiles import get_profile from ptq.ssh import Backend log = logging.getLogger("ptq.worktree") @@ -34,11 +35,81 @@ def _chain_result( return next_step() -def validate_workspace(backend: Backend, workspace: str) -> None: - result = backend.run(f"test -d {workspace}/pytorch/.git", check=False) +def _setup_lightweight_venv( + backend: Backend, + job_dir: str, + worktree_path: str, + *, + verbose: bool = False, + progress: ProgressCallback = _noop_progress, + repo: str = "torchtitan", +) -> None: + """Set up venv for lightweight (pure-Python) repos like torchtitan. + + Clones the base workspace venv (which has torch built), then does + an editable install of the target repo. + """ + profile = get_profile(repo) + workspace = backend.workspace + base_venv = f"{workspace}/.venv" + + progress("Cloning base venv...") + with _timed("venv clone", progress): + for cp_flags in ("-al", "-a"): + if ( + backend.run( + f"cp {cp_flags} {base_venv} {job_dir}/.venv", check=False + ).returncode + == 0 + ): + break + backend.run(f"rm -rf {job_dir}/.venv", check=False) + else: + progress("Venv clone failed, creating fresh venv...") + backend.run(f"cd {job_dir} && uv venv --python 3.12") + + job_python = f"{job_dir}/.venv/bin/python" + + # Rewrite venv paths to point to job-local copy + job_venv = f"{job_dir}/.venv" + + def _last_line(cmd: str) -> str: + lines = backend.run(cmd, check=False).stdout.strip().splitlines() + return lines[-1] if lines else "" + + resolved_venv = _last_line(f"realpath {job_venv}") or job_venv + backend.run( + f'sed -i "s|{base_venv}|{resolved_venv}|g" {job_venv}/bin/activate {job_venv}/bin/activate.csh {job_venv}/bin/activate.fish {job_venv}/bin/activate.nu 2>/dev/null', + check=False, + ) + backend.run( + f"""sed -i "s|^VIRTUAL_ENV=.*|VIRTUAL_ENV='{resolved_venv}'|" {job_venv}/bin/activate 2>/dev/null""", + check=False, + ) + backend.run( + f'sed -i "1s|#!{base_venv}/bin/python[0-9.]*|#!{resolved_venv}/bin/python|" {job_venv}/bin/* 2>/dev/null', + check=False, + ) + + progress(f"Editable install ({profile.name})...") + with _timed("editable install", progress): + result = backend.run( + f"cd {worktree_path} && uv pip install --python {job_python} -e .", + check=False, + stream=verbose, + ) + if result.returncode != 0: + progress("Editable install failed — agent can install manually.") + else: + progress("Editable install complete.") + + +def validate_workspace(backend: Backend, workspace: str, repo: str = "pytorch") -> None: + profile = get_profile(repo) + result = backend.run(f"test -d {workspace}/{profile.dir_name}/.git", check=False) if result.returncode != 0: raise PtqError( - f"Workspace broken: {workspace}/pytorch/.git missing. Re-run: ptq setup" + f"Workspace broken: {workspace}/{profile.dir_name}/.git missing. Re-run: ptq setup" ) @@ -49,6 +120,7 @@ def _try_clone_base_venv( *, verbose: bool = False, progress: ProgressCallback = _noop_progress, + repo: str = "pytorch", ) -> bool: """Clone base workspace venv + source artifacts instead of rebuilding. @@ -205,9 +277,20 @@ def _setup_job_venv( verbose: bool = False, progress: ProgressCallback = _noop_progress, build_env_prefix: str = "USE_NINJA=1 ", + repo: str = "pytorch", ) -> None: + profile = get_profile(repo) + + if not profile.needs_cpp_build: + _setup_lightweight_venv( + backend, job_dir, worktree_path, + verbose=verbose, progress=progress, repo=repo, + ) + return + if not _try_clone_base_venv( - backend, job_dir, worktree_path, verbose=verbose, progress=progress + backend, job_dir, worktree_path, verbose=verbose, progress=progress, + repo=repo, ): log.info("slow-path: full editable install for %s", job_dir) with _timed("venv creation", progress): @@ -260,15 +343,17 @@ def provision_worktree( *, verbose: bool = False, progress: ProgressCallback | None = None, + repo: str = "pytorch", ) -> bool: """Create a git worktree and per-worktree venv if they don't already exist. Returns True if a new worktree was created, False if reusing existing. """ cb = progress or _noop_progress + profile = get_profile(repo) workspace = backend.workspace job_dir = f"{workspace}/jobs/{job_id}" - worktree_path = f"{job_dir}/pytorch" + worktree_path = f"{job_dir}/{profile.dir_name}" backend.run(f"mkdir -p {job_dir}") @@ -281,13 +366,23 @@ def provision_worktree( return False if worktree_exists.returncode != 0: - cb("Creating worktree with submodules...") - with _timed("worktree creation", cb): - backend.run( - f"cd {workspace}/pytorch && {workspace}/.venv/bin/python tools/create_worktree.py create pytorch " - f"--parent-dir {job_dir} --commit HEAD", - stream=verbose, - ) + if profile.uses_custom_worktree_tool: + cb("Creating worktree with submodules...") + with _timed("worktree creation", cb): + backend.run( + f"cd {workspace}/pytorch && {workspace}/.venv/bin/python tools/create_worktree.py create pytorch " + f"--parent-dir {job_dir} --commit HEAD", + stream=verbose, + ) + else: + cb(f"Creating {profile.name} worktree...") + with _timed("worktree creation", cb): + branch = f"ptq-{job_id}" + backend.run( + f"cd {workspace}/{profile.dir_name} && " + f"git worktree add -b {branch} {worktree_path} HEAD", + stream=verbose, + ) if venv_exists.returncode != 0: cb("Creating per-job venv...") @@ -300,6 +395,7 @@ def provision_worktree( verbose=verbose, progress=cb, build_env_prefix=load_config().build_env_prefix(), + repo=repo, ) return worktree_exists.returncode != 0 diff --git a/ptq/cli.py b/ptq/cli.py index b5bc041..b099015 100644 --- a/ptq/cli.py +++ b/ptq/cli.py @@ -214,8 +214,12 @@ def run( str | None, typer.Option("--name", "-n", help="Display name for this job."), ] = None, + repo: Annotated[ + str, + typer.Option("--repo", help="Target repository name (see config.toml [repos])."), + ] = "pytorch", ) -> None: - """Launch an AI agent to investigate a PyTorch issue or run an adhoc task. + """Launch an AI agent to investigate a GitHub issue or run an adhoc task. Provide --issue for GitHub issue investigation, or --message for a freeform task. Re-run an existing job by passing its JOB_ID (or issue number) as a positional arg. @@ -254,19 +258,20 @@ def run( else: message = selected_preset.body - repo = _repo() + job_repo = _repo() resolved_job_id: str | None = None if job_id is not None: try: - resolved_job_id = repo.resolve_id(job_id) + resolved_job_id = job_repo.resolve_id(job_id) except PtqError as e: _handle_error(e) - job = repo.get(resolved_job_id) + job = job_repo.get(resolved_job_id) issue = issue or job.issue machine = machine or job.machine local = local or job.local workspace = workspace or job.workspace + repo = job.repo if agent is None: agent = job.agent @@ -280,10 +285,14 @@ def run( model = cfg.effective_model(agent, model) max_turns = max_turns or cfg.default_max_turns + from ptq.repo_profiles import get_profile + + profile = get_profile(repo) + issue_data = None if issue is not None: - console.print(f"Fetching issue #{issue}...") - issue_data = fetch_issue(issue) + console.print(f"Fetching {profile.github_repo}#{issue}...") + issue_data = fetch_issue(issue, repo=profile.github_repo) console.print(f"[bold]{issue_data['title']}[/bold]") backend = create_backend(machine=machine, local=local, workspace=workspace) @@ -300,17 +309,18 @@ def run( existing_job_id=resolved_job_id, verbose=verbose, name=name, + repo=repo, ) try: launched_id = run_service.launch( - repo, backend, request, on_progress=lambda msg: console.print(msg) + job_repo, backend, request, on_progress=lambda msg: console.print(msg) ) except PtqError as e: _handle_error(e) if follow: - job = repo.get(launched_id) + job = job_repo.get(launched_id) agent_impl = get_agent(job.agent) log_file = f"{backend.workspace}/jobs/{launched_id}/{agent_impl.log_filename(job.runs)}" _follow_logs(backend, log_file, agent_impl, launched_id) @@ -780,12 +790,24 @@ def web( raise typer.Exit(1) # noqa: B904 console.print(f"Starting ptq web at http://{host}:{port}") - uvicorn.run( - create_app(debug=debug), - host=host, - port=port, - log_level="debug" if debug else "info", - ) + if debug: + # Use string import so uvicorn can enable auto-reload + uvicorn.run( + "ptq.web.app:create_debug_app", + factory=True, + host=host, + port=port, + log_level="debug", + reload=True, + reload_dirs=[str(Path(__file__).resolve().parent)], + ) + else: + uvicorn.run( + create_app(debug=False), + host=host, + port=port, + log_level="info", + ) @app.command() @@ -841,8 +863,12 @@ def worktree( bool, typer.Option("--verbose", "-v", help="Stream build output and show timings."), ] = False, + repo: Annotated[ + str, + typer.Option("--repo", help="Target repository name (see config.toml [repos])."), + ] = "pytorch", ) -> None: - """Create a named PyTorch worktree with a ready-to-use venv. + """Create a named worktree with a ready-to-use venv. Sets up a git worktree and per-worktree venv without launching an agent. Use `ptq run ` later to launch an agent in this worktree. @@ -858,10 +884,12 @@ def worktree( from ptq.application.worktree_service import provision_worktree, validate_workspace from ptq.domain.policies import make_job_id from ptq.infrastructure.backends import create_backend + from ptq.repo_profiles import get_profile from ptq.workspace import deploy_scripts - repo = _repo() - existing = repo.find_by_name(name) + profile = get_profile(repo) + job_repo = _repo() + existing = job_repo.find_by_name(name) if existing: console.print( f"[yellow]Worktree '{name}' already exists as {existing}[/yellow]" @@ -870,12 +898,12 @@ def worktree( backend = create_backend(machine=machine, local=local, workspace=workspace) try: - validate_workspace(backend, backend.workspace) + validate_workspace(backend, backend.workspace, repo=repo) except PtqError as e: _handle_error(e) - job_id = make_job_id(message=name) - repo.save( + job_id = make_job_id(message=name, repo=repo) + job_repo.save( JobRecord( job_id=job_id, runs=0, @@ -885,6 +913,7 @@ def worktree( local=local, workspace=backend.workspace, name=name, + repo=repo, ) ) @@ -895,21 +924,23 @@ def worktree( job_id, verbose=verbose, progress=lambda msg: console.print(msg), + repo=repo, ) except PtqError as e: _handle_error(e) ws = backend.workspace job_dir = f"{ws}/jobs/{job_id}" + dir_name = profile.dir_name console.print() console.print(f"[bold green]Worktree '{name}' ready.[/bold green]") console.print(f" Job ID: {job_id}") - console.print(f" Worktree: {job_dir}/pytorch") + console.print(f" Worktree: {job_dir}/{dir_name}") if local: - console.print(f"\n cd {job_dir}/pytorch && source ../.venv/bin/activate") + console.print(f"\n cd {job_dir}/{dir_name} && source ../.venv/bin/activate") else: console.print( - f"\n ssh -t {machine} 'cd {job_dir}/pytorch && " + f"\n ssh -t {machine} 'cd {job_dir}/{dir_name} && " f"source ../.venv/bin/activate && exec $SHELL'" ) console.print(f"\n To launch an agent: ptq run {name} -m 'your task'") diff --git a/ptq/web/app.py b/ptq/web/app.py index d4ce461..8d1e395 100644 --- a/ptq/web/app.py +++ b/ptq/web/app.py @@ -21,6 +21,11 @@ def setup_logging(*, debug: bool = False) -> None: logging.getLogger("uvicorn.access").setLevel(logging.WARNING) +def create_debug_app() -> FastAPI: + """Factory for uvicorn --reload (no args).""" + return create_app(debug=True) + + def create_app(*, debug: bool = False) -> FastAPI: setup_logging(debug=debug) app = FastAPI(title="ptq — PyTorch Job Queue") diff --git a/ptq/workspace.py b/ptq/workspace.py index 5dd2c27..8af7150 100644 --- a/ptq/workspace.py +++ b/ptq/workspace.py @@ -6,6 +6,7 @@ from rich.console import Console +from ptq.repo_profiles import RepoProfile, available_repos, get_profile from ptq.ssh import Backend, RemoteBackend _CUDA_VERSION_RE = re.compile(r"CUDA Version:\s*(\d+)\.(\d+)") @@ -68,7 +69,10 @@ def setup_workspace( backend.run(f"mkdir -p {workspace}/jobs {workspace}/scripts") _ensure_ccache_config(backend) - _clone_pytorch(backend, workspace) + + for name in available_repos(): + profile = get_profile(name) + _clone_repo(backend, workspace, profile) console.print("Installing Python 3.12 via uv...") backend.run("uv python install 3.12", check=False) @@ -97,29 +101,33 @@ def setup_workspace( console.print("[bold green]Workspace setup complete.[/bold green]") -def _clone_pytorch(backend: Backend, workspace: str) -> None: - existing = backend.run(f"test -d {workspace}/pytorch/.git", check=False) +def _clone_repo(backend: Backend, workspace: str, profile: RepoProfile) -> None: + repo_dir = f"{workspace}/{profile.dir_name}" + existing = backend.run(f"test -d {repo_dir}/.git", check=False) + if existing.returncode == 0: - console.print("PyTorch checkout already exists, resetting to latest...") - backend.run( - f"cd {workspace}/pytorch && git fetch origin && git reset --hard origin/main", - stream=True, - ) + console.print(f"{profile.name} checkout already exists, resetting to latest...") backend.run( - f"cd {workspace}/pytorch && git submodule sync && git submodule update --init --recursive --progress", + f"cd {repo_dir} && git fetch origin && git reset --hard origin/main", stream=True, ) + if profile.uses_custom_worktree_tool: + backend.run( + f"cd {repo_dir} && git submodule sync && git submodule update --init --recursive --progress", + stream=True, + ) return - console.print("Cloning pytorch (full clone with submodules)...") + console.print(f"Cloning {profile.name}...") backend.run( - f"git clone --progress https://github.com/pytorch/pytorch.git {workspace}/pytorch", - stream=True, - ) - backend.run( - f"cd {workspace}/pytorch && git submodule update --init --recursive --progress", + f"git clone --progress {profile.clone_url} {repo_dir}", stream=True, ) + if profile.uses_custom_worktree_tool: + backend.run( + f"cd {repo_dir} && git submodule update --init --recursive --progress", + stream=True, + ) def _install_triton(backend: Backend, workspace: str) -> CompletedProcess[str]: diff --git a/tests/test_torchtitan_launch.py b/tests/test_torchtitan_launch.py new file mode 100644 index 0000000..6f0d6a6 --- /dev/null +++ b/tests/test_torchtitan_launch.py @@ -0,0 +1,101 @@ +"""Tests for torchtitan launch: worktree creation, venv setup, repo persistence.""" + +from __future__ import annotations + +from subprocess import CompletedProcess +from unittest.mock import MagicMock, patch + +from ptq.application.run_service import launch +from ptq.domain.models import RunRequest +from ptq.ssh import LocalBackend + + +def _ok(cmd="", **kwargs): + return CompletedProcess(args="", returncode=0, stdout="", stderr="") + + +def _mock_backend(backend): + def run_side_effect(cmd: str, check: bool = True, **kw): + if "test -d" in cmd or "test -f" in cmd: + return CompletedProcess(args="", returncode=1, stdout="", stderr="") + return _ok() + + backend.run = MagicMock(side_effect=run_side_effect) + backend.copy_to = MagicMock() + backend.launch_background = MagicMock(return_value=12345) + backend.tail_log = MagicMock() + + +class TestLaunchTorchtitan: + @patch("ptq.application.run_service.deploy_scripts") + def test_torchtitan_uses_git_worktree(self, _deploy, repo, frozen_date): + """Torchtitan should use standard git worktree, not create_worktree.py.""" + backend = LocalBackend(workspace="/tmp/ws") + _mock_backend(backend) + + launch( + repo, + backend, + RunRequest(message="hello", local=True, follow=False, repo="torchtitan"), + ) + + run_cmds = [ + call.args[0] + for call in backend.run.call_args_list + if isinstance(call.args[0], str) + ] + assert not any("create_worktree.py" in c for c in run_cmds) + assert any("git worktree add" in c for c in run_cmds) + + @patch("ptq.application.run_service.deploy_scripts") + def test_torchtitan_repo_persisted(self, _deploy, repo, frozen_date): + backend = LocalBackend(workspace="/tmp/ws") + _mock_backend(backend) + + job_id = launch( + repo, + backend, + RunRequest(message="hello", local=True, follow=False, repo="torchtitan"), + ) + + job = repo.get(job_id) + assert job.repo == "torchtitan" + + @patch("ptq.application.run_service.deploy_scripts") + def test_torchtitan_worktree_path(self, _deploy, repo, frozen_date): + """Worktree should be under torchtitan/, not pytorch/.""" + backend = LocalBackend(workspace="/tmp/ws") + _mock_backend(backend) + + job_id = launch( + repo, + backend, + RunRequest(message="hello", local=True, follow=False, repo="torchtitan"), + ) + + run_cmds = [ + call.args[0] + for call in backend.run.call_args_list + if isinstance(call.args[0], str) + ] + worktree_cmds = [c for c in run_cmds if "git worktree add" in c] + assert any(f"/jobs/{job_id}/torchtitan" in c for c in worktree_cmds) + + @patch("ptq.application.run_service.deploy_scripts") + def test_pytorch_still_uses_create_worktree(self, _deploy, repo, frozen_date): + """Pytorch should still use create_worktree.py.""" + backend = LocalBackend(workspace="/tmp/ws") + _mock_backend(backend) + + launch( + repo, + backend, + RunRequest(message="hello", local=True, follow=False, repo="pytorch"), + ) + + run_cmds = [ + call.args[0] + for call in backend.run.call_args_list + if isinstance(call.args[0], str) + ] + assert any("create_worktree.py" in c for c in run_cmds) diff --git a/tests/test_worktree.py b/tests/test_worktree.py index c6444e9..e5b9dc5 100644 --- a/tests/test_worktree.py +++ b/tests/test_worktree.py @@ -110,19 +110,27 @@ def test_prints_enter_command_local(self, tmp_path, frozen_date): mock_backend.workspace = "/tmp/ws" mock_backend.run = MagicMock(return_value=_ok()) + from ptq.repo_profiles import _DEFAULT_PROFILES, _loaded_profiles + with ( patch("ptq.cli._repo", return_value=repo), patch( "ptq.infrastructure.backends.LocalBackend", return_value=mock_backend ), patch("ptq.config.load_config") as mock_cfg, + patch( + "ptq.repo_profiles._loaded_profiles", + return_value=_DEFAULT_PROFILES, + ), ): mock_cfg.return_value.build_env_prefix.return_value = "USE_NINJA=1 " result = runner.invoke(app, ["worktree", "my-fix", "--local"]) assert result.exit_code == 0, result.output - assert "source ../.venv/bin/activate" in result.output - assert "ptq run my-fix" in result.output + # Rich may wrap long paths across lines, so normalize whitespace + flat = " ".join(result.output.split()) + assert "source ../.venv/bin/activate" in flat + assert "ptq run my-fix" in flat def test_prints_ssh_command_remote(self, tmp_path, frozen_date): repo = _make_repo(tmp_path) @@ -130,12 +138,18 @@ def test_prints_ssh_command_remote(self, tmp_path, frozen_date): mock_backend.workspace = "/tmp/ws" mock_backend.run = MagicMock(return_value=_ok()) + from ptq.repo_profiles import _DEFAULT_PROFILES + with ( patch("ptq.cli._repo", return_value=repo), patch( "ptq.infrastructure.backends.RemoteBackend", return_value=mock_backend ), patch("ptq.config.load_config") as mock_cfg, + patch( + "ptq.repo_profiles.get_profile", + side_effect=lambda name: _DEFAULT_PROFILES[name], + ), ): mock_cfg.return_value.build_env_prefix.return_value = "USE_NINJA=1 " result = runner.invoke(app, ["worktree", "my-fix", "--machine", "gpu-dev"]) @@ -150,12 +164,18 @@ def test_no_agent_launched(self, tmp_path, frozen_date): mock_backend.run = MagicMock(return_value=_ok()) mock_backend.launch_background = MagicMock() + from ptq.repo_profiles import _DEFAULT_PROFILES, _loaded_profiles + with ( patch("ptq.cli._repo", return_value=repo), patch( "ptq.infrastructure.backends.LocalBackend", return_value=mock_backend ), patch("ptq.config.load_config") as mock_cfg, + patch( + "ptq.repo_profiles._loaded_profiles", + return_value=_DEFAULT_PROFILES, + ), ): mock_cfg.return_value.build_env_prefix.return_value = "USE_NINJA=1 " result = runner.invoke(app, ["worktree", "my-fix"]) @@ -186,6 +206,8 @@ def run_side(cmd: str, check: bool = True, **kw) -> CompletedProcess[str]: mock_backend.run = MagicMock(side_effect=run_side) + from ptq.repo_profiles import _DEFAULT_PROFILES + with ( patch("ptq.cli._repo", return_value=repo), patch.object(repo, "save", side_effect=tracked_save), @@ -193,6 +215,10 @@ def run_side(cmd: str, check: bool = True, **kw) -> CompletedProcess[str]: "ptq.infrastructure.backends.LocalBackend", return_value=mock_backend ), patch("ptq.config.load_config") as mock_cfg, + patch( + "ptq.repo_profiles._loaded_profiles", + return_value=_DEFAULT_PROFILES, + ), ): mock_cfg.return_value.build_env_prefix.return_value = "USE_NINJA=1 " result = runner.invoke(app, ["worktree", "my-fix"]) @@ -205,7 +231,10 @@ def run_side(cmd: str, check: bool = True, **kw) -> CompletedProcess[str]: class TestWorktreeReuse: @patch("ptq.application.run_service.deploy_scripts") - def test_run_adopts_precreated_worktree(self, _deploy, repo, frozen_date): + @patch("ptq.repo_profiles._loaded_profiles") + def test_run_adopts_precreated_worktree(self, mock_profiles, _deploy, repo, frozen_date): + from ptq.repo_profiles import _DEFAULT_PROFILES + mock_profiles.return_value = _DEFAULT_PROFILES backend = LocalBackend(workspace="/tmp/ws") _mock_backend(backend, worktree_exists=True) @@ -238,7 +267,10 @@ def test_run_adopts_precreated_worktree(self, _deploy, repo, frozen_date): backend.launch_background.assert_called_once() @patch("ptq.application.run_service.deploy_scripts") - def test_run_reuses_existing_worktree_no_rebuild(self, _deploy, repo, frozen_date): + @patch("ptq.repo_profiles._loaded_profiles") + def test_run_reuses_existing_worktree_no_rebuild(self, mock_profiles, _deploy, repo, frozen_date): + from ptq.repo_profiles import _DEFAULT_PROFILES + mock_profiles.return_value = _DEFAULT_PROFILES backend = LocalBackend(workspace="/tmp/ws") _mock_backend(backend, worktree_exists=True) From 6095554a9fcd86408f27c9f59b9daff3a7dd2429 Mon Sep 17 00:00:00 2001 From: Simon Fan Date: Thu, 9 Apr 2026 11:59:38 +0200 Subject: [PATCH 4/8] Update web UI for multi-repo support - routes.py: pass profile objects to template for dynamic repo dropdown, repo column in job list, merge-base diff, dynamic issue links - templates: iterate repos from config, repo column, dynamic issue links --- ptq/web/routes.py | 44 ++++++++++++++++++++++--- ptq/web/templates/job_detail.html | 2 +- ptq/web/templates/job_list.html | 1 + ptq/web/templates/job_new.html | 8 +++++ ptq/web/templates/partials/job_row.html | 1 + 5 files changed, 51 insertions(+), 5 deletions(-) diff --git a/ptq/web/routes.py b/ptq/web/routes.py index 51fee91..3ea6898 100644 --- a/ptq/web/routes.py +++ b/ptq/web/routes.py @@ -26,6 +26,7 @@ from ptq.domain.models import PtqError, RebaseState, RunRequest from ptq.infrastructure.backends import backend_for_job from ptq.infrastructure.job_repository import JobRepository +from ptq.repo_profiles import available_repos, get_profile from ptq.web.deps import get_job_status_with_finalize, templates log = logging.getLogger("ptq.web") @@ -99,10 +100,15 @@ def _append_progress(msg: str) -> None: try: issue_data = None + repo_name = str(params.get("repo", "pytorch")) + if issue_number is not None: from ptq.issue import fetch_issue - issue_data = await asyncio.to_thread(fetch_issue, issue_number) + profile = get_profile(repo_name) + issue_data = await asyncio.to_thread( + fetch_issue, issue_number, repo=profile.github_repo + ) run_request = RunRequest( issue_data=issue_data, @@ -115,6 +121,7 @@ def _append_progress(msg: str) -> None: max_turns=int(params["max_turns"]), agent_type=str(params["agent"]), name=job_name, + repo=repo_name, ) log.info( @@ -247,6 +254,7 @@ async def job_list(request: Request, status_filter: str = "all"): { "id": job_id, "issue": job.issue, + "repo": job.repo, "agent": job.agent, "target": job.target, "runs": job.runs, @@ -270,11 +278,13 @@ def _form_context(error: str | None = None) -> dict: available = am.available or cached_models(name) agent_models[name] = {"available": available, "default": am.default} machines = list(dict.fromkeys(cfg.machines + discover_ssh_hosts())) + return { "agents": list(AGENTS.keys()), "machines": machines, "agent_models": agent_models, "prompt_presets": _prompt_presets(cfg), + "repos": [get_profile(r) for r in available_repos()], "defaults": { "agent": cfg.default_agent, "model": cfg.default_model, @@ -342,6 +352,7 @@ async def job_create( model: str = Form("opus"), max_turns: int = Form(100), name: str = Form(""), + repo: str = Form("pytorch"), ): if task_type == "issue" and not issue.strip(): return templates.TemplateResponse( @@ -378,6 +389,7 @@ async def job_create( "model": model, "max_turns": max_turns, "name": name.strip(), + "repo": repo.strip(), } ) return RedirectResponse(url=f"/jobs/launching/{launch_id}", status_code=303) @@ -451,6 +463,8 @@ async def job_detail(request: Request, job_id: str): get_pr_state, backend_for_job(job), job.pr_url ) + profile = get_profile(job.repo) + rb = job.rebase_info if rb.state == RebaseState.SUCCEEDED: repo.save_rebase(job_id, {}) @@ -481,6 +495,9 @@ async def job_detail(request: Request, job_id: str): "rebase_attempts": rb.attempts, "rebase_error": rb.error, "prompt_presets": _prompt_presets(cfg), + "github_repo": profile.github_repo, + "repo_name": job.repo, + "dir_name": profile.dir_name, }, ) @@ -529,7 +546,10 @@ async def job_rerun( if job.issue is not None: from ptq.issue import fetch_issue - issue_data = await asyncio.to_thread(fetch_issue, job.issue) + profile = get_profile(job.repo) + issue_data = await asyncio.to_thread( + fetch_issue, job.issue, repo=profile.github_repo + ) run_request = RunRequest( issue_data=issue_data, @@ -542,6 +562,7 @@ async def job_rerun( max_turns=cfg.default_max_turns, agent_type=agent_type, existing_job_id=job_id, + repo=job.repo, ) await asyncio.to_thread(run_service.launch, repo, backend, run_request) @@ -733,13 +754,28 @@ async def job_diff(job_id: str): with _catch_error(): job = repo.get(job_id) backend = backend_for_job(job) - worktree = f"{backend.workspace}/jobs/{job_id}/pytorch" + profile = get_profile(job.repo) + worktree = f"{backend.workspace}/jobs/{job_id}/{profile.dir_name}" + + # Show all changes: committed (branch vs merge-base with main) + uncommitted + # Use a single shell command to avoid multiple SSH round-trips result = backend.run( - f"git -C {worktree} -c color.diff=never diff --no-color --no-ext-diff", + f"cd {worktree} && " + f"mb=$(git merge-base HEAD origin/main 2>/dev/null) && " + f"git -c color.diff=never diff --no-color --no-ext-diff $mb", check=False, ) content = result.stdout.strip() if result.returncode == 0 else None if not content: + # merge-base failed or no committed changes — try plain diff (uncommitted only) + result = backend.run( + f"cd {worktree} && git -c color.diff=never diff --no-color --no-ext-diff", + check=False, + ) + content = result.stdout.strip() if result.returncode == 0 else None + if not content: + log.warning("Empty diff for job %s (worktree=%s, stderr=%s)", + job_id, worktree, result.stderr.strip() if result.stderr else "") return PlainTextResponse("") return PlainTextResponse(content) diff --git a/ptq/web/templates/job_detail.html b/ptq/web/templates/job_detail.html index e140c7c..719222c 100644 --- a/ptq/web/templates/job_detail.html +++ b/ptq/web/templates/job_detail.html @@ -15,7 +15,7 @@

- {% if issue %}#{{ issue }}{% else %}adhoc{% endif %} + {% if issue %}{{ github_repo }}#{{ issue }}{% else %}adhoc{% endif %} · {{ agent_name }} · {{ target }} · run {{ runs }} diff --git a/ptq/web/templates/job_list.html b/ptq/web/templates/job_list.html index e7804cd..c27613c 100644 --- a/ptq/web/templates/job_list.html +++ b/ptq/web/templates/job_list.html @@ -17,6 +17,7 @@

Jobs

Status Job ID Name + Repo Issue Agent Runs diff --git a/ptq/web/templates/job_new.html b/ptq/web/templates/job_new.html index b243d9a..8bf127f 100644 --- a/ptq/web/templates/job_new.html +++ b/ptq/web/templates/job_new.html @@ -10,6 +10,14 @@

Launch New Job

Task + + diff --git a/ptq/web/templates/partials/job_row.html b/ptq/web/templates/partials/job_row.html index 6eca86e..6ce267d 100644 --- a/ptq/web/templates/partials/job_row.html +++ b/ptq/web/templates/partials/job_row.html @@ -5,6 +5,7 @@ {{ job.id }} {% if job.name %}{{ job.name }}{% else %}-{% endif %} + {{ job.repo }} {% if job.issue %}#{{ job.issue }}{% else %}adhoc{% endif %} {{ job.agent }} {{ job.runs }} From edf160f519a73c36fb19391be4d6f19218b69b2e Mon Sep 17 00:00:00 2001 From: Simon Fan Date: Thu, 9 Apr 2026 11:59:43 +0200 Subject: [PATCH 5/8] Document how to add a new repo profile --- README.md | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c48d575..23573d2 100644 --- a/README.md +++ b/README.md @@ -245,6 +245,32 @@ Removes job directories and prunes git worktrees. | `--keep` | clean | 0 | Number of recent jobs to keep | | `--log` | peek | 0 | Number of log lines to show | +## Adding a new repo + +1. Add a `[repos.]` section to `~/.ptq/config.toml`: + +```toml +[repos.torchtitan] +github_repo = "pytorch/torchtitan" +clone_url = "https://github.com/pytorch/torchtitan.git" +dir_name = "torchtitan" +smoke_test_import = "torchtitan" +repro_import_hint = "import torchtitan" +``` + +2. Create prompt templates in `prompts/`: + - `prompts/investigate_.md` — issue investigation prompt + - `prompts/adhoc_.md` — freeform task prompt + +The prompt templates are where the real work is — they teach the agent about the repo's build system, directory layout, debugging tools, and testing conventions. See the existing `investigate.md` and `investigate_torchtitan.md` for examples. + +Optional profile fields (all default to `false`/`null`): +| Field | Description | +|-------|-------------| +| `uses_custom_worktree_tool` | Use `tools/create_worktree.py` instead of `git worktree add` | +| `needs_cpp_build` | Run C++ rebuild after worktree creation | +| `lint_cmd` | Lint command to run before PRs | + ## Project layout ``` @@ -277,8 +303,10 @@ pt_job_queue/ │ ├── static/style.css # Dark-theme styles │ └── templates/ # Jinja2 templates (Pico CSS + htmx) ├── prompts/ -│ ├── investigate.md # Issue investigation prompt -│ └── adhoc.md # Freeform task prompt +│ ├── investigate.md # PyTorch issue investigation prompt +│ ├── adhoc.md # PyTorch freeform task prompt +│ ├── investigate_torchtitan.md # TorchTitan issue investigation prompt +│ └── adhoc_torchtitan.md # TorchTitan freeform task prompt └── scripts/ └── rebuild.sh ``` From c2311b1202d20c3d69a163f922d3dbe73c9d6372 Mon Sep 17 00:00:00 2001 From: Simon Fan Date: Thu, 9 Apr 2026 14:16:14 +0200 Subject: [PATCH 6/8] Reframe UX: torchtitan as an add-on repo, not a separate profile --- prompts/adhoc.md | 1 + prompts/adhoc_torchtitan.md | 4 ++++ prompts/investigate.md | 1 + prompts/investigate_torchtitan.md | 8 +++++++- ptq/cli.py | 6 +++--- ptq/web/templates/job_new.html | 2 +- 6 files changed, 17 insertions(+), 5 deletions(-) diff --git a/prompts/adhoc.md b/prompts/adhoc.md index e216ab3..a54c2ba 100644 --- a/prompts/adhoc.md +++ b/prompts/adhoc.md @@ -11,6 +11,7 @@ You are performing a task on a PyTorch codebase. - **PyTorch source** (edit here): `{workspace}/jobs/{job_id}/pytorch/` - **Job artifacts** (write output here): `{workspace}/jobs/{job_id}/` - **Rebuild script** (after C++ changes): `bash {workspace}/scripts/rebuild.sh {workspace}/jobs/{job_id}/pytorch` +- **Add-on repos** (available for cross-referencing): `{workspace}/torchtitan/` ## Task diff --git a/prompts/adhoc_torchtitan.md b/prompts/adhoc_torchtitan.md index 1393b33..0a497db 100644 --- a/prompts/adhoc_torchtitan.md +++ b/prompts/adhoc_torchtitan.md @@ -58,5 +58,9 @@ Write these files to `{workspace}/jobs/{job_id}/`: ``` cd {workspace}/jobs/{job_id}/torchtitan && git diff > {workspace}/jobs/{job_id}/fix.diff ``` +If you also edited PyTorch source, generate a separate diff: +``` +cd {workspace}/pytorch && git diff > {workspace}/jobs/{job_id}/pytorch-fix.diff +``` IMPORTANT: Always generate report.md before finishing. Generate fix.diff if you made any code changes. diff --git a/prompts/investigate.md b/prompts/investigate.md index cd2d127..11d9b42 100644 --- a/prompts/investigate.md +++ b/prompts/investigate.md @@ -11,6 +11,7 @@ You are investigating a PyTorch bug. Your goal is to reproduce, understand, and - **PyTorch source** (edit here): `{workspace}/jobs/{job_id}/pytorch/` - **Job artifacts** (write output here): `{workspace}/jobs/{job_id}/` - **Rebuild script** (after C++ changes): `bash {workspace}/scripts/rebuild.sh {workspace}/jobs/{job_id}/pytorch` +- **Add-on repos** (available for cross-referencing): `{workspace}/torchtitan/` ## Issue Context diff --git a/prompts/investigate_torchtitan.md b/prompts/investigate_torchtitan.md index 7d8338d..073b9c6 100644 --- a/prompts/investigate_torchtitan.md +++ b/prompts/investigate_torchtitan.md @@ -70,7 +70,9 @@ CUDA_LAUNCH_BLOCKING=1 PYTORCH_NO_CUDA_MEMORY_CACHING=1 compute-sanitizer --tool ### 3. Fix - Edit source files in `{workspace}/jobs/{job_id}/torchtitan/` to fix the bug. -- If the root cause is in PyTorch, edit files in `{workspace}/pytorch/` instead (changes are picked up after C++ rebuild if needed). +- If the root cause is in PyTorch, edit files in `{workspace}/pytorch/` instead. + - **Python-only changes**: picked up automatically. + - **C++ changes**: rebuild with `bash {workspace}/scripts/rebuild.sh {workspace}/pytorch` - Make minimal, targeted changes. ### 4. Test @@ -91,5 +93,9 @@ Write these files to `{workspace}/jobs/{job_id}/`: ``` cd {workspace}/jobs/{job_id}/torchtitan && git diff > {workspace}/jobs/{job_id}/fix.diff ``` +If you also edited PyTorch source, generate a separate diff: +``` +cd {workspace}/pytorch && git diff > {workspace}/jobs/{job_id}/pytorch-fix.diff +``` IMPORTANT: Always generate both report.md and fix.diff before finishing. diff --git a/ptq/cli.py b/ptq/cli.py index b099015..33510de 100644 --- a/ptq/cli.py +++ b/ptq/cli.py @@ -13,7 +13,7 @@ from ptq.domain.models import JobRecord, JobStatus, PtqError, RebaseState, RunRequest app = typer.Typer( - name="ptq", help="PyTorch Job Queue — dispatch AI agents to fix PyTorch issues." + name="ptq", help="PyTorch Job Queue — dispatch AI agents to fix issues in PyTorch and add-on repos." ) console = Console() @@ -216,7 +216,7 @@ def run( ] = None, repo: Annotated[ str, - typer.Option("--repo", help="Target repository name (see config.toml [repos])."), + typer.Option("--repo", help="Repo the issue is filed in (default: pytorch)."), ] = "pytorch", ) -> None: """Launch an AI agent to investigate a GitHub issue or run an adhoc task. @@ -865,7 +865,7 @@ def worktree( ] = False, repo: Annotated[ str, - typer.Option("--repo", help="Target repository name (see config.toml [repos])."), + typer.Option("--repo", help="Repo to create a worktree for (default: pytorch)."), ] = "pytorch", ) -> None: """Create a named worktree with a ready-to-use venv. diff --git a/ptq/web/templates/job_new.html b/ptq/web/templates/job_new.html index 8bf127f..dbffcfb 100644 --- a/ptq/web/templates/job_new.html +++ b/ptq/web/templates/job_new.html @@ -10,7 +10,7 @@

Launch New Job

Task -