diff --git a/scripts/workflow_status/lib/gh.py b/scripts/workflow_status/lib/gh.py index d258ac5f..3559e2d5 100644 --- a/scripts/workflow_status/lib/gh.py +++ b/scripts/workflow_status/lib/gh.py @@ -118,7 +118,13 @@ def fetch_jobs(run_id: int, config: Config) -> list[dict]: def fetch_log(run_id: int, config: Config, failed_only: bool = True) -> str | None: - """Fetch the log for a run (``--log-failed`` or ``--log``).""" + """Fetch the log for a whole run (``--log-failed`` or ``--log``). + + NOTE: ``gh run view --log[-failed]`` is rejected by the CLI when the run has + too many jobs (``too many API requests needed to fetch logs; try narrowing + down to a specific job with the --job option``). For analysis use + :func:`fetch_job_log` instead, which fetches per-job and avoids the throttle. + """ flag = "--log-failed" if failed_only else "--log" out = run_gh_safe( "run", @@ -142,6 +148,43 @@ def fetch_log(run_id: int, config: Config, failed_only: bool = True) -> str | No return out +def fetch_job_log(job_id: int, config: Config) -> str | None: + """Fetch the log for a single job by ID. + + Tries ``gh run view --job --log`` first, which returns the log with + the standard ``\\t\\t\\t`` prefixes used + elsewhere in the analyser. Falls back to the raw REST endpoint + ``/repos/{owner}/{repo}/actions/jobs/{job_id}/logs`` if the gh wrapper + returns nothing. + + Per-job fetching is the only reliable path for runs with many jobs (large + matrix builds): the run-level log endpoints are rejected by the gh CLI + with ``too many API requests needed to fetch logs; try narrowing down to a + specific job with the --job option``. + """ + out = run_gh_safe( + "run", + "view", + "-R", + config.repo, + "--job", + str(job_id), + "--log", + config=config, + ) + if out: + return out + + raw = run_gh_safe( + "api", + "-H", + "Accept: application/vnd.github.v3.raw", + f"/repos/{config.repo}/actions/jobs/{job_id}/logs", + config=config, + ) + return raw + + def detect_repo(config: Config) -> str: """Auto-detect the repo via ``gh repo view``.""" out = run_gh_safe("repo", "view", "--json", "nameWithOwner", "-q", ".nameWithOwner", config=config) diff --git a/scripts/workflow_status/lib/logs.py b/scripts/workflow_status/lib/logs.py index 879d4028..eb429231 100644 --- a/scripts/workflow_status/lib/logs.py +++ b/scripts/workflow_status/lib/logs.py @@ -195,6 +195,49 @@ def filter_log_for_job(full_log: str, job_name: str) -> str: return full_log +def line_matches_error(line: str) -> bool: + """Return True if *line* looks like a real error/failure line.""" + return any(p.search(line) for p in _ERROR_PATTERNS) + + +def pick_display_snippet( + block: str, + *, + max_lines: int = 12, + context_before: int = 2, +) -> str: + """Pick the most informative slice of an extracted failure block. + + Algorithm: + 1. Strip empty lines and `...` block-internal separators. + 2. Locate lines matching an error pattern. The *last* match is the + anchor -- the deepest error is usually the actionable one (earlier + matches are often build-up / dependency chatter, and the GH + ``##[error]Process completed`` summary tends to immediately follow + the actual error so we want it inside the window). + 3. Return up to ``max_lines`` lines, with ``context_before`` lines of + lead-in before the anchor. + 4. If no error pattern hits, return the *tail* of the block, not the + head -- the tail contains the GH job-completion lines and is far + more useful than the first few lines (often pure context-before + lead-in produced by ``_extract_error_lines``). + """ + lines = [ln for ln in block.splitlines() if ln.strip() and ln.strip() != "..."] + if not lines: + return "" + + err_idx = [i for i, ln in enumerate(lines) if line_matches_error(ln)] + if not err_idx: + return "\n".join(lines[-max_lines:]) + + anchor = err_idx[-1] + start = max(0, anchor - context_before) + end = min(len(lines), start + max_lines) + if end - start < max_lines: + start = max(0, end - max_lines) + return "\n".join(lines[start:end]) + + def split_into_blocks(content: str) -> list[str]: """Split *content* on ``BLOCK_SEP`` into individual failure blocks.""" blocks: list[str] = [] diff --git a/scripts/workflow_status/workflow_status.py b/scripts/workflow_status/workflow_status.py index 3dc7e8f7..83b56ce7 100644 --- a/scripts/workflow_status/workflow_status.py +++ b/scripts/workflow_status/workflow_status.py @@ -27,10 +27,11 @@ format_failure_detail, status_emoji, ) -from lib.gh import detect_repo, fetch_jobs, fetch_log, fetch_run +from lib.gh import detect_repo, fetch_job_log, fetch_jobs, fetch_run from lib.logs import ( extract_relevant_failures, filter_log_for_job, + pick_display_snippet, split_into_blocks, ) from lib.similarity import ( @@ -166,7 +167,7 @@ def _process_failed_job( rep_bidx = grp[0] block = blocks[rep_bidx] ai_st, cause, fix = ai_results.get(rep_bidx, ("", "", "")) - display_st = ai_st if ai_st else "\n".join([ln for ln in block.splitlines() if ln.strip()][:5]) + display_st = ai_st if ai_st else pick_display_snippet(block) stacktraces.append((display_st, cause, fix)) if len(grp) > 1: @@ -274,18 +275,43 @@ def main() -> None: else: out.print(f"*\U0001f534 Failure Details ({len(failed_jobs)} failed job(s)):*") - # Fetch logs once for the whole run - log_out: str | None = None - if config.analyze_cause or config.print_logs: - log_out = fetch_log(config.run_id, config, failed_only=True) - - # Group failed jobs by error similarity to deduplicate + # Fetch logs per-job in parallel. Logs are needed for the stacktrace + # display itself, similarity-based grouping, and (optionally) AI + # cause/fix analysis -- so we always fetch when there are failures, + # regardless of --no-cause / --no-fix / --print-logs. + # + # We use per-job ``gh run view --job --log`` because the + # run-level ``--log[-failed]`` path is rejected for matrices large + # enough to trip gh's safety throttle: + # too many API requests needed to fetch logs; + # try narrowing down to a specific job with the --job option + # which used to leave every failure stuck on "(no logs available)". + job_log_by_id: dict[int, str] = {} + with ThreadPoolExecutor(max_workers=config.max_gh_workers) as log_pool: + log_futs = { + log_pool.submit(fetch_job_log, j.get("databaseId"), config): j + for j in failed_jobs + if j.get("databaseId") + } + for fut in as_completed(log_futs): + j = log_futs[fut] + try: + job_log_by_id[j["databaseId"]] = fut.result() or "" + except Exception as exc: + print( + f"WARN: failed to fetch log for job {j.get('databaseId')} ({j.get('name', '?')}): {exc}", + file=sys.stderr, + ) + job_log_by_id[j["databaseId"]] = "" + + # Per-job extracted failure lines, used downstream for similarity + # grouping. Each ``jlog`` string already covers a single job, so + # ``filter_log_for_job`` is a cheap no-op safety net. job_logs: list[str] = [] for job in failed_jobs: jname = job.get("name", "unknown") - jlog = "" - if log_out: - jlog = extract_relevant_failures(filter_log_for_job(log_out, jname)) + raw = job_log_by_id.get(job.get("databaseId", -1), "") + jlog = extract_relevant_failures(filter_log_for_job(raw, jname)) if raw else "" job_logs.append(jlog) job_token_sets = [compute_error_tokens(jl) for jl in job_logs] @@ -310,11 +336,13 @@ def main() -> None: for member_indices in error_groups: rep_idx = member_indices[0] display_idx += 1 + rep_job = failed_jobs[rep_idx] + rep_log = job_log_by_id.get(rep_job.get("databaseId", -1), "") or None fut = pool.submit( _process_failed_job, - failed_jobs[rep_idx], + rep_job, display_idx, - log_out, + rep_log, run_url, wf_name, config,