diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 36a52c2f..298242f1 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -39,7 +39,10 @@ jobs: run: uv sync --all-extras --dev && uv run playwright install chromium - name: Run unit tests with coverage - run: uv run pytest -m "not (tui or browser or benchmark)" --cov=claude_code_log --cov-report=xml --cov-report=html --cov-report=term + # -p no:playwright: unit tests don't use the browser fixtures; skipping the + # plugin avoids a ~1s playwright import per xdist worker, which Windows pays + # per spawned worker. See work/xdist-import-cost.md. + run: uv run pytest -p no:playwright -m "not (tui or browser or benchmark)" --cov=claude_code_log --cov-report=xml --cov-report=html --cov-report=term - name: Run TUI tests with coverage append run: uv run pytest -m tui --cov=claude_code_log --cov-append --cov-report=xml --cov-report=html --cov-report=term diff --git a/justfile b/justfile index aeb80f85..4a05796e 100644 --- a/justfile +++ b/justfile @@ -6,8 +6,11 @@ cli *ARGS: uv run claude-code-log {{ ARGS }} # Run unit + integration tests (excludes TUI, browser, and benchmark) +# `-p no:playwright`: unit runs never use the browser fixtures, so skip loading +# the pytest-playwright plugin โ€” it imports playwright (~1s) per xdist worker on +# Windows (spawned workers re-import). See work/xdist-import-cost.md. test: - uv run pytest -m "not (tui or browser or benchmark)" -v + uv run pytest -p no:playwright -m "not (tui or browser or benchmark)" -v # Run benchmark tests serially for stable measurements (outputs to GITHUB_STEP_SUMMARY in CI). DEBUG_TIMING enables coverage of renderer_timings.py test-benchmark: @@ -35,7 +38,7 @@ test-all: set -e # Exit on first failure echo "๐Ÿงช Running all tests in sequence..." echo "๐Ÿ“ฆ Running unit tests..." - uv run pytest -m "not (tui or browser or integration or benchmark)" -v + uv run pytest -p no:playwright -m "not (tui or browser or integration or benchmark)" -v echo "๐Ÿ–ฅ๏ธ Running TUI tests..." uv run pytest -m tui -v echo "๐ŸŒ Running browser tests..." @@ -52,7 +55,7 @@ test-cov: set -e # Exit on first failure echo "๐Ÿ“Š Running all tests with coverage..." echo "๐Ÿ“ฆ Running unit tests with coverage..." - uv run pytest -m "not (tui or browser or integration or benchmark)" --cov=claude_code_log --cov-report=xml --cov-report=html --cov-report=term -v + uv run pytest -p no:playwright -m "not (tui or browser or integration or benchmark)" --cov=claude_code_log --cov-report=xml --cov-report=html --cov-report=term -v echo "๐Ÿ–ฅ๏ธ Running TUI tests with coverage append..." uv run pytest -m tui --cov=claude_code_log --cov-append --cov-report=xml --cov-report=html --cov-report=term -v echo "๐ŸŒ Running browser tests with coverage append..." diff --git a/test/conftest.py b/test/conftest.py index 7df5e9ba..98d8143f 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -1,11 +1,12 @@ """Pytest configuration and shared fixtures.""" from pathlib import Path -from typing import TYPE_CHECKING, Generator +from typing import TYPE_CHECKING, Generator, Optional import pytest if TYPE_CHECKING: + from _pytest.config import Config from claude_code_log.cache import CacheManager from test.snapshot_serializers import ( @@ -14,6 +15,61 @@ ) +# ========== Collection cost control ========== +# Browser test modules import `playwright` and TUI modules import `textual` +# at module top level. pytest imports every test module during collection and +# only *then* applies `-m` marker deselection โ€” so a plain unit run +# (`-m "not (tui or browser)"`) still pays to import playwright + textual for +# nothing. On Windows that tax is paid PER xdist worker, because workers spawn +# fresh interpreters and re-import the world (Linux forks instead). Skipping +# those modules at collection time when their marker can't be selected keeps the +# heavy deps out of unit collection entirely. See work/xdist-import-cost.md. + + +def pytest_ignore_collect(collection_path: Path, config: "Config") -> Optional[bool]: + """Don't import browser/TUI test modules when their marker is excluded. + + Uses pytest's own marker-expression evaluator so the decision exactly + tracks the active ``-m`` filter (no string-matching heuristics). + Returns ``True`` to skip a module, ``None`` to defer to default behaviour. + """ + markexpr: str = config.getoption("markexpr") or "" + if not markexpr: + return None # no -m filter -> collect everything as usual + + # Filename coupling: we can't read a module's markers without importing it + # (the very cost we're avoiding), so the heavy-dep modules are identified by + # naming convention. A browser/TUI-marked test in a differently-named file + # (e.g. test_index_timezone.py carries @pytest.mark.browser tests) is NOT + # skipped here โ€” still correct (it's deselected on unit runs anyway), just + # not optimised. If such a file ever grows a *top-level* playwright/textual + # import, rename it to *_browser.py / test_tui_* so this hook catches it. + name = collection_path.name + if name.endswith("_browser.py"): + marker = "browser" + elif name == "test_tui.py" or name.startswith("test_tui_"): + marker = "tui" + else: + return None + + from _pytest.mark.expression import Expression + + try: + expr = Expression.compile(markexpr) + except Exception: + return None # malformed expression: let pytest handle/report it + + def has_marker(tag: str, /, **_kwargs: "str | int | bool | None") -> bool: + # Signature matches pytest's ExpressionMatcher protocol. + return tag == marker + + # If a test bearing only this marker could never be selected, the whole + # module is dead weight for this run -> skip importing it. + if not expr.evaluate(has_marker): + return True + return None + + # ========== Cache Test Fixtures ========== # These fixtures use explicit db_path for true test isolation, # enabling parallel test execution without database conflicts. diff --git a/work/xdist-import-cost.md b/work/xdist-import-cost.md new file mode 100644 index 00000000..31f27359 --- /dev/null +++ b/work/xdist-import-cost.md @@ -0,0 +1,133 @@ +# Cutting pytest-xdist worker-spawn / re-import cost on Windows + +Status: WIP investigation (branch `dev/xdist-import-cost`). +Scope: conftest / test-module import cost + pyproject/CI xdist knobs. +Out of scope (parallel work by alice): `claude_code_log/cache.py`, +`test/test_integration_realistic.py`. + +## Problem + +Windows CI is ~3x slower than Linux CI, orthogonal to data volume. Linux +`fork()`s workers (copy-on-write, no re-import); Windows **spawns** a fresh +interpreter per xdist worker and **re-imports the whole world** in each. So any +module pulled in during collection is paid once per worker, every run. + +## Root cause found + +`pytest` imports **every** test module during collection and only *then* +applies `-m` marker deselection. The unit run (`-m "not (tui or browser ...)"`) +therefore imported the heavy optional deps for nothing: + +- 10 `*_browser.py` modules `import playwright` at top level +- 2 `test_tui*.py` modules `import textual` at top level + +Per-process cold import cost (fresh interpreter, measured locally): + +| dep | import cost | +|-----|-------------| +| `textual` / `textual.app` | ~730โ€“880 ms | +| `playwright.sync_api` | ~500 ms | +| `pytest_playwright` plugin (pulls playwright) | ~975 ms | + +Key nuance: **playwright is loaded by the `pytest_playwright` plugin on every +worker anyway** (entry-point autoload), independent of whether the browser test +modules are collected. So skipping the browser modules does *not* save the +playwright import โ€” the plugin already pays it. What the module-skip genuinely +removes from unit workers is **`textual`** (no always-on textual plugin). +Eliminating the playwright cost on unit runs is a *separate* lever: disabling +the unused plugin. + +## Change 1 โ€” don't import browser/TUI modules during unit collection (DONE) + +`test/conftest.py` adds a `pytest_ignore_collect` hook that skips +`*_browser.py` / `test_tui*.py` when their marker can't be selected by the +active `-m` expression. It uses pytest's own `Expression` evaluator, so the +decision exactly tracks `-m` (no string-matching heuristics). + +Verified selection is unchanged: + +| run | before | after | +|-----|--------|-------| +| `-m "not (tui or browser)"` collected | 2214 | 2214 (modules skipped, not deselected) | +| `-m browser` collected | 62 | 62 | +| `-m tui` collected | 68 | 68 | +| no `-m` (full) collected | 2344 | 2344 | + +Effect: removes the per-worker `textual` import (~0.7โ€“0.9 s/worker) from every +unit run. Saving scales with worker count and is paid on **every** CI run +because each Windows worker re-imports. + +## Change 2 โ€” disable the unused `pytest_playwright` plugin on unit runs (DONE) + +`-p no:playwright` added to the unit-test invocations (justfile `test`, +`test-all`, `test-cov`, and the CI "Run unit tests" step). Removes ~0.97 s/worker +of playwright import that unit tests never use. Safe because the browser fixtures +in conftest are lazy โ€” nothing requests `page`/`context` on a unit run (and +browser modules are now skipped). Deliberately NOT in `pyproject` `addopts` +(that would disable it for the browser run too); it lives at the unit call sites +only. Verified: full unit suite green and `-m browser` / `-m tui` runs still +execute (smoke-tested locally). + +## Change 3 โ€” xdist worker count (RECOMMENDATION, needs CI validation) + +`pyproject` forces `-n auto --dist=worksteal`. On a 12-core dev box `-n auto` +spawns 12 workers, exaggerating the spawn+reimport tax (1 trivial test: +`-n0` โ‰ˆ 1.4 s vs `-n auto` โ‰ˆ 6.4 s). **But GitHub-hosted `windows-latest` has +only ~4 vCPUs**, so CI already runs `-n auto` โ‰ˆ 4 workers โ€” modest. Cutting +worker count below the core count on CI would sacrifice real parallelism on the +slow integration tests for little spawn-cost gain. + +Recommendation: **keep `-n auto`**; close the gap via per-worker import cost +(Changes 1+2), not by capping workers. The 12-worker tax is a dev-machine +artifact, not the CI reality. + +> **Needs real-runner confirmation:** the definitive worker-count sweet spot +> must be measured on the actual GitHub Windows runner (fewer cores, different +> disk/AV profile than a dev box). Do not hard-commit a CI `-n` value from +> local numbers. If Windows CI is still slow after Changes 1+2, A/B `-n auto` +> vs a fixed `-n 2`/`-n 3` *on the runner* before changing the knob. + +## Before/after (local measurements) + +Box: Windows 11, 12 cores, warm caches. **Caveat:** every `uv run pytest` +invocation carries ~3 s of `uv` env-resolution overhead, which is included in +all wall numbers below and dilutes the measured delta. The deterministic +per-process import costs (above) are the cleaner signal for what each fresh CI +worker stops paying. + +Parallel **collect-only** wall (`pytest --co`, forces every worker to +spawn + import + collect, no test execution), best-of-N to suppress contention: + +| config | wall | +|--------|------| +| baseline (no hook, playwright loaded), `-n auto` | 4.2 s (best-of-5) | +| **complete change** (hook + `-p no:playwright`), `-n auto` | **3.1 s (best-of-5)** | +| baseline, `-n4` (CI-like core count) | 3.9 s (best-of-3) | +| after hook, `-n4` | 3.3 s (best-of-3) | + +~26% off collect-only wall at `-n auto` despite the fixed ~3 s `uv` floor โ€” i.e. +the spawn+import portion itself dropped substantially. `-p no:playwright` alone +accounts for ~1.4 s of it (`-n auto`: 5.0 s โ†’ 3.6 s in a separate best-of-3). + +**Full unit-suite wall** (`-m "not (tui or browser)"`, ~2200 tests) measured +182 s / 237 s / 295 s across runs โ€” dominated by the slow integration tests and +too contention-sensitive on this box to isolate a per-worker *startup* delta of +~1โ€“2 s. This is expected: the startup saving is a fixed offset, swamped by test +execution. It surfaces best on the cold, fewer-core GitHub Windows runner โ€” +hence the "needs CI confirmation" flag. + +## Validated locally vs needs CI + +- Validated locally: collection-selection equivalence; per-process import costs; + conftest hook correctness across `-m` variants; suite stays green + (except 3 pre-existing Windows symlink-privilege failures, below). +- Needs CI confirmation: real wall-clock delta on the GitHub Windows runner; + the worker-count recommendation. + +## Pre-existing unrelated failures + +`test/test_session_scan_characterization.py::TestIndexInlineAggregateLoop...` +(3 tests) fail locally with `OSError [WinError 1314] A required privilege is not +held by the client` from `os.symlink` โ€” a Windows Developer-Mode/admin +requirement, not related to import cost or this branch. Deselected in the A/B +measurements for a clean comparison.