diff --git a/.gitignore b/.gitignore index 2b5c8ef..2662484 100644 --- a/.gitignore +++ b/.gitignore @@ -38,3 +38,5 @@ Thumbs.db .vscode/ *.swp *.swo + +ignore/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 6e104ca..336dc1d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,14 @@ # Changelog +## Unreleased + +- Changed default `gua` state paths to `~/.gua/gua.db`, `~/.gua/gua.pid`, + and `~/.gua/gua.log`; the default database now acts as an appendable local + history database. +- Record daemon run intervals in SQLite and attach samples to a run, so + `gua report` uses recorded intervals by default. `--interval` is now an + override and a fallback for legacy rows without interval metadata. + ## 1.0.2 - 2026-05-15 - Hardened `gua status` and `gua stop` so stale PID files do not act on diff --git a/README.md b/README.md index d3941f5..6394efc 100644 --- a/README.md +++ b/README.md @@ -32,14 +32,14 @@ uv tool install gpu-usage-audit gua doctor gua daemon --interval 30s gua status -gua report --since 1h --interval 30s +gua report --since 1h gua stop ``` `gua doctor` is intentionally read-only. It checks only the current machine: OS/kernel/Python, `/dev/nvidia*`, `nvidia-smi -L`, NVML load/init/device count/driver version, and the database path the daemon -would write to. The default is `/tmp/gua.db`; pass `gua doctor --db PATH` +would write to. The default is `~/.gua/gua.db`; pass `gua doctor --db PATH` when you plan to use a different daemon database. Use `gua doctor --json` for the same report in a machine-readable form. @@ -77,7 +77,7 @@ uvx --from "./$WHEEL" gua doctor ## What you get ``` -$ gua report --since 1h --interval 30s +$ gua report --since 1h gua — lab-a100 (bare, driver 560.35.05) Window: 1:00:00 §1 Headline @@ -90,7 +90,7 @@ gua — lab-a100 (bare, driver 560.35.05) Window: 1:00:00 (51 samples) §2 Idle capacity - converted from card-ticks to GPU-hours using the report --interval + converted from card-ticks to GPU-hours using recorded daemon interval idle-held: ~0.31 GPU-hours, ~1.53 GPUs equivalently unavailable truly-idle: ~0.12 GPU-hours, ~1.00 GPUs equivalently free @@ -116,9 +116,9 @@ The 3-bar collapses every card × every tick over the window into the active / idle-held / truly-idle split. **`idle-held` rows are the embarrassing category**: a process is holding GPU memory but the SM utilization is below 10%. §2 converts those card-ticks into GPU-hours -with `--interval`; §4 groups process rows by identity, GPU, and tick -before ranking users, so multiple same-user processes on one GPU/tick -count once. +using the interval recorded by the daemon; §4 groups process rows by +identity, GPU, and tick before ranking users, so multiple same-user +processes on one GPU/tick count once. ## Demo (no GPU required) @@ -142,7 +142,7 @@ gua doctor ``` Doctor should show the current machine, visible `/dev/nvidia*` device -files, `nvidia-smi -L` GPUs, NVML device count, and `/tmp/gua.db` status. +files, `nvidia-smi -L` GPUs, NVML device count, and `~/.gua/gua.db` status. `nvidia-ml-py` is installed by default with `gpu-usage-audit`; if doctor reports that `pynvml` is not importable, reinstall the isolated tool environment: @@ -166,7 +166,7 @@ gua status Run the report: ```sh -gua report --since 1h --interval 30s +gua report --since 1h ``` Stop the background collector when the collection window is done: @@ -175,12 +175,12 @@ Stop the background collector when the collection window is done: gua stop ``` -If `--db` is omitted, both `daemon` and `report` use `/tmp/gua.db`. -`daemon` refuses to start when that database file already exists, so a -new collection run does not silently append to an old test database. If -`gua doctor` reports that the database already exists, either run -`gua report` against the existing data or choose a fresh `--db PATH` for -the next daemon run. +If `--db` is omitted, `daemon`, `report`, `status`, and `stop` use +`~/.gua/` for local user state: `gua.db`, `gua.pid`, and `gua.log`. The +default database is a local history database, so later daemon runs append +to it and reports read the accumulated data. If you pass a custom +`--db PATH`, `daemon` still refuses an existing file to avoid silently +appending to an ad hoc collection run. > The daemon requires the NVIDIA driver and `libnvidia-ml.so.1`. On a > driverless host it exits with a friendly NVML initialization error. For @@ -193,7 +193,7 @@ point remains installed for compatibility, but new examples use `gua`. | Command | What it does | | -------- | ----------------------------------------------------------- | -| `daemon` | Starts the collector in the background. Samples real NVML telemetry on every tick and writes to a new database. NVIDIA host required. | +| `daemon` | Starts the collector in the background. Samples real NVML telemetry on every tick and writes to the local history database. NVIDIA host required. | | `start` | Alias for `gua daemon`. | | `status` | Shows whether the background collector PID is still running. Also clears a stale PID file when it points to a missing or unrelated process. | | `stop` | Stops the background collector with SIGTERM. | @@ -208,13 +208,14 @@ gua start [--db PATH] [--interval D] [--pid-file PATH] [--log-file PATH] gua daemon --foreground [--db PATH] [--interval D] ``` -- `--db PATH` (default `/tmp/gua.db`) — SQLite file to create and write - to. The daemon exits with an error if the file already exists. WAL mode - is enabled automatically. +- `--db PATH` (default `~/.gua/gua.db`) — SQLite history database. The + default path is appended across daemon runs; a custom path still exits + with an error if the file already exists. WAL mode is enabled + automatically. - `--interval D` (default `30s`) — how often to sample. Accepts `30s`, `1m`, `200ms`, etc. -- `--pid-file PATH` (default `/tmp/gua.pid`) — background PID file. -- `--log-file PATH` (default `/tmp/gua.log`) — stdout/stderr from the +- `--pid-file PATH` (default `~/.gua/gua.pid`) — background PID file. +- `--log-file PATH` (default `~/.gua/gua.log`) — stdout/stderr from the background collector. - `--foreground` — keep the collector attached to the current process. Use this for systemd or debugging. @@ -232,15 +233,15 @@ managed collector before acting on it; stale PID files are cleared. gua report [--db PATH] [--since D] [--interval D] [--width N] ``` -- `--db PATH` (default `/tmp/gua.db`) — same SQLite file the daemon writes - to. The report exits with an error if the file does not exist. +- `--db PATH` (default `~/.gua/gua.db`) — same SQLite file the daemon + writes to. The report exits with an error if the file does not exist. - `--since D` (default `1h`) — the report window. **No upper bound** — `--since 365d` is accepted. The effective window is min(`--since`, age of oldest sample), so passing a huge `--since` is the same as "all data". Units: `ms`, `s`, `m`, `h`, `d` (no `w`; use `7d`). -- `--interval D` (default `30s`) — **must match what the daemon used**. - This is how §2 (Idle capacity) and §4 (Top identities) convert tick counts - to GPU-hours. Mismatched intervals → wrong GPU-hours. +- `--interval D` — optional override for §2 (Idle capacity) and §4 (Top + identities). By default, reports use the interval recorded by each daemon + run. Legacy rows without interval metadata fall back to 30s. - `--width N` (default `60`) — width of the §1 three-bar in characters. ### `demo` @@ -257,8 +258,10 @@ gua demo [--db PATH] [--ticks N] [--interval D] ### Operational notes -- **Same `--interval` on both sides.** If you ran the daemon with - `--interval 30s`, run `gua report --interval 30s` too. +- **Intervals are recorded.** New daemon runs store their sampling interval + in the database, so `gua report` can compute GPU-hours without repeating + `--interval`. Use report `--interval D` only to override or to interpret + legacy rows. - **Let it run for a while.** §1/§3 are meaningful after one tick; §4 (Top identities) needs hours; §5 (Heatmap) needs days. - **WAL leaves sidecar files** (`gua.db-wal`, `gua.db-shm`). They are diff --git a/projects/bare-metal-1.0/handoff.ko.md b/projects/bare-metal-1.0/handoff.ko.md index c32352a..394ca79 100644 --- a/projects/bare-metal-1.0/handoff.ko.md +++ b/projects/bare-metal-1.0/handoff.ko.md @@ -1,10 +1,10 @@ # Bare Metal 1.0 Handoff -갱신일: 2026-05-15 +갱신일: 2026-05-27 ## 이어받을 때 먼저 볼 것 -- `projects/bare-metal-1.0/status.ko.md`: 현재 완료 상태, 1.0.1 검증 결과, 1.0.2 release prep 상태. +- `projects/bare-metal-1.0/status.ko.md`: 현재 완료 상태, 1.0.2 release/publish 상태, 최신 로컬 검증 결과. - `README.md`: 실제 사용자 문서와 release/install/runbook/report 표면. - `src/gpu_usage_audit/__main__.py`: `gua` CLI, background daemon lifecycle, PID handling. - `src/gpu_usage_audit/report.py`: report SQL 집계. @@ -17,14 +17,14 @@ - Kubernetes, Slurm, Docker/Podman fallback, remote node, cluster-wide report는 1.0 범위 밖이다. - `nvidia-ml-py`는 기본 dependency다. - `gpu-usage-audit[nvml]` extra는 compatibility를 위해 빈 alias로 남긴다. -- DB schema는 v1을 유지한다: `host`, `gpu_sample`, `proc_sample`. -- 기본 DB는 `/tmp/gua.db`다. +- DB schema는 additive v1을 유지한다: `host`, `daemon_run`, `gpu_sample`, `proc_sample`. +- 기본 상태 경로는 `~/.gua/`이고, 기본 DB/PID/log는 `~/.gua/gua.db`, `~/.gua/gua.pid`, `~/.gua/gua.log`다. - `gua daemon`은 기본 백그라운드 실행이다. - `gua daemon --foreground`는 systemd/debugging 용도다. - `gua start`는 `gua daemon` alias다. - `gua status`와 `gua stop`은 pid file 기반 background collector 관리용이다. -- `daemon`은 기존 DB 파일이 있으면 실패한다. -- `report`는 DB 파일이 없으면 실패한다. +- `daemon`은 기본 DB에는 append하고, custom `--db PATH`가 기존 파일이면 실패한다. +- `report`는 DB 파일이 없으면 실패하고, 기본적으로 daemon_run에 기록된 interval로 GPU-hours를 계산한다. - `daemon`과 `demo`는 host row의 `env_kind`를 항상 `"bare"`로 기록한다. - auto-runtime proposal/project 문서는 삭제했다. Kubernetes/Slurm/Docker/Podman 확장을 다시 시작하려면 새 proposal로 시작한다. @@ -39,38 +39,40 @@ - GitHub Release `v1.0.1`: published. - PyPI `gpu-usage-audit 1.0.1`: published. - NVIDIA host acceptance: 사용자가 실제 host에서 수집 정상 동작을 확인했다. -- 1.0.2 release prep: 진행 중. #14 lifecycle/report cleanup을 patch release로 배포한다. - package version은 `1.0.2`로 bump했고 local build/wheel smoke는 통과했다. +- 1.0.2 lifecycle/report cleanup release: completed in PR #14/#15 and tag `v1.0.2`. +- GitHub Release `v1.0.2`: published. +- PyPI `gpu-usage-audit 1.0.2`: published. ## 마지막 로컬 검증 ```sh uv run ruff check uv run ruff format --check -uv run mypy -uv run pytest -uv build --out-dir /tmp/gua-dist-1.0.2-prep -bash scripts/smoke-dist-wheel.sh /tmp/gua-dist-1.0.2-prep/gpu_usage_audit-1.0.2-py3-none-any.whl +uv run python -m mypy +uv run python -m pytest env GITHUB_REF_NAME=v1.0.2 uv run python scripts/check-tag-version.py +uv build --out-dir /tmp/gua-dist-current +bash scripts/smoke-dist-wheel.sh /tmp/gua-dist-current/gpu_usage_audit-1.0.2-py3-none-any.whl ``` 결과는 `pytest` 124 passed, `mypy` 25 source files, `ruff format` 26 files 기준이다. +현재 로컬에서는 direct entrypoint인 `uv run mypy`, `uv run pytest` 대신 +`uv run python -m ...` 경로가 안정적으로 동작한다. -## 현재 cleanup PR 방향 +## 1.0.2 포함 cleanup - `/tmp/gua.pid`가 PID 재사용으로 다른 프로세스를 가리킬 수 있으므로 `status`/`stop` 전에 해당 PID가 실제 managed `gpu_usage_audit daemon` 프로세스인지 확인한다. -- report §2는 low-util 전체를 "waste"로 합치지 말고 `idle-held`와 `truly-idle`을 분리한다. +- report §2는 low-util 전체를 "waste"로 합치지 않고 `idle-held`와 `truly-idle`을 분리한다. - report §4는 process row가 아니라 identity/GPU/tick 단위로 먼저 접어서 사용자별 GPU-hours를 계산한다. - report 출력 자체에 sample 의미, classification rule, `--interval` 의존성, heatmap 의미를 짧게 노출한다. - NVML process list 조회 실패는 idle-held를 과소평가할 수 있으므로 warning으로 남긴다. -- 1.0.2 release prep에서는 package version, README release asset 예시, CHANGELOG를 `1.0.2`로 맞춘다. ## 주의할 점 - 현재 로컬 개발 머신은 NVIDIA host가 아니다. `gua doctor`가 unsupported를 내는 것은 정상이다. -- `/tmp/gua.db`가 이미 존재한다. 기본 경로 daemon 실행이 거부되는 것은 기대 동작이다. -- `report --interval`은 daemon 수집 interval과 같아야 GPU-hours가 맞다. +- 기본 DB는 `~/.gua/gua.db`로 이동 중이다. 기본 경로는 기존 DB에 append한다. +- `report --interval`은 선택적 override다. 새 샘플은 daemon_run interval을 기록하고, legacy row만 fallback이 필요하다. - SQLite WAL sidecar(`*.db-wal`, `*.db-shm`)는 마지막 connection이 닫히면 정리된다. - 1.0.2를 자를 경우 `env GITHUB_REF_NAME=v1.0.2 uv run python scripts/check-tag-version.py`가 통과해야 한다. @@ -78,6 +80,6 @@ env GITHUB_REF_NAME=v1.0.2 uv run python scripts/check-tag-version.py ## 다음 세션 추천 순서 1. `git status --short`로 사용자 변경 여부를 먼저 확인한다. -2. cleanup PR의 CI 결과와 review comments를 확인한다. -3. 필요하면 report wording을 실제 운영자가 읽기 쉬운 형태로 한 번 더 다듬는다. -4. merge 후 patch release가 필요하면 version bump와 changelog를 별도 PR로 처리한다. +2. untracked 파일(`package.json`, `package-lock.json`, `project_report.md`)의 의도를 먼저 확인한다. +3. 기본 검증은 `uv run python -m pytest`, `uv run python -m mypy` 경로를 우선 사용한다. +4. `~/.gua` 기본 경로와 recorded interval 변경을 1.0.3 patch 후보로 묶고, 기능 확장은 새 proposal로 분리한다. diff --git a/projects/bare-metal-1.0/status.ko.md b/projects/bare-metal-1.0/status.ko.md index 61a7302..08acb9e 100644 --- a/projects/bare-metal-1.0/status.ko.md +++ b/projects/bare-metal-1.0/status.ko.md @@ -1,16 +1,15 @@ # Bare Metal 1.0 Status -갱신일: 2026-05-15 +갱신일: 2026-05-27 ## 요약 -Bare Metal 1.0은 단일 NVIDIA 베어메탈 호스트만 대상으로 하는 형태로 1.0.1까지 -릴리스됐고, 현재 1.0.2 release prep을 진행 중이다. `v1.0.1` GitHub Release와 -PyPI publish는 완료됐고, 사용자가 실제 NVIDIA host에서 telemetry 수집이 정상 -동작하는 것도 확인했다. +Bare Metal 1.0은 단일 NVIDIA 베어메탈 호스트만 대상으로 하는 형태로 1.0.2까지 +릴리스됐다. `v1.0.2` GitHub Release와 PyPI publish가 완료됐고, 사용자가 실제 +NVIDIA host에서 telemetry 수집이 정상 동작하는 것도 확인했다. -1.0.2 후보는 1.0.1 이후 코드 퀄리티 cleanup을 배포하기 위한 patch release다. -주요 초점은 background daemon PID 안전성, report 의미 가시성, 내부 문서 정합성이다. +1.0.2는 1.0.1 이후 코드 퀄리티 cleanup을 배포한 patch release다. 주요 초점은 +background daemon PID 안전성, report 의미 가시성, 내부 문서 정합성이었다. ## 구현 상태 @@ -21,13 +20,38 @@ PyPI publish는 완료됐고, 사용자가 실제 NVIDIA host에서 telemetry | Packaging UX | 완료 | `nvidia-ml-py`가 기본 dependency이고 `nvml` extra는 빈 compatibility alias. | | `gua` command surface | 완료 | `doctor`, `daemon`, `start`, `status`, `stop`, `report`, `demo` 제공. | | Background daemon UX | 완료 | `gua daemon`은 기본 백그라운드 실행, `--foreground`는 systemd/debug용. | -| `daemon`/`report` DB UX | 완료 | 기본 DB는 `/tmp/gua.db`; daemon은 기존 DB를 거부하고 report는 없는 DB를 거부. | +| `daemon`/`report` DB UX | 진행 중 | 기본 상태 경로를 `~/.gua/`로 옮기고, 기본 DB는 append 가능한 history DB로 전환 중. custom `--db` 기존 파일은 계속 거부. | | README bare-metal 문서 | 완료 | install, runbook, systemd 예시, 운영 notes가 1.0.2 기준. | -| Release | 진행 중 | package version은 `1.0.2`; local build/wheel smoke 완료, release prep PR과 tag publish가 남음. | +| Release | 완료 | `v1.0.2` GitHub Release와 PyPI publish 완료. | | NVIDIA host acceptance | 완료 | 실제 NVIDIA host에서 수집 정상 동작 확인. | ## 마지막 확인 결과 +2026-05-27 현재 작업 트리 로컬 검증: + +```sh +uv run ruff check +uv run ruff format --check +uv run python -m mypy +uv run python -m pytest +env GITHUB_REF_NAME=v1.0.2 uv run python scripts/check-tag-version.py +uv build --out-dir /tmp/gua-dist-current +bash scripts/smoke-dist-wheel.sh /tmp/gua-dist-current/gpu_usage_audit-1.0.2-py3-none-any.whl +``` + +결과: + +- `ruff check`: pass. +- `ruff format --check`: 26 files already formatted. +- `mypy`: no issues in 25 source files. +- `pytest`: 124 passed. +- tag-version check: `v1.0.2`와 `pyproject.toml` version 일치. +- `uv build`: sdist/wheel build 성공. +- wheel smoke: 성공. + +현재 로컬 환경에서는 `uv run mypy`, `uv run pytest` direct entrypoint가 꼬여 있어 +검증 명령은 `uv run python -m mypy`, `uv run python -m pytest` 경로를 사용했다. + 2026-05-15 1.0.2 release prep 로컬 검증: ```sh @@ -108,13 +132,13 @@ bash scripts/smoke-dist-wheel.sh /tmp/gua-dist-1.0.1-status/gpu_usage_audit-1.0. - `/dev/nvidia*` 없음. - `nvidia-smi`가 PATH에 없음. - NVML init 실패: `libnvidia-ml.so.1` 없음. -- `/tmp/gua.db`가 이미 있어 daemon은 기본 경로로 시작하지 않음. +- 기본 DB는 `~/.gua/gua.db`로 이동 중이며, 기본 경로에서는 기존 DB에 append한다. 이 결과는 로컬 환경 한계이며, 제품 regression으로 보지 않는다. ## 다음 작업 -1. 1.0.2 release prep PR에서 version, README release asset 예시, CHANGELOG를 갱신한다. -2. `uv run ruff check`, `uv run ruff format --check`, `uv run mypy`, `uv run pytest`, - `uv build`, wheel smoke, tag-version check를 다시 실행한다. -3. PR merge 후 `v1.0.2` tag를 push해 GitHub Release와 PyPI publish workflow를 실행한다. +1. 현재 작업 트리의 untracked 파일(`package.json`, `package-lock.json`, `project_report.md`)이 + 의도된 산출물인지 확인하고 track/delete 여부를 결정한다. +2. `~/.gua` 기본 경로와 recorded interval 변경을 1.0.3 patch 후보로 검증한다. +3. 기본 검증은 `ruff`, `mypy`, `pytest`, tag-version check, build, wheel smoke 순서로 유지한다. diff --git a/src/gpu_usage_audit/__main__.py b/src/gpu_usage_audit/__main__.py index c4f299a..f7ee1df 100644 --- a/src/gpu_usage_audit/__main__.py +++ b/src/gpu_usage_audit/__main__.py @@ -34,17 +34,17 @@ from . import __version__ from .daemon import install_signal_handlers, run_daemon from .db import open_db -from .doctor import ( - DEFAULT_DB_PATH as DOCTOR_DEFAULT_DB_PATH, -) -from .doctor import ( - build_doctor_report, - doctor_report_to_dict, - render_doctor, -) +from .doctor import build_doctor_report, doctor_report_to_dict, render_doctor from .identity import system_user_lookup from .model import HostMeta from .nvml import NVMLNotAvailableError, NVMLTier +from .paths import ( + DEFAULT_DB_PATH, + DEFAULT_LOG_PATH, + DEFAULT_PID_PATH, + expand_path, + is_default_db_path, +) from .render import ( render_headline, render_heatmap, @@ -70,9 +70,6 @@ "h": "hours", "d": "days", } -DEFAULT_DB_PATH = DOCTOR_DEFAULT_DB_PATH -DEFAULT_PID_PATH = Path("/tmp/gua.pid") -DEFAULT_LOG_PATH = Path("/tmp/gua.log") DISPLAY_COMMAND_ENV = "GPU_USAGE_AUDIT_DISPLAY_COMMAND" LOCAL_ENV_KIND = "bare" STARTUP_CHECK_SECONDS = 0.3 @@ -108,7 +105,7 @@ def build_parser() -> argparse.ArgumentParser: p_daemon.add_argument( "--db", default=str(DEFAULT_DB_PATH), - help=f"Path to a new SQLite database file [default: {DEFAULT_DB_PATH}]", + help=f"Path to SQLite database file [default: {DEFAULT_DB_PATH}]", ) p_daemon.add_argument( "--interval", @@ -136,8 +133,11 @@ def build_parser() -> argparse.ArgumentParser: p_report.add_argument( "--interval", type=_duration, - default=timedelta(seconds=30), - help="Daemon tick interval — for §2 Idle capacity / §4 time conversion [default: 30s]", + default=None, + help=( + "Override recorded daemon interval for §2 Idle capacity / §4 time conversion " + "[default: read from DB; legacy rows fall back to 30s]" + ), ) p_report.add_argument( "--width", @@ -180,7 +180,7 @@ def _add_daemon_args(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--db", default=str(DEFAULT_DB_PATH), - help=f"Path to a new SQLite database file [default: {DEFAULT_DB_PATH}]", + help=f"Path to SQLite database file [default: {DEFAULT_DB_PATH}]", ) parser.add_argument( "--interval", @@ -205,8 +205,11 @@ def _add_report_args(parser: argparse.ArgumentParser) -> None: parser.add_argument( "--interval", type=_duration, - default=timedelta(seconds=30), - help="Daemon tick interval — for §2 Idle capacity / §4 time conversion [default: 30s]", + default=None, + help=( + "Override recorded daemon interval for §2 Idle capacity / §4 time conversion " + "[default: read from DB; legacy rows fall back to 30s]" + ), ) parser.add_argument( "--width", @@ -350,9 +353,9 @@ def _cmd_gua_daemon(args: argparse.Namespace) -> int: def _cmd_gua_start(args: argparse.Namespace) -> int: - db_path = Path(args.db) - pid_path = Path(args.pid_file) - log_path = Path(args.log_file) + db_path = expand_path(args.db) + pid_path = expand_path(args.pid_file) + log_path = expand_path(args.log_file) existing_pid = _read_pid(pid_path) if existing_pid is not None: @@ -366,10 +369,10 @@ def _cmd_gua_start(args: argparse.Namespace) -> int: ) _unlink_if_exists(pid_path) - if db_path.exists(): + if db_path.exists() and not is_default_db_path(db_path): print( f"gua daemon: {db_path} already exists; " - "run `gua report` for existing data or choose another --db path.", + "run `gua report --db PATH` for existing data or choose another --db path.", file=sys.stderr, ) return 2 @@ -417,8 +420,8 @@ def _cmd_gua_start(args: argparse.Namespace) -> int: def _cmd_gua_status(args: argparse.Namespace) -> int: - pid_path = Path(args.pid_file) - log_path = Path(args.log_file) + pid_path = expand_path(args.pid_file) + log_path = expand_path(args.log_file) pid = _read_pid(pid_path) if pid is None: print("gua daemon: not running") @@ -441,7 +444,7 @@ def _cmd_gua_status(args: argparse.Namespace) -> int: def _cmd_gua_stop(args: argparse.Namespace) -> int: - pid_path = Path(args.pid_file) + pid_path = expand_path(args.pid_file) pid = _read_pid(pid_path) if pid is None: print("gua daemon: not running") @@ -490,14 +493,16 @@ def _cmd_daemon(args: argparse.Namespace) -> int: "display_command", os.environ.get(DISPLAY_COMMAND_ENV, "gpu-usage-audit daemon"), ) - db_path = Path(args.db) - if db_path.exists(): + db_path = expand_path(args.db) + if db_path.exists() and not is_default_db_path(db_path): print( f"{display_command}: {db_path} already exists; " "choose another --db path or remove the existing file before starting.", file=sys.stderr, ) return 2 + if is_default_db_path(db_path): + db_path.parent.mkdir(parents=True, exist_ok=True) tier = NVMLTier() try: try: @@ -505,7 +510,7 @@ def _cmd_daemon(args: argparse.Namespace) -> int: except NVMLNotAvailableError as e: print(f"{display_command}: {e}", file=sys.stderr) return 1 - conn = open_db(args.db) + conn = open_db(db_path) try: host = HostMeta( hostname=socket.gethostname() or "unknown", @@ -534,7 +539,7 @@ def _cmd_daemon(args: argparse.Namespace) -> int: def _cmd_report(args: argparse.Namespace) -> int: display_command = getattr(args, "display_command", "gpu-usage-audit report") - db_path = Path(args.db) + db_path = expand_path(args.db) if not db_path.exists(): print( f"{display_command}: {db_path} does not exist; " @@ -542,7 +547,7 @@ def _cmd_report(args: argparse.Namespace) -> int: file=sys.stderr, ) return 2 - conn = open_db(args.db) + conn = open_db(db_path) try: cutoff = datetime.now(UTC) - args.since host = load_host(conn) @@ -573,7 +578,7 @@ def _cmd_demo(args: argparse.Namespace) -> int: db_path = str(Path(tmpdir) / "demo.db") print(f"(using temporary database: {db_path})", file=sys.stderr) else: - db_path = args.db + db_path = str(expand_path(args.db)) conn = open_db(db_path) try: diff --git a/src/gpu_usage_audit/daemon.py b/src/gpu_usage_audit/daemon.py index 5e1e81f..6cd6ce3 100644 --- a/src/gpu_usage_audit/daemon.py +++ b/src/gpu_usage_audit/daemon.py @@ -21,7 +21,7 @@ from datetime import UTC, datetime, timedelta from typing import TextIO -from .db import write_snapshot +from .db import start_daemon_run, write_snapshot from .model import HostMeta from .summarize import summarize from .tier import Tier @@ -59,6 +59,7 @@ def _tick( ts: datetime, n: int, out: TextIO, + run_id: int, ) -> None: """한 틱: tier.collect → loginuid 해석 → 적재 → 한 줄 로그.""" snap = tier.collect(ts) @@ -66,7 +67,7 @@ def _tick( for p in snap.procs: if p.loginuid_user is None: p.loginuid_user = lookup(p.pid) - write_snapshot(db, ts, host, snap) + write_snapshot(db, ts, host, snap, run_id=run_id) classes = " ".join(f"{c.uuid}={c.klass.value:<10}" for c in summarize(snap)) ts_short = ts.strftime("%H:%M:%S.") + f"{ts.microsecond // 1000:03d}" @@ -108,12 +109,16 @@ def run_daemon( next_at = time.monotonic() n = 0 + run_id: int | None = None while not stop.is_set(): if max_ticks is not None and n >= max_ticks: break + if run_id is None: + run_id = start_daemon_run(db, datetime.now(UTC), interval) + try: - _tick(tier, db, host, lookup, datetime.now(UTC), n, out) + _tick(tier, db, host, lookup, datetime.now(UTC), n, out, run_id) except Exception: logger.exception("tick %d failed; continuing", n) n += 1 diff --git a/src/gpu_usage_audit/db.py b/src/gpu_usage_audit/db.py index 427c14c..d6c86ea 100644 --- a/src/gpu_usage_audit/db.py +++ b/src/gpu_usage_audit/db.py @@ -13,7 +13,7 @@ from __future__ import annotations import sqlite3 -from datetime import datetime +from datetime import datetime, timedelta from pathlib import Path from .model import HostMeta, Snapshot @@ -29,10 +29,17 @@ last_seen DATETIME NOT NULL ); +CREATE TABLE IF NOT EXISTS daemon_run ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + started_at DATETIME NOT NULL, + interval_seconds REAL NOT NULL +); + CREATE TABLE IF NOT EXISTS gpu_sample ( ts DATETIME NOT NULL, gpu_uuid TEXT NOT NULL, - util_pct INTEGER NOT NULL + util_pct INTEGER NOT NULL, + run_id INTEGER REFERENCES daemon_run(id) ); CREATE TABLE IF NOT EXISTS proc_sample ( @@ -40,7 +47,8 @@ gpu_uuid TEXT NOT NULL, pid INTEGER NOT NULL, mem_used_mb INTEGER NOT NULL, - loginuid_user TEXT + loginuid_user TEXT, + run_id INTEGER REFERENCES daemon_run(id) ); CREATE INDEX IF NOT EXISTS idx_gpu_sample_uuid_ts ON gpu_sample(gpu_uuid, ts); @@ -74,9 +82,44 @@ def open_db(path: str | Path) -> sqlite3.Connection: raise RuntimeError(f"expected journal_mode=wal, got {mode!r}") conn.execute("PRAGMA busy_timeout=5000") conn.executescript(SCHEMA) + _migrate_schema(conn) return conn +def _migrate_schema(conn: sqlite3.Connection) -> None: + """Apply additive migrations for DBs created before interval metadata.""" + _ensure_column(conn, "gpu_sample", "run_id", "INTEGER") + _ensure_column(conn, "proc_sample", "run_id", "INTEGER") + + +def _ensure_column( + conn: sqlite3.Connection, + table: str, + column: str, + definition: str, +) -> None: + columns = {row[1] for row in conn.execute(f"PRAGMA table_info({table})")} + if column not in columns: + conn.execute(f"ALTER TABLE {table} ADD COLUMN {column} {definition}") + + +def start_daemon_run( + conn: sqlite3.Connection, + started_at: datetime, + interval: timedelta, +) -> int: + """Record one daemon run and return its row id for subsequent samples.""" + cur = conn.execute( + "INSERT INTO daemon_run(started_at, interval_seconds) VALUES(?, ?)", + (_ts(started_at), interval.total_seconds()), + ) + conn.commit() + run_id = cur.lastrowid + if run_id is None: + raise RuntimeError("failed to create daemon_run row") + return run_id + + def upsert_host(conn: sqlite3.Connection, host: HostMeta, last_seen: datetime) -> None: """단일 row 의 호스트 메타 유지. UPDATE → 0 영향이면 INSERT 패턴. @@ -109,6 +152,8 @@ def write_snapshot( ts: datetime, host: HostMeta, snap: Snapshot, + *, + run_id: int | None = None, ) -> None: """한 틱의 Snapshot + 호스트 메타를 *단일 트랜잭션* 으로 적재한다. @@ -124,12 +169,15 @@ def write_snapshot( upsert_host(conn, host, ts) if snap.gpus: conn.executemany( - "INSERT INTO gpu_sample(ts, gpu_uuid, util_pct) VALUES(?,?,?)", - [(ts_str, g.uuid, g.util_pct) for g in snap.gpus], + "INSERT INTO gpu_sample(ts, gpu_uuid, util_pct, run_id) VALUES(?,?,?,?)", + [(ts_str, g.uuid, g.util_pct, run_id) for g in snap.gpus], ) if snap.procs: conn.executemany( - "INSERT INTO proc_sample(ts, gpu_uuid, pid, mem_used_mb, loginuid_user) " - "VALUES(?,?,?,?,?)", - [(ts_str, p.gpu_uuid, p.pid, p.mem_used_mb, p.loginuid_user) for p in snap.procs], + "INSERT INTO proc_sample(ts, gpu_uuid, pid, mem_used_mb, loginuid_user, run_id) " + "VALUES(?,?,?,?,?,?)", + [ + (ts_str, p.gpu_uuid, p.pid, p.mem_used_mb, p.loginuid_user, run_id) + for p in snap.procs + ], ) diff --git a/src/gpu_usage_audit/doctor.py b/src/gpu_usage_audit/doctor.py index 2f60bd2..aaa6633 100644 --- a/src/gpu_usage_audit/doctor.py +++ b/src/gpu_usage_audit/doctor.py @@ -17,15 +17,15 @@ from typing import Literal from .nvml import NVMLNotAvailableError, _decode, _load_pynvml, nvml_init_error_message +from .paths import DEFAULT_DB_PATH, expand_path, is_default_db_path type CheckStatus = Literal["ok", "warning", "error", "skipped"] type ReadinessMode = Literal["host", "unsupported"] type Which = Callable[[str], str | None] DEFAULT_COMMAND_TIMEOUT_SECONDS = 3.0 -DEFAULT_DB_PATH = Path("/tmp/gua.db") COLLECT_COMMAND = "gua daemon --interval 30s" -REPORT_COMMAND = "gua report --since 1h --interval 30s" +REPORT_COMMAND = "gua report --since 1h" @dataclass(slots=True) @@ -392,10 +392,10 @@ def check_nvml(info: NVMLInfo) -> DoctorCheck: def probe_default_db(db_path: str | Path = DEFAULT_DB_PATH) -> tuple[DatabaseInfo, DoctorCheck]: - path = Path(db_path) + path = expand_path(db_path) display_path = str(path) parent = path.parent - is_default = path == DEFAULT_DB_PATH + is_default = is_default_db_path(path) try: exists = path.exists() is_file = path.is_file() if exists else False @@ -432,12 +432,18 @@ def probe_default_db(db_path: str | Path = DEFAULT_DB_PATH) -> tuple[DatabaseInf size_bytes=size_bytes, error=error, ) - if exists and is_file: - status: CheckStatus = "warning" + if exists and is_file and is_default: + status: CheckStatus = "ok" + summary = "present; daemon will append, report can read it" + elif exists and is_file: + status = "warning" summary = "present; daemon will refuse this path, report can read it" elif exists: status = "error" summary = "present but is not a regular file" + elif not parent_exists and is_default: + status = "ok" + summary = f"absent, parent directory will be created: {parent}" elif not parent_exists: status = "error" summary = f"absent, parent directory does not exist: {parent}" @@ -605,9 +611,9 @@ def _recommended_commands_for(report: DoctorReport) -> dict[str, str]: database = checks["default_db"].details db_path = str(database.get("path", DEFAULT_DB_PATH)) report_command = _report_command(db_path) - if database.get("exists") is True: + if database.get("exists") is True and database.get("is_default") is not True: return {"report": report_command} - if database.get("parent_writable") is False: + if database.get("parent_writable") is False and database.get("is_default") is not True: return {} return { "collect": _collect_command(db_path), @@ -616,15 +622,15 @@ def _recommended_commands_for(report: DoctorReport) -> dict[str, str]: def _collect_command(db_path: str) -> str: - if Path(db_path) == DEFAULT_DB_PATH: + if is_default_db_path(db_path): return COLLECT_COMMAND return f"gua daemon --db {shlex.quote(db_path)} --interval 30s" def _report_command(db_path: str) -> str: - if Path(db_path) == DEFAULT_DB_PATH: + if is_default_db_path(db_path): return REPORT_COMMAND - return f"gua report --db {shlex.quote(db_path)} --since 1h --interval 30s" + return f"gua report --db {shlex.quote(db_path)} --since 1h" def _short_error(result: CommandResult) -> str: @@ -660,16 +666,20 @@ def _unsupported_blockers(facts: DetectionFacts) -> list[str]: blockers.append("NVML initialized but reported zero GPUs.") if facts.database.exists and not facts.database.is_file: blockers.append(f"{facts.database.path} exists but is not a regular file.") - elif not facts.database.exists and not facts.database.parent_exists: + elif ( + not facts.database.exists + and not facts.database.parent_exists + and not facts.database.is_default + ): blockers.append(f"The parent directory for {facts.database.path} does not exist.") return blockers def _host_warnings(facts: DetectionFacts) -> list[str]: warnings: list[str] = [] - if facts.database.exists and facts.database.is_file: + if facts.database.exists and facts.database.is_file and not facts.database.is_default: warnings.append( - f"{facts.database.path} already exists; `gua daemon` will refuse " + f"{facts.database.path} already exists; `gua daemon --db PATH` will refuse " "this path until it is removed or another --db path is provided." ) elif ( diff --git a/src/gpu_usage_audit/paths.py b/src/gpu_usage_audit/paths.py new file mode 100644 index 0000000..842d934 --- /dev/null +++ b/src/gpu_usage_audit/paths.py @@ -0,0 +1,20 @@ +"""Default filesystem locations for gua runtime state.""" + +from __future__ import annotations + +from pathlib import Path + +DEFAULT_STATE_DIR = Path.home() / ".gua" +DEFAULT_DB_PATH = DEFAULT_STATE_DIR / "gua.db" +DEFAULT_PID_PATH = DEFAULT_STATE_DIR / "gua.pid" +DEFAULT_LOG_PATH = DEFAULT_STATE_DIR / "gua.log" + + +def expand_path(path: str | Path) -> Path: + """Expand user-facing filesystem path arguments.""" + return Path(path).expanduser() + + +def is_default_db_path(path: str | Path) -> bool: + """Return whether a path points at gua's default history database.""" + return expand_path(path) == DEFAULT_DB_PATH diff --git a/src/gpu_usage_audit/render.py b/src/gpu_usage_audit/render.py index b9a82c9..9721717 100644 --- a/src/gpu_usage_audit/render.py +++ b/src/gpu_usage_audit/render.py @@ -71,7 +71,10 @@ def render_headline( def render_idle_capacity(w: TextIO, idle_capacity: IdleCapacity) -> None: print(file=w) print("§2 Idle capacity", file=w) - print(" converted from card-ticks to GPU-hours using the report --interval", file=w) + print( + f" converted from card-ticks to GPU-hours using {idle_capacity.interval_source}", + file=w, + ) if idle_capacity.samples == 0: print(" (no samples in window)", file=w) return diff --git a/src/gpu_usage_audit/report.py b/src/gpu_usage_audit/report.py index 89b449d..788f4b5 100644 --- a/src/gpu_usage_audit/report.py +++ b/src/gpu_usage_audit/report.py @@ -20,6 +20,8 @@ from .db import _ts from .model import HostRow +LEGACY_INTERVAL_FALLBACK = timedelta(seconds=30) + def load_host(conn: sqlite3.Connection) -> HostRow: """단일 host row 를 읽는다. row 없으면 *빈* HostRow — 헤더가 "host @@ -42,13 +44,15 @@ def load_host(conn: sqlite3.Connection) -> HostRow: # truly-idle 로 집계됨. INNER JOIN 이었다면 통째로 사라짐. HEADLINE_QUERY = """ WITH s AS ( - SELECT gs.gpu_uuid, gs.ts, gs.util_pct, + SELECT gs.gpu_uuid, gs.ts, gs.run_id, gs.util_pct, COALESCE(SUM(ps.mem_used_mb), 0) AS proc_mem_mb FROM gpu_sample gs LEFT JOIN proc_sample ps - ON ps.gpu_uuid = gs.gpu_uuid AND ps.ts = gs.ts + ON ps.gpu_uuid = gs.gpu_uuid + AND ps.ts = gs.ts + AND (ps.run_id = gs.run_id OR (ps.run_id IS NULL AND gs.run_id IS NULL)) WHERE gs.ts >= ? - GROUP BY gs.gpu_uuid, gs.ts + GROUP BY gs.gpu_uuid, gs.ts, gs.run_id ) SELECT AVG(CASE WHEN util_pct >= 10 THEN 1.0 ELSE 0.0 END) AS active, @@ -91,24 +95,33 @@ def load_headline(conn: sqlite3.Connection, cutoff: datetime) -> Headline: # truly-idle 은 실제로 비어 있는 용량이다. # equiv_gpus = 상태 비율 × 카드 수. "8장 중 3.2장이 해당 상태였다" 식. # 카드 수는 *gpu_sample 에서 distinct* 로 추론 — v2 는 별도 gpu 인벤토리 -# 테이블이 없음 (단순화). interval 은 Python 에서 인자로 받는다 — -# 데몬과 report 가 *같은* interval 을 약속해야 의미가 맞음. +# 테이블이 없음 (단순화). 새 DB 는 daemon_run.interval_seconds 를 샘플별로 +# 조인해 GPU-hours 를 계산한다. run_id 가 없는 legacy row 는 report --interval +# 또는 30s fallback 을 사용한다. IDLE_CAPACITY_QUERY = """ WITH s AS ( - SELECT gs.gpu_uuid, gs.ts, gs.util_pct, + SELECT gs.gpu_uuid, gs.ts, gs.util_pct, gs.run_id, COALESCE(SUM(ps.mem_used_mb), 0) AS proc_mem_mb FROM gpu_sample gs LEFT JOIN proc_sample ps - ON ps.gpu_uuid = gs.gpu_uuid AND ps.ts = gs.ts + ON ps.gpu_uuid = gs.gpu_uuid + AND ps.ts = gs.ts + AND (ps.run_id = gs.run_id OR (ps.run_id IS NULL AND gs.run_id IS NULL)) WHERE gs.ts >= ? - GROUP BY gs.gpu_uuid, gs.ts + GROUP BY gs.gpu_uuid, gs.ts, gs.run_id ), gpu_count AS ( SELECT COUNT(DISTINCT gpu_uuid) AS n FROM s ) SELECT - SUM(CASE WHEN util_pct < 10 AND proc_mem_mb > 100 THEN 1 ELSE 0 END) * ? / 3600.0 AS idle_held_gpu_hours, - SUM(CASE WHEN util_pct < 10 AND proc_mem_mb <= 100 THEN 1 ELSE 0 END) * ? / 3600.0 AS truly_idle_gpu_hours, + SUM( + CASE WHEN util_pct < 10 AND proc_mem_mb > 100 + THEN COALESCE(?, dr.interval_seconds, ?) ELSE 0 END + ) / 3600.0 AS idle_held_gpu_hours, + SUM( + CASE WHEN util_pct < 10 AND proc_mem_mb <= 100 + THEN COALESCE(?, dr.interval_seconds, ?) ELSE 0 END + ) / 3600.0 AS truly_idle_gpu_hours, CASE WHEN COUNT(*) = 0 THEN 0.0 ELSE SUM(CASE WHEN util_pct < 10 AND proc_mem_mb > 100 THEN 1.0 ELSE 0.0 END) / COUNT(*) * (SELECT n FROM gpu_count) @@ -119,6 +132,7 @@ def load_headline(conn: sqlite3.Connection, cutoff: datetime) -> Headline: END AS truly_idle_equiv_gpus, COUNT(*) AS samples FROM s +LEFT JOIN daemon_run dr ON dr.id = s.run_id """ @@ -129,15 +143,20 @@ class IdleCapacity: idle_held_equiv_gpus: float = 0.0 truly_idle_equiv_gpus: float = 0.0 samples: int = 0 + interval_source: str = "recorded daemon interval (legacy rows fall back to 30s)" def load_idle_capacity( conn: sqlite3.Connection, cutoff: datetime, - interval: timedelta, + interval: timedelta | None = None, ) -> IdleCapacity: - interval_s = interval.total_seconds() - row = conn.execute(IDLE_CAPACITY_QUERY, (_ts(cutoff), interval_s, interval_s)).fetchone() + override_s = interval.total_seconds() if interval is not None else None + fallback_s = LEGACY_INTERVAL_FALLBACK.total_seconds() + row = conn.execute( + IDLE_CAPACITY_QUERY, + (_ts(cutoff), override_s, fallback_s, override_s, fallback_s), + ).fetchone() if row is None: return IdleCapacity() idle_held_h, truly_idle_h, idle_held_equiv, truly_idle_equiv, samples = row @@ -147,6 +166,7 @@ def load_idle_capacity( idle_held_equiv_gpus=idle_held_equiv or 0.0, truly_idle_equiv_gpus=truly_idle_equiv or 0.0, samples=samples, + interval_source=_interval_source(interval), ) @@ -163,10 +183,12 @@ def load_idle_capacity( COUNT(*) AS samples FROM gpu_sample gs LEFT JOIN ( - SELECT gpu_uuid, ts, SUM(mem_used_mb) AS proc_mem + SELECT gpu_uuid, ts, run_id, SUM(mem_used_mb) AS proc_mem FROM proc_sample - GROUP BY gpu_uuid, ts -) ps ON ps.gpu_uuid = gs.gpu_uuid AND ps.ts = gs.ts + GROUP BY gpu_uuid, ts, run_id +) ps ON ps.gpu_uuid = gs.gpu_uuid + AND ps.ts = gs.ts + AND (ps.run_id = gs.run_id OR (ps.run_id IS NULL AND gs.run_id IS NULL)) WHERE gs.ts >= ? GROUP BY gs.gpu_uuid ORDER BY gs.gpu_uuid @@ -208,18 +230,22 @@ def load_per_gpu(conn: sqlite3.Connection, cutoff: datetime) -> list[PerGPU]: COALESCE(loginuid_user, 'unknown') AS identity, gpu_uuid, ts, + run_id, SUM(mem_used_mb) AS mem_used_mb FROM proc_sample WHERE ts >= ? - GROUP BY identity, gpu_uuid, ts + GROUP BY identity, gpu_uuid, ts, run_id ) SELECT owned.identity AS identity, - COUNT(*) * ? / 3600.0 AS gpu_hours, + SUM(COALESCE(?, dr.interval_seconds, ?)) / 3600.0 AS gpu_hours, AVG(CASE WHEN gs.util_pct < 10 AND owned.mem_used_mb > 100 THEN 1.0 ELSE 0.0 END) AS idle_held, COUNT(*) AS samples FROM owned -JOIN gpu_sample gs ON gs.gpu_uuid = owned.gpu_uuid AND gs.ts = owned.ts +JOIN gpu_sample gs ON gs.gpu_uuid = owned.gpu_uuid + AND gs.ts = owned.ts + AND (owned.run_id = gs.run_id OR (owned.run_id IS NULL AND gs.run_id IS NULL)) +LEFT JOIN daemon_run dr ON dr.id = COALESCE(owned.run_id, gs.run_id) GROUP BY identity ORDER BY gpu_hours DESC LIMIT 10 @@ -237,11 +263,13 @@ class TopIdentity: def load_top_identities( conn: sqlite3.Connection, cutoff: datetime, - interval: timedelta, + interval: timedelta | None = None, ) -> list[TopIdentity]: out: list[TopIdentity] = [] + override_s = interval.total_seconds() if interval is not None else None + fallback_s = LEGACY_INTERVAL_FALLBACK.total_seconds() for identity, gpu_hours, idle_held, samples in conn.execute( - TOP_IDENTITIES_QUERY, (_ts(cutoff), interval.total_seconds()) + TOP_IDENTITIES_QUERY, (_ts(cutoff), override_s, fallback_s) ): out.append( TopIdentity( @@ -254,6 +282,12 @@ def load_top_identities( return out +def _interval_source(interval: timedelta | None) -> str: + if interval is None: + return "recorded daemon interval (legacy rows fall back to 30s)" + return f"report --interval ({interval})" + + # ── §5 Heatmap ────────────────────────────────────────────────── # # ts 의 *요일×시간* 으로 그루핑. substr(ts, 1, 19) 로 nano/timezone 떼고 diff --git a/tests/test_daemon.py b/tests/test_daemon.py index 94db06f..894035b 100644 --- a/tests/test_daemon.py +++ b/tests/test_daemon.py @@ -52,7 +52,10 @@ def test_run_daemon_runs_max_ticks_and_loads_rows(db: sqlite3.Connection, host: # FakeTier 는 GPU 3개를 매 틱 반환 → 3 ticks * 3 GPUs = 9 gpu_sample 행. assert db.execute("SELECT COUNT(*) FROM gpu_sample").fetchone()[0] == 9 - # host upsert 한 행. + # 한 daemon run 과 host upsert 한 행. + assert db.execute("SELECT COUNT(*) FROM daemon_run").fetchone()[0] == 1 + assert db.execute("SELECT interval_seconds FROM daemon_run").fetchone()[0] == 0.02 + assert db.execute("SELECT COUNT(DISTINCT run_id) FROM gpu_sample").fetchone()[0] == 1 assert db.execute("SELECT COUNT(*) FROM host").fetchone()[0] == 1 # 콘솔 출력: 3 틱 모두 "Tick N" 줄. diff --git a/tests/test_db.py b/tests/test_db.py index 7538e47..a8e9aee 100644 --- a/tests/test_db.py +++ b/tests/test_db.py @@ -7,12 +7,12 @@ import sqlite3 from collections.abc import Iterator -from datetime import UTC, datetime +from datetime import UTC, datetime, timedelta from pathlib import Path import pytest -from gpu_usage_audit.db import open_db, write_snapshot +from gpu_usage_audit.db import open_db, start_daemon_run, write_snapshot from gpu_usage_audit.model import GPUSample, HostMeta, ProcSample, Snapshot @@ -47,12 +47,37 @@ def test_open_db_enables_wal_and_creates_indexes(db: sqlite3.Connection) -> None assert {"idx_gpu_sample_uuid_ts", "idx_proc_sample_uuid_ts"} <= idx -def test_open_db_creates_three_tables(db: sqlite3.Connection) -> None: +def test_open_db_creates_runtime_tables(db: sqlite3.Connection) -> None: tables = { row[0] for row in db.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name") } - assert {"host", "gpu_sample", "proc_sample"} <= tables + assert {"host", "daemon_run", "gpu_sample", "proc_sample"} <= tables + + +def test_start_daemon_run_and_write_snapshot_store_run_id( + db: sqlite3.Connection, + host: HostMeta, +) -> None: + run_id = start_daemon_run( + db, + datetime(2026, 5, 11, 12, 0, 0, tzinfo=UTC), + timedelta(seconds=17), + ) + write_snapshot( + db, + datetime(2026, 5, 11, 12, 0, 1, tzinfo=UTC), + host, + Snapshot( + gpus=[GPUSample(uuid="GPU-0", util_pct=2)], + procs=[ProcSample(gpu_uuid="GPU-0", pid=100, mem_used_mb=70000)], + ), + run_id=run_id, + ) + + assert db.execute("SELECT interval_seconds FROM daemon_run").fetchone()[0] == 17.0 + assert db.execute("SELECT run_id FROM gpu_sample").fetchone()[0] == run_id + assert db.execute("SELECT run_id FROM proc_sample").fetchone()[0] == run_id def test_write_snapshot_inserts_rows(db: sqlite3.Connection, host: HostMeta) -> None: diff --git a/tests/test_doctor.py b/tests/test_doctor.py index 8aabdd6..45c2650 100644 --- a/tests/test_doctor.py +++ b/tests/test_doctor.py @@ -260,9 +260,7 @@ def test_custom_db_path_is_rendered_and_shell_quoted(tmp_path: Path) -> None: quoted = f"'{db_path}'" assert f"target: {db_path}" in rendered assert f"collect: gua daemon --db {quoted} --interval 30s" in rendered - assert ( - f"report after collecting: gua report --db {quoted} --since 1h --interval 30s" in rendered - ) + assert f"report after collecting: gua report --db {quoted} --since 1h" in rendered def test_nvidia_smi_counts_mig_instances(tmp_path: Path) -> None: diff --git a/tests/test_report.py b/tests/test_report.py index 3b2aaef..5b93519 100644 --- a/tests/test_report.py +++ b/tests/test_report.py @@ -20,7 +20,7 @@ import pytest -from gpu_usage_audit.db import open_db, write_snapshot +from gpu_usage_audit.db import open_db, start_daemon_run, write_snapshot from gpu_usage_audit.model import GPUSample, HostMeta, HostRow, ProcSample, Snapshot from gpu_usage_audit.report import ( load_headline, @@ -150,6 +150,65 @@ def test_load_idle_capacity(db_loaded: sqlite3.Connection) -> None: assert _close(idle_capacity.truly_idle_equiv_gpus, 0.5) +def test_load_idle_capacity_uses_recorded_interval_by_default(tmp_path: Path) -> None: + conn = open_db(tmp_path / "recorded-interval.db") + try: + host = _fixture_host() + run_a = start_daemon_run(conn, BASE, timedelta(seconds=5)) + run_b = start_daemon_run(conn, BASE + timedelta(minutes=1), timedelta(seconds=20)) + write_snapshot( + conn, + BASE, + host, + Snapshot( + gpus=[GPUSample(uuid="GPU-0", util_pct=2)], + procs=[ProcSample(gpu_uuid="GPU-0", pid=100, mem_used_mb=70000)], + ), + run_id=run_a, + ) + write_snapshot( + conn, + BASE + timedelta(minutes=1), + host, + Snapshot( + gpus=[GPUSample(uuid="GPU-0", util_pct=2)], + procs=[ProcSample(gpu_uuid="GPU-0", pid=100, mem_used_mb=70000)], + ), + run_id=run_b, + ) + + idle_capacity = load_idle_capacity(conn, BASE) + assert idle_capacity.samples == 2 + assert _close(idle_capacity.idle_held_gpu_hours, (5 + 20) / 3600) + + identities = load_top_identities(conn, BASE) + assert len(identities) == 1 + assert _close(identities[0].gpu_hours, (5 + 20) / 3600) + finally: + conn.close() + + +def test_load_idle_capacity_interval_override_wins_over_recorded_interval(tmp_path: Path) -> None: + conn = open_db(tmp_path / "override-interval.db") + try: + run_id = start_daemon_run(conn, BASE, timedelta(seconds=5)) + write_snapshot( + conn, + BASE, + _fixture_host(), + Snapshot( + gpus=[GPUSample(uuid="GPU-0", util_pct=2)], + procs=[ProcSample(gpu_uuid="GPU-0", pid=100, mem_used_mb=70000)], + ), + run_id=run_id, + ) + + idle_capacity = load_idle_capacity(conn, BASE, timedelta(seconds=30)) + assert _close(idle_capacity.idle_held_gpu_hours, 30 / 3600) + finally: + conn.close() + + # ── load_per_gpu ──────────────────────────────────────────────── diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 61b7fb5..0951611 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -18,7 +18,6 @@ from gpu_usage_audit import __version__ from gpu_usage_audit.__main__ import ( - DEFAULT_DB_PATH, DISPLAY_COMMAND_ENV, _duration, _pid_is_managed_daemon, @@ -30,6 +29,7 @@ ) from gpu_usage_audit.doctor import DoctorCheck, DoctorPlan, DoctorReport from gpu_usage_audit.nvml import NVMLNotAvailableError +from gpu_usage_audit.paths import DEFAULT_DB_PATH def test_version_string_is_nonempty() -> None: @@ -45,10 +45,12 @@ def test_parser_registers_subcommands() -> None: assert ns.command == cmd -def test_daemon_and_report_default_to_tmp_gua_db() -> None: +def test_daemon_and_report_default_to_home_gua_db() -> None: p = build_parser() + assert Path.home() / ".gua" / "gua.db" == DEFAULT_DB_PATH assert p.parse_args(["daemon"]).db == str(DEFAULT_DB_PATH) assert p.parse_args(["report"]).db == str(DEFAULT_DB_PATH) + assert p.parse_args(["report"]).interval is None def test_pyproject_registers_gua_entry_point() -> None: @@ -74,6 +76,7 @@ def test_gua_parser_registers_command_surface() -> None: for cmd in ("daemon", "start", "status", "stop", "report", "demo", "version", "help"): ns = p.parse_args([cmd]) assert ns.command == cmd + assert p.parse_args(["report"]).interval is None def _required_args_for(cmd: str) -> list[str]: