From f589c78da465666c0cd0d02454bc38409bc484f0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20Mondaini=20Calv=C3=A3o?= Date: Tue, 26 May 2026 09:36:48 -0300 Subject: [PATCH 1/2] fix(deps): bump zspec to v0.9.2 to init std.testing.io_instance (closes #583) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CI hang under registry_scan_spec on x86_64-linux had nothing to do with `Io.Threaded` worker-pool exhaustion or `prefab_cache.scanDir` itself — the engine's pinned `zspec` was v0.9.1, the release *before* apotema/zspec#45 ("init std.testing.io_instance"). With v0.9.1 the spec runner never ran `std.testing.io_instance = .init(allocator, .{})`, so the global stayed as Zig's `undefined`-pattern `0xaaaaaaaa…` bytes. The first `std.testing.io.*` call any spec made — concretely `tmpDir()`'s opening `io.random(...)` — deadlocked deterministically: 1. `random(userdata=&io_instance, …)` casts the uninitialized bytes to `*Threaded` and hands them to `randomMainThread`. 2. `randomMainThread` calls `mutexLock(&t.mutex)`. 3. `t.mutex.state.raw == 0xaaaaaaaa` — neither `.unlocked` (0) nor `.locked_once` (1) nor `.contended` (2). The opening `cmpxchgStrong(.unlocked, .locked_once, …)` fails; the `swap(.contended, .acquire)` returns the garbage value, which compares `!= .unlocked`, so the thread enters `Thread.futexWaitUncancelable` on a futex no other thread will ever wake. The process has one TID parked in `futex_wait_queue` forever (confirmed via `/proc/$pid/task` under `docker --platform=linux/amd64 --cpus=2 ubuntu:24.04`). macOS and Windows masked the bug because their memory-init paths gave the mutex bytes a value `mutexLock` could recover from — the exact same bug, but only deterministic on x86_64-linux Debug. apotema/zspec#45 fixes it by initializing `io_instance` once in the runner's `main()`. The first release containing it is v0.9.2; this commit bumps the dependency hash to point there. Verification: - `docker --platform=linux/amd64 --cpus=2 -m 7g`: `zig build spec` now passes 28/28 in 102 ms (was: hang → CI timeout / exit 124). - macOS arm64: `zig build spec` 28/28 in 46 ms; `zig build test` full suite green in 16 s. - Docker amd64 `zig build test` full suite green in 1 m 22 s — well within the 10-min CI timeout that #585 added. Drops the Linux gate that #585 introduced as an emergency workaround; re-exports `RegistryScanSpec` unconditionally so every platform now runs the full RFC #561 / #577 registry-scan coverage. Also corrects the `io_helper.io()` comment, which had attributed the original hang to dual-pool `sigaction` racing — the real cause is documented above. --- build.zig.zon | 13 +++++++++++-- spec/spec_tests.zig | 42 ++---------------------------------------- src/io_helper.zig | 22 ++++++++++------------ 3 files changed, 23 insertions(+), 54 deletions(-) diff --git a/build.zig.zon b/build.zig.zon index 686704a5..355db3ee 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -20,8 +20,17 @@ // the engine module never fetch it — it is reached only by the // `spec`/`test` steps via `b.lazyDependency` in build.zig. .zspec = .{ - .url = "https://github.com/apotema/zspec/archive/v0.9.1.tar.gz", - .hash = "zspec-0.9.1-jaKLbbX4AwBKANdetxzzWc3UTO0UY0lcJJzTagQHlt5K", + // v0.9.2 is the first release that initializes + // `std.testing.io_instance` in the runner (apotema/zspec#45). + // The previously-pinned v0.9.1 left it as `undefined`, so the + // first `std.testing.io.*` call in a spec — e.g. `tmpDir()`'s + // inner `io.random()` — deadlocked on `mutexLock` because the + // mutex's state was the `0xaaaaaaaa` undefined-memory pattern + // instead of `.unlocked`. macOS/Windows happened to mask the + // bug (different memory-init paths); the x86_64-linux runner + // hit it deterministically. Closes #583. + .url = "https://github.com/apotema/zspec/archive/v0.9.2.tar.gz", + .hash = "zspec-0.9.1-jaKLbXgMBACFwbNjflhbMyP113leoYjboxn-1UOP-FGw", .lazy = true, }, }, diff --git a/spec/spec_tests.zig b/spec/spec_tests.zig index 36b414d8..1912b815 100644 --- a/spec/spec_tests.zig +++ b/spec/spec_tests.zig @@ -4,56 +4,18 @@ //! every `pub const`-exported spec struct is discovered by //! `zspec.runAll`. Add new spec files as imports + re-exports below. -const builtin = @import("builtin"); const zspec = @import("zspec"); pub const unified_format_spec = @import("unified_format_spec.zig"); pub const override_merge_spec = @import("override_merge_spec.zig"); pub const tree_walker_spec = @import("tree_walker_spec.zig"); +pub const registry_scan_spec = @import("registry_scan_spec.zig"); // Re-export the spec structs so zspec discovers them. pub const UnifiedFormatSpec = unified_format_spec.UnifiedFormatSpec; pub const OverrideMergeSpec = override_merge_spec.OverrideMergeSpec; pub const TreeWalkerSpec = tree_walker_spec.TreeWalkerSpec; - -// ── registry_scan_spec — gated off on Linux while #585 investigates ───── -// -// The spec_tests binary deadlocks on the GitHub Actions ubuntu-latest -// runner the first time a `registry_scan_spec` test's `tests:before` -// runs — concretely, just after the last `tree_walker_spec` test -// prints. Every Ubuntu CI run since #582 / #577 hung at exactly that -// boundary, blowing through 30+ minutes of runner minutes before -// timing out; Windows runs (which never reach this binary) succeeded -// in <1 minute. Reproduced under `docker --platform=linux/amd64 -// --cpus=2 -m 7g ubuntu:24.04` (exit code 124 at the same boundary); -// did NOT reproduce on macOS/arm64 Docker, so the trigger is -// architecture- or scheduler-specific to the x86_64-linux runner. -// -// The spec exercises real-filesystem walks (`Bridge.loadScene` -> -// `prefab_cache.scanDir`) layered on `std.testing.io` setup files -// (each `tests:before` calls `tmpDir().createDir(io, ...)` then -// `writeFile(io, ...)` then later `Bridge.loadScene` which fires -// `io_helper.io()` for the first time). That combination is what -// hangs. Until the root cause is pinned down (separate ticket), -// gate the spec out of the aggregator on Linux only so: -// - Ubuntu CI is unblocked immediately, -// - macOS / Windows local dev still runs the full registry-scan -// coverage from RFC #561 / #577, -// - the spec file stays in the tree (kept import-able from -// other entry points so the new behaviour is still reachable). -// -// Conditional `@import` keeps the file out of `builtin.test_functions` -// on Linux — Zig only walks `test` blocks in files reachable from -// the test root, so an unreferenced `@import` doesn't pull them in. -// -// TODO(#585-followup): repro under a tighter `act` setup, identify -// which `std.Io.Dir` op (`tmpDir`, `createDir`, `writeFile`, -// `realPath`, or the engine-side `walker.next`) blocks forever on -// x86_64-linux runners with 2 CPUs, and remove this gate. -pub const RegistryScanSpec = if (builtin.os.tag == .linux) - struct {} -else - @import("registry_scan_spec.zig").RegistryScanSpec; +pub const RegistryScanSpec = registry_scan_spec.RegistryScanSpec; test { zspec.runAll(@This()); diff --git a/src/io_helper.zig b/src/io_helper.zig index c76bbd7e..a0c23ca6 100644 --- a/src/io_helper.zig +++ b/src/io_helper.zig @@ -26,18 +26,16 @@ pub fn io() std.Io { return std.Io.failing; } // Inside a test binary, reuse the test runner's `std.testing.io_instance` - // rather than spinning up a second `Io.Threaded` pool. Two pools in the - // same process meant two `sigaction(.IO, ...)` installs racing on the - // same global signal slot — the second wins, leaving the first pool's - // worker threads unable to be interrupted out of blocking syscalls. - // On a constrained Linux runner (e.g. GitHub Actions ubuntu-latest, - // 2 CPUs → 1 worker per pool) that manifested as a hard deadlock - // the first time a test exercised both `std.testing.io` (e.g. - // `tmpDir().createDir(io, ...)`) and an engine codepath that calls - // `loadScene` → `prefab_cache.scanDir` → `io_helper.io()`. The CI - // hang reproduced exactly there (registry_scan_spec, post-#577). - // Sharing one pool also removes the non-atomic lazy-init race that - // was sitting in this file regardless. + // rather than spinning up a second `Io.Threaded` pool. This keeps engine + // codepaths that hit `io_helper.io()` (e.g. `loadScene` -> + // `prefab_cache.scanDir`) on the same pool the test-side `std.testing.io` + // calls use, sidestepping a second `sigaction(.IO, ...)` install and the + // non-atomic lazy-init race that was sitting in this file regardless. + // The CI hang investigated under #583 turned out to be unrelated to the + // dual-pool concern (the zspec v0.9.1 runner never initialized + // `std.testing.io_instance`, so the first `std.testing.io.*` call + // deadlocked on an `0xaaaaaaaa` mutex); the upgrade to v0.9.2 fixes it. + // This shared-pool path remains as defence-in-depth. if (builtin.is_test) { return std.testing.io_instance.io(); } From 0b3250a7771bd7267345b1600b162219bd5cf527 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexandre=20Mondaini=20Calv=C3=A3o?= Date: Tue, 26 May 2026 10:42:07 -0300 Subject: [PATCH 2/2] ci: retrigger after engine v1.45.0 tag