diff --git a/build.zig.zon b/build.zig.zon index 686704a5..355db3ee 100644 --- a/build.zig.zon +++ b/build.zig.zon @@ -20,8 +20,17 @@ // the engine module never fetch it — it is reached only by the // `spec`/`test` steps via `b.lazyDependency` in build.zig. .zspec = .{ - .url = "https://github.com/apotema/zspec/archive/v0.9.1.tar.gz", - .hash = "zspec-0.9.1-jaKLbbX4AwBKANdetxzzWc3UTO0UY0lcJJzTagQHlt5K", + // v0.9.2 is the first release that initializes + // `std.testing.io_instance` in the runner (apotema/zspec#45). + // The previously-pinned v0.9.1 left it as `undefined`, so the + // first `std.testing.io.*` call in a spec — e.g. `tmpDir()`'s + // inner `io.random()` — deadlocked on `mutexLock` because the + // mutex's state was the `0xaaaaaaaa` undefined-memory pattern + // instead of `.unlocked`. macOS/Windows happened to mask the + // bug (different memory-init paths); the x86_64-linux runner + // hit it deterministically. Closes #583. + .url = "https://github.com/apotema/zspec/archive/v0.9.2.tar.gz", + .hash = "zspec-0.9.1-jaKLbXgMBACFwbNjflhbMyP113leoYjboxn-1UOP-FGw", .lazy = true, }, }, diff --git a/spec/spec_tests.zig b/spec/spec_tests.zig index 36b414d8..1912b815 100644 --- a/spec/spec_tests.zig +++ b/spec/spec_tests.zig @@ -4,56 +4,18 @@ //! every `pub const`-exported spec struct is discovered by //! `zspec.runAll`. Add new spec files as imports + re-exports below. -const builtin = @import("builtin"); const zspec = @import("zspec"); pub const unified_format_spec = @import("unified_format_spec.zig"); pub const override_merge_spec = @import("override_merge_spec.zig"); pub const tree_walker_spec = @import("tree_walker_spec.zig"); +pub const registry_scan_spec = @import("registry_scan_spec.zig"); // Re-export the spec structs so zspec discovers them. pub const UnifiedFormatSpec = unified_format_spec.UnifiedFormatSpec; pub const OverrideMergeSpec = override_merge_spec.OverrideMergeSpec; pub const TreeWalkerSpec = tree_walker_spec.TreeWalkerSpec; - -// ── registry_scan_spec — gated off on Linux while #585 investigates ───── -// -// The spec_tests binary deadlocks on the GitHub Actions ubuntu-latest -// runner the first time a `registry_scan_spec` test's `tests:before` -// runs — concretely, just after the last `tree_walker_spec` test -// prints. Every Ubuntu CI run since #582 / #577 hung at exactly that -// boundary, blowing through 30+ minutes of runner minutes before -// timing out; Windows runs (which never reach this binary) succeeded -// in <1 minute. Reproduced under `docker --platform=linux/amd64 -// --cpus=2 -m 7g ubuntu:24.04` (exit code 124 at the same boundary); -// did NOT reproduce on macOS/arm64 Docker, so the trigger is -// architecture- or scheduler-specific to the x86_64-linux runner. -// -// The spec exercises real-filesystem walks (`Bridge.loadScene` -> -// `prefab_cache.scanDir`) layered on `std.testing.io` setup files -// (each `tests:before` calls `tmpDir().createDir(io, ...)` then -// `writeFile(io, ...)` then later `Bridge.loadScene` which fires -// `io_helper.io()` for the first time). That combination is what -// hangs. Until the root cause is pinned down (separate ticket), -// gate the spec out of the aggregator on Linux only so: -// - Ubuntu CI is unblocked immediately, -// - macOS / Windows local dev still runs the full registry-scan -// coverage from RFC #561 / #577, -// - the spec file stays in the tree (kept import-able from -// other entry points so the new behaviour is still reachable). -// -// Conditional `@import` keeps the file out of `builtin.test_functions` -// on Linux — Zig only walks `test` blocks in files reachable from -// the test root, so an unreferenced `@import` doesn't pull them in. -// -// TODO(#585-followup): repro under a tighter `act` setup, identify -// which `std.Io.Dir` op (`tmpDir`, `createDir`, `writeFile`, -// `realPath`, or the engine-side `walker.next`) blocks forever on -// x86_64-linux runners with 2 CPUs, and remove this gate. -pub const RegistryScanSpec = if (builtin.os.tag == .linux) - struct {} -else - @import("registry_scan_spec.zig").RegistryScanSpec; +pub const RegistryScanSpec = registry_scan_spec.RegistryScanSpec; test { zspec.runAll(@This()); diff --git a/src/io_helper.zig b/src/io_helper.zig index c76bbd7e..a0c23ca6 100644 --- a/src/io_helper.zig +++ b/src/io_helper.zig @@ -26,18 +26,16 @@ pub fn io() std.Io { return std.Io.failing; } // Inside a test binary, reuse the test runner's `std.testing.io_instance` - // rather than spinning up a second `Io.Threaded` pool. Two pools in the - // same process meant two `sigaction(.IO, ...)` installs racing on the - // same global signal slot — the second wins, leaving the first pool's - // worker threads unable to be interrupted out of blocking syscalls. - // On a constrained Linux runner (e.g. GitHub Actions ubuntu-latest, - // 2 CPUs → 1 worker per pool) that manifested as a hard deadlock - // the first time a test exercised both `std.testing.io` (e.g. - // `tmpDir().createDir(io, ...)`) and an engine codepath that calls - // `loadScene` → `prefab_cache.scanDir` → `io_helper.io()`. The CI - // hang reproduced exactly there (registry_scan_spec, post-#577). - // Sharing one pool also removes the non-atomic lazy-init race that - // was sitting in this file regardless. + // rather than spinning up a second `Io.Threaded` pool. This keeps engine + // codepaths that hit `io_helper.io()` (e.g. `loadScene` -> + // `prefab_cache.scanDir`) on the same pool the test-side `std.testing.io` + // calls use, sidestepping a second `sigaction(.IO, ...)` install and the + // non-atomic lazy-init race that was sitting in this file regardless. + // The CI hang investigated under #583 turned out to be unrelated to the + // dual-pool concern (the zspec v0.9.1 runner never initialized + // `std.testing.io_instance`, so the first `std.testing.io.*` call + // deadlocked on an `0xaaaaaaaa` mutex); the upgrade to v0.9.2 fixes it. + // This shared-pool path remains as defence-in-depth. if (builtin.is_test) { return std.testing.io_instance.io(); }