diff --git a/.github/workflows/guix-reproducibility.yml b/.github/workflows/guix-reproducibility.yml new file mode 100644 index 000000000..81b613a1c --- /dev/null +++ b/.github/workflows/guix-reproducibility.yml @@ -0,0 +1,253 @@ +name: Guix reproducible build smoke + +# Verify that the contrib/guix pipeline still produces byte-identical +# cuprated artifacts across two independent runs. Runs on: +# - every PR touching the pipeline itself or workspace Cargo +# metadata (catches a regression in the toolchain pin, scripts, +# or any vendored crate that affects determinism) +# - workflow_dispatch (manual; use this when changing crate source +# code without touching the pipeline) +# +# We do NOT run on every source-tree change. The smoke job takes +# ~25-35 min on a stock ubuntu-24.04 runner and we'd burn that budget +# on every PR otherwise; the mechanical native-flag grep below catches +# the most common regression class anyway. If a change touches a crate +# that ends up linking into cuprated and might affect determinism, +# trigger the workflow manually before merging. + +on: + pull_request: + paths: + - 'contrib/guix/**' + - 'Cargo.toml' + - 'Cargo.lock' + - '.github/workflows/guix-reproducibility.yml' + workflow_dispatch: + schedule: + # Weekly smoke run on the default branch. The path-filtered PR + # trigger above intentionally skips source-only PRs (to keep CI + # cost bounded); this catches drift those PRs would introduce. + - cron: '0 7 * * 1' # Mondays 07:00 UTC + +env: + # Guix binary tarball: pin BOTH the SHA256 of the bytes and the GPG + # signer fingerprint. The SHA256 is the primary trust anchor (a + # mismatched tarball fails the workflow before any key import). The + # signature verification is defense in depth and forces a maintainer + # to update both pins together when bumping Guix versions. + GUIX_VER: '1.5.0' + GUIX_ARCH: 'x86_64-linux' + GUIX_TARBALL_SHA256: 'aa41025489c5061543e9c48873eaa829b900b2da75d40f9648913622f5f47817' + GUIX_SIGNER_FPR: 'A28BF40C3E551372662D14F741AAE7DCCA3D8351' # Efraim Flashner, Guix release signer (expires 2029-01-18) + +jobs: + smoke: + name: smoke-reproducible.sh + runs-on: ubuntu-24.04 + timeout-minutes: 90 + permissions: + contents: read + steps: + # Pinned by full commit SHA (NOT tag) - tags are mutable, commit SHAs + # are not. v6.0.2 -> de0fac2e4500dabe0009e67214ff5f5447ce83dd + # Bump together with any deliberate action update. + - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + # The build needs ~20 GiB of writable disk for Guix substitutes + # plus two cargo build trees. The default runner ships with only + # ~14 GiB free; reclaim the rest by removing pre-installed + # toolchains we don't use. Inline to avoid a third-party action. + - name: Reclaim runner disk + run: | + set -euxo pipefail + df -h / + sudo rm -rf \ + /usr/share/dotnet \ + /opt/ghc \ + /usr/local/lib/android \ + /usr/local/share/boost \ + /opt/hostedtoolcache/CodeQL \ + /opt/hostedtoolcache/Java_* \ + /opt/hostedtoolcache/Ruby \ + /opt/hostedtoolcache/PyPy \ + /opt/hostedtoolcache/go \ + /opt/hostedtoolcache/node \ + "$AGENT_TOOLSDIRECTORY" || true + sudo docker system prune -af || true + df -h / + + - name: Install Guix + run: | + set -euxo pipefail + export DEBIAN_FRONTEND=noninteractive + sudo apt-get update -qq + sudo apt-get install -y --no-install-recommends \ + xz-utils gpg gpg-agent ca-certificates curl jq + + tarball="guix-binary-${GUIX_VER}.${GUIX_ARCH}.tar.xz" + base="https://ftp.gnu.org/gnu/guix" + + # 1. Fetch tarball + detached sig. + curl -fsSL "$base/$tarball" -o "/tmp/$tarball" + curl -fsSL "$base/$tarball.sig" -o "/tmp/$tarball.sig" + + # 2. SHA256 check first - this is the primary trust anchor and + # runs before any key import. + echo "$GUIX_TARBALL_SHA256 /tmp/$tarball" > /tmp/expected.sha256 + sha256sum -c /tmp/expected.sha256 + + # 3. GPG verification against the pinned signer fingerprint. + # + # Public keyservers flake on GitHub runners (we saw an empty + # response from the dirmngr default within 70ms in a previous + # run). Try a list of well-known keyservers in order, and do + # the FULL verification (--status-fd + VALIDSIG check) inside + # each iteration with a per-keyserver scratch homedir. This + # way, a malformed-but-importable key from server A doesn't + # poison the homedir and block trying server B - we only + # commit the homedir on a successful pinned VALIDSIG. + verified=0 + for ks in \ + hkps://keyserver.ubuntu.com \ + hkps://keys.openpgp.org \ + hkps://pgp.mit.edu; do + echo "Trying keyserver: $ks" + try_home="$(mktemp -d /tmp/gpg-try.XXXXXX)" + chmod 0700 "$try_home" + if gpg --homedir "$try_home" --batch --no-tty --quiet \ + --keyserver "$ks" --recv-keys "$GUIX_SIGNER_FPR" \ + >/dev/null 2>&1 \ + && gpg --homedir "$try_home" --batch --no-tty --status-fd 1 --verify \ + "/tmp/$tarball.sig" "/tmp/$tarball" 2>/dev/null \ + | tee "$try_home/status" >/dev/null \ + && grep -q "^\[GNUPG:\] VALIDSIG $GUIX_SIGNER_FPR " "$try_home/status"; then + # Commit this homedir as the trusted one. + rm -rf /tmp/gpghome + mv "$try_home" /tmp/gpghome + echo "Verified Guix tarball signature with $GUIX_SIGNER_FPR via $ks" + verified=1 + break + fi + rm -rf "$try_home" + done + if [[ "$verified" -ne 1 ]]; then + echo "FATAL: no keyserver yielded a key that verifies the Guix tarball signature against pinned fingerprint $GUIX_SIGNER_FPR" >&2 + exit 1 + fi + + # 4. Only now extract. + sudo tar --warning=no-timestamp -xJf "/tmp/$tarball" -C / + + # Create build users (Guix daemon needs an isolated UID pool). + sudo groupadd --system guixbuild || true + for i in $(seq -w 1 10); do + id "guixbuilder$i" >/dev/null 2>&1 || sudo useradd \ + -g guixbuild -G guixbuild -d /var/empty -s /usr/sbin/nologin \ + -c "Guix build user $i" --system "guixbuilder$i" + done + + # Profile symlink for root + sudo mkdir -p /root/.config/guix + sudo ln -sf /var/guix/profiles/per-user/root/current-guix /root/.config/guix/current + + # Authorize substitute keys (the daemon will only accept + # substitutes signed by these). These come out of the + # verified tarball we extracted above. No `|| true`: a + # failure here would leave the daemon with no authorized + # keys, which is a real problem worth surfacing loudly. + GUIX_BIN=/var/guix/profiles/per-user/root/current-guix/bin + key_count=0 + for key in /var/guix/profiles/per-user/root/current-guix/share/guix/*.pub; do + [[ -f "$key" ]] || continue + # `cat | sudo` rather than `sudo ... < "$key"`: the + # redirect form is SC2024 (the redirect is opened by the + # unprivileged shell, not by sudo). The `cat` here is also + # unprivileged - this is purely about shellcheck-clean + # piping, not about reading a root-only key. Today the .pub + # files in the extracted tarball are world-readable, so a + # plain unprivileged read works. + cat "$key" | sudo "$GUIX_BIN/guix" archive --authorize + key_count=$((key_count + 1)) + done + if [[ "$key_count" -eq 0 ]]; then + echo "FATAL: no substitute keys authorized from verified tarball" >&2 + exit 1 + fi + echo "Authorized $key_count substitute key(s) from verified tarball" + + # Start the daemon. The `> /tmp/guix-daemon.log 2>&1` redirect + # has to happen INSIDE the sudo shell - otherwise the redirect + # is opened by the unprivileged shell (SC2024). The + # `setsid /tmp/guix-daemon.log 2>&1" > "$GITHUB_PATH" + + - name: Run smoke-reproducible.sh + run: | + set -euxo pipefail + sudo -E PATH="$PATH" ./contrib/guix/smoke-reproducible.sh + + # Mechanical regression guard. smoke-reproducible.sh already does + # this check itself; running it again at the workflow level makes + # the regression render directly in the job summary. The smoke + # script exports its per-run build logs to contrib/guix/smoke-logs/ + # before its cleanup runs (and `chmod a+rX` them so this non-sudo + # step can read them even when the smoke script ran under sudo). + - name: Assert no host-CPU-native flags in build logs + if: always() + run: | + set -uo pipefail + shopt -s globstar nullglob + pat='-march=native|-mcpu=native|target-cpu=native' + # Positive self-test: a known-bad line MUST trip the grep. Without + # this, a future regression in `pat` or the grep invocation could + # silently turn the guard into a no-op (the regex starts with + # `-m`, which grep parses as an option unless `--` or `-e` is + # used - exactly the trap this self-test catches). + selftest="$(mktemp)" + printf '%s\n' 'cc -march=native foo.c' > "$selftest" + if ! grep -nE -- "$pat" "$selftest" >/dev/null 2>&1; then + echo "::error::native-flag guard self-test failed; regex is broken" >&2 + rm -f "$selftest" + exit 2 + fi + rm -f "$selftest" + logs=( contrib/guix/smoke-logs/build-*.log ) + if [[ "${#logs[@]}" -eq 0 ]]; then + echo "::warning::no exported smoke logs found at contrib/guix/smoke-logs/build-*.log; native-flag scan not run" + exit 0 + fi + found=0 + for f in "${logs[@]}"; do + if grep -nE -- "$pat" "$f" >/dev/null 2>&1; then + echo "::error file=$f::native-arch flag detected" + grep -nE -- "$pat" "$f" | head -5 + found=1 + fi + done + exit "$found" + + - name: Surface daemon log on failure + if: failure() + run: tail -200 /tmp/guix-daemon.log || true + + # On failure, upload every log smoke-reproducible.sh's EXIT trap + # exported (cargo-verbose + guix-build wrapper output + mk-distsrc + # stderr, per run a/b). Without this the runner is torn down and + # the only artifact left is the timestamped grep at the end of the + # job log, which is rarely enough to root-cause a silent failure + # like "guix-build returned 0 but no cuprated tarball appeared". + # Pinned by full SHA (v5.0.0 -> 330a01c490aca151604b8cf639adc76d48f6c5d4). + - name: Upload smoke logs on failure + if: failure() + uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 + with: + name: smoke-logs + path: contrib/guix/smoke-logs/ + if-no-files-found: warn + retention-days: 7 diff --git a/.gitignore b/.gitignore index 533eaa4c7..d6507a8f9 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,6 @@ fast_sync_hashes.bin /books/user/Cuprated.toml fuzz/corpus fuzz/artifacts +/contrib/guix/out/ +/contrib/guix/.work/ +/contrib/guix/smoke-logs/ diff --git a/contrib/guix/README.md b/contrib/guix/README.md new file mode 100644 index 000000000..96ee41afb --- /dev/null +++ b/contrib/guix/README.md @@ -0,0 +1,258 @@ +# Guix reproducible build flow (cuprated) + +This directory provides a Guix-first reproducible release pipeline for +`cuprated` on Linux. The goal is that anyone with the right toolchain +can rebuild the published `cuprated--x86_64-unknown-linux-gnu.tar.gz` +from this repository and get **byte-identical** output, then verify the +release's SHA256 against the upstream publication. + +## Scope + +| | | +|---|---| +| Supported target today | `x86_64-unknown-linux-gnu` | +| Supported Guix system | `x86_64-linux` | +| Other architectures | not yet — see [Roadmap](#roadmap) | + +This is a node binary, not a miner. RandomX is built in CMake's +`ARCH=default` mode (compiler-capability-gated `-maes -mssse3 -mavx2`, +no `-march=native`), so the binary is identical across x86_64 CPUs +under the same toolchain. Block-verification uses RandomX's "light" +mode, which doesn't depend on CPU-specific codegen for correctness; +miners using `randomx-rs` directly should override via the +`RANDOMX_ARCH` env var to opt in to host-specific instructions. + +## Build flow + +1. Create a deterministic source archive (vendored Cargo deps, fixed + mtimes/uid/gid/path-prefix), inside a hermetic Guix shell: + + ```bash + ./contrib/guix/guix-mk-distsrc x86_64-linux + ``` + +2. Build `cuprated` from the source archive using `guix time-machine` + pinned to the channel in `channels.scm`: + + ```bash + ./contrib/guix/guix-build \ + --guix-system x86_64-linux \ + --target x86_64-unknown-linux-gnu \ + --package cuprated \ + --distsrc contrib/guix/out/cuprate---src.tar.gz + ``` + +3. Aggregate checksums + GPG-signed JSON attestation: + + ```bash + ./contrib/guix/guix-checksums contrib/guix/out + ./contrib/guix/guix-attest contrib/guix/out + ``` + +4. Sidecar SHA256 integrity check against a published `.SHA256SUM`: + + ```bash + ./contrib/guix/guix-verify contrib/guix/out/cuprated--x86_64-unknown-linux-gnu.tar.gz + ``` + +5. End-to-end reproducibility self-check (builds twice and compares + every determinism-sensitive output): + + ```bash + ./contrib/guix/smoke-reproducible.sh + ``` + +### Concurrency + +Each script uses per-run `mktemp` working directories, so concurrent +invocations cannot trample each other's *intermediate* state. **Final +outputs share fixed names by default** (e.g. `cuprated--.tar.gz`, +`build-metadata.json`, `build-.log`), so two concurrent runs +writing into the same `contrib/guix/out` would overwrite each other's +results. Use `--out-dir ` (on `guix-mk-distsrc` and `guix-build`) +to give each parallel run its own output directory; the path is +required to live inside the repository so the Guix container can see +it. `smoke-reproducible.sh` sidesteps this entirely by performing each +run in its own `git clone` of the working tree. + +## Output files + +`contrib/guix/out/` after a full run: + +- `cuprate---src.tar.gz` (+ `.SHA256SUM`) — deterministic source +- `cuprated--.tar.gz` (+ `.SHA256SUM`) — release artifact +- `SHA256SUMS` — aggregate sum over the two tarballs only +- `build-metadata.json` — package, version, target, `SOURCE_DATE_EPOCH`, + git commit, distsrc sha256, RandomX arch mode +- `guix-describe.json` — channel + commit metadata for the Guix instance + used to build (captured on the *outer* host before entering the + container, since the container doesn't ship `guix` itself) +- `rustc-version.txt`, `cargo-version.txt` — toolchain proof +- `build-.log` — full verbose build log (used by the + smoke test's native-flag regression guard) +- `ldd-.diag.txt` — **diagnostic only**; host-loader output + varies by system and is intentionally excluded from `SHA256SUMS` and + the signed attestation + +## Determinism inputs + +Everything that goes into a build is pinned at one of these layers: + +| Layer | Pin | Where | +|---|---|---| +| Guix instance | commit sha + channel introduction | `channels.scm` | +| Build profile (rust, gcc-toolchain, cmake, make, openssl, …) | by Guix package name; transitively pinned via Guix commit | `manifest.scm` | +| Rust source tree | git commit, verified at build time against `git archive` of the same commit (`libexec/build.sh`'s distsrc-equivalence check) | `.cuprate-distsrc.json` | +| Rust deps | `Cargo.lock`, then `cargo vendor --locked --versioned-dirs` | `Cargo.lock`, `vendor/` | +| C/C++ build flags | `--remap-path-prefix`, `-ffile-prefix-map`, `-C codegen-units=1` | `libexec/build.sh` | +| Time | `SOURCE_DATE_EPOCH` = git-commit time | `libexec/build.sh` | +| Tar metadata | `--sort=name --mtime=@EPOCH --owner=0 --group=0 --numeric-owner --mode='a=rX,u+w'` | `mk-distsrc`, `libexec/package.sh` | +| Gzip metadata | `gzip -n` (no name, no timestamp) | same | + +## Distsrc content equivalence + +`libexec/build.sh` does **not** trust `.cuprate-distsrc.json`'s +`git_commit` field on its own. After extracting the distsrc tarball, +it runs `git archive` on that commit from the outer checkout. The +three paths `mk-distsrc` legitimately adds (`vendor/`, `.cargo/`, +`.cuprate-distsrc.json`) are moved aside to a temporary directory +under `$build_root`, then `diff -rq` runs over the rest of the tree +with no excludes — so a nested file or directory that happens to +share one of those basenames (a hypothetical future `tests/vendor/`, +for example) is still compared. After the diff succeeds, the three +paths are restored so the build can use them; on diff failure, the +build root is preserved with the paths restored so the operator can +inspect the as-extracted distsrc. A distsrc that claims a git commit +but contains modified sources fails this check before the build +starts. + +## Threat model + +**Trust roots** (compromise of any of these defeats reproducibility +verification): + +1. **The Guix substitute key set authorised on the building host.** + Guix accepts substitutes only when they validate under one of the + keys passed to `guix archive --authorize`. The CI workflow loads + those keys from the verified Guix binary tarball (see #2 below); + any host with a substitute key controls what binary artifacts the + Guix daemon will accept as cached toolchain pieces. +2. **The Guix binary bootstrap tarball.** The CI workflow downloads + `guix-binary-..tar.xz` from `ftp.gnu.org`, then verifies + it against both a pinned SHA256 *and* a pinned OpenPGP fingerprint + (`A28BF40C…3D8351`, Efraim Flashner). Compromise of *both* trust + anchors at the same time would let a malicious tarball through. +3. **The Guix channel commit + introduction.** `channels.scm` pins the + channel by commit sha and supplies the canonical + `make-channel-introduction` for the official Guix channel. `guix + time-machine` authenticates the channel head against that + introduction before evaluating anything. +4. **The release signing key + publication channel.** Reproducibility + proves *what got shipped matches the source*; it does not prove + that the shipped binary is the one users should run. Users still + need to verify the published `cuprated-*.tar.gz.SHA256SUM` (or the + signed JSON attestation under `contrib/guix/sigs/`) against + maintainer-controlled distribution. +5. **The git tree itself.** A malicious commit landing on `main` + yields a malicious-but-reproducibly-built binary. Reproducibility + shifts the trust requirement onto code review, not away from it. + +**This pipeline protects against:** + +- A future toolchain regression silently changing artifact contents + (Guix pin + lockfile + vendor make any drift a load-bearing diff). +- A maintainer (or attacker with access to one) shipping a binary + that doesn't correspond to the published source. +- Supply-chain bit-rot in transitive Cargo deps — the lockfile + + vendor freezes everything; new upstream versions only enter via a + reviewable commit. +- A tampered distsrc tarball that claims a git commit but contains + modified sources (see [Distsrc content + equivalence](#distsrc-content-equivalence)). + +**This pipeline does NOT protect against:** + +- Compromise of any of the trust roots listed above. +- Hardware-level attacks on the build host (compromised microcode, + flashed firmware, etc.). +- A bug in `cuprated` itself. Reproducibility is about *what got + shipped*, not whether what got shipped is correct. + +## RandomX + +This pipeline depends on `randomx-rs` building RandomX in a +host-CPU-independent mode. As of `Cuprate/randomx-rs@567bdca`, this +happens **by accident** — a useful one to be aware of: + +- `randomx-rs/build.rs` calls `cmake::Config::new(…).define("DARCH", + "native")`, which emits `-DDARCH=native` to CMake. +- Upstream [`tevador/RandomX`](https://github.com/tevador/RandomX)'s + `CMakeLists.txt` has **zero** references to `DARCH`. It reads + `ARCH`, which defaults to `"default"` when unset. +- So the `.define("DARCH", …)` line is a years-old silent typo. The + actual ARCH value that takes effect is the CMake default, + `"default"`, regardless of what `DARCH` is set to. +- `ARCH="default"` produces a build with `-maes -mssse3 -mavx2` + flags, but each of those is gated on a *compiler-capability* check + (`check_c_compiler_flag`), not on the build host's CPU. With the + Guix toolchain pinned, all three flags resolve identically across + hosts, so the produced object code is identical. + +If `randomx-rs` ever corrects the typo (legitimately, since the line +is meaningless as written), the *named-by-intent* behaviour +(`-march=native`) would kick in and the build would silently stop +being reproducible across CPUs. The CI smoke job grep-fails on +`-march=native` / `-mcpu=native` / `target-cpu=native` to catch that +regression before merge. + +Miners using `randomx-rs` directly and wanting host-specific +performance can set `RANDOMX_ARCH=native` in their environment — but +that has no effect today because of the same typo. Filing a one-line +fix upstream at `Cuprate/randomx-rs` would unblock both that and our +regression guard simultaneously. + +## Known workarounds (remove when upstream fixes land) + +- **`_GLIBCXX_HAVE_FENV_H` / `_GLIBCXX_USE_C99_FENV` defined in + `CXXFLAGS`** (`libexec/build.sh`). Guix's gcc-15.2 libstdc++ ships + with these undefined in `bits/c++config.h`, so `` doesn't + pull in `` and `fesetround` is absent from the global + namespace. Any C++ caller of `` (RandomX, + `src/instructions_portable.cpp`) then fails to link. Setting both + macros restores the expected `` behaviour. To check whether + this is still needed, run with `GUIX_SKIP_FENV_WORKAROUND=1` and + see if RandomX still compiles. + +- **`OPENSSL_NO_VENDOR=1`** to force `openssl-sys` to link the audited + openssl from `manifest.scm` instead of recompiling its bundled + `openssl-src` from source. This is a behavioural improvement, not a + workaround for a bug; it's recorded here as a knob. + +## CI + +`.github/workflows/guix-reproducibility.yml` runs `smoke-reproducible.sh` +on every PR that touches `contrib/guix/**`, `Cargo.toml`, `Cargo.lock`, +or the workflow itself. It can also be triggered manually via +`workflow_dispatch` for PRs that change crate source without touching +the pipeline, and runs weekly on the default branch via `schedule` to +catch drift those source-only PRs would otherwise miss. The workflow: + +- pins `actions/checkout` by commit sha (not tag) +- pins the Guix binary tarball by SHA256 AND by GPG signer + fingerprint, and fails before extracting if either check fails +- runs the smoke script, which itself fails on any divergence in + distsrc / artifact / metadata / toolchain versions / guix-describe + output, and on any `-march=native` / `-mcpu=native` / + `target-cpu=native` flag appearing in the build log + +## Roadmap + +- aarch64-linux target +- macOS (`x86_64-darwin`, `aarch64-darwin`) — needs an alternative + hermetic-build story; Guix doesn't run natively on macOS +- A signed-attestation flow compatible with sigstore / SLSA + provenance (the current `guix-attest` produces a stable, sorted-key + signed JSON attestation; SLSA-format export is a future step) +- Re-pinning `channels.scm` to a tagged Guix release once one ships + with rust ≥ 1.91 (v1.5.0 only carries 1.88; we currently pin to a + recent commit on master) diff --git a/contrib/guix/channels.scm b/contrib/guix/channels.scm new file mode 100644 index 000000000..7ea7ad469 --- /dev/null +++ b/contrib/guix/channels.scm @@ -0,0 +1,23 @@ +(list + (channel + (name 'guix) + (url "https://git.savannah.gnu.org/git/guix.git") + (branch "master") + ;; Commit pin policy: pin to a tagged Guix release whenever the rust + ;; available there is >= the maximum rustc version pulled in by + ;; cuprate's workspace deps. Today (May 2026) v1.5.0 (230aa373f3) only + ;; ships rust up to 1.88, but several deps (fjall, lsm-tree, + ;; typed-index-collections, monero-daemon-rpc, ...) require up to 1.91, + ;; so we pin to a recent master commit that ships rust-1.93. Repin to + ;; the next stable Guix tag as soon as one carrying rust >= 1.91 lands. + (commit "7041be9c117cbae2a5238bb22a0ff93ef11ca91a") + ;; Guix v1.5.0 requires every channel to carry an `introduction` with the + ;; commit + OpenPGP fingerprint that started the chain of trust; without + ;; this `guix time-machine` aborts with "channel 'guix' lacks an + ;; introduction and cannot be authenticated". These values are the + ;; canonical introduction for the official Guix channel. + (introduction + (make-channel-introduction + "9edb3f66fd807b096b48283debdcddccfea34bad" + (openpgp-fingerprint + "BBB0 2DDF 2CEA F6A8 0D1D E643 A2A0 6DF2 A33A 54FA"))))) diff --git a/contrib/guix/guix-attest b/contrib/guix/guix-attest new file mode 100755 index 000000000..907043d73 --- /dev/null +++ b/contrib/guix/guix-attest @@ -0,0 +1,209 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Write a signed release attestation. +# +# The attestation is a stable, sorted-key JSON document (Python json.dumps +# with sort_keys=True, indent=2) containing every fact a downstream +# verifier needs to bind a binary to its inputs: +# +# - package, version +# - rust_target, guix_system +# - source_date_epoch, git_commit +# - distsrc filename + sha256 +# - artifact filename + sha256 + length +# - guix channel commit (from guix-describe.json) +# - rustc / cargo versions (from rustc-version.txt / cargo-version.txt) +# - builder identity (passed in) +# - attestation timestamp +# +# We then GPG-sign that JSON. Signing concatenated text was the previous +# shape and has weak filename/section boundaries; a single signed JSON +# closes that gap. The JSON itself is intentionally NOT byte-stable across +# runs (attested_at and the detached GPG sig are both fresh per run); the +# binding is via the sha256 fields, not via byte-identity of the payload. +# +# Fails closed: if gpg is not on PATH, or the signing key is unavailable, +# the script exits non-zero rather than writing an unsigned attestation - +# an unsigned attestation that "looks signed" is a footgun. + +repo_root="$(git rev-parse --show-toplevel)" +out_dir="${1:-$repo_root/contrib/guix/out}" +if [[ ! -d "$out_dir" ]]; then + echo "output directory not found: $out_dir" >&2 + exit 1 +fi + +identity_raw="${2:-}" +version="${3:-}" +if [[ -z "$identity_raw" || -z "$version" ]]; then + echo "usage: $0 " >&2 + exit 1 +fi +# Sanitize the raw builder identity to a path-safe form. The keep-set +# matches what `safe_path_component` (defined below) will accept, so a +# common email-shaped builder id like `kim0@example.org` becomes +# `kim0_example.org` rather than surviving sanitization and then being +# rejected by the validator. Anything outside the keep-set collapses to +# underscore. +identity="$(printf '%s' "$identity_raw" | tr -c 'A-Za-z0-9_.+-' '_')" + +# Path-component validator. Used on every metadata value that ends up as +# a filename or directory name below. The character regex allows `.`, +# which means `.` and `..` would slip through it - those are valid path +# traversal in a way slashes are not, so reject them explicitly. +safe_path_component() { + local s="$1" + [[ -n "$s" ]] || return 1 + [[ "$s" != "." && "$s" != ".." ]] || return 1 + [[ "$s" =~ ^[A-Za-z0-9._+-]+$ ]] || return 1 + return 0 +} + +# Identity is sanitized via `tr -c '...' '_'` above, but the surviving +# charset still admits "." and ".." - explicitly reject those. +if ! safe_path_component "$identity"; then + echo "ERROR: builder identity sanitizes to unsafe path component: '$identity'" >&2 + exit 1 +fi + +if ! command -v gpg >/dev/null 2>&1; then + echo "ERROR: gpg not found; refusing to write unsigned attestation" >&2 + exit 1 +fi + +# Collect required inputs. +for f in SHA256SUMS build-metadata.json guix-describe.json rustc-version.txt cargo-version.txt; do + if [[ ! -f "$out_dir/$f" ]]; then + echo "ERROR: missing $out_dir/$f - run guix-checksums before guix-attest" >&2 + exit 1 + fi +done + +# Derive the artifact and distsrc filenames from build-metadata.json rather +# than globbing the directory. A `find | sort | head` approach can attest +# the lexicographically first stale tarball from a previous build/version/ +# target if out_dir is not pristine; binding to the metadata file the build +# just wrote eliminates that class of mistake. +# +# Use one-field-per-line + `mapfile -t` so a value containing whitespace +# (none today, but cheap insurance) can't corrupt later fields. +mapfile -t meta_fields < <( + python3 - "$out_dir/build-metadata.json" <<'PY' +import json, sys +m = json.load(open(sys.argv[1])) +for k in ("version", "rust_target", "distsrc", "distsrc_sha256"): + print(m[k]) +PY +) +if [[ "${#meta_fields[@]}" -ne 4 ]]; then + echo "ERROR: could not extract version/rust_target/distsrc/distsrc_sha256 from build-metadata.json" >&2 + exit 1 +fi +meta_version="${meta_fields[0]}" +meta_rust_target="${meta_fields[1]}" +meta_distsrc_name="${meta_fields[2]}" +meta_distsrc_sha="${meta_fields[3]}" +if [[ -z "$meta_version" || -z "$meta_rust_target" || -z "$meta_distsrc_name" || -z "$meta_distsrc_sha" ]]; then + echo "ERROR: build-metadata.json has empty version/rust_target/distsrc/distsrc_sha256" >&2 + exit 1 +fi +if [[ "$meta_version" != "$version" ]]; then + echo "ERROR: build-metadata version ($meta_version) != passed version ($version)" >&2 + exit 1 +fi +# Reject path-traversal or shell-meaningful characters in every metadata +# value that becomes a filename or directory name below. The CLI version +# arg already had to match meta_version above, so this also gates the CLI +# input transitively. meta_rust_target and meta_distsrc_name are also +# checked because they end up in the artifact / distsrc paths and a +# malicious/corrupted build-metadata.json could otherwise smuggle slashes +# or `..` segments into those paths. +if ! safe_path_component "$meta_version"; then + echo "ERROR: build-metadata version is not a safe path component: '$meta_version'" >&2 + exit 1 +fi +if ! safe_path_component "$meta_rust_target"; then + echo "ERROR: build-metadata rust_target is not a safe path component: '$meta_rust_target'" >&2 + exit 1 +fi +if ! safe_path_component "$meta_distsrc_name"; then + echo "ERROR: build-metadata distsrc is not a safe path component: '$meta_distsrc_name'" >&2 + exit 1 +fi + +# Now that the metadata version is validated, it's safe to compose the +# signatures directory. The CLI `version` arg is no longer used past this +# point; meta_version is the authoritative value. +sig_dir="$repo_root/contrib/guix/sigs/${meta_version}/${identity}" +mkdir -p "$sig_dir" + +artifact="$out_dir/cuprated-${meta_version}-${meta_rust_target}.tar.gz" +distsrc="$out_dir/${meta_distsrc_name}" +[[ -f "$artifact" ]] || { echo "ERROR: expected artifact missing: $artifact" >&2; exit 1; } +[[ -f "$distsrc" ]] || { echo "ERROR: expected distsrc missing: $distsrc" >&2; exit 1; } + +# Reject a stale distsrc with the same filename: build-metadata.json +# records the sha256 the build saw; if the file on disk now hashes +# differently, somebody swapped it under us. The `--` separator on +# sha256sum is belt-and-braces against a filename starting with `-` +# (already excluded by safe_path_component, but cheap). +actual_distsrc_sha="$(sha256sum -- "$distsrc" | awk '{print $1}')" +if [[ "$actual_distsrc_sha" != "$meta_distsrc_sha" ]]; then + echo "ERROR: distsrc sha mismatch for $distsrc" >&2 + echo " build-metadata.json: $meta_distsrc_sha" >&2 + echo " on disk now: $actual_distsrc_sha" >&2 + exit 1 +fi + +artifact_sha="$(sha256sum -- "$artifact" | awk '{print $1}')" +distsrc_sha="$actual_distsrc_sha" +artifact_size="$(wc -c <"$artifact" | tr -d ' ')" +attest_iso="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + +# Emit stable, sorted-key JSON (Python json.dumps with sort_keys=True). +python3 - "$out_dir" "$artifact" "$distsrc" "$artifact_sha" "$distsrc_sha" "$artifact_size" "$identity_raw" "$attest_iso" > "$sig_dir/attestation.json" <<'PY' +import json, sys, pathlib +out_dir, artifact, distsrc, asha, dsha, asize, identity, when = sys.argv[1:9] +meta = json.loads(pathlib.Path(out_dir, "build-metadata.json").read_text()) +describe = json.loads(pathlib.Path(out_dir, "guix-describe.json").read_text()) +rustc = pathlib.Path(out_dir, "rustc-version.txt").read_text().strip() +cargo = pathlib.Path(out_dir, "cargo-version.txt").read_text().strip() +# Extract the official-guix channel commit, if present. +guix_commit = None +if isinstance(describe, list): + for ch in describe: + if ch.get("name") == "guix": + guix_commit = ch.get("commit") + break +elif isinstance(describe, dict): + guix_commit = describe.get("commit") +doc = { + "schema": "cuprate-guix-attestation/v1", + "package": meta["package"], + "version": meta["version"], + "rust_target": meta["rust_target"], + "guix_system": meta["guix_system"], + "source_date_epoch": meta["source_date_epoch"], + "git_commit": meta["git_commit"], + "distsrc": { + "filename": pathlib.Path(distsrc).name, + "sha256": dsha, + }, + "artifact": { + "filename": pathlib.Path(artifact).name, + "sha256": asha, + "size": int(asize), + }, + "guix_channel_commit": guix_commit, + "rustc_version": rustc.splitlines()[0] if rustc else None, + "cargo_version": cargo.splitlines()[0] if cargo else None, + "builder_id": identity, + "attested_at": when, +} +print(json.dumps(doc, sort_keys=True, indent=2)) +PY + +# Detached ASCII-armored signature over the JSON payload. +(cd "$sig_dir" && gpg --detach-sign --armor --output attestation.json.asc attestation.json) +echo "Wrote $sig_dir/attestation.json + attestation.json.asc" diff --git a/contrib/guix/guix-build b/contrib/guix/guix-build new file mode 100755 index 000000000..01560e22e --- /dev/null +++ b/contrib/guix/guix-build @@ -0,0 +1,84 @@ +#!/usr/bin/env bash +set -euo pipefail + +script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +# realpath the repo root so the containment check below compares two +# physical paths. Without this, a checkout reached via a symlink would +# return a logical path from `git rev-parse` while `realpath` further +# down returns the physical path, and an in-repo --out-dir would be +# falsely rejected. +repo_root="$(realpath "$(git -C "$script_dir/../.." rev-parse --show-toplevel)")" +cd "$repo_root" + +guix_system="x86_64-linux" +rust_target="x86_64-unknown-linux-gnu" +package_name="cuprated" +distsrc="" +out_dir="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --guix-system) guix_system="$2"; shift 2 ;; + --target) rust_target="$2"; shift 2 ;; + --distsrc) distsrc="$2"; shift 2 ;; + --package) package_name="$2"; shift 2 ;; + --out-dir) out_dir="$2"; shift 2 ;; + *) echo "unknown argument: $1" >&2; exit 1 ;; + esac +done + +[[ "$package_name" == "cuprated" ]] || { echo "only --package cuprated is supported" >&2; exit 1; } +[[ -n "$distsrc" ]] || { echo "missing required --distsrc " >&2; exit 1; } +[[ -f "$distsrc" ]] || { echo "distsrc not found: $distsrc" >&2; exit 1; } + +dist_src="$(realpath "$distsrc")" +case "$dist_src" in + "$repo_root"/*) ;; + *) echo "distsrc must be inside the repository" >&2; exit 1 ;; +esac + +if [[ -z "$out_dir" ]]; then + out_dir="$repo_root/contrib/guix/out" +fi +mkdir -p "$out_dir" +# Normalize and constrain to inside the repo. `guix shell --container +# --pure` does not bind-mount arbitrary paths; an absolute path outside +# the repo would silently fail or land in a non-persisted overlay inside +# the container. Same goes for relative paths that resolve outside. +out_dir="$(realpath "$out_dir")" +case "$out_dir" in + "$repo_root"/*) ;; + *) echo "ERROR: --out-dir must resolve inside the repository ($repo_root); got $out_dir" >&2; exit 1 ;; +esac + +# Capture the exact Guix instance that will be used BEFORE entering the +# hermetic container. `guix` itself isn't in manifest.scm (and shouldn't be - +# the whole point of `guix shell --container --pure` is no host tools), so +# `guix describe` cannot run inside the container. Running it here, on the +# outer host, against the same channels.scm gives the authoritative pinned +# instance metadata for build-metadata.json / attestation. +if command -v guix >/dev/null 2>&1; then + guix time-machine -C "$script_dir/channels.scm" -- describe --format=json \ + > "$out_dir/guix-describe.json" +else + echo "ERROR: guix not on PATH; cannot capture guix-describe.json" >&2 + exit 1 +fi + +# See guix-mk-distsrc for the rationale: redirect TMPDIR inside the +# container to a bind-mounted host directory so the cargo target tree +# isn't constrained by the container's private tmpfs (~50% of RAM ~= +# 8 GB on a 16 GB GHA runner, smaller than a release build of cuprated). +work_root="$repo_root/contrib/guix/.work" +mkdir -p "$work_root" +guix time-machine -C "$script_dir/channels.scm" \ + -- shell --system="$guix_system" -m "$script_dir/manifest.scm" --container --pure -- \ + env \ + GUIX_BUILD_SYSTEM="$guix_system" \ + GUIX_RUST_TARGET="$rust_target" \ + GUIX_DIST_SRC="$dist_src" \ + GUIX_OUT_DIR="$out_dir" \ + GUIX_ALLOW_COMMIT_MISMATCH="${GUIX_ALLOW_COMMIT_MISMATCH:-0}" \ + GUIX_SKIP_FENV_WORKAROUND="${GUIX_SKIP_FENV_WORKAROUND:-0}" \ + TMPDIR="$work_root" \ + bash "$script_dir/libexec/build.sh" diff --git a/contrib/guix/guix-checksums b/contrib/guix/guix-checksums new file mode 100755 index 000000000..9a407eb63 --- /dev/null +++ b/contrib/guix/guix-checksums @@ -0,0 +1,83 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Aggregate the two release tarballs (distsrc + artifact) into a single +# SHA256SUMS file. We deliberately do NOT glob *.tar.gz: a stale tarball +# left over from a previous build/version/target could end up in the +# aggregate and be subsequently signed by guix-attest. Instead, read the +# canonical filenames from build-metadata.json, sha each, and require +# both to exist. + +out_dir="${1:-contrib/guix/out}" +if [[ ! -d "$out_dir" ]]; then + echo "output directory not found: $out_dir" >&2 + exit 1 +fi +if [[ ! -f "$out_dir/build-metadata.json" ]]; then + echo "ERROR: $out_dir/build-metadata.json not found - run a build first" >&2 + exit 1 +fi + +# One-field-per-line + `mapfile -t` so a metadata value containing +# whitespace (none today, but cheap insurance) can't corrupt later fields. +mapfile -t meta_fields < <( + python3 - "$out_dir/build-metadata.json" <<'PY' +import json, sys +m = json.load(open(sys.argv[1])) +for k in ("version", "rust_target", "distsrc"): + print(m[k]) +PY +) +if [[ "${#meta_fields[@]}" -ne 3 ]]; then + echo "ERROR: could not extract version/rust_target/distsrc from build-metadata.json" >&2 + exit 1 +fi +meta_version="${meta_fields[0]}" +meta_rust_target="${meta_fields[1]}" +meta_distsrc_name="${meta_fields[2]}" +if [[ -z "$meta_version" || -z "$meta_rust_target" || -z "$meta_distsrc_name" ]]; then + echo "ERROR: build-metadata.json has empty version/rust_target/distsrc" >&2 + exit 1 +fi + +# Each metadata value below ends up as a filename component fed to +# sha256sum. The non-empty check above is necessary but not sufficient: +# the regex permits `.` and `..` (which are valid path-traversal +# segments), and the field could in principle contain slashes from a +# corrupted/malicious build-metadata.json. Reject everything we don't +# want to see in an output filename. +safe_path_component() { + local s="$1" + [[ -n "$s" ]] || return 1 + [[ "$s" != "." && "$s" != ".." ]] || return 1 + [[ "$s" =~ ^[A-Za-z0-9._+-]+$ ]] || return 1 + return 0 +} +for v in "$meta_version" "$meta_rust_target" "$meta_distsrc_name"; do + if ! safe_path_component "$v"; then + echo "ERROR: build-metadata.json contains unsafe path component: '$v'" >&2 + exit 1 + fi +done + +artifact_name="cuprated-${meta_version}-${meta_rust_target}.tar.gz" +distsrc_name="$meta_distsrc_name" + +[[ -f "$out_dir/$artifact_name" ]] || { echo "ERROR: missing $out_dir/$artifact_name" >&2; exit 1; } +[[ -f "$out_dir/$distsrc_name" ]] || { echo "ERROR: missing $out_dir/$distsrc_name" >&2; exit 1; } + +( + cd "$out_dir" + # Sort the two filenames into a stable order before hashing. Use a + # `while read` loop rather than `xargs -d` so this stays portable + # across BSD/macOS hosts that don't ship GNU xargs. `sha256sum --` is + # belt-and-braces against any filename starting with `-` (already + # excluded by safe_path_component above). + printf '%s\n%s\n' "$distsrc_name" "$artifact_name" \ + | LC_ALL=C sort \ + | while IFS= read -r f; do + sha256sum -- "$f" + done > SHA256SUMS +) + +echo "Wrote $out_dir/SHA256SUMS" diff --git a/contrib/guix/guix-mk-distsrc b/contrib/guix/guix-mk-distsrc new file mode 100755 index 000000000..3ef40fbed --- /dev/null +++ b/contrib/guix/guix-mk-distsrc @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +set -euo pipefail +script_dir="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)" +# Match guix-build: realpath so the containment check below compares +# physical paths consistently with `realpath "$out_dir"`. +repo_root="$(realpath "$(git -C "$script_dir/../.." rev-parse --show-toplevel)")" +cd "$repo_root" + +guix_system="x86_64-linux" +out_dir="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --out-dir) out_dir="$2"; shift 2 ;; + --) shift; break ;; + -*) echo "unknown argument: $1" >&2; exit 1 ;; + *) guix_system="$1"; shift ;; + esac +done + +if [[ -z "$out_dir" ]]; then + out_dir="$repo_root/contrib/guix/out" +fi +mkdir -p "$out_dir" +# Same constraint as guix-build: out_dir must live inside the repo so +# `guix shell --container --pure` can see it. +out_dir="$(realpath "$out_dir")" +case "$out_dir" in + "$repo_root"/*) ;; + *) echo "ERROR: --out-dir must resolve inside the repository ($repo_root); got $out_dir" >&2; exit 1 ;; +esac + +# `guix shell --container` mounts a private tmpfs over /tmp inside the +# container, sized at the kernel's tmpfs default (~50% of RAM). On a +# 16 GB GHA runner that's ~8 GB - smaller than the full cuprate cargo- +# vendor tree (windows-0.62.2 alone unpacks to ~2 GB, and the whole +# tree easily exceeds 10 GB). Redirect TMPDIR inside the container to +# a bind-mounted work directory on the host filesystem so mktemp and +# cargo vendor target the host volume (typically 100+ GB free) rather +# than the constrained tmpfs. +work_root="$repo_root/contrib/guix/.work" +mkdir -p "$work_root" +guix time-machine -C "$script_dir/channels.scm" \ + -- shell --system="$guix_system" -m "$script_dir/manifest.scm" --container --pure --network -- \ + env \ + GUIX_OUT_DIR="$out_dir" \ + TMPDIR="$work_root" \ + bash "$script_dir/mk-distsrc" diff --git a/contrib/guix/guix-verify b/contrib/guix/guix-verify new file mode 100755 index 000000000..e7e53af21 --- /dev/null +++ b/contrib/guix/guix-verify @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Integrity check (NOT authentication) of a release artifact against the +# sidecar SHA256SUM that lives next to it. Use this to check that the +# tarball matches the sum it was published with - it does NOT verify who +# produced the tarball or that the sum itself is trustworthy. For +# authenticity, fetch and verify the GPG-signed attestation under +# contrib/guix/sigs///attestation.json.asc. + +archive="${1:-}" +if [[ -z "$archive" ]]; then + echo "usage: $0 " >&2 + echo " integrity check vs the sidecar .SHA256SUM" >&2 + exit 1 +fi + +dir="$(cd -- "$(dirname -- "$archive")" && pwd)" +base="$(basename -- "$archive")" +sum_file="$dir/${base}.SHA256SUM" +if [[ ! -f "$sum_file" ]]; then + echo "missing checksum file: $sum_file" >&2 + exit 1 +fi + +( cd "$dir" && sha256sum --check "${base}.SHA256SUM" ) diff --git a/contrib/guix/libexec/build.sh b/contrib/guix/libexec/build.sh new file mode 100755 index 000000000..e3724822d --- /dev/null +++ b/contrib/guix/libexec/build.sh @@ -0,0 +1,202 @@ +#!/usr/bin/env bash +set -euo pipefail + +repo_root="$(git rev-parse --show-toplevel)" + +export LC_ALL=C +export TZ=UTC + +# Hold all the "no network after distsrc creation" invariants up front so a +# later `cargo metadata` or anything else can't accidentally hit the network. +export CARGO_INCREMENTAL=0 +export CARGO_NET_OFFLINE=true + +dist_src="${GUIX_DIST_SRC:-}" +if [[ -z "$dist_src" || ! -f "$dist_src" ]]; then + echo "GUIX_DIST_SRC must point to a source archive produced by contrib/guix/mk-distsrc" >&2 + exit 1 +fi + +# Use a per-run build root so two concurrent invocations in the same checkout +# don't trample each other. Inside `guix shell --container --pure`, `$TMPDIR` +# is private to the container, so collisions across simultaneous outer +# invocations are also avoided. +build_root="$(mktemp -d "${TMPDIR:-/tmp}/guix-build-src.XXXXXX")" +src_dir="$build_root/src" +aside_dir="$build_root/distsrc-aside" +cleanup_build_root() { + # On success, drop the tree. On failure keep it so the user can inspect + # state - this is hugely valuable when distsrc verification or CMake + # configure fails inside a sealed container. + local code=$? + # If the distsrc-equivalence check exited mid-way (between move-aside + # and the explicit restore), put the three legitimately-added paths + # back so the preserved tree reflects the original distsrc contents. + # Idempotent: no-op if aside_dir was already cleaned up on success. + if [[ -d "$aside_dir" ]]; then + [[ -e "$aside_dir/vendor" ]] && mv "$aside_dir/vendor" "$src_dir/vendor" 2>/dev/null || true + [[ -e "$aside_dir/cargo" ]] && mv "$aside_dir/cargo" "$src_dir/.cargo" 2>/dev/null || true + [[ -e "$aside_dir/distsrc-meta.json" ]] && mv "$aside_dir/distsrc-meta.json" "$src_dir/.cuprate-distsrc.json" 2>/dev/null || true + rmdir "$aside_dir" 2>/dev/null || true + fi + if [[ $code -ne 0 ]]; then + echo "build failed; preserving build root for inspection: $build_root" >&2 + else + rm -rf "$build_root" + fi +} +trap cleanup_build_root EXIT + +export CARGO_HOME="$build_root/cargo-home" +mkdir -p "$src_dir" "$CARGO_HOME" + +# See mk-distsrc: guix shell --container runs as a mapped non-root user that +# cannot honor the archive's stored uid/gid; pass --no-same-owner so tar +# accepts the unprivileged extraction. +tar -xf "$dist_src" --no-same-owner --no-same-permissions -C "$src_dir" + +# Parse the manifest we just extracted. cargo can pick up its own +# vendor config now that CARGO_HOME and CARGO_NET_OFFLINE are set. +version="$({ cd "$src_dir" && cargo metadata --locked --format-version=1 --no-deps | python3 -c 'import json,sys; print(next(p["version"] for p in json.load(sys.stdin)["packages"] if p["name"]=="cuprated"))'; })" +: "${version:?unable to parse version}" + +SOURCE_DATE_EPOCH="$(python3 -c 'import json; print(json.load(open("'"$src_dir"'/.cuprate-distsrc.json"))["source_date_epoch"])')" +git_commit="$(python3 -c 'import json; print(json.load(open("'"$src_dir"'/.cuprate-distsrc.json"))["git_commit"])')" +distsrc_sha256="$(sha256sum "$dist_src" | awk '{print $1}')" +outer_commit="$(git -C "$repo_root" rev-parse HEAD)" +if [[ "$outer_commit" != "$git_commit" && "${GUIX_ALLOW_COMMIT_MISMATCH:-0}" != "1" ]]; then + echo "outer checkout $outer_commit differs from distsrc commit $git_commit" >&2 + exit 1 +fi +export SOURCE_DATE_EPOCH + +# cuprate-constants/build.rs hardcodes the embedded git commit by running +# `git show -s --format=%H` in CARGO_MANIFEST_DIR (relying on git walking up +# to find a .git/). The distsrc tarball legitimately contains no .git/, so +# `git show` finds nothing and the build script's assert(commit.len() == 40) +# trips. constants/build.rs also honors a GITHUB_SHA env var as an override - +# set it from the distsrc's authoritative git_commit so the embedded commit +# is bound to the distsrc, not to whatever outer checkout the build happens +# to run inside. +export GITHUB_SHA="$git_commit" + +# Distsrc content equivalence: verify the extracted source tree matches +# `git archive $git_commit` from the outer checkout. The three paths +# `mk-distsrc` legitimately adds (vendor/, .cargo/, .cuprate-distsrc.json) +# are moved aside before the diff and restored afterwards, so any nested +# directory or file that *happens* to share one of those basenames is +# still compared. This catches a tampered distsrc that lies about its +# git_commit but contains modified source files. The git_commit equality +# check above is metadata-only and not sufficient on its own. +# +# The cleanup_build_root EXIT trap installed near the top of this script +# also restores the aside-d paths on any failure that lands here mid-way, +# so the preserved $build_root reflects the as-extracted distsrc. +verify_dir="$build_root/git-baseline" +mkdir -p "$verify_dir" +git -C "$repo_root" archive --format=tar "$git_commit" | \ + tar -xf - --no-same-owner --no-same-permissions -C "$verify_dir" +mkdir -p "$aside_dir" +mv "$src_dir/vendor" "$aside_dir/vendor" +mv "$src_dir/.cargo" "$aside_dir/cargo" +mv "$src_dir/.cuprate-distsrc.json" "$aside_dir/distsrc-meta.json" +if ! diff -rq "$verify_dir" "$src_dir" > "$build_root/distsrc-diff.log" 2>&1; then + echo "ERROR: distsrc contents diverge from git commit $git_commit" >&2 + echo "see diff log:" >&2 + cat "$build_root/distsrc-diff.log" >&2 + exit 1 +fi +mv "$aside_dir/vendor" "$src_dir/vendor" +mv "$aside_dir/cargo" "$src_dir/.cargo" +mv "$aside_dir/distsrc-meta.json" "$src_dir/.cuprate-distsrc.json" +rmdir "$aside_dir" +rm -rf "$verify_dir" + +cd "$src_dir" + +# RandomX builds with CMake's ARCH default ("default"), which uses +# compiler-capability-gated -maes -mssse3 -mavx2 - all compiler-deterministic, +# none host-CPU-specific. The bundled randomx-rs build.rs passes +# .define("DARCH", "native"), but cmake reads ARCH, not DARCH, so that line +# is a silent no-op (a years-old typo that fortuitously keeps the build +# portable). See contrib/guix/README.md > "RandomX" for the longer story. +export RUSTFLAGS="--remap-path-prefix=$src_dir=/cuprate -C codegen-units=1" +export CFLAGS="-ffile-prefix-map=$src_dir=/cuprate" + +# Workaround: Guix gcc-15.2 libstdc++ ships with both _GLIBCXX_HAVE_FENV_H +# and _GLIBCXX_USE_C99_FENV undefined in bits/c++config.h, so never +# pulls in glibc's and `fesetround` is absent in the global +# namespace - even though the underlying glibc 2.41 obviously has it. Define +# both macros so any C++ caller of (RandomX, plus future C++ deps) +# compiles cleanly. +# +# This block is REMOVABLE once Guix's libstdc++ packaging is fixed upstream. +# To check: build with GUIX_SKIP_FENV_WORKAROUND=1 and see if RandomX still +# compiles. If yes, this whole `if` block can be deleted. +fenv_workaround="" +if [[ "${GUIX_SKIP_FENV_WORKAROUND:-0}" != "1" ]]; then + fenv_workaround=" -D_GLIBCXX_HAVE_FENV_H=1 -D_GLIBCXX_USE_C99_FENV=1" +fi +export CXXFLAGS="-ffile-prefix-map=$src_dir=/cuprate${fenv_workaround}" +# Guix's gcc-toolchain profile only provides `gcc`/`g++`, not the legacy `cc` +# alias; cc-rs (used by -sys crates such as libsqlite3-sys, openssl-sys, +# randomx-rs, ring, etc.) defaults to `cc` and fails with +# ToolNotFound: failed to find tool "cc": No such file or directory +# Pointing CC/CXX/AR/AS at the actual binaries fixes every -sys crate. +export CC=gcc +export CXX=g++ +export AR=ar +export AS=as +export LD=ld +export RANLIB=ranlib +export STRIP=strip +# Force openssl-sys to use the system openssl provided by the manifest via +# pkg-config rather than building it from source via openssl-src (which would +# also need `make`, but is wasteful when we already ship openssl). +export OPENSSL_NO_VENDOR=1 +# Some -sys crates use $GUIX_ENVIRONMENT to find headers/libs when pkg-config +# is not available; provide explicit hints. +if [[ -n "${GUIX_ENVIRONMENT:-}" ]]; then + export OPENSSL_DIR="$GUIX_ENVIRONMENT" + export PKG_CONFIG_PATH="$GUIX_ENVIRONMENT/lib/pkgconfig:${PKG_CONFIG_PATH:-}" +fi + +rust_target="${GUIX_RUST_TARGET:-x86_64-unknown-linux-gnu}" + +out_dir="${GUIX_OUT_DIR:-$repo_root/contrib/guix/out}" +mkdir -p "$out_dir" + +# Stream verbose-makefile + verbose-build output to a file so smoke tests +# can mechanically grep for -march=native / -mcpu=native / target-cpu=native +# regressions (RandomX or any future cc-rs crate could re-introduce them). +build_log="$out_dir/build-${rust_target}.log" +{ + cargo build --frozen --release --package cuprated --target "$rust_target" --verbose +} 2>&1 | tee "$build_log" + +bash "$repo_root/contrib/guix/libexec/package.sh" "$version" "$rust_target" "$SOURCE_DATE_EPOCH" "$out_dir" "$src_dir" + +binary="$src_dir/target/${rust_target}/release/cuprated" +# ldd output is host/loader-dependent and varies by system; keep it as a +# diagnostic file only - it MUST NOT be included in SHA256SUMS or +# attestation. guix-checksums excludes it explicitly. +if command -v ldd >/dev/null 2>&1; then + ldd "$binary" > "$out_dir/ldd-${rust_target}.diag.txt" 2>&1 || true +fi + +rustc --version --verbose > "$out_dir/rustc-version.txt" +cargo --version --verbose > "$out_dir/cargo-version.txt" + +cat > "$out_dir/build-metadata.json" < "$out_dir/$archive" +) + +( + cd "$out_dir" + sha256sum "$archive" > "$archive.SHA256SUM" +) diff --git a/contrib/guix/manifest.scm b/contrib/guix/manifest.scm new file mode 100644 index 000000000..a1e61c375 --- /dev/null +++ b/contrib/guix/manifest.scm @@ -0,0 +1,20 @@ +(specifications->manifest + (list + "bash" + "coreutils" + "git" + "gcc-toolchain" + "cmake" + "make" + "pkg-config" + "openssl" + "perl" + "python" + "rust" + "rust:cargo" + "gzip" + "tar" + "findutils" + "diffutils" + "gawk" + "nss-certs")) diff --git a/contrib/guix/mk-distsrc b/contrib/guix/mk-distsrc new file mode 100755 index 000000000..29763c95a --- /dev/null +++ b/contrib/guix/mk-distsrc @@ -0,0 +1,80 @@ +#!/usr/bin/env bash +set -euo pipefail + +repo_root="$(git rev-parse --show-toplevel)" +cd "$repo_root" +export LC_ALL=C +export TZ=UTC + +# Refresh the stat cache first: guix shell --container can serve files with +# stat metadata that differs from what was indexed on the host, which makes +# `git diff-index --quiet` report dirty without an actual content diff. +git update-index -q --refresh >/dev/null 2>&1 || true +if ! git diff-index --quiet HEAD --; then + echo "working tree is dirty; commit changes before creating distsrc" >&2 + echo "--- git status --porcelain ---" >&2 + git status --porcelain >&2 + echo "--- git diff-index --name-only HEAD ---" >&2 + git diff-index --name-only HEAD -- >&2 + exit 1 +fi + +out_dir="${GUIX_OUT_DIR:-$repo_root/contrib/guix/out}" +mkdir -p "$out_dir" + +# Per-run work dir so two concurrent invocations don't trample each other's +# extracted source / cargo vendor output. +work_dir="$(mktemp -d "${TMPDIR:-/tmp}/guix-distsrc.XXXXXX")" +cleanup_work_dir() { + local code=$? + if [[ $code -ne 0 ]]; then + echo "mk-distsrc failed; preserving work dir for inspection: $work_dir" >&2 + else + rm -rf "$work_dir" + fi +} +trap cleanup_work_dir EXIT + +mkdir -p "$work_dir/src" +export CARGO_HOME="$work_dir/cargo-home" +mkdir -p "$CARGO_HOME" + +version="$({ cargo metadata --locked --format-version=1 --no-deps | python3 -c 'import json,sys; print(next(p["version"] for p in json.load(sys.stdin)["packages"] if p["name"]=="cuprated"))'; })" +commit="$(git rev-parse HEAD)" +short_commit="$(git rev-parse --short=12 HEAD)" +source_date_epoch="$(git log -1 --pretty=%ct)" +archive="cuprate-${version}-${short_commit}-src.tar.gz" + +if [[ -n "${GUIX_ENVIRONMENT:-}" ]]; then + cert_file="$GUIX_ENVIRONMENT/etc/ssl/certs/ca-certificates.crt" + cert_dir="$GUIX_ENVIRONMENT/etc/ssl/certs" + [[ -f "$cert_file" ]] && export SSL_CERT_FILE="$cert_file" GIT_SSL_CAINFO="$cert_file" CURL_CA_BUNDLE="$cert_file" + [[ -d "$cert_dir" ]] && export SSL_CERT_DIR="$cert_dir" +fi + +# --no-same-owner: inside guix shell --container we run as a mapped user +# without privilege to chown to uid 0/gid 0; without this flag, tar fails +# with "Cannot change ownership ... Invalid argument" on every entry. +# --no-same-permissions: similarly avoid setting setuid/setgid bits. +git archive --format=tar HEAD | tar -xf - --no-same-owner --no-same-permissions -C "$work_dir/src" +( + cd "$work_dir/src" + mkdir -p .cargo + cargo vendor --locked --versioned-dirs vendor > .cargo/config.toml + # No post-vendor patching needed. The bundled randomx-rs has a years-old + # build.rs typo (`.define("DARCH", ...)` instead of `ARCH`) that means + # RandomX's CMake never sees the override and falls through to + # ARCH=default, which is exactly the portable, compiler-deterministic + # build we want. The libstdc++ /fesetround workaround is handled + # in libexec/build.sh via CXXFLAGS, not by mutating vendored sources. + cat > .cuprate-distsrc.json < "$out_dir/$archive" +) +(cd "$out_dir" && sha256sum "$archive" > "$archive.SHA256SUM") +printf '%s\n' "$out_dir/$archive" diff --git a/contrib/guix/smoke-reproducible.sh b/contrib/guix/smoke-reproducible.sh new file mode 100755 index 000000000..91ed8c27e --- /dev/null +++ b/contrib/guix/smoke-reproducible.sh @@ -0,0 +1,215 @@ +#!/usr/bin/env bash +set -euo pipefail +# Without inherit_errexit, set -e is silently disabled inside `$(...)` +# command substitutions - so a failing `distsrc="$(./guix-mk-distsrc ...)"` +# would set $distsrc to "" and the script would soldier on with an empty +# `--distsrc` arg, producing the confusing "guix-build returned 0 but no +# artifact" tail-of-the-iceberg failure mode. With inherit_errexit, the +# substitution exits non-zero and set -e fires at the assignment, so the +# real error (e.g. disk-full in cargo vendor) surfaces immediately. +shopt -s inherit_errexit + +# End-to-end reproducibility self-check. +# +# Builds cuprated twice in two independent checkouts (different paths, +# different temp dirs) and asserts that ALL determinism-sensitive outputs +# match - not just the final tarball. Specifically: +# +# - cuprate---src.tar.gz (deterministic source archive) +# - cuprated--.tar.gz (release artifact) +# - build-metadata.json (records SOURCE_DATE_EPOCH, distsrc hash) +# - rustc-version.txt (proves manifest pinned the same rustc) +# - cargo-version.txt (proves manifest pinned the same cargo) +# - guix-describe.json (proves time-machine pinned the same +# channel instance) +# +# Also greps the build log for `-march=native`, `-mcpu=native`, and +# `target-cpu=native` and FAILS if any of them appear - protection against a +# future cc-rs / rustc / CMake change quietly re-introducing host-CPU codegen. + +repo_root="$(git rev-parse --show-toplevel)" +tmp="$(mktemp -d "${TMPDIR:-/tmp}/cuprate-smoke.XXXXXX")" + +# Where to drop the per-run build logs so a later CI step (or a curious +# operator) can scan them. The smoke script is typically invoked under +# `sudo` in CI, which makes $tmp root-owned mode 0700 - so the later +# non-sudo workflow step that greps for native-arch flags can't read +# anything in /tmp/cuprate-smoke.*. Exporting the logs to a workspace- +# relative path and chmod a+rX fixes that for both success and failure +# without changing the failure-preserves-tmp behaviour. +LOG_EXPORT_DIR="${LOG_EXPORT_DIR:-$repo_root/contrib/guix/smoke-logs}" + +# Preserve the working tree on failure so the user can dig through logs and +# intermediate artifacts. On success, clean up. Always export the build +# logs first so the assert step (in this script and at the workflow level) +# has something to scan even on success. +# +# Export failures (mkdir/cp/chmod) emit warnings rather than fail the +# trap, because: +# - the trap is firing on exit; its job is to preserve forensic state, +# not to surface a NEW failure that obscures the original one; +# - the native-flag scan in this script runs on `$tmp/$sub/...` BEFORE +# cleanup, so the primary guard does not depend on the workspace +# copy; the export is purely for the workflow-level duplicate scan; +# - we still want a visible signal if the workflow-level scan is about +# to silently degrade to "no logs found". +on_exit() { + local code=$? + if [[ -d "$tmp" ]]; then + if ! mkdir -p "$LOG_EXPORT_DIR"; then + echo "warning: could not create smoke log export dir: $LOG_EXPORT_DIR" >&2 + else + local sub + for sub in a b; do + # cargo's --verbose build log (this is what the workflow scans + # for native-arch flags). + local cargo_log="$tmp/$sub/contrib/guix/out/build-x86_64-unknown-linux-gnu.log" + if [[ -f "$cargo_log" ]]; then + if ! cp "$cargo_log" "$LOG_EXPORT_DIR/build-$sub.log"; then + echo "warning: could not export smoke log $cargo_log -> $LOG_EXPORT_DIR/build-$sub.log" >&2 + fi + fi + # The smoke script also captures guix-build's wrapper output and + # mk-distsrc's stderr - these contain the actual failure + # explanation when the build dies before producing the cargo + # log (e.g. distsrc-equivalence diff, OOM-killed rustc, + # substitute fetch failure). Export them too so a workflow + # `upload-artifact` step can collect the full forensic set. + for inner in build.log mk-distsrc.log; do + local src_log="$tmp/$sub/$inner" + if [[ -f "$src_log" ]]; then + if ! cp "$src_log" "$LOG_EXPORT_DIR/$sub-$inner"; then + echo "warning: could not export smoke log $src_log -> $LOG_EXPORT_DIR/$sub-$inner" >&2 + fi + fi + done + done + if ! chmod -R a+rX "$LOG_EXPORT_DIR" 2>/dev/null; then + echo "warning: could not chmod a+rX $LOG_EXPORT_DIR (workflow log scan may be unable to read it)" >&2 + fi + fi + fi + if [[ $code -ne 0 ]]; then + echo "smoke FAILED; preserving working dir for inspection: $tmp" >&2 + else + rm -rf "$tmp" + fi +} +trap on_exit EXIT + +run_once() { + local src="$1" + local commit + commit="$(git -C "$repo_root" rev-parse HEAD)" + + git clone --quiet --no-local "$repo_root" "$src" >&2 + git -C "$src" checkout --quiet --detach "$commit" >&2 + + ( + cd "$src" + local distsrc artifact + distsrc="$(./contrib/guix/guix-mk-distsrc x86_64-linux 2>"$src/mk-distsrc.log" | tail -n1)" + ./contrib/guix/guix-build \ + --guix-system x86_64-linux \ + --target x86_64-unknown-linux-gnu \ + --package cuprated \ + --distsrc "$distsrc" >"$src/build.log" 2>&1 + + artifact="$({ find contrib/guix/out -maxdepth 1 -type f -name 'cuprated-*-x86_64-unknown-linux-gnu.tar.gz' | LC_ALL=C sort | tail -n1; })" + if [[ -z "$artifact" || ! -f "$artifact" ]]; then + # guix-build returned 0 (we're past `set -e`) but didn't write an + # artifact - print enough state to diagnose without leaving the + # caller to guess. This is the classic "build silently succeeded + # without producing output" failure mode; the immediate diagnostic + # is the tail of build.log, which the on_exit trap also exports to + # LOG_EXPORT_DIR for the workflow to archive. + echo "FAIL: guix-build exited 0 but no cuprated artifact found in $src/contrib/guix/out" >&2 + echo "--- contents of $src/contrib/guix/out ---" >&2 + ls -la "$src/contrib/guix/out" >&2 || true + echo "--- last 100 lines of $src/build.log ---" >&2 + tail -100 "$src/build.log" >&2 || true + echo "--- last 50 lines of $src/mk-distsrc.log ---" >&2 + tail -50 "$src/mk-distsrc.log" >&2 || true + return 1 + fi + sha256sum "$artifact" | awk '{print $1}' + ) +} + +# Mechanical regression guard: native-arch flags should never appear in the +# build log. Catches both the obvious (-march=native, -mcpu=native) and the +# rustc form (target-cpu=native) that any future cc-rs crate or rustc config +# could re-introduce. +# +# NOTE: the `--` before "$pat" is load-bearing. The regex begins with `-m`, +# which grep otherwise parses as the `-m` (max-count) option, resulting in +# `grep: invalid max count` and a non-zero exit that the `if` block +# silently treats as "not found" - i.e. the guard becomes fail-open. +NATIVE_FLAG_REGEX='-march=native|-mcpu=native|target-cpu=native' +assert_no_native_flags() { + local src="$1" + local log="$src/contrib/guix/out/build-x86_64-unknown-linux-gnu.log" + if [[ ! -s "$log" ]]; then + echo "FAIL: build log is missing or empty: $log" >&2 + return 1 + fi + if grep -E -- "$NATIVE_FLAG_REGEX" "$log" >/dev/null 2>&1; then + echo "FAIL: host-CPU-native build flag detected in $src build log:" >&2 + # `head -5 >&2`: redirect head's output to stderr, not grep's + # (otherwise head reads from an empty pipe and prints nothing). + grep -nE -- "$NATIVE_FLAG_REGEX" "$log" | head -5 >&2 + return 1 + fi +} + +# Positive self-test for the regex/guard: a known-bad line MUST trip the +# grep. If this ever stops firing, the regression guard is broken (the +# leading `-m` parses as an option without `--`/`-e`). +native_flag_selftest() { + local tmpfile + tmpfile="$(mktemp "${TMPDIR:-/tmp}/native-flag-selftest.XXXXXX")" + printf '%s\n' 'cc -march=native foo.c' > "$tmpfile" + if ! grep -E -- "$NATIVE_FLAG_REGEX" "$tmpfile" >/dev/null 2>&1; then + echo "FAIL: native-flag guard self-test failed; regex is broken" >&2 + rm -f "$tmpfile" + return 1 + fi + rm -f "$tmpfile" +} + +native_flag_selftest + +h1="$(run_once "$tmp/a")" +h2="$(run_once "$tmp/b")" + +# Compare every reproducibility-sensitive output, not just the final binary. +compare() { + local rel="$1" + local a="$tmp/a/contrib/guix/out/$rel" + local b="$tmp/b/contrib/guix/out/$rel" + if [[ ! -f "$a" || ! -f "$b" ]]; then + echo "FAIL: missing comparison file '$rel' in one of the runs" >&2 + return 1 + fi + if ! cmp -s "$a" "$b"; then + echo "FAIL: '$rel' differs between runs:" >&2 + diff -u "$a" "$b" | head -50 >&2 || true + return 1 + fi +} + +distsrc_basename="$(basename "$(find "$tmp/a/contrib/guix/out" -maxdepth 1 -type f -name 'cuprate-*-src.tar.gz' | head -n1)")" +artifact_basename="cuprated-$(awk -F'\"' '/"version"/{print $4; exit}' "$tmp/a/contrib/guix/out/build-metadata.json")-x86_64-unknown-linux-gnu.tar.gz" + +compare "$distsrc_basename" +compare "$artifact_basename" +compare "build-metadata.json" +compare "rustc-version.txt" +compare "cargo-version.txt" +compare "guix-describe.json" + +assert_no_native_flags "$tmp/a" +assert_no_native_flags "$tmp/b" + +[[ "$h1" == "$h2" ]] || { echo "FAIL: final artifact sha mismatch ($h1 vs $h2)" >&2; exit 1; } +echo "reproducibility smoke test: PASS ($h1)"