Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/tier1-benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ jobs:
--memory=4Gi \
--cpu=2 \
--service-account="$SERVICE_ACCOUNT" \
--set-env-vars="INTERACTION_DB_REPO=${INTERACTION_DB_REPO},INTERACTION_DB_TAG=${INTERACTION_DB_TAG},INTERACTION_DB_SHA256=${INTERACTION_DB_SHA256},BENCHMARK_FORCE_OS_EXIT=1" \
--set-env-vars="INTERACTION_DB_REPO=${INTERACTION_DB_REPO},INTERACTION_DB_TAG=${INTERACTION_DB_TAG},INTERACTION_DB_SHA256=${INTERACTION_DB_SHA256},BENCHMARK_FORCE_OS_EXIT=1,BENCHMARK_GIT_COMMIT=${GITHUB_SHA}" \
--set-secrets="HF_TOKEN=HF_TOKEN:latest" \
--args="$ARGS"

Expand Down Expand Up @@ -285,7 +285,7 @@ jobs:
"run_id": manifest.get("run_id") == os.environ["RUN_ID"],
"dataset_revision": manifest.get("dataset_revision") == os.environ["DATASET_REVISION"],
"sample_size": 0 < sample_size <= expected_limit,
"git_commit": bool(manifest.get("git_commit")),
"git_commit": manifest.get("git_commit") not in ("", "0000000", None),
"model_ids": bool(manifest.get("model_ids", {}).get("ner")),
"ddinter_repo": manifest.get("ddinter_db", {}).get("repo") == os.environ["INTERACTION_DB_REPO"].strip("\r\n"),
"ddinter_tag": manifest.get("ddinter_db", {}).get("tag") == os.environ["INTERACTION_DB_TAG"].strip("\r\n"),
Expand Down
3 changes: 3 additions & 0 deletions eval/run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -787,6 +787,9 @@ def ddinter_metadata_from_args(args: argparse.Namespace) -> dict:


def _git_commit() -> str:
env_commit = os.environ.get("BENCHMARK_GIT_COMMIT") or os.environ.get("GITHUB_SHA")
if env_commit:
return env_commit
try:
completed = subprocess.run(
["git", "rev-parse", "HEAD"],
Expand Down
7 changes: 7 additions & 0 deletions tests/eval/test_run_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,13 @@ def test_manifest_includes_ddinter_release_metadata():
assert manifest["concurrency"] == 8


def test_git_commit_uses_benchmark_env_when_git_metadata_missing(monkeypatch):
monkeypatch.setenv("BENCHMARK_GIT_COMMIT", "e9f24645baf2d64a0504ee5b03c7af9f767b74cb")
monkeypatch.setattr(run_benchmark.subprocess, "run", lambda *args, **kwargs: (_ for _ in ()).throw(RuntimeError))

assert run_benchmark._git_commit() == "e9f24645baf2d64a0504ee5b03c7af9f767b74cb"


@pytest.mark.asyncio
async def test_ensure_ddinter_db_downloads_from_github_release(monkeypatch, tmp_path: Path):
checks = iter([False, True])
Expand Down
2 changes: 2 additions & 0 deletions tests/test_tier1_benchmark_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,15 @@ def test_tier1_benchmark_workflow_contract():
assert "INTERACTION_DB_TAG" in workflow
assert "INTERACTION_DB_SHA256" in workflow
assert "BENCHMARK_FORCE_OS_EXIT=1" in workflow
assert "BENCHMARK_GIT_COMMIT=${GITHUB_SHA}" in workflow
assert 'tr -d "\\r\\n"' in workflow
assert 'INTERACTION_DB_REPO="$(printf' in workflow
assert 'INTERACTION_DB_TAG="$(printf' in workflow
assert "--record-timeout-seconds" in workflow
assert "hf://buckets/SPerva/pillchecker-experiments" in workflow
assert 'bucket = os.environ["BENCHMARK_BUCKET"]' in workflow
assert "0 < sample_size <= expected_limit" in workflow
assert 'manifest.get("git_commit") not in ("", "0000000", None)' in workflow
assert "len(predictions) == sample_size" in workflow

for artifact in ("manifest.json", "results.json", "predictions.jsonl", "errors.jsonl", "summary.md"):
Expand Down
Loading