From 53babe65b15b7ee0ca48739fff78a38412349e38 Mon Sep 17 00:00:00 2001 From: Jamie Lentin Date: Fri, 29 May 2026 15:45:52 +0000 Subject: [PATCH 1/2] dvc: Don't track data/js_output, always regenerate The .gz files in data/js_output/ aren't stable, so we can't use them in as part of a pipeline, as we'll constantly regenerate and upload new ones. Instead, mark the stage as always_changed, meaning it'll get re-run even on repro --pull. --- dvc.lock | 6 ------ dvc.yaml | 3 +-- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/dvc.lock b/dvc.lock index a94607e..cc7581b 100644 --- a/dvc.lock +++ b/dvc.lock @@ -262,12 +262,6 @@ stages: md5: 81dfa9c3747bbe94c57161cc8e22d5c9.dir size: 1158499655 nfiles: 7 - outs: - - path: data/js_output/ - hash: md5 - md5: 9782411b79a64d6da09afacae2f3f047.dir - size: 8297912 - nfiles: 6 discover_latest_wikidata_dump_url: cmd: discover_latest_wikidata_dump_url > data/Wiki/wd_JSON/latest-all-json-bz2-url.txt diff --git a/dvc.yaml b/dvc.yaml index 6e2a357..94480a2 100644 --- a/dvc.yaml +++ b/dvc.yaml @@ -197,5 +197,4 @@ stages: - make_js_treefiles --outdir data/js_output deps: - data/output_files/ - outs: - - data/js_output/ + always_changed: true From 2b95e00bc7d09cb9983a79baa3bee750e6ad44f5 Mon Sep 17 00:00:00 2001 From: Jamie Lentin Date: Fri, 29 May 2026 16:15:03 +0000 Subject: [PATCH 2/2] .github: Freeze make_js_treefiles before checking status make_js_treefiles is always_changed, so unless we ignore it the test will always fail, --- .github/workflows/tests.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index f78dba0..26dd334 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -43,6 +43,7 @@ jobs: [ -n "${{ secrets.DVC_SECRET_ACCESS_KEY }}" ] || { echo "DVC_SECRET_ACCESS_KEY secret is not set"; exit 1; } dvc remote modify onezoom-r2 access_key_id ${{ secrets.DVC_ACCESS_KEY_ID }} dvc remote modify onezoom-r2 secret_access_key ${{ secrets.DVC_SECRET_ACCESS_KEY }} + dvc freeze make_js_treefiles dvc repro --allow-missing --dry | tee /dev/stderr | grep -q "Data and pipelines are up to date." if dvc data status --not-in-remote | grep -q "Not in remote"; then exit 1; fi test: