|
| 1 | +name: Update mirror list |
| 2 | + |
| 3 | +on: |
| 4 | + schedule: |
| 5 | + # First Monday of every month at 09:07 UTC |
| 6 | + - cron: "7 9 1-7 * 1" |
| 7 | + workflow_dispatch: |
| 8 | + inputs: |
| 9 | + dry_run: |
| 10 | + description: "Dry run (show changes without creating a PR)" |
| 11 | + type: boolean |
| 12 | + default: false |
| 13 | + |
| 14 | +permissions: |
| 15 | + contents: write |
| 16 | + pull-requests: write |
| 17 | + |
| 18 | +jobs: |
| 19 | + update-mirrors: |
| 20 | + name: Check Wikipedia for mirror changes |
| 21 | + runs-on: ubuntu-latest |
| 22 | + steps: |
| 23 | + - uses: actions/checkout@v4 |
| 24 | + |
| 25 | + - name: Fetch and parse mirror list from Wikipedia |
| 26 | + id: check |
| 27 | + run: | |
| 28 | + WIKIPEDIA_API="https://en.wikipedia.org/w/api.php?action=parse&page=Archive.today&prop=text&format=json" |
| 29 | +
|
| 30 | + echo "Fetching Wikipedia article..." |
| 31 | + curl --silent --fail --max-time 30 --location "$WIKIPEDIA_API" -o /tmp/wiki.json |
| 32 | +
|
| 33 | + # Extract <li>archive.TLD</li> entries from the infobox HTML. |
| 34 | + # Writing to a temp file avoids any shell-quoting issues with the JSON. |
| 35 | + python3 - /tmp/wiki.json <<'PYEOF' > /tmp/new_domains.txt |
| 36 | + import sys, json, re |
| 37 | +
|
| 38 | + with open(sys.argv[1]) as f: |
| 39 | + html = json.load(f)['parse']['text']['*'] |
| 40 | +
|
| 41 | + found = re.findall(r'<li>(archive\.[a-z]{2,6})</li>', html) |
| 42 | + seen = set() |
| 43 | + for d in found: |
| 44 | + if d not in seen: |
| 45 | + seen.add(d) |
| 46 | + print(d) |
| 47 | + PYEOF |
| 48 | +
|
| 49 | + if [[ ! -s /tmp/new_domains.txt ]]; then |
| 50 | + echo "ERROR: No domains extracted from Wikipedia article" >&2 |
| 51 | + exit 1 |
| 52 | + fi |
| 53 | +
|
| 54 | + # Ensure archive.today is first; preserve Wikipedia article ordering for the rest. |
| 55 | + { |
| 56 | + echo "archive.today" |
| 57 | + grep -v '^archive\.today$' /tmp/new_domains.txt |
| 58 | + } > /tmp/ordered_domains.txt |
| 59 | +
|
| 60 | + echo "Domains found on Wikipedia:" |
| 61 | + cat /tmp/ordered_domains.txt |
| 62 | +
|
| 63 | + # Compare sorted sets (order differences don't constitute an update). |
| 64 | + CURRENT_SORTED="$(grep -E '^[^#[:space:]]' mirrors.txt | sed 's/[[:space:]]*//' | sort)" |
| 65 | + NEW_SORTED="$(sort /tmp/ordered_domains.txt)" |
| 66 | +
|
| 67 | + if [[ "$CURRENT_SORTED" == "$NEW_SORTED" ]]; then |
| 68 | + echo "up_to_date=true" >> "$GITHUB_OUTPUT" |
| 69 | + echo "No changes detected — mirrors.txt is up to date." |
| 70 | + else |
| 71 | + echo "up_to_date=false" >> "$GITHUB_OUTPUT" |
| 72 | + echo "Changes detected:" |
| 73 | + diff <(echo "$CURRENT_SORTED") <(echo "$NEW_SORTED") \ |
| 74 | + | grep '^[<>]' | sed 's/^< / removed: /; s/^> / added: /' || true |
| 75 | +
|
| 76 | + # Save the diff before overwriting mirrors.txt — the Create pull request |
| 77 | + # step runs in a separate shell and mirrors.txt will already be updated by then. |
| 78 | + DIFF_SUMMARY="$(diff \ |
| 79 | + <(echo "$CURRENT_SORTED") \ |
| 80 | + <(echo "$NEW_SORTED") \ |
| 81 | + | grep '^[<>]' | sed 's/^< /- removed: /; s/^> /+ added: /' || echo 'See diff')" |
| 82 | + echo "diff_summary<<EOF" >> "$GITHUB_OUTPUT" |
| 83 | + echo "$DIFF_SUMMARY" >> "$GITHUB_OUTPUT" |
| 84 | + echo "EOF" >> "$GITHUB_OUTPUT" |
| 85 | +
|
| 86 | + # Write updated mirrors.txt |
| 87 | + { |
| 88 | + echo "# archive-resolver mirror list" |
| 89 | + echo "# Format: one domain per line. Lines starting with # are comments." |
| 90 | + echo "# The first non-comment line is the primary domain (others become symlinks)." |
| 91 | + echo "#" |
| 92 | + echo "# Source: https://en.wikipedia.org/wiki/Archive.today" |
| 93 | + printf "# Updated: %s\n" "$(date -u '+%Y-%m-%dT%H:%M:%SZ')" |
| 94 | + cat /tmp/ordered_domains.txt |
| 95 | + } > mirrors.txt |
| 96 | + fi |
| 97 | +
|
| 98 | + - name: Create pull request |
| 99 | + if: steps.check.outputs.up_to_date == 'false' && inputs.dry_run != 'true' |
| 100 | + env: |
| 101 | + GH_TOKEN: ${{ github.token }} |
| 102 | + run: | |
| 103 | + DATE="$(date -u '+%Y-%m-%d')" |
| 104 | + BRANCH="chore/update-mirrors-${DATE}" |
| 105 | +
|
| 106 | + git config user.name "github-actions[bot]" |
| 107 | + git config user.email "github-actions[bot]@users.noreply.github.com" |
| 108 | + git checkout -b "$BRANCH" |
| 109 | + git add mirrors.txt |
| 110 | + git commit -m "chore: update mirror list from Wikipedia (${DATE})" |
| 111 | + git push origin "$BRANCH" |
| 112 | +
|
| 113 | + DIFF_SUMMARY="${{ steps.check.outputs.diff_summary }}" |
| 114 | + WIKI_URL="https://en.wikipedia.org/wiki/Archive.today" |
| 115 | + gh pr create \ |
| 116 | + --title "chore: update mirror list from Wikipedia (${DATE})" \ |
| 117 | + --body "Automated update of \`mirrors.txt\` based on the |
| 118 | + current [Archive.today Wikipedia article](${WIKI_URL}). |
| 119 | +
|
| 120 | + **Changes:** |
| 121 | + \`\`\` |
| 122 | + ${DIFF_SUMMARY} |
| 123 | + \`\`\` |
| 124 | +
|
| 125 | + **Review checklist:** |
| 126 | + - [ ] Verify removed domains are genuinely discontinued mirrors |
| 127 | + - [ ] Verify added domains are genuine archive.today mirrors |
| 128 | + - [ ] Confirm \`archive.today\` remains the first (primary) entry" \ |
| 129 | + --label "automated" \ |
| 130 | + --head "$BRANCH" \ |
| 131 | + --base main |
| 132 | +
|
| 133 | + - name: Dry run summary |
| 134 | + if: inputs.dry_run == 'true' |
| 135 | + run: | |
| 136 | + if [[ "${{ steps.check.outputs.up_to_date }}" == "true" ]]; then |
| 137 | + echo "Dry run: mirrors.txt is already up to date." |
| 138 | + else |
| 139 | + echo "Dry run: changes detected. A PR would be created if dry_run=false." |
| 140 | + echo "Updated mirror list:" |
| 141 | + cat /tmp/ordered_domains.txt |
| 142 | + fi |
0 commit comments