Skip to content

Cache Spark download in CI to speed up builds #200

Cache Spark download in CI to speed up builds

Cache Spark download in CI to speed up builds #200

Workflow file for this run

name: PyDeequ V2 Tests
on:
push:
branches:
- "**"
pull_request:
branches:
- "master"
jobs:
# V2 tests with Spark Connect (Python 3.12)
v2-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Install uv
uses: astral-sh/setup-uv@v4
- uses: actions/setup-python@v5
name: Install Python 3.12
with:
python-version: "3.12"
- uses: actions/setup-java@v4
name: Setup Java 17
with:
distribution: "corretto"
java-version: "17"
- name: Cache Spark and Deequ JAR
id: cache-spark
uses: actions/cache@v4
with:
path: |
spark-3.5.0-bin-hadoop3
deequ_2.12-2.1.0b-spark-3.5.jar
key: spark-3.5.0-deequ-2.1.0b
- name: Download Spark 3.5
if: steps.cache-spark.outputs.cache-hit != 'true'
run: |
curl -L -o spark-3.5.0-bin-hadoop3.tgz \
https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
tar -xzf spark-3.5.0-bin-hadoop3.tgz
rm spark-3.5.0-bin-hadoop3.tgz
- name: Download Deequ JAR
if: steps.cache-spark.outputs.cache-hit != 'true'
run: |
curl -L -o deequ_2.12-2.1.0b-spark-3.5.jar \
https://github.com/awslabs/python-deequ/releases/download/v2.0.0b1/deequ_2.12-2.1.0b-spark-3.5.jar
- name: Set SPARK_HOME
run: echo "SPARK_HOME=$PWD/spark-3.5.0-bin-hadoop3" >> $GITHUB_ENV
- name: Install Python dependencies
run: |
uv pip install -e ".[dev]" --system
uv pip install "pyspark[connect]==3.5.0" --system
- name: Run V2 unit tests
run: |
pytest tests/v2/test_unit.py -v
- name: Start Spark Connect Server
run: |
$SPARK_HOME/sbin/start-connect-server.sh \
--packages org.apache.spark:spark-connect_2.12:3.5.0 \
--jars ${{ github.workspace }}/deequ_2.12-2.1.0b-spark-3.5.jar \
--conf spark.connect.extensions.relation.classes=com.amazon.deequ.connect.DeequRelationPlugin
sleep 20
ps aux | grep SparkConnectServer | grep -v grep
- name: Run V2 integration tests
env:
SPARK_REMOTE: "sc://localhost:15002"
run: |
pytest tests/v2/ -v --ignore=tests/v2/test_unit.py
- name: Stop Spark Connect Server
if: always()
run: |
$SPARK_HOME/sbin/stop-connect-server.sh || true