Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 1 addition & 36 deletions .github/workflows/slo-report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,42 +16,7 @@ jobs:
pull-requests: write
steps:
- name: Publish YDB SLO Report
uses: ydb-platform/ydb-slo-action/report@13c687b7d4b2879da79dd12932dee0ed2b65dd1c
uses: ydb-platform/ydb-slo-action/report@v2
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
github_run_id: ${{ github.event.workflow_run.id }}

remove-slo-label:
if: always() && github.event.workflow_run.event == 'pull_request'
name: Remove SLO Label
needs: ydb-slo-action-report
runs-on: ubuntu-latest
permissions:
pull-requests: write
steps:
- name: Remove SLO label from PR
uses: actions/github-script@v7
with:
script: |
const pullRequests = context.payload.workflow_run.pull_requests;
if (pullRequests && pullRequests.length > 0) {
for (const pr of pullRequests) {
try {
await github.rest.issues.removeLabel({
owner: context.repo.owner,
repo: context.repo.repo,
issue_number: pr.number,
name: 'SLO'
});
console.log(`Removed SLO label from PR #${pr.number}`);
} catch (error) {
if (error.status === 404) {
console.log(`SLO label not found on PR #${pr.number}, skipping`);
} else {
throw error;
}
}
}
} else {
console.log('No pull requests associated with this workflow run');
}
165 changes: 32 additions & 133 deletions .github/workflows/slo.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,14 @@ jobs:
strategy:
fail-fast: false
matrix:
include:
- id: sync-table
prefix: table
workload: sync-table
- id: sync-query
prefix: table
workload: sync-query
sdk:
- name: sync-table
command: "--read-rps ${{ inputs.slo_workload_read_max_rps || '1000' }} --write-rps ${{ inputs.slo_workload_write_max_rps || '100' }}"
- name: sync-query
command: "--read-rps ${{ inputs.slo_workload_read_max_rps || '1000' }} --write-rps ${{ inputs.slo_workload_write_max_rps || '100' }}"

concurrency:
group: slo-${{ github.ref }}-${{ matrix.workload }}
group: slo-${{ github.ref }}-${{ matrix.sdk.name }}
cancel-in-progress: true

steps:
Expand Down Expand Up @@ -84,7 +82,7 @@ jobs:
docker compose version

- name: Checkout current version
uses: actions/checkout@v5
uses: actions/checkout@v6
with:
path: current
fetch-depth: 0
Expand Down Expand Up @@ -118,16 +116,10 @@ jobs:
echo "ref=$BASELINE_REF" >> $GITHUB_OUTPUT

- name: Checkout baseline version
uses: actions/checkout@v5
uses: actions/checkout@v6
with:
ref: ${{ steps.baseline.outputs.sha }}
path: baseline
fetch-depth: 1

- name: Show Docker versions
run: |
docker --version
docker compose version

- name: Build workload images (current + baseline)
run: |
Expand All @@ -141,125 +133,32 @@ jobs:
-t "ydb-app-baseline" \
"$GITHUB_WORKSPACE/baseline"

- name: Initialize YDB SLO
id: ydb_slo
uses: ydb-platform/ydb-slo-action/init@13c687b7d4b2879da79dd12932dee0ed2b65dd1c
- name: Run SLO Tests
uses: ydb-platform/ydb-slo-action/init@v2
timeout-minutes: 30
with:
github_issue: ${{ github.event.pull_request.number || inputs.github_issue }}
github_issue: ${{ github.event.inputs.github_issue }}
github_token: ${{ secrets.GITHUB_TOKEN }}
workload_name: ydb-python-${{ matrix.workload }}
workload_name: ${{ matrix.sdk.name }}
workload_duration: ${{ inputs.slo_workload_duration_seconds || '600' }}
workload_current_ref: ${{ github.head_ref || github.ref_name }}
workload_current_image: ydb-app-current
workload_current_command: ${{ matrix.sdk.command }}
workload_baseline_ref: ${{ steps.baseline.outputs.ref }}

- name: Prepare SLO Database
run: |
docker run --rm \
--network ydb_ydb-net \
--add-host "ydb:172.28.0.11" \
--add-host "ydb:172.28.0.12" \
--add-host "ydb:172.28.0.13" \
--add-host "ydb:172.28.0.99" \
-e "WORKLOAD=${{ matrix.workload }}" \
-e "REF=${{ github.head_ref || github.ref_name }}" \
ydb-app-current \
${{ matrix.prefix }}-create grpc://ydb:2136 /Root/testdb

- name: Run SLO Tests (current + baseline in parallel)
timeout-minutes: 15
env:
WORKLOAD: ${{ matrix.workload }}
DURATION: ${{ inputs.slo_workload_duration_seconds || 600 }}
READ_RPS: ${{ inputs.slo_workload_read_max_rps || 1000 }}
WRITE_RPS: ${{ inputs.slo_workload_write_max_rps || 100 }}
CURRENT_REF: ${{ github.head_ref || github.ref_name }}
BASELINE_REF: ${{ steps.baseline.outputs.ref }}
run: |
ARGS="${{ matrix.prefix }}-run grpc://ydb:2136 /Root/testdb \
--otlp-endpoint http://prometheus:9090/api/v1/otlp/v1/metrics \
--report-period 250 \
--time ${DURATION} \
--read-rps ${READ_RPS} \
--write-rps ${WRITE_RPS} \
--read-timeout 1000 \
--write-timeout 1000"

echo "Starting current workload (ref=${CURRENT_REF}, workload=${WORKLOAD})..."
docker run -d \
--name ydb-app-current \
--network ydb_ydb-net \
--add-host "ydb:172.28.0.11" \
--add-host "ydb:172.28.0.12" \
--add-host "ydb:172.28.0.13" \
--add-host "ydb:172.28.0.99" \
-e "REF=${CURRENT_REF}" \
-e "WORKLOAD=${WORKLOAD}" \
ydb-app-current \
$ARGS

echo "Starting baseline workload (ref=${BASELINE_REF}, workload=${WORKLOAD})..."
docker run -d \
--name ydb-app-baseline \
--network ydb_ydb-net \
--add-host "ydb:172.28.0.11" \
--add-host "ydb:172.28.0.12" \
--add-host "ydb:172.28.0.13" \
--add-host "ydb:172.28.0.99" \
-e "REF=${BASELINE_REF}" \
-e "WORKLOAD=${WORKLOAD}" \
ydb-app-baseline \
$ARGS

echo ""
echo "==================== INITIAL CURRENT LOGS ===================="
docker logs -n 15 ydb-app-current 2>&1 || echo "No current container"
echo ""
echo "==================== INITIAL BASELINE LOGS ===================="
docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container"
echo ""

echo "Waiting for workloads to complete (${DURATION}s)..."
sleep ${DURATION}

echo "Stopping containers after ${DURATION}s..."
docker stop --timeout=30 ydb-app-current ydb-app-baseline 2>&1 || true

# Force kill if still running
docker kill ydb-app-current ydb-app-baseline 2>&1 || true

# Check exit codes
CURRENT_EXIT=$(docker inspect ydb-app-current --format='{{.State.ExitCode}}' 2>/dev/null || echo "1")
BASELINE_EXIT=$(docker inspect ydb-app-baseline --format='{{.State.ExitCode}}' 2>/dev/null || echo "1")

echo "Current exit code: ${CURRENT_EXIT}"
echo "Baseline exit code: ${BASELINE_EXIT}"

echo ""
echo "==================== FINAL CURRENT LOGS ===================="
docker logs -n 15 ydb-app-current 2>&1 || echo "No current container"
echo ""
echo "==================== FINAL BASELINE LOGS ===================="
docker logs -n 15 ydb-app-baseline 2>&1 || echo "No baseline container"
echo ""

if [[ "${CURRENT_EXIT}" != "0" || "${BASELINE_EXIT}" != "0" ]]; then
echo "One or both workloads failed."
exit 0
fi

echo "SUCCESS: Workloads completed successfully"

- if: always()
name: Store logs
run: |
docker logs ydb-app-current > current.log 2>&1 || echo "No current container" > current.log
docker logs ydb-app-baseline > baseline.log 2>&1 || echo "No baseline container" > baseline.log

- if: always()
name: Upload logs
uses: actions/upload-artifact@v4
workload_baseline_image: ydb-app-current
workload_baseline_command: ${{ matrix.sdk.command }}

ydb-slo-action-report:
runs-on: ubuntu-latest
name: Publish YDB SLO Report
needs: ydb-slo-action
permissions:
checks: write
contents: read
pull-requests: write
steps:
- name: Publish YDB SLO Report
uses: ydb-platform/ydb-slo-action/report@v2
with:
name: ydb-python-${{ matrix.workload }}-logs
path: |
./current.log
./baseline.log
retention-days: 1
github_token: ${{ secrets.GITHUB_TOKEN }}
github_run_id: ${{ github.run_id }}
51 changes: 27 additions & 24 deletions AGENTS.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,44 +90,47 @@ To regenerate protobuf stubs: see `Makefile` and `generate-protobuf.Dockerfile`.

---

## Topic Chaos Testing (SLO)
## SLO Testing

Run this only for changes that affect topic reader/writer reconnection logic.
Run this for changes that affect topic/table reader/writer reconnection logic.

**1. Start YDB with chaos** (kills a DB node every ~20 seconds):
```sh
docker compose -f tests/slo/playground/configs/compose.yaml up -d
```
### Docker Compose (full stack)

**2. Wait until YDB is healthy:**
```sh
docker ps --format "table {{.Names}}\t{{.Status}}" | grep ydb
```
Uses [ydb-slo-action](https://github.com/ydb-platform/ydb-slo-action/tree/v2/deploy) infra (YDB cluster + Prometheus + workload in one command).

**3. Create a test topic** (from `tests/slo/` directory):
From `tests/slo/` directory:
```sh
source .venv/bin/activate
python ./src topic-create grpc://localhost:2135 /Root/testdb \
--path /Root/testdb/slo_topic --debug
WORKLOAD_NAME=topic ./slo_runner.sh
WORKLOAD_NAME=sync-query ./slo_runner.sh
```

**4. Test writer** (60 sec):
Override defaults via env vars: `RUN_TIME_SEC`, `WRITE_RPS`, `READ_THREADS`, `WRITE_THREADS`, `MESSAGE_SIZE`, `DEBUG=1`.

### Local run (against your own YDB)

**1. Start playground cluster:**
```sh
python ./src topic-run grpc://localhost:2135 /Root/testdb \
--path /Root/testdb/slo_topic --otlp-endpoint "" \
--read-threads 0 --write-rps 1 --time 60 --debug
docker compose -f tests/slo/playground/configs/compose.yaml up -d
```

**5. Test reader** (60 sec):
**2. Run workload** (from `tests/slo/` directory):
```sh
python ./src topic-run grpc://localhost:2135 /Root/testdb \
--path /Root/testdb/slo_topic --otlp-endpoint "" \
--read-rps 1 --write-threads 0 --time 60 --debug
source ../../.venv/bin/activate

# Topic workload (60 sec)
python ./src grpc://localhost:2136 /Root/testdb \
--workload-name topic --otlp-endpoint "" --time 60 --debug

# Table workload (60 sec)
python ./src grpc://localhost:2136 /Root/testdb \
--workload-name sync-query --otlp-endpoint "" --time 60 --debug
```

**6. Tear down:**
**3. Tear down:**
```sh
docker compose -f tests/slo/playground/configs/compose.yaml down
```

**Success criteria:** writer and reader reconnect automatically during node restarts with no fatal errors.
Full list of CLI arguments and environment variables: see `tests/slo/README.md` or run `python tests/slo/src --help`.

**Success criteria:** workload reconnects automatically during node restarts with no fatal errors.
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
[tool.ty.environment]
extra-paths = ["tests/slo/src"]

[tool.ruff]
line-length = 120

[tool.black]
line-length = 120

Expand Down
1 change: 1 addition & 0 deletions tests/slo/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.infra/
46 changes: 25 additions & 21 deletions tests/slo/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,38 +1,42 @@
# syntax=docker/dockerfile:1

# This image packages the Python SLO workload runner.
# It expects to be run with arguments like:
# docker run --rm <image> table-run <endpoint> <db> --otlp-endpoint http://prometheus:9090/api/v1/otlp/v1/metrics ...
#
# All settings are CLI arguments with env-var fallback (CLI arg > env var > default):
# endpoint $YDB_ENDPOINT grpc://ydb:2136
# db $YDB_DATABASE /Root/testdb
# --workload-name $WORKLOAD_NAME sync-query
# --workload-ref $WORKLOAD_REF / $REF main
# --otlp-endpoint $OTEL_EXPORTER_OTLP_ENDPOINT http://ydb-prometheus:9090/api/v1/otlp
# --time $WORKLOAD_DURATION 600
#
# Example:
# docker run --rm <image> grpc://ydb:2136 /Root/testdb --workload-name topic --time 120
#
# Notes:
# - OpenTelemetry 1.39.x requires Python >= 3.9.
# - The entrypoint is `python ./tests/slo/src`, i.e. it runs the `__main__.py`
# from that directory (same as `python tests/slo/src ...` in CI).

FROM python:3.11-slim AS build
FROM python:3.11-slim

ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1

WORKDIR /src
COPY . /src

# Install runtime deps into an isolated venv so we can copy it into the final stage.
RUN python -m venv /opt/venv \
&& /opt/venv/bin/python -m pip install --no-cache-dir --upgrade pip \
&& /opt/venv/bin/pip install --no-cache-dir . \
&& /opt/venv/bin/pip install --no-cache-dir -r tests/slo/requirements.txt
RUN apt-get update && apt-get install -y --no-install-recommends gcc libc6-dev && rm -rf /var/lib/apt/lists/*

WORKDIR /src

FROM python:3.11-slim
# 1. YDB SDK
COPY setup.py pyproject.toml README.md requirements.txt ./
COPY ydb/ ydb/
RUN pip install --no-cache-dir .

ENV PYTHONDONTWRITEBYTECODE=1 \
PYTHONUNBUFFERED=1 \
PATH="/opt/venv/bin:${PATH}"
# 2. SLO deps
COPY tests/slo/requirements.txt tests/slo/requirements.txt
RUN pip install --no-cache-dir -r tests/slo/requirements.txt

WORKDIR /app

COPY --from=build /opt/venv /opt/venv
COPY --from=build /src/tests/slo/src /app/tests/slo/src
# 3. Workload source
COPY tests/slo/src /src/tests/slo/src

ENTRYPOINT ["python", "./tests/slo/src"]

CMD ["--read-rps", "1000", "--write-rps", "100"]
Loading
Loading