diff --git a/.dockerignore b/.dockerignore index 9596778..6f3b123 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,23 +1,73 @@ # configs and git .git/ +.gitattributes .gitignore +.github/ .idea/ +.cursor/ # python and docker .venv/ .env +.env.* +.env.example Dockerfile docker-compose.yml +dockerhub-publish.sh __pycache__/ +*.py[cod] +*$py.class +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.coverage +htmlcov/ +*.egg-info/ +dist/ +build/ +.ipynb_checkpoints -# data folders +# credentials and keys (never ship in image layers) +*.pem +*.key +*.p12 +*.pfx +**/id_rsa +**/id_rsa.pub +**/.ssh/ +*kubeconfig* +secrets/ +credentials/ + +# ops / deploy (runtime uses env vars, not these files) +k8s/ +notebooks/ +docs/ +tests/ +hooks/ +README.md +LICENSE +run.sh +geocoder_test.py +example.output.txt +requirements.txt +.python-version + +# data folders and local databases staging.db +staging.duckdb* data/ data-raw/ data-clean/ +data-private/ +data-public/ lib/data-private/ lib/data-public/ +# logs and temp +*.log +*.tmp + # macos .DS_Store -.Trash-0/ \ No newline at end of file +.Trash-0/ diff --git a/.env.example b/.env.example index 65d4a7b..df467ee 100644 --- a/.env.example +++ b/.env.example @@ -1,35 +1,73 @@ # Mode (level 1 or 2) +# 2 = full S3 publish (production default) MODE=2 +# PostgreSQL target schema (optional) +# Sets session search_path for the ETL run. Leave empty for public. +DB_SCHEMA= + +# Optional S3 key prefix for private/ and public/ paths (e.g. refactor/ for isolated runs) +S3_PREFIX= + +# Reprocess zip files from S3 private/ backups by filename glob +# Example: LandlordTenant.Incr.2024-*.zip +REPROCESS_GLOB= + +# When true, replay REPROCESS_GLOB matches even if already completed in etl_files manifest +FORCE_REPROCESS=false + +# When true, fail parse_xml and abort before export/promote if any case-level parse errors occur +PARSE_FAIL_FAST=false + +# When true, skip post-promote RDS public CSV export and S3 encryption normalization (reprocess throughput only) +SKIP_PUBLIC_PUBLISH=false + +# Geocoding and CSV tuning (optional; safe defaults preserve current behavior) +GEOCODE_WORKERS= +CENSUS_BATCH_CHUNK_SIZE=2500 +CSV_ROW_CHECK_CHUNK_SIZE=1000 + +# Parser → DuckDB write batching (Option A; enabled by default) +PARSE_WRITE_BATCH_ENABLED=1 +PARSE_WRITE_BATCH_SIZE=128 +PARSE_WRITE_FLUSH_EVERY_N_CASES=16 + +# PostgreSQL TCP keepalives (optional; reduce idle disconnects on long ETL runs) +# DB_KEEPALIVES=1 +# DB_KEEPALIVES_IDLE=60 +# DB_KEEPALIVES_INTERVAL=10 +# DB_KEEPALIVES_COUNT=5 + +# Long-running SQL timeout in milliseconds (default 3600000 = 1 hour) +# DB_STATEMENT_TIMEOUT_MS=3600000 + # The database URL # ---------------- # -# This is the postgres instance the parsed cases will -# load data into. -# -# If you use the Dockerfile you don't need to change this. Otherwise make this the remote +# PostgreSQL instance where parsed cases are loaded and promoted. +# With Docker Compose, point this at your RDS instance (or local db service). DATABASE_URL= +# Optional clone/sync target (legacy maintenance path) +# CLONED_DATABASE_URL= # Amazon Web Services (AWS) configuration # --------------------------------------- # -# These are used to move data to/from S3 bucket -# If you are using AWS Lambda you do not need to include AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY -# please configure https://docs.aws.amazon.com/lambda/latest/dg/lambda-intro-execution-role.html +# Used to move data to/from the S3 bucket and for RDS aws_s3 import/export. +# On ECS/Lambda you can omit keys and use an IAM role instead. AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_S3_BUCKET_NAME= - # OCA SFTP credentials # --------------------------------------- # -# These are used to download raw XML and CSV files +# Used to download new raw XML zip files from OCA SFTP_HOST= SFTP_USER= SFTP_PSWD= -SFTP_DIR= \ No newline at end of file +SFTP_DIR= diff --git a/.gitignore b/.gitignore index 8b20829..c296593 100644 --- a/.gitignore +++ b/.gitignore @@ -20,4 +20,7 @@ data-public/ .DS_Store .Trash-0/ -staging.duckdb* \ No newline at end of file +staging.duckdb* + +k8s/*-kubeconfig.yaml +k8s/oca-etl-secret.yaml diff --git a/Dockerfile b/Dockerfile index 2728242..93ae26d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,11 @@ -FROM --platform=linux/amd64 python:3.12-slim-bookworm +FROM --platform=linux/amd64 python:3.12.13-slim-trixie ENV TZ=America/New_York # Update package lists and setup Python with uv RUN apt-get update && \ + apt-get upgrade -y && \ apt-get install -y --no-install-recommends \ - openssh-client curl ca-certificates python3 unzip && \ + openssh-client curl ca-certificates unzip && \ rm -rf /var/lib/apt/lists/* ADD https://astral.sh/uv/install.sh /uv-installer.sh RUN sh /uv-installer.sh && rm /uv-installer.sh diff --git a/README.md b/README.md index 8817437..56cb787 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ This work is licensed under a [Creative Commons Attribution-NonCommercial-ShareA ## CSV Files -[![Date Last Updated](https://oca-2-dev.s3.amazonaws.com/public/last-updated-shield.png)](https://oca-2-dev.s3.amazonaws.com/public/last-updated-date.txt) +[![Date Last Updated](https://oca-2-dev.s3.amazonaws.com/public/last-updated-shield.svg)](https://oca-2-dev.s3.amazonaws.com/public/last-updated-date.txt) * [`oca_index`](https://oca-2-dev.s3.amazonaws.com/public/oca_index.csv) * [`oca_causes`](https://oca-2-dev.s3.amazonaws.com/public/oca_causes.csv) @@ -43,36 +43,98 @@ The data we receive from OCA is an extract of all landlord and tenant cases in N ## About the code -For information about the details of various components, see [`/lib`](/lib) +The ETL pipeline lives under [`lib/`](lib/). See [`lib/README.md`](lib/README.md) for stage-by-stage architecture, module map, and SQL script roles. -### Local Setup +### Local setup -First, you will only be able to run this yourself if you have HDC's credentials to access to the SFTP to get the raw data transfered from OCA and access to the private AWS S3 where those files are stored. +You need credentials for OCA SFTP and the BetaNYC AWS for S3 (file storage) and RDS (PostgreSQL database), plus Docker and Docker Compose. -You will need Docker and Docker Compose. +Copy the example env file and fill in credentials: -First, you'll want to create an `.env` file by copying the example one: - -``` +```bash cp .env.example .env # Or 'copy .env.example .env' on Windows ``` -Take a look at the `.env` file and fill in the AWS S3 credentials. +Required variables: `DATABASE_URL`, `AWS_*`, `SFTP_*`, and `MODE=2` for full publish. Optional runtime controls are documented in [`.env.example`](.env.example). + +**Typical weekly run** (process new SFTP files only; geocodes addresses in the staging CSV before S3 upload, then promotes and publishes): +```bash +docker compose run --rm app python oca_update.py +``` -To run the whole process in the docker container run: +**RDS geocode backfill** (on-demand; rows in `oca_addresses` where `lat IS NULL` only; does not publish public CSVs): +```bash +docker compose run --rm app python oca_geocode_backfill.py ``` -docker-compose up + +Use the same `DATABASE_URL` and `DB_SCHEMA` as weekly ETL. Optional flags: `--geocode-workers`, `--census-batch-chunk-size` (or env `GEOCODE_WORKERS`, `CENSUS_BATCH_CHUNK_SIZE`). After backfill, run view rebuild + publish separately if S3 public files must reflect new coordinates. + +**Refactor / replay run** (isolated schema and S3 prefix, force replay from S3 private backups): + +```bash +docker compose run --rm app env \ + DB_SCHEMA=refactor \ + S3_PREFIX=refactor/ \ + REPROCESS_GLOB='LandlordTenant.Incr.2025-*.zip' \ + FORCE_REPROCESS=true \ + SKIP_PUBLIC_PUBLISH=true \ + GEOCODE_WORKERS=2 \ + python oca_update.py ``` -### Jupyter notebook for maintenance +Bulk reprocess with `SKIP_PUBLIC_PUBLISH=true` updates RDS and private backups only; public S3 CSVs stay stale until you run once with `SKIP_PUBLIC_PUBLISH=false` (or unset). + +Compose reads `.env` from the repo root for `DATABASE_URL`, AWS, and SFTP. Override any variable inline with `env VAR=value ...` as above. -Comment out `CMD ["python", "oca_update.py"]` in the Dockerfile +Run the test suite in Docker: +```bash +docker compose run --rm app python -m unittest discover -s tests -p "test_*.py" ``` -docker-compose up -d -docker-compose exec app /bin/bash + +### Weekly scheduling and Kubernetes + +See [`docs/operations/weekly-etl-scheduling.md`](docs/operations/weekly-etl-scheduling.md) for: + +- local Docker + **cron** (weekly example), +- **Kubernetes CronJob** (`k8s/k8s-cron-job.yaml`, 2Gi memory limit, secrets via `oca-etl-secrets`), +- **AWS EventBridge + ECS Fargate** (weekly task schedule). + +Create cluster secrets from [`k8s/oca-etl-secret.example.yaml`](k8s/oca-etl-secret.example.yaml); do not commit real credentials. + +### Runtime controls + +Optional env vars (and matching `oca_update.py` CLI flags) tune isolation, replay, memory, and parse throughput. When unset, defaults preserve standard weekly behavior: new SFTP files only, `public` schema, CPU-count geocode workers. + +| Variable | Purpose | Default | +|----------|---------|---------| +| `DB_SCHEMA` | PostgreSQL `search_path` target | `public` | +| `S3_PREFIX` | Prefix for `private/` and `public/` S3 keys | none | +| `REPROCESS_GLOB` | Filename glob for S3 private zip replay | none | +| `FORCE_REPROCESS` | Replay manifest-completed glob matches | `false` | +| `SKIP_PUBLIC_PUBLISH` | Skip post-promote RDS→S3 public CSV export and SSE normalize | `false` | +| `PARSE_FAIL_FAST` | Abort before export/promote on any case-level parse failure | `false` | +| `GEOCODE_WORKERS` | Geosupport multiprocessing pool size | CPU count | +| `CENSUS_BATCH_CHUNK_SIZE` | Census batch geocoder chunk | `2500` | +| `CSV_ROW_CHECK_CHUNK_SIZE` | Staging CSV preprocess / row-check chunk | `1000` | +| `PARSE_WRITE_BATCH_ENABLED` | Buffer parser DuckDB writes in txn windows | `1` (on) | +| `PARSE_WRITE_BATCH_SIZE` | Max buffered INSERTs before flush | `128` | +| `PARSE_WRITE_FLUSH_EVERY_N_CASES` | Flush cadence per parse worker | `16` | +| `DB_KEEPALIVES_*` | PostgreSQL TCP keepalive tuning (see `.env.example`) | RDS-friendly defaults | + +Long runs (multi-hour XML parse, S3 upload, geocoding) may idle the RDS connection; the pipeline uses TCP keepalives and automatic reconnect (`ensure_connection`) before RDS-heavy stages. Optional `DB_KEEPALIVES_IDLE` / `DB_KEEPALIVES_INTERVAL` / `DB_KEEPALIVES_COUNT` override libpq defaults. + +Use an isolated `S3_PREFIX` (e.g. `refactor/`) for refactor and end-to-end test runs so reads and writes stay out of production public paths. Memory target per job is **≤ 2 GiB**; lower `GEOCODE_WORKERS` if geocoding approaches the limit. + +### Jupyter notebook for maintenance + +Comment out `CMD ["python", "oca_update.py"]` in the Dockerfile, then: + +```bash +docker compose up -d +docker compose exec app /bin/bash jupyter notebook --allow-root --ip 0.0.0.0 --no-browser ``` diff --git a/docker-compose.yml b/docker-compose.yml index 1763d5f..2518a63 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,6 +6,17 @@ services: user: 'root' volumes: - .:/app + # Keep Linux .venv out of the host bind mount (macOS .venv breaks imports in-container). + - app_venv:/app/.venv + environment: + DATABASE_URL: ${DATABASE_URL} + AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID} + AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY} + AWS_S3_BUCKET_NAME: ${AWS_S3_BUCKET_NAME} + SFTP_HOST: ${SFTP_HOST} + SFTP_USER: ${SFTP_USER} + SFTP_PSWD: ${SFTP_PSWD} + SFTP_DIR: ${SFTP_DIR} # to debug in the container uncomment below # then use use > docker compose up -d # > docker compose exec app /bin/bash @@ -13,4 +24,7 @@ services: # > jupyter notebook --allow-root --ip 0.0.0.0 --no-browser # ports: # - 8888:8888 - tty: true \ No newline at end of file + tty: true + +volumes: + app_venv: \ No newline at end of file diff --git a/dockerhub-publish.sh b/dockerhub-publish.sh new file mode 100755 index 0000000..e1c6872 --- /dev/null +++ b/dockerhub-publish.sh @@ -0,0 +1,51 @@ +#!/bin/bash +set -eo pipefail + +# --- Configuration --- +DOCKER_USER="justfix" +DOCKER_TEAM="justfixnyc" +REPO_NAME="oca" +IMAGE_TAG="latest" # Or use a dynamic tag like $1 or a version number + +FULL_IMAGE_NAME="${DOCKER_TEAM}/${REPO_NAME}:${IMAGE_TAG}" +DOCKERFILE_PATH="./Dockerfile" # Path to your Dockerfile + +# Ensure credentials are set as environment variables for security +if [ -z "$DOCKER_PASSWORD" ]; then + echo "Error: DOCKER_PASSWORD environment variable not set." + exit 1 +fi +# --------------------- + +echo "Starting Docker image build and push process..." + +# 1. Log in to Docker Hub using standard input for the password for security +echo "Logging in to Docker Hub..." +echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USER" --password-stdin +if [ $? -ne 0 ]; then + echo "Error: Docker login failed." + exit 1 +fi +echo "Successfully logged in." + +# 2. Build the Docker image +echo "Building image: ${FULL_IMAGE_NAME} from ${DOCKERFILE_PATH}..." +docker build -f "${DOCKERFILE_PATH}" -t "${FULL_IMAGE_NAME}" . +if [ $? -ne 0 ]; then + echo "Error: Docker build failed." + exit 1 +fi +echo "Successfully built image." + +# 3. Push the image to Docker Hub +echo "Pushing image: ${FULL_IMAGE_NAME} to Docker Hub..." +docker push "${FULL_IMAGE_NAME}" +if [ $? -ne 0 ]; then + echo "Error: Docker push failed." + exit 1 +fi +echo "Successfully pushed image to Docker Hub." + +# Optional: Log out of Docker Hub after pushing +docker logout +echo "Logged out of Docker Hub." diff --git a/docs/operations/weekly-etl-scheduling.md b/docs/operations/weekly-etl-scheduling.md new file mode 100644 index 0000000..5c32853 --- /dev/null +++ b/docs/operations/weekly-etl-scheduling.md @@ -0,0 +1,213 @@ +# Weekly OCA ETL scheduling and deployment + +The OCA pipeline ingests new SFTP XML zip files weekly, geocodes addresses in the local staging CSV before S3 upload, promotes staging data in PostgreSQL, and publishes CSVs to S3 via `aws_s3`. All three supported schedulers run the same container entrypoint: + +```bash +python oca_update.py +``` + +Historical RDS rows that still lack coordinates are handled separately by `oca_geocode_backfill.py` (not scheduled with weekly ETL). See [RDS geocode backfill](#rds-geocode-backfill-on-demand) below. + +Cases marked deleted in OCA XML are tombstoned in `oca_metadata.deletedate` and purged from `oca_index` (and child tables) during weekly promotion. Historical orphans (tombstone without purge) are cleaned by `oca_deletion_backfill.py` — see [RDS deletion backfill](#rds-deletion-backfill-on-demand). + +Use Docker (or the published image `justfixnyc/oca:latest`) with credentials supplied via environment variables or a secret store. See [Runtime controls](#runtime-controls) and the root [README](../../README.md). + +## Runtime controls + +| Variable | Purpose | Production default | +|----------|---------|-------------------| +| `MODE` | Publish mode (`2` = full S3 publish) | `2` | +| `DATABASE_URL` | PostgreSQL connection (RDS) | required | +| `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` | S3 + RDS `aws_s3` credentials | required (or IAM role on ECS) | +| `AWS_S3_BUCKET_NAME` | Target bucket | required | +| `SFTP_*` | OCA SFTP download | required | +| `DB_SCHEMA` | `search_path` schema (refactor/E2E) | empty → `public` | +| `S3_PREFIX` | Key prefix for `private/` and `public/` | empty → bucket root | +| `REPROCESS_GLOB` | Replay zip files from S3 `private/` | empty | +| `FORCE_REPROCESS` | Replay manifest-completed files | `false` | +| `PARSE_FAIL_FAST` | Fail `parse_xml` and abort before export/promote when any zip has case-level parse failures | `false` | +| `SKIP_PUBLIC_PUBLISH` | Skip post-promote public CSV export and SSE normalization (reprocess throughput) | `false` | +| `GEOCODE_WORKERS` | Geosupport pool size | CPU count | +| `CENSUS_BATCH_CHUNK_SIZE` | Census batch chunk | `2500` | +| `CSV_ROW_CHECK_CHUNK_SIZE` | Staging CSV preprocess chunk | `1000` | + +Refactor and E2E runs must set `S3_PREFIX=refactor/` (or another isolated prefix) so reads/writes stay out of production public paths. + +Memory target: **≤ 2 GiB** per job. Tune `GEOCODE_WORKERS` down (e.g. `2`) if geocoding approaches the limit. + +**Do not** set `SKIP_PUBLIC_PUBLISH=true` on production weekly cron, Kubernetes CronJob, or ECS tasks. Use it only for operator-driven bulk reprocess; run a normal publish afterward so public S3 matches RDS. + +**Parse failures (default lenient):** With `PARSE_FAIL_FAST=false`, weekly runs still promote and publish; zips with any `cases_failed` in manifest `etl_files.details` do **not** reach `status = 'completed'` (requires **`cases_failed = 0`**) and are omitted from `completed_reprocess_files` on later `REPROCESS_GLOB` runs (no `FORCE_REPROCESS` needed to retry them). Set `PARSE_FAIL_FAST=true` to stop the run before export/promote. + +## Publish behavior + +- **Geocode timing:** weekly runs geocode all rows in `oca_addresses_staging.csv` locally (`geocode_staging` manifest step) before uploading staging CSVs to S3. Promotion imports coordinates (and sets `geom` on `oca_addresses`); there is no post-promotion RDS geocode in `oca_etl()`. +- **Core tables:** every table in `OCA_TABLES` is exported after promotion. Selective skip per table is unsafe when `oca_index_staging` has rows: promotion deletes child rows for the batch even when a child staging CSV was empty. +- **Address views:** `create_addresses_views.sql` runs on every successful weekly publish (views only; `geom` already on the base table). +- **S3 encryption:** SSE-S3 normalization runs only on objects exported in the current run (not a full public-prefix scan). +- **Skip publish:** `SKIP_PUBLIC_PUBLISH=true` skips view rebuild, RDS exports, date badges, and SSE normalize; manifest records `publish_public` and `normalize_s3_encryption` as completed with `details.skipped=true`. + +## RDS geocode backfill (on-demand) + +Use when `oca_addresses` still has rows with `lat IS NULL` (e.g. pre-CSV-geocode history). **Not** wired into cron, Kubernetes CronJob, or ECS weekly tasks. + +```bash +docker compose run --rm app python oca_geocode_backfill.py +``` + +Same secrets as weekly ETL (`DATABASE_URL`, `DB_SCHEMA`, AWS if needed for manifest only). Tune with `GEOCODE_WORKERS` / `CENSUS_BATCH_CHUNK_SIZE` or CLI flags. + +- Selects only ungeocoded rows (`select_addresses_needing_geocode.sql`). +- Records manifest `mode='geocode_backfill'` with step `geocode_refresh` only. +- **Does not** run `create_addresses_views.sql` or publish public CSVs. Re-run publish (or a full `oca_update.py` publish path) if S3 must reflect backfilled coordinates. + +## RDS deletion backfill (on-demand) + +Use after deploying tombstone-aware promotion, or when validation shows case rows still present for deleted metadata. **Not** wired into weekly ETL. + +```bash +docker compose run --rm app python oca_deletion_backfill.py +``` + +Same secrets as weekly ETL (`DATABASE_URL`, `DB_SCHEMA`). Records manifest `mode='deletion_backfill'` with step `deletion_backfill`. + +- Deletes from `oca_index` only (child tables CASCADE); **`oca_metadata` rows are kept** (`deletedate` preserved). +- **Does not** publish public CSVs. Re-run weekly publish if S3 must drop deleted cases from snapshots. + +**Validation** (expect `0` after a successful backfill): + +```sql +SELECT COUNT(*)::bigint +FROM oca_index i +INNER JOIN oca_metadata m ON m.indexnumberid = i.indexnumberid +WHERE m.deletedate IS NOT NULL; +``` + +## 1. Local Docker + cron (weekly) + +Best for a single host with Docker and an `.env` file. + +**Weekly schedule example** (Saturdays 12:00 US/Eastern, same cadence as K8s manifest): + +```cron +# /etc/cron.d/oca-etl — adjust path to your clone +0 12 * * 6 root cd /path/to/oca && /usr/bin/docker compose run --rm \ + -e MODE=2 \ + -e GEOCODE_WORKERS=2 \ + app python oca_update.py >> /var/log/oca-etl.log 2>&1 +``` + +Ensure `.env` in the repo root defines `DATABASE_URL`, AWS, and SFTP variables (see `.env.example`). Do not commit `.env`. + +**Manual run:** + +```bash +docker compose run --rm app python oca_update.py +``` + +**Refactor / replay example:** + +```bash +docker compose run --rm app env \ + DB_SCHEMA=oca_refactor \ + S3_PREFIX=refactor/ \ + REPROCESS_GLOB='LandlordTenant.Incr.2025-*.zip' \ + FORCE_REPROCESS=true \ + GEOCODE_WORKERS=2 \ + python oca_update.py +``` + +## 2. Kubernetes CronJob (weekly) + +Manifest: [`k8s/k8s-cron-job.yaml`](../../k8s/k8s-cron-job.yaml). + +**Schedule:** `0 12 * * 6` with `timeZone: America/New_York` (weekly Saturday noon). + +**Memory:** requests `1536Mi`, limit `2Gi` (2 GB class). + +**Secrets:** create `oca-etl-secrets` before applying the CronJob (see [`k8s/oca-etl-secret.example.yaml`](../../k8s/oca-etl-secret.example.yaml)). + +```bash +kubectl apply -f k8s/oca-etl-secret.example.yaml # after editing placeholders +kubectl apply -f k8s/k8s-cron-job.yaml +kubectl get cronjob oca-etl +``` + +Non-secret runtime knobs are set inline in the CronJob (`MODE`, `GEOCODE_WORKERS`, etc.). Override `DB_SCHEMA` / `S3_PREFIX` there for refactor jobs. + +**One-off job from the CronJob template:** + +```bash +kubectl create job --from=cronjob/oca-etl oca-etl-manual-$(date +%s) +kubectl logs -f job/oca-etl-manual- +``` + +## 3. AWS EventBridge + ECS Fargate (weekly, non-Kubernetes) + +Use when production runs on AWS without a cluster. EventBridge starts an ECS task on a schedule; the task uses the same image and command as Docker/K8s. + +**High-level steps** + +1. Push `justfixnyc/oca:latest` (or your ECR mirror) and register a Fargate task definition with: + - `command`: `["python", "oca_update.py"]` + - `memory`: `2048` (hard limit, MiB) + - `cpu`: `1024` (1 vCPU; adjust if needed) + - Secrets from AWS Secrets Manager or SSM Parameter Store → container environment (same keys as `.env.example`) + - Task role: S3 access for the bucket; execution role: ECR pull + secrets +2. Create an ECS cluster and service is optional; scheduled tasks can run standalone. +3. EventBridge rule (weekly Saturday 12:00 ET): + +```json +{ + "scheduleExpression": "cron(0 12 ? * SAT *)", + "scheduleExpressionTimezone": "America/New_York", + "state": "ENABLED", + "targets": [{ + "Arn": "arn:aws:ecs:us-east-1:ACCOUNT_ID:cluster/oca-etl", + "RoleArn": "arn:aws:iam::ACCOUNT_ID:role/EventBridgeECSRunTask", + "EcsParameters": { + "TaskDefinitionArn": "arn:aws:ecs:us-east-1:ACCOUNT_ID:task-definition/oca-etl:1", + "LaunchType": "FARGATE", + "NetworkConfiguration": { + "awsvpcConfiguration": { + "subnets": ["subnet-xxx"], + "securityGroups": ["sg-xxx"], + "assignPublicIp": "DISABLED" + } + } + } + }] +} +``` + +Replace ARNs, subnets, and security groups. The task needs outbound HTTPS (SFTP, Census geocoder, S3, RDS) and RDS connectivity from the task subnets. + +**Environment example (task definition fragment):** + +```json +"environment": [ + { "name": "MODE", "value": "2" }, + { "name": "GEOCODE_WORKERS", "value": "2" }, + { "name": "CENSUS_BATCH_CHUNK_SIZE", "value": "2500" }, + { "name": "CSV_ROW_CHECK_CHUNK_SIZE", "value": "1000" } +], +"secrets": [ + { "name": "DATABASE_URL", "valueFrom": "arn:aws:secretsmanager:us-east-1:ACCOUNT:secret:oca-etl:DATABASE_URL::" }, + { "name": "AWS_ACCESS_KEY_ID", "valueFrom": "..." } +] +``` + +On ECS, prefer IAM task roles for S3 instead of long-lived access keys when RDS `aws_s3` integration allows it. + +## Validation checklist + +- [ ] `docker compose run --rm app python -m unittest discover -s tests -p "test_*.py"` +- [ ] CronJob or ECS task memory limit ≤ 2 GiB; geocode workers tuned if OOM +- [ ] Secrets not stored in git-tracked manifests (use K8s Secret / Secrets Manager) +- [ ] Refactor runs use `S3_PREFIX=refactor/` (or dedicated prefix) + +## Related files + +- [`k8s/k8s-cron-job.yaml`](../../k8s/k8s-cron-job.yaml) — CronJob, resources, env +- [`k8s/oca-etl-secret.example.yaml`](../../k8s/oca-etl-secret.example.yaml) — secret template +- [`README.md`](../../README.md) — local setup and runtime controls diff --git a/k8s/k8s-cron-job.yaml b/k8s/k8s-cron-job.yaml new file mode 100644 index 0000000..43eb52d --- /dev/null +++ b/k8s/k8s-cron-job.yaml @@ -0,0 +1,52 @@ +# Weekly OCA ETL (Saturday 12:00 US/Eastern). See docs/operations/weekly-etl-scheduling.md +apiVersion: batch/v1 +kind: CronJob +metadata: + name: oca-etl +spec: + schedule: "0 12 * * 6" + timeZone: America/New_York + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + containers: + - name: oca-etl + image: justfixnyc/oca:latest + imagePullPolicy: Always + command: + - python + - oca_update.py + # ~2GB class: Geosupport + DuckDB staging + geocode pools + resources: + requests: + memory: "1536Mi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "2" + envFrom: + - secretRef: + name: oca-etl-secrets + env: + - name: MODE + value: "2" + # Tune down if geocode OOMs near the 2Gi limit + - name: GEOCODE_WORKERS + value: "2" + - name: CENSUS_BATCH_CHUNK_SIZE + value: "2500" + - name: CSV_ROW_CHECK_CHUNK_SIZE + value: "1000" + # Production: leave empty. Refactor/E2E: set e.g. refactor/ + - name: S3_PREFIX + value: "" + - name: DB_SCHEMA + value: "" + - name: REPROCESS_GLOB + value: "" + - name: FORCE_REPROCESS + value: "false" diff --git a/k8s/oca-etl-secret.example.yaml b/k8s/oca-etl-secret.example.yaml new file mode 100644 index 0000000..b2d2da7 --- /dev/null +++ b/k8s/oca-etl-secret.example.yaml @@ -0,0 +1,27 @@ +# Example Secret for oca-etl CronJob. Do not commit real credentials. +# +# kubectl create secret generic oca-etl-secrets \ +# --from-literal=DATABASE_URL='postgresql://...' \ +# --from-literal=AWS_ACCESS_KEY_ID='...' \ +# --from-literal=AWS_SECRET_ACCESS_KEY='...' \ +# --from-literal=AWS_S3_BUCKET_NAME='oca-2-dev' \ +# --from-literal=SFTP_HOST='sftp.nycourts.gov' \ +# --from-literal=SFTP_USER='...' \ +# --from-literal=SFTP_PSWD='...' \ +# --from-literal=SFTP_DIR='139' +# +# Or apply this template after replacing placeholders: +apiVersion: v1 +kind: Secret +metadata: + name: oca-etl-secrets +type: Opaque +stringData: + DATABASE_URL: "postgresql://USER:PASSWORD@HOST:5432/DATABASE" + AWS_ACCESS_KEY_ID: "REPLACE_ME" + AWS_SECRET_ACCESS_KEY: "REPLACE_ME" + AWS_S3_BUCKET_NAME: "oca-2-dev" + SFTP_HOST: "sftp.nycourts.gov" + SFTP_USER: "REPLACE_ME" + SFTP_PSWD: "REPLACE_ME" + SFTP_DIR: "139" diff --git a/lib/README.md b/lib/README.md index 8e1643f..1ef217b 100644 --- a/lib/README.md +++ b/lib/README.md @@ -1,49 +1,121 @@ -# Code - -### `sftp.py` - -This class provides a connection to the SFTP maintained by OCA and allows us to list the available files and download selected files. - -### `s3.py` - -This class provides a connection to our Amazon S3 account where both the private raw files and public csv files are stored, and allows us to list the available files and upload new files. - -### `database.py` - -This class is adapted from [NYCDB](https://github.com/nycdb/nycdb/blob/master/src/nycdb/database.py), and provides a connection to the PostgreSQL database where the parsed files are stored. It includes methods to insert new rows, execute SQL files, export tables to CSV, and to create and restore from [pg_dump](https://www.postgresql.org/docs/12/app-pgdump.html) files. - -### `parsers.py` - -The final function `parse_file` takes an XML file and database connection from `database.py` and iterates over each case, parsing all the data into the various tables. - -### `utils.py` - -A few basic helper functions: - -* `make_dir` - * Create new local directories - -* `list_new_data_files` - * List files that are in the SFTP but not yet in S3 - -* `prep_db` - * Prepare the Postgres database (either from scratch with SQL scripts or from a `pg_dump` file) - -* `insert_staging_to_main` - * Move newly parsed records in the database over from staging tables to the main ones - -* `create_date_files` - * Create plain text and image files for the most recent date of the data extracts for display in this repo - -### `etl.py` - -This is the main script that does the full process. - - -### `oca_update.py` - -Finally, this file (in the top level of this repo) simply pulls environment variables from the `.env` file and runs `etl.py` to process an update to the data. - -### `geocode_record.py` - -Uses usaddress to normalize addresses before sending it off to NYC's Geosupport to get bin, bbl, community districts, census tracts, council districts, and status messages. \ No newline at end of file +# OCA ETL pipeline + +This directory contains the Extract–Transform–Load pipeline that ingests NY State housing court XML from OCA, parses it into relational tables, loads PostgreSQL on RDS, geocodes addresses, and publishes CSVs to S3. This process works with the protected address-level data ("level 2") but maintains public exports of the deidentified (zip code only, "level 1") version with the full address data kept only in secure S3 and RDS for organization under the legal agreement with OCA. + +Entry points: + +- [`oca_update.py`](../oca_update.py) — weekly ETL → `oca_etl()` in [`etl.py`](etl.py) +- [`oca_geocode_backfill.py`](../oca_geocode_backfill.py) — on-demand RDS backfill (`lat IS NULL` only); not part of weekly ETL + +## Pipeline flow (weekly) + +```mermaid +flowchart TD + sftp[SFTP new zip files] --> select[Select files to process] + s3backup[S3 private backups] --> select + select --> download[Download selected zips] + download --> parse[Stream parse XML → DuckDB staging] + parse --> export[Export staging CSVs + preprocess] + export --> geo[Geocode oca_addresses_staging.csv locally] + geo --> s3upload[Upload staging CSVs to S3] + s3upload --> import[RDS import staging tables] + import --> normalize[SQL normalize + appearance outcomes] + normalize --> promote[Atomic promote staging → main] + promote --> views[create_addresses_views.sql] + views --> publish[Publish all public CSVs + date files] + publish --> enc[SSE normalize except private address CSV] + enc --> priv[Upload private XML zips] +``` + +Each weekly run is orchestrated sequentially in `oca_etl()`. There is **no** post-promotion `geocode_addresses()` on the weekly path. See [`etl_stages.py`](etl_stages.py) for stage implementations. + +## Stages + +| Stage | Module | What it does | +|-------|--------|--------------| +| Select files | `etl_file_selection.py`, `etl_stages.select_input_files` | Picks new SFTP zips and/or S3 private replays (`REPROCESS_GLOB`); skips manifest-completed files unless `FORCE_REPROCESS=true`. | +| Download | `etl_stages.download_selected_files` | New files from SFTP; replay files from S3 private backup. | +| Parse | `parsers.py`, `duckdb_database.py`, `parse_manifest.py` | Streaming XML parse into local DuckDB (`staging.duckdb`); batched writes via `parse_write_buffer.py` with per-case windows (`begin_case` / `discard_case` on error—no partial case rows). Per-zip `cases_seen` / `cases_parsed_ok` / `cases_failed` (+ capped `error_samples`) on `etl_files.details`. Default lenient: promote/publish still run; `etl_files.status = 'completed'` only when **`cases_failed = 0`** after promote. `PARSE_FAIL_FAST` aborts before export/promote. Address rows have no lat/lon until CSV geocode. | +| Export staging | `etl_stages.export_staging_csvs` | DuckDB `COPY` with Postgres-compatible transforms; manifest step `export_staging`. No S3 upload yet. | +| Geocode staging | `etl_geocode.geocode_staging_addresses_csv`, `etl_stages.geocode_staging_csvs` | Geocode **every** row in `oca_addresses_staging.csv`; write `oca_addresses_staging_geocoded.csv`, copy over staging CSV; manifest step `geocode_staging`. | +| Upload staging | `etl_stages.upload_staging_csvs`, `etl_publish.list_staging_csvs_in_dir` | Upload only whitelisted `{table}_staging.csv` files (from `OCA_TABLES`); ignores geocoder temps and other junk; manifest step `upload_staging`. | +| Import + promote | `etl_stages.import_and_promote_staging`, `etl_promotion.py` | Bootstrap core tables, import staging CSVs via `aws_s3`, normalize, promote; batch `geom` UPDATE from lat/lon. | +| Publish public | `etl_stages.publish_public_artifacts` | `create_addresses_views.sql` (views only); export all `OCA_TABLES` and address views; upload date badge files. Skipped when `SKIP_PUBLIC_PUBLISH=true` (manifest steps recorded with `skipped` in details). | +| Normalize encryption | `etl_stages.normalize_public_s3_encryption` | SSE-S3 on published keys except `oca_addresses_private.csv`. Skipped with `SKIP_PUBLIC_PUBLISH=true`. | +| Upload private | `etl_stages.upload_private_source_files` | Back up raw XML zips to S3 `private/`. | + +### RDS backfill (not weekly) + +| Stage | Module | What it does | +|-------|--------|--------------| +| Geocode refresh | `etl_stages.geocode_addresses`, `oca_geocode_backfill.py` | Fetch `oca_addresses` where `lat IS NULL`; Geosupport + Census; upsert by natural address key + `geom`. Manifest step `geocode_refresh`, run `mode='geocode_backfill'`. Does not publish. | + +## Key modules + +### Connectivity + +- [`sftp.py`](sftp.py) — list and download raw XML zip files from OCA SFTP. +- [`s3.py`](s3.py) — S3 upload/download, encryption normalization (`update_encryption`). +- [`database.py`](database.py) — PostgreSQL connection (NYCDB-derived), schema `search_path`, TCP keepalives, automatic reconnect after idle drops, transactions, `aws_s3` import/export helpers. + +### Parse and local staging + +- [`parsers.py`](parsers.py) — stream `` nodes from XML; per-case child table replace semantics; delete-event handling. +- [`duckdb_database.py`](duckdb_database.py) — local DuckDB staging DB; export to CSV with contract transforms. +- [`parse_write_buffer.py`](parse_write_buffer.py) — buffer INSERTs and flush in transaction windows (`PARSE_WRITE_*` env knobs). +- [`staging_csv_export.py`](staging_csv_export.py) — per-table export specs (Postgres array literals, nullable ints, appearances column rules). + +### ETL orchestration + +- [`etl.py`](etl.py) — run orchestrator, manifest lifecycle, stage sequencing. +- [`etl_constants.py`](etl_constants.py) — table list, zip patterns, S3 folder constants. +- [`etl_run_manifest.py`](etl_run_manifest.py) — `etl_runs` / `etl_files` / `etl_steps` bookkeeping. +- [`etl_helpers.py`](etl_helpers.py) — paths, CSV row checks, PLUTO download, local SVG date badge files. +- [`etl_csv.py`](etl_csv.py) — streaming CSV normalization for tables not handled at export time. +- [`etl_promotion.py`](etl_promotion.py) — atomic `promote_staging_to_main()`; count/checksum hooks for validation. +- [`etl_publish.py`](etl_publish.py) — S3 export helpers and encryption key filtering. +- [`etl_geocode.py`](etl_geocode.py) — staging CSV geocode (`read_staging_addresses_csv`, `geocode_staging_addresses_csv`); RDS fetch/upsert for backfill. + +### Geocoding + +- [`geocode_record.py`](geocode_record.py) — address normalization (usaddress), NYC Geosupport, Census batch geocoder. + +## SQL scripts (`lib/sql/`) + +Scripts run against the active session schema (`DB_SCHEMA` / `search_path`). + +| Script | Role | +|--------|------| +| `create_tables.sql` | Non-destructive bootstrap; `oca_addresses.geom` + GIST index | +| `create_tables_staging.sql` | Per-run RDS staging tables (no `geom` on address staging) | +| `create_tables_staging_duckdb.sql` | Local DuckDB staging DDL | +| `normalize_staging_after_import.sql` | Nullable int coercion after S3 import | +| `update_appearance_outcomes.sql` | Assign `appearanceid`, expand outcomes JSON | +| `promote_staging_to_main.sql` | Staging → main promotion; batch `geom` from lat/lon | +| `ensure_promotion_indexes.sql` | Indexes for promotion and address natural keys | +| `select_addresses_needing_geocode.sql` | Backfill delta (`lat IS NULL`) | +| `create_geocode_staging_table.sql`, `upsert_geocoded_addresses.sql` | Backfill staging merge; sets `geom` | +| `create_addresses_views.sql` | Public address views (no geom DDL) | +| `create_etl_manifest_tables.sql` | Run manifest DDL | + +Legacy/manual only: `reset_addresses_table.sql`, `update_metadata.sql`. + +## Idempotency and run control + +- **Manifest** — weekly runs record `export_staging`, `geocode_staging`, `upload_staging`, `promote_staging`, `publish_public`, `normalize_s3_encryption`, `upload_private`. Backfill runs record only `geocode_refresh`. +- **Connection resilience** — TCP keepalives and `ensure_connection()` before promote and before publish (publish connection refresh skipped when `SKIP_PUBLIC_PUBLISH=true`). +- **Reprocess** — `REPROCESS_GLOB` selects S3 private backups; manifest skips files in `completed_reprocess_files` (promoted with `cases_failed = 0`) unless `FORCE_REPROCESS=true`. Zips with prior case-level parse failures stay eligible for reprocess without force. +- **Schema isolation** — `DB_SCHEMA` + `S3_PREFIX` for refactor/E2E without touching production paths. +- **Weekly geocode** — all staging CSV address rows (re-geocodes rows that already have lat/lon in the file). +- **Backfill geocode** — only `lat IS NULL` with a house number; upsert matches on address line columns, not `indexnumberid` alone. +- **Publish** — every successful weekly run exports the full public snapshot (all core tables and address views). `SKIP_PUBLIC_PUBLISH=true` is for bulk reprocess only; run a normal publish afterward so public S3 matches RDS. + +## Output tables + +Core tables (also published as public CSVs): `oca_index`, `oca_causes`, `oca_addresses`, `oca_parties`, `oca_events`, `oca_appearances`, `oca_appearance_outcomes`, `oca_motions`, `oca_decisions`, `oca_judgments`, `oca_warrants`. Defined in [`etl_constants.py`](etl_constants.py). + +## Further reading + +- Root [README](../README.md) — setup, env vars, Docker invocation, backfill CLI +- [`docs/operations/weekly-etl-scheduling.md`](../docs/operations/weekly-etl-scheduling.md) — cron, Kubernetes, EventBridge/ECS +- [`docs/`](../docs/) — data dictionary links and raw XML notes diff --git a/lib/database.py b/lib/database.py index 77212cc..90243b9 100644 --- a/lib/database.py +++ b/lib/database.py @@ -1,8 +1,32 @@ -import urllib.parse +import os +from contextlib import contextmanager + import psycopg2 import psycopg2.extras -from psycopg2 import sql -import os +from psycopg2 import Error, InterfaceError, OperationalError, sql + + +def _env_int(name, default): + raw = os.environ.get(name) + if raw is None or str(raw).strip() == '': + return default + return int(raw) + + +def default_statement_timeout_ms(): + """RDS statement_timeout for long-running ETL SQL (override via DB_STATEMENT_TIMEOUT_MS).""" + return _env_int('DB_STATEMENT_TIMEOUT_MS', 3_600_000) + + +def _connect_params(): + """libpq TCP keepalive settings (override via DB_KEEPALIVES_* env).""" + return { + 'keepalives': _env_int('DB_KEEPALIVES', 1), + 'keepalives_idle': _env_int('DB_KEEPALIVES_IDLE', 60), + 'keepalives_interval': _env_int('DB_KEEPALIVES_INTERVAL', 10), + 'keepalives_count': _env_int('DB_KEEPALIVES_COUNT', 5), + } + # https://github.com/nycdb/nycdb/blob/master/src/nycdb/sql.py def insert_many(table_name, rows): @@ -22,63 +46,170 @@ def insert_many(table_name, rows): fields = ', '.join(field_names) placeholders = ', '.join(["%({})s".format(k) for k in field_names]) template = f"({placeholders})" - sql = f"INSERT INTO {table_name} ({fields}) VALUES %s" + sql_str = f"INSERT INTO {table_name} ({fields}) VALUES %s" - return sql, template + return sql_str, template # https://github.com/nycdb/nycdb/blob/master/src/nycdb/database.py class Database: """Database connection to OCA database""" - def __init__(self, db_url, autocommit = False): + def __init__(self, db_url, schema='', autocommit=False): self.db_url = db_url - self.conn = psycopg2.connect(db_url) + self.schema = schema + self.conn = None + self._connect() + + def _connect(self): + self.conn = psycopg2.connect(self.db_url, **_connect_params()) + if self.schema: + self.set_search_path(self.schema) + + def _close_connection(self): + if self.conn is not None: + try: + self.conn.close() + except Exception: + pass + self.conn = None + + def _connection_is_closed(self): + if self.conn is None: + return True + closed = getattr(self.conn, 'closed', None) + if isinstance(closed, bool): + return closed + if isinstance(closed, int): + return closed != 0 + return False + + def _safe_rollback(self): + if self._connection_is_closed(): + self._close_connection() + return + try: + self.conn.rollback() + except (InterfaceError, OperationalError, Error, AttributeError): + self._close_connection() + + def set_statement_timeout(self, timeout_ms=None): + timeout = timeout_ms if timeout_ms is not None else default_statement_timeout_ms() + self.sql(f"SET statement_timeout = '{timeout}'") + + def ensure_connection(self): + """Ping the connection; reconnect if the server closed an idle session. + + Returns True if a new connection was opened, False if the existing one is healthy. + """ + if self._connection_is_closed(): + self._connect() + return True + try: + with self.conn.cursor() as curs: + curs.execute('SELECT 1') + return False + except Error: + try: + self.conn.rollback() + with self.conn.cursor() as curs: + curs.execute('SELECT 1') + return False + except (OperationalError, InterfaceError, Error, AttributeError): + self._close_connection() + self._connect() + return True def __exit__(self, exc_type, exc_value, traceback): - self.conn.close() + self._close_connection() - def sql(self, SQL, autocommit = False): - """ Executes single sql statement + def set_search_path(self, schema): + with self.conn.cursor() as curs: + curs.execute(sql.SQL("SET search_path TO {}, public").format(sql.Identifier(schema))) + self.conn.commit() + + def execute(self, SQL, autocommit=False): + """Execute SQL without committing (for use inside transaction blocks).""" + self.ensure_connection() + if autocommit: + self.conn.set_session(autocommit=True) - Set auto commit to run queries like VACUUM FULL [1] - [1]: https://til.codeinthehole.com/posts/about-a-gotcha-with-psycopg2s-autocommit-handling/ - """ - if autocommit: self.conn.set_session(autocommit=True) - with self.conn.cursor() as curs: curs.execute(SQL) - if autocommit: self.conn.set_session(autocommit=False) # unset - self.conn.commit() + if autocommit: + self.conn.set_session(autocommit=False) + + def sql(self, SQL, autocommit=False): + """Execute a single SQL statement and commit. + + Set autocommit to run queries like VACUUM FULL [1] + [1]: https://til.codeinthehole.com/posts/about-a-gotcha-with-psycopg2s-autocommit-handling/ + """ + self.ensure_connection() + try: + self.execute(SQL, autocommit=autocommit) + self.conn.commit() + except Exception: + self._safe_rollback() + raise + + @contextmanager + def transaction(self): + """Run a block in one DB transaction; rollback on any exception.""" + self.ensure_connection() + try: + yield self + self.conn.commit() + except Exception: + self._safe_rollback() + raise def sql_fetch_one(self, SQL): + self.ensure_connection() with self.conn.cursor() as curs: curs.execute(SQL) return curs.fetchone() - def insert_rows(self, rows, table_name): - """ - Inserts many rows, all in the same transaction. - """ - + def sql_fetch_all(self, SQL): + self.ensure_connection() with self.conn.cursor() as curs: - sql_str, template = insert_many(table_name, rows) - try: - psycopg2.extras.execute_values( - curs, - sql_str, - rows, - template=template, - page_size=len(rows) - ) - except psycopg2.DataError: - print(rows) # useful for debugging - raise - self.conn.commit() + curs.execute(SQL) + return curs.fetchall() + def sql_fetch_all_from_file(self, sql_file): + file_path = os.path.join(os.path.dirname(__file__), 'sql', sql_file) + with open(file_path, 'r', encoding='utf-8') as f: + return self.sql_fetch_all(f.read()) - def execute_sql_file(self, sql_file): + def insert_rows(self, rows, table_name, page_size=1000): + """ + Inserts many rows, all in the same transaction. + """ + if not rows: + return + + self.ensure_connection() + try: + with self.conn.cursor() as curs: + sql_str, template = insert_many(table_name, rows) + try: + psycopg2.extras.execute_values( + curs, + sql_str, + rows, + template=template, + page_size=min(page_size, len(rows)), + ) + except psycopg2.DataError: + print(rows) # useful for debugging + raise + self.conn.commit() + except Exception: + self._safe_rollback() + raise + + def execute_sql_file(self, sql_file, commit=True): """ Executes the provided sql file. Assumes the path is relative to ./sql @@ -86,50 +217,41 @@ def execute_sql_file(self, sql_file): file_path = os.path.join(os.path.dirname(__file__), 'sql', sql_file) with open(file_path, 'r', encoding='utf-8') as f: - self.sql(f.read()) - + sql_text = f.read() + if commit: + self.sql(sql_text) + else: + self.execute(sql_text) def export_csv(self, table_name, file_path): """ Exports tables to CSV files """ - - f = open(file_path, 'w') - - with self.conn.cursor() as curs: - curs.copy_expert(f"COPY {table_name} TO STDOUT WITH CSV HEADER", f) - - f.close() + self.ensure_connection() + with open(file_path, 'w', encoding='utf-8') as f: + with self.conn.cursor() as curs: + curs.copy_expert(f"COPY {table_name} TO STDOUT WITH CSV HEADER", f) def import_csv(self, table_name, file_path): """ Imports a CSV file to existing table """ - - f = open(file_path, 'r') - - with self.conn.cursor() as curs: - curs.copy_expert(f'COPY {table_name} FROM STDIN WITH CSV HEADER', f) + self.ensure_connection() + with open(file_path, 'r', encoding='utf-8') as f: + with self.conn.cursor() as curs: + curs.copy_expert(f'COPY {table_name} FROM STDIN WITH CSV HEADER', f) self.conn.commit() - f.close() - - def export_view_as_csv(self, table_name, file_path): """ Exports tables to CSV files """ - - f = open(file_path, 'w') - - with self.conn.cursor() as curs: - curs.copy_expert(f"COPY (SELECT * FROM {table_name}) TO STDOUT WITH CSV HEADER", f) - - f.close() + self.ensure_connection() + with open(file_path, 'w', encoding='utf-8') as f: + with self.conn.cursor() as curs: + curs.copy_expert(f"COPY (SELECT * FROM {table_name}) TO STDOUT WITH CSV HEADER", f) def dump_to(self, file_path): """ pg_dump the database to file """ cmd = f"pg_dump {self.db_url} -Fc > {file_path}" os.system(cmd) - def restore_from(self, file_path): """ pg_restore the database from file """ cmd = f"pg_restore -d {self.db_url} -c {file_path}" os.system(cmd) - diff --git a/lib/duckdb_database.py b/lib/duckdb_database.py index fdc5cbc..350d218 100644 --- a/lib/duckdb_database.py +++ b/lib/duckdb_database.py @@ -1,24 +1,54 @@ import duckdb import os import threading +from contextlib import contextmanager + +from .staging_csv_export import build_staging_copy_sql + +STAGING_TABLE_FAMILIES = ( + 'oca_index_staging', + 'oca_causes_staging', + 'oca_addresses_staging', + 'oca_parties_staging', + 'oca_events_staging', + 'oca_appearances_staging', + 'oca_motions_staging', + 'oca_decisions_staging', + 'oca_judgments_staging', + 'oca_warrants_staging', + 'oca_metadata_staging', +) + + +def fetch_staging_row_counts(db) -> dict[str, int]: + """Return row counts for known staging tables (missing tables -> 0).""" + counts = {} + with db._lock: + for table_name in STAGING_TABLE_FAMILIES: + try: + row = db.conn.execute(f'SELECT COUNT(*) FROM {table_name}').fetchone() + counts[table_name] = int(row[0]) if row else 0 + except Exception: + counts[table_name] = 0 + return counts + class DuckDB: - """DuckDB database helper with methods for + """DuckDB database helper with methods for exporting to csv, and running sql files and commands with thread safety""" - + def __init__(self, dbname): self.dbname = dbname self.conn = duckdb.connect(dbname) self._lock = threading.Lock() - + def execute_sql_file(self, sql_file_path): """Execute SQL commands from a file""" with open(sql_file_path, 'r') as f: sql_content = f.read() - - # Split by semicolon and execute each statement + statements = [stmt.strip() for stmt in sql_content.split(';') if stmt.strip()] - + with self._lock: for statement in statements: try: @@ -27,38 +57,56 @@ def execute_sql_file(self, sql_file_path): print(f"Error executing statement: {statement[:100]}...") print(f"Error: {e}") raise - + def execute(self, sql, params=None): """Execute a single SQL statement""" with self._lock: - if params: - return self.conn.execute(sql, params) - return self.conn.execute(sql) - + return self._execute_unlocked(sql, params) + + def _execute_unlocked(self, sql, params=None): + if params: + return self.conn.execute(sql, params) + return self.conn.execute(sql) + def executemany(self, sql, params_list): """Execute SQL with multiple parameter sets""" with self._lock: - return self.conn.executemany(sql, params_list) - + return self._executemany_unlocked(sql, params_list) + + def _executemany_unlocked(self, sql, params_list): + return self.conn.executemany(sql, params_list) + + @contextmanager + def transaction(self): + """Run a block in one DuckDB transaction (caller should not nest locks).""" + with self._lock: + self.conn.execute('BEGIN TRANSACTION') + try: + yield self + self.conn.execute('COMMIT') + except Exception: + self.conn.execute('ROLLBACK') + raise + def close(self): - if self.conn: self.conn.close() - + if self.conn: + self.conn.close() + def export_tables_to_csv(self, output_dir): """Export all tables to CSV files""" os.makedirs(output_dir, exist_ok=True) - + with self._lock: - # Get list of all tables tables = self.conn.execute("SHOW TABLES").fetchall() - + for table_row in tables: table_name = table_row[0] csv_path = os.path.join(output_dir, f"{table_name}.csv") - - # Export to CSV - self.conn.execute(f"COPY {table_name} TO '{csv_path}' (HEADER, DELIMITER ',')") - print(f"Exported {table_name} to {csv_path}") - # TODO: before exporting covert arrays to the postgres format, but ignore json objects - # Transform arrays: [1,2,3] -> {1,2,3} - # Ignore JSON objects: {[key: value]} -> [{key: value}] \ No newline at end of file + describe_rows = self.conn.execute( + f'DESCRIBE {table_name}' + ).fetchall() + columns = [(row[0], row[1]) for row in describe_rows] + copy_sql = build_staging_copy_sql(table_name, csv_path, columns) + self.conn.execute(copy_sql) + print(f"Exported {table_name} to {csv_path}") diff --git a/lib/etl.py b/lib/etl.py index aa17698..0766f83 100644 --- a/lib/etl.py +++ b/lib/etl.py @@ -1,521 +1,167 @@ -import os -import io -import shutil -import zipfile -import requests -import re -import json -from datetime import datetime -# TODO - replace os.path with Pathlib and its '/' operator -from pathlib import Path - -import numpy as np -import pandas as pd import multiprocessing -import functools -from itertools import repeat -from lxml import etree -import sys +from pathlib import Path from .database import Database from .duckdb_database import DuckDB +from .etl_constants import ( + DATA_FILENAME, + DATA_ZIPFILE_PAT, + OCA_TABLES, + S3_PRIVATE_FOLDER, + S3_PUBLIC_FOLDER, +) +from .etl_file_selection import ( + list_new_data_files, + list_reprocess_data_files, + select_data_files_to_process, +) +from .etl_helpers import ( + create_date_files, + csv_has_rows, + download_pluto, + make_dir, + s3_key, + upload_public_file, +) +from .etl_promotion import ( + promote_staging_to_main, + promotion_counts_checksum, + promotion_table_counts, +) +from .etl_run_manifest import EtlRunManifest, completed_reprocess_files, manifest_step +from .parse_manifest import ParseFailFastError, file_names_needing_reprocess +from .etl_stages import ( + FileSelection, + download_selected_files, + export_staging_csvs, + geocode_staging_csvs, + import_and_promote_staging, + normalize_public_s3_encryption, + parse_xml_to_staging, + publish_public_artifacts, + select_input_files, + upload_staging_csvs, + upload_private_source_files, +) from .s3 import S3 from .sftp import Sftp -from .parsers import oca_tag, parse_file - -from .geocode_record import geocode_record, geocode_using_census_batch -OCA_TABLES = [ - 'oca_index', - 'oca_causes', - 'oca_addresses', - 'oca_parties', - 'oca_events', - 'oca_appearances', - 'oca_appearance_outcomes', - 'oca_motions', - 'oca_decisions', - 'oca_judgments', - 'oca_warrants', - 'oca_metadata' +__all__ = [ + 'OCA_TABLES', + 'DATA_ZIPFILE_PAT', + 'DATA_FILENAME', + 'S3_PRIVATE_FOLDER', + 'S3_PUBLIC_FOLDER', + 's3_key', + 'make_dir', + 'list_new_data_files', + 'list_reprocess_data_files', + 'select_data_files_to_process', + 'completed_reprocess_files', + 'EtlRunManifest', + 'manifest_step', + 'csv_has_rows', + 'promote_staging_to_main', + 'promotion_table_counts', + 'promotion_counts_checksum', + 'create_date_files', + 'download_pluto', + 'upload_public_file', + 'FileSelection', + 'oca_etl', ] -DATA_ZIPFILE_PAT = r'LandlordTenant\.(Initial\.FiledIn\d{4}|Incr)\.\d{4}-\d{2}-\d{2}\.zip' - -DATA_FILENAME = 'LandlordTenantExtract.xml' - -S3_PRIVATE_FOLDER = 'private' - -S3_PUBLIC_FOLDER = 'public' - - -def make_dir(dir_name): - """ - Create a new directory in the same folder as this file, - deleting everything in the folder if it already exists - - :param dir_name: The name of the directory to be created as a string - """ - dir_path = os.path.abspath(os.path.join(os.path.dirname(__file__), dir_name)) - shutil.rmtree(dir_path, ignore_errors=True) - os.mkdir(dir_path) - return dir_path - - -def list_new_data_files(sftp, s3): - """ - Get a list of filenames for all the data files available in the SFTP - that are not already in the private S3 folder. These are the new ones - that still need to be processed. They are returned in the proper order - in which they need to be processed. - - :param sftp: SFTP object - :param s3: S3 object - """ - - sftp_zip_files = sftp.list_files(DATA_ZIPFILE_PAT) - s3_zip_files = s3.list_files(DATA_ZIPFILE_PAT, S3_PRIVATE_FOLDER) - new_sftp_zip_files = list(set(sftp_zip_files) - set(s3_zip_files)) - - # It's important that everything is processed in order because files - # can contain modify/delete cases included in past files - init_files = [f for f in new_sftp_zip_files if 'Initial' in f] - incr_files = [f for f in new_sftp_zip_files if 'Incr' in f] - - files = [] - files += sorted(init_files) if init_files else [] - files += sorted(incr_files) if incr_files else [] - - return files - -def prep_db(s3, db, local_dir): - """ - Create a new directory in the same folder as this file, - deleting everything in the folder if it already exists - - :param s3: S3 object - :param db: Database object - :param local_dir: Path for local directory to save database dump file - """ - if s3.list_files('oca.dump', S3_PRIVATE_FOLDER): - print('Rebuilding tables from SQL dump') - s3.download_file(f"{S3_PRIVATE_FOLDER}/oca.dump", os.path.join(local_dir, 'oca.dump')) - db.execute_sql_file('create_tables.sql') - db.restore_from(os.path.join(local_dir, 'oca.dump')) - else: - print('Creating tables from scratch') - db.execute_sql_file('create_tables.sql') - - -def insert_staging_to_main(db, tables): - """ - Delete all cases from main tables if they exist in the staging table, - then insert all records from the staging tables to the main tables - - issue: SET session_replication_role = replica - https://stackoverflow.com/questions/3942258/how-do-i-temporarily-disable-triggers-in-postgresql/18709987#18709987 - to a work around to avoid DELETE FROM command stalling. - A VACUUM FULL on all the tables were tried, it does not seem to help - Might be an issue with the staging table schema? - - :param db: Database object - """ - - db.sql("SET session_replication_role = replica;") - for table in tables: - if table in ('oca_metadata'): # skip these tables - continue - print(f"\t...Deleting older entries from {table}") - db.sql(f"DELETE FROM {table} WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging)") - db.sql("SET session_replication_role = default;") - - for table in tables: - if table in ('oca_metadata'): # skip these tables - continue - print(f"\t...Inserting to {table}") - db.sql(f"INSERT INTO {table} SELECT * FROM {table}_staging") - db.sql(f"DROP TABLE {table}_staging") - - -def create_date_files(s3, data_file, local_dir): - """ - Create a text file and a custom shield image with date the data was - last updated and add them to the public S3 folder. - - :param s3: S3 object - :param data_file: file path for data being processed - """ - date = re.search(r'(\d{4}-\d{2}-\d{2})', data_file).group(1) - - txt_file = os.path.join(local_dir, 'last-updated-date.txt') - open(txt_file, 'w').write(date) - - url = f"https://raster.shields.io/badge/Last%20Updated-{date.replace('-', '--')}-yellow" - r = requests.get(url) - img_file = os.path.join(local_dir, 'last-updated-shield.png') - open(img_file, 'wb').write(r.content) - - -def download_pluto(output_dir): - """ - Download and unzip PLUTO into the directory. - - :param output_dir: string or Path - """ - print('downloading pluto') - - # Check https://www.nyc.gov/content/planning/pages/resources/datasets/mappluto-pluto-change for updates - PLUTO_CSV_URL = 'https://s-media.nyc.gov/agencies/dcp/assets/files/zip/data-tools/bytes/pluto/nyc_pluto_25v1_1_csv.zip' - - #download and unzip - response = requests.get(PLUTO_CSV_URL) - content = response.content - z = zipfile.ZipFile(io.BytesIO(content)) - - pluto_csv = [name for name in z.namelist() if '.csv' in name][0] - z.extract(pluto_csv, output_dir) - - # rename - pluto_file = os.path.join(output_dir, "pluto.csv") - os.rename(os.path.join(output_dir, pluto_csv), pluto_file) - - return pluto_file - - -def upload_public_file(f, pub_dir, mode, s3_args): - """ - Uploads a local file from the pub_dir folder to the S3_PUBLIC_FOLDER. - - :param f: filename - :paramp ub_dir: local path folder - :param mode: string - :param s3_args: dict/ kwargs with aws_id, aws_key aws_bucket_name - """ - s3 = S3(**s3_args) - print('-', f) - s3_filename = f - # to maintain consistent names for public level-1 csv files, we'll rename the level-2 version - if mode == "2" and f == "oca_addresses.csv": - s3_filename = "oca_addresses_private.csv" - s3.upload_file(f"{S3_PUBLIC_FOLDER}/{s3_filename}", os.path.join(pub_dir, f)) - del s3 - -def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args): +def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None): """ Extract files from SFTP, parse cases, upload to S3 bucket """ + runtime_args = runtime_args or {} + s3_prefix = (runtime_args.get('s3_prefix') or '').strip('/') + reprocess_glob = runtime_args.get('reprocess_glob') or '' + force_reprocess = bool(runtime_args.get('force_reprocess')) + geocode_workers = runtime_args.get('geocode_workers') or multiprocessing.cpu_count() + census_batch_chunk_size = runtime_args.get('census_batch_chunk_size') or 2500 + csv_row_check_chunk_size = runtime_args.get('csv_row_check_chunk_size') or 1000 + parse_fail_fast = bool(runtime_args.get('parse_fail_fast')) + skip_public_publish = bool(runtime_args.get('skip_public_publish')) + db = Database(**db_args) + manifest = EtlRunManifest( + db=db, + schema_name=(runtime_args.get('db_schema') or db_args.get('schema') or 'public'), + s3_prefix=s3_prefix, + mode=mode, + reprocess_glob=reprocess_glob, + force_reprocess=force_reprocess + ) + manifest.setup_tables() + manifest.create_run() + Path('staging.duckdb').unlink(missing_ok=True) staging_db = DuckDB(dbname='staging.duckdb') sftp = Sftp(**sftp_args) s3 = S3(**s3_args) - - - - # Create local versions of folder in the S3 bucket "oca-data" - # # For debugging only -- replace with the var declarations below - # priv_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data-private')) - # pub_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data-public')) - priv_dir = make_dir('data-private') # "private/" - pub_dir = make_dir('data-public') # "public/" - - # Get list of new files to download from SFTP - new_sftp_zip_files = list_new_data_files(sftp, s3) - - # If there are no new files we can stop everything here. - if not new_sftp_zip_files: - print('No new files to download from SFTP. Stopping process.') + priv_dir = make_dir('data-private') + pub_dir = make_dir('data-public') + selection = None + + try: + selection = select_input_files( + manifest, db, sftp, s3, s3_prefix, reprocess_glob, force_reprocess + ) + if not selection.selected_zip_files: + print('No files selected for processing. Stopping process.') + manifest.mark_run_completed(0, 0, len(selection.skipped_reprocess_files)) + return True + + download_selected_files(manifest, sftp, s3, priv_dir, s3_prefix, selection) + parse_xml_to_staging(manifest, staging_db, priv_dir, parse_fail_fast=parse_fail_fast) + export_staging_csvs( + manifest, staging_db, pub_dir, + csv_preprocess_chunk_size=csv_row_check_chunk_size, + ) + geocode_staging_csvs( + manifest, pub_dir, geocode_workers, census_batch_chunk_size, + ) + upload_staging_csvs(manifest, pub_dir, mode, s3_args, s3_prefix) + db.ensure_connection() + import_and_promote_staging( + manifest, + db, + pub_dir, + s3_args, + s3_prefix, + selection, + runtime_args.get('db_schema') or db_args.get('schema') or 'public', + ) + if skip_public_publish: + print('Skipping public publish (SKIP_PUBLIC_PUBLISH)') + skip_details = {'skipped': True, 'reason': 'SKIP_PUBLIC_PUBLISH'} + manifest.upsert_step('publish_public', 'completed', details=skip_details) + manifest.upsert_step('normalize_s3_encryption', 'completed', details=skip_details) + else: + db.ensure_connection() + published_keys = publish_public_artifacts( + manifest, db, s3_args, s3_prefix, mode, selection, pub_dir, + ) + normalize_public_s3_encryption(manifest, s3, published_keys) + upload_private_source_files(manifest, s3, priv_dir, s3_prefix) + + files_needing_reprocess = file_names_needing_reprocess(manifest.file_details_by_name) + processed_count = len(selection.selected_zip_files) - len(files_needing_reprocess) + manifest.mark_run_completed( + len(selection.selected_zip_files), + processed_count, + len(selection.skipped_reprocess_files), + files_needing_reprocess=files_needing_reprocess or None, + ) return True - - # If there are new files, download them. - print('Downloading new files from SFTP:') - for f in new_sftp_zip_files: - print('-', f) - sftp.download_files(f, priv_dir) - - # Sort zipfiles by date - def sort_by_date(file): - r = re.search(r'(\d+.+)\.zip', file).group(0).replace('.',' ') - return r - local_zip_files = sorted([os.path.join(priv_dir, f) for f in os.listdir(priv_dir) if f.endswith('.zip')], key = sort_by_date) - - # Rebuild the staging tables - # Then for each zipfile, unzip the XML file and - # parse it into the staging tables - print(' - Creating staging tables...') - staging_db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') - print('Processing files:') - for zip_file in local_zip_files: - print('-', os.path.basename(zip_file)) - print(' - Parsing XML file...') - # takes about 4-5 minutes per xml - extract_date = None - with zipfile.ZipFile(zip_file, 'r').open(DATA_FILENAME) as xml_file: - for _, elem in etree.iterparse(xml_file, tag=oca_tag('RunDate')): - # Grab the first date and break - if not extract_date: - extract_date = elem.text - break - - with zipfile.ZipFile(zip_file, 'r').open(DATA_FILENAME) as xml_file: - parse_file(xml_file, staging_db, extract_date) - - # export staging tables to the pub_dir, upload to s3, and then rds - staging_db.export_tables_to_csv(output_dir=pub_dir) - - def preprocess_csvs(pub_dir): - """Convert all CSV files from DuckDB to PostgreSQL array format; and make small corrections (todo fix this in the parser/ duckdb export)""" - for filename in os.listdir(pub_dir): - if filename.endswith('.csv'): - file_path = os.path.join(pub_dir, filename) - df = pd.read_csv(file_path) - for col in df.columns: - if df[col].dtype == 'object': - # Convert arrays: [anything] -> {anything} - # But only if the content doesn't contain JSON objects - def replace_brackets(text): - if pd.isna(text) or not isinstance(text, str): - return text - - if text.startswith('[') and text.endswith(']'): - inner_content = text[1:-1].strip() - # Don't replace if the inner content is wrapped in {} - # Todo: fix appearanceoutcomes that are blank [] ... they are still converted to {} - if inner_content.startswith('{') and inner_content.endswith('}'): - return text # Keep original - it's [{}] format - else: - return '{' + text[1:-1] + '}' # Convert [] to {} - - return text - - df[col] = df[col].apply(replace_brackets) - - if filename.startswith('oca_appearances'): - # remove the appearanceid column, BIGSERIAL is assigned in postgres - if 'appearanceid' in df.columns: del df['appearanceid'] - # change motionsequence to a int instead of a float - df['motionsequence'] = df['motionsequence'].astype('Int64') - - if filename.startswith('oca_judgments'): - df['amendedfromjudgmentsequence'] = df['amendedfromjudgmentsequence'].astype('Int64') - - if filename.startswith('oca_warrants'): - df['executionstayeddays'] = df['executionstayeddays'].astype('Int64') - df['issuancestayeddays'] = df['issuancestayeddays'].astype('Int64') - - df.to_csv(file_path, index=False) - - print('Convert csvs:') - preprocess_csvs(pub_dir) - staging_tables = [t + '_staging' for t in OCA_TABLES] - public_files = [i for i in os.listdir(pub_dir) if i.endswith('.csv')] - with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: - files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args)) - pool.starmap(upload_public_file, files_zip) - - # reset staging tables then import from s3 to rds - db.execute_sql_file('create_tables_staging.sql') - for t in staging_tables: - print('-', f"{t} table to db") - # only import to the rds, if the local csv has rows - csv_filepath = os.path.join(pub_dir, f"{t}.csv") - if len(pd.read_csv(csv_filepath)): - columns = '' - # ignore the appearanceid column - if t == 'oca_appearances_staging': - columns = 'indexnumberid, appearancedatetime, appearancepurpose, appearancereason, appearancepart, motionsequence, appearanceoutcomes' - db.sql(f""" - SELECT aws_s3.table_import_from_s3( - '{t}', '{columns}', '(FORMAT CSV, HEADER)', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', 'public/{t}.csv', 'us-east-1'), - aws_commons.create_aws_credentials('{s3_args["aws_id"]}', '{s3_args["aws_key"]}', '') - ); - """) - - # expand appearance_outcomes from json - print('\n - Updating appearance outcomes...') - db.execute_sql_file('update_appearance_outcomes.sql') - - print('\n - Inserting from staging to main ...') - # moves records from staging tables to the main tables, skips oca_metadata - insert_staging_to_main(db, OCA_TABLES) - - # Merging in oca_metadata using case if logic - print('\n - Update metadata in main ...') - db.execute_sql_file('update_metadata.sql') - - - # Export the rds tables to csv files directly into the s3 bucket - for t in OCA_TABLES: - print('-', f'{t} table from db to s3') - # to maintain consistent names for public level-1 csv files, we'll rename the level-2 version - s3_filename = t + '.csv' - if t == "oca_addresses": - s3_filename = "oca_addresses_private.csv" - db.sql(f""" - SELECT * from aws_s3.query_export_to_s3( - 'SELECT * from {t}', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', 'public/{s3_filename}', 'us-east-1'), - options :='FORMAT CSV, HEADER'); - """) - - # Export oca_addresses_private.csv to pub_dir to geocode - csv_filepath = os.path.join(pub_dir, f"oca_addresses_private.csv") - db.export_csv('oca_addresses', csv_filepath) - - input_csv = Path(pub_dir) / 'oca_addresses_private.csv' - output_csv = Path(pub_dir) /'oca_addresses_private.csv' - addr_cols = ['street1', 'city', 'postalcode'] - - #keep all cols - keep_cols = lambda x: x - - df = pd.read_csv( - input_csv, - dtype = str, - index_col = False, - usecols=keep_cols, - keep_default_na=False - ) - - #filter for only records that need to be geocoded - df_1 = df[ - ((pd.isna(df['lat'])) | (df['lat'] == '')) & - ((df['house_number'] != '') | (pd.notna(df['house_number']))) - ].copy().reset_index() - - # # DEBUG: geocode all records - # df_1 = df - - print(f'Geocoding {len(df_1)} entries in {output_csv}.') - - records = df_1.to_dict('records') - - # Geocode records using NYC GeoSupport - # TODO - check if pluto in the database matches the pluto version of the geosupport - # TODO - adjust geocode to put lat/lng on the lot centroid? instead of the centerline/sidewalk - with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool: - it = pd.DataFrame(pool.map(functools.partial(geocode_record, addr_cols=addr_cols), records, 10000)) - - del df_1 # delete unused objects to avoid docker's memory error / 137 - del records - - # Geocode other records using the US Batch Census Geocoder - # Sub-select for all addresses that are missing latitude; also needs to have a house number - df_2 = it[(((pd.isna(it['lat'])) | (it['lat'] == '')))].copy().reset_index() - print(f'Geocoding {len(df_2)} entries in {output_csv} using another geocoder. {datetime.now()}') - - # For debugging only - # --- - # data_split = np.split(df_2, range(chunk_size, df_2.shape[0], 10000)) - # geocode_using_census_batch(data_split[2], pub_dir) - - with multiprocessing.Pool(processes=min([5, multiprocessing.cpu_count()])) as pool: - chunk_size = 2500 # census batch limit is 10,000. Smaller batches tend to work better - data_split = zip(np.split(df_2, range(chunk_size, df_2.shape[0], chunk_size)), repeat(pub_dir)) - it_2 = pd.concat(pool.starmap(geocode_using_census_batch, data_split)) - del df_2 - del data_split - - print(f'Done geocoding. {datetime.now()}') - # Concat and drop duplicates by keeping the last changes from US Batch Census Geocoder (overwrites the GeoSupport returns - export_cols = ['indexnumberid', 'street1', 'street2', 'city', 'state', - 'postalcode', 'status', 'house_number', 'street_name', 'borough_code', - 'place_name', 'sname', 'hnum', 'boro', 'lat', 'bin', 'bbl', 'cd', - 'ct', 'council', 'grc', 'grc2', 'msg', 'msg2', 'lon', 'zip_code'] - concat = pd.concat([df, it, it_2], ignore_index = True).drop_duplicates(subset=['indexnumberid'], ignore_index = True, keep = 'last')[export_cols] - del df - del it - del it_2 - pd.DataFrame(concat).to_csv(output_csv, index=False) - del concat - - # # reset connection to s3 - # s3 = S3(**s3_args) - - # Update "last updated date" files on S3 for the latest file processed - create_date_files(s3, new_sftp_zip_files[-1], pub_dir) - - print('Uploading public files to S3:') - public_files = [i for i in os.listdir(pub_dir) - if i in ('last-updated-shield.png', 'last-updated-date.txt', 'oca_addresses_private.csv')] - with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: - files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args)) - pool.starmap(upload_public_file, files_zip) - - # # Create/upload a dump of the database as a backup - # print('Creating database dump and uploading to s3') - # db.dump_to(os.path.join(priv_dir, 'oca.dump')) - - # Upload raw data files and database dump to private folder in S3 bucket - print('Uploading private files to S3:') - for f in os.listdir(priv_dir): - if f != '.DS_Store': - print('-', f) - s3.upload_file(f"{S3_PRIVATE_FOLDER}/{f}", os.path.join(priv_dir, f)) - - # reset oca_addresses (removes geom), and uses the geocoded s3 import to overwrite oca_addresses table - print('-', f'overwrite oca_addresses with geocoded version') - db.execute_sql_file('reset_addresses_table.sql') - db.sql(f""" - SELECT aws_s3.table_import_from_s3( - 'oca_addresses', '', '(FORMAT CSV, HEADER)', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', 'public/oca_addresses_private.csv', 'us-east-1'), - aws_commons.create_aws_credentials('{s3_args["aws_id"]}', '{s3_args["aws_key"]}', '') - ); - """) # TODO: replace with similar sql query as update_metadata.sql to reduce the time this takes (10 mins) - - # # setup pluto if it does not exist - # # # TODO: setup census tracts if it does not exist - # if not db.sql_fetch_one( - # "SELECT * FROM information_schema.tables WHERE table_name = 'pluto'"): - # pluto_file = download_pluto(pub_dir) - - - # print('uploading pluto to s3') - # s3.upload_file(f"{S3_PUBLIC_FOLDER}/pluto.csv", pluto_file) - - # print('importing pluto to db') - # db.execute_sql_file('create_pluto_table.sql') - - # db.sql(f""" - # SELECT aws_s3.table_import_from_s3( - # 'pluto', '', '(FORMAT CSV, HEADER)', - # aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', 'public/pluto_24v2.csv', 'us-east-1'), - # aws_commons.create_aws_credentials('{s3_args["aws_id"]}', '{s3_args["aws_key"]}', '') - # ); - # """) - - # db.execute_sql_file('alter_pluto_table.sql') - - - # create views and grant access to folks - db.execute_sql_file('create_addresses_views.sql') - - # export views directly to s3, each takes 1-2 minutes - print(f"Creating oca_addresses_with_bbl and exporting to S3") - db.sql(f""" - SELECT * from aws_s3.query_export_to_s3( - 'SELECT * from oca_addresses_with_bbl', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', 'public/oca_addresses_with_bbl.csv', 'us-east-1'), - options :='FORMAT CSV, HEADER'); - """) - - print(f"Creating oca_addresses_with_ct and exporting to S3") - db.sql(f""" - SELECT * from aws_s3.query_export_to_s3( - 'SELECT * from oca_addresses_with_ct', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', 'public/oca_addresses_with_ct.csv', 'us-east-1'), - options :='FORMAT CSV, HEADER'); - """) - - # add level-1 version of address table from level-2 data and maintain consistent name - print(f"Creating oca_addresses_public and exporting to S3") - db.sql(f""" - SELECT * from aws_s3.query_export_to_s3( - 'SELECT * from oca_addresses_public', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', 'public/oca_addresses.csv', 'us-east-1'), - options :='FORMAT CSV, HEADER'); - """) \ No newline at end of file + except Exception as exc: + if selection and selection.selected_zip_files and not isinstance(exc, ParseFailFastError): + for selected_name in selection.selected_zip_files: + source = 'sftp' if selected_name in selection.new_file_set else 's3_private' + manifest.upsert_file(selected_name, source=source, status='failed', stage='run', error=exc) + manifest.mark_run_failed(exc) + raise diff --git a/lib/etl_constants.py b/lib/etl_constants.py new file mode 100644 index 0000000..df5d17f --- /dev/null +++ b/lib/etl_constants.py @@ -0,0 +1,22 @@ +OCA_TABLES = [ + 'oca_index', + 'oca_causes', + 'oca_addresses', + 'oca_parties', + 'oca_events', + 'oca_appearances', + 'oca_appearance_outcomes', + 'oca_motions', + 'oca_decisions', + 'oca_judgments', + 'oca_warrants', + 'oca_metadata' +] + +DATA_ZIPFILE_PAT = r'LandlordTenant\.(Initial\.FiledIn\d{4}|Incr)\.\d{4}-\d{2}-\d{2}\.zip' + +DATA_FILENAME = 'LandlordTenantExtract.xml' + +S3_PRIVATE_FOLDER = 'private' + +S3_PUBLIC_FOLDER = 'public' diff --git a/lib/etl_csv.py b/lib/etl_csv.py new file mode 100644 index 0000000..2dea8c1 --- /dev/null +++ b/lib/etl_csv.py @@ -0,0 +1,116 @@ +"""Constant-memory CSV preprocessing for DuckDB staging exports.""" + +import csv +import os + +from .staging_csv_export import staging_csv_needs_preprocess + +_APPEARANCES_PREFIX = 'oca_appearances_staging' +_JUDGMENTS_PREFIX = 'oca_judgments_staging' +_WARRANTS_PREFIX = 'oca_warrants_staging' + +_EMPTY_INT_MARKERS = frozenset({'', 'nan', 'NaN', 'None', ''}) + + +def replace_postgres_array_brackets(text): + """ + Convert DuckDB-style array literals ``[a,b]`` to PostgreSQL ``{a,b}``. + + JSON object arrays (inner ``{...}``) are left unchanged. + """ + if not text or not isinstance(text, str): + return text + stripped = text.strip() + if not (stripped.startswith('[') and stripped.endswith(']')): + return text + inner = stripped[1:-1].strip() + if inner.startswith('{') and inner.endswith('}'): + return text + return '{' + stripped[1:-1] + '}' + + +def _normalize_int_cell(value): + if value is None: + return '' + if isinstance(value, str) and value.strip() in _EMPTY_INT_MARKERS: + return '' + return value + + +def _preprocess_row(filename, fieldnames, int_columns, row): + out = {} + for col in fieldnames: + value = row.get(col, '') + if col in int_columns: + out[col] = _normalize_int_cell(value) + elif isinstance(value, str): + out[col] = replace_postgres_array_brackets(value) + else: + out[col] = value + return out + + +def _file_preprocess_rules(filename): + drop_columns = set() + int_columns = set() + if filename.startswith(_APPEARANCES_PREFIX): + drop_columns.add('appearanceid') + int_columns.add('motionsequence') + elif filename.startswith(_JUDGMENTS_PREFIX): + int_columns.add('amendedfromjudgmentsequence') + elif filename.startswith(_WARRANTS_PREFIX): + int_columns.update(('executionstayeddays', 'issuancestayeddays')) + return drop_columns, int_columns + + +def preprocess_csv_file(file_path, chunk_size=1000): + """ + Rewrite one staging CSV in place using bounded memory. + + ``chunk_size`` controls how many rows are buffered before writing; it does + not load the full file into a DataFrame. + """ + filename = os.path.basename(file_path) + if not filename.endswith('.csv'): + return 0 + + drop_columns, int_columns = _file_preprocess_rules(filename) + tmp_path = f'{file_path}.tmp' + rows_touched = 0 + + with open(file_path, newline='', encoding='utf-8') as infile, open( + tmp_path, 'w', newline='', encoding='utf-8' + ) as outfile: + reader = csv.DictReader(infile) + if not reader.fieldnames: + os.remove(tmp_path) + return 0 + + fieldnames = [name for name in reader.fieldnames if name not in drop_columns] + writer = csv.DictWriter(outfile, fieldnames=fieldnames, lineterminator='\n') + writer.writeheader() + + batch = [] + for row in reader: + batch.append(_preprocess_row(filename, fieldnames, int_columns, row)) + rows_touched += 1 + if len(batch) >= chunk_size: + writer.writerows(batch) + batch.clear() + if batch: + writer.writerows(batch) + + os.replace(tmp_path, file_path) + return rows_touched + + +def preprocess_staging_csv_dir(target_dir, chunk_size=1000): + for filename in sorted(os.listdir(target_dir)): + if not filename.endswith('.csv'): + continue + if not staging_csv_needs_preprocess(filename): + continue + preprocess_csv_file( + os.path.join(target_dir, filename), + chunk_size=chunk_size, + ) diff --git a/lib/etl_file_selection.py b/lib/etl_file_selection.py new file mode 100644 index 0000000..4031df9 --- /dev/null +++ b/lib/etl_file_selection.py @@ -0,0 +1,55 @@ +import fnmatch + +from .etl_constants import DATA_ZIPFILE_PAT, S3_PRIVATE_FOLDER +from .etl_helpers import s3_key + + +def list_new_data_files(sftp, s3, s3_prefix=''): + """ + Get a list of filenames for all the data files available in the SFTP + that are not already in the private S3 folder. These are the new ones + that still need to be processed. They are returned in the proper order + in which they need to be processed. + + :param sftp: SFTP object + :param s3: S3 object + """ + + sftp_zip_files = sftp.list_files(DATA_ZIPFILE_PAT) + s3_zip_files = s3.list_files(DATA_ZIPFILE_PAT, s3_key(S3_PRIVATE_FOLDER, s3_prefix)) + new_sftp_zip_files = list(set(sftp_zip_files) - set(s3_zip_files)) + + # It's important that everything is processed in order because files + # can contain modify/delete cases included in past files + init_files = [f for f in new_sftp_zip_files if 'Initial' in f] + incr_files = [f for f in new_sftp_zip_files if 'Incr' in f] + + files = [] + files += sorted(init_files) if init_files else [] + files += sorted(incr_files) if incr_files else [] + + return files + + +def list_reprocess_data_files(s3, reprocess_glob, s3_prefix=''): + if not reprocess_glob: + return [] + s3_zip_files = s3.list_files(DATA_ZIPFILE_PAT, s3_key(S3_PRIVATE_FOLDER, s3_prefix)) + return sorted([f for f in s3_zip_files if fnmatch.fnmatch(f, reprocess_glob)]) + + +def select_data_files_to_process(new_files, reprocess_files, force_reprocess=False): + def ordered(files): + init_files = sorted([f for f in files if 'Initial' in f]) + incr_files = sorted([f for f in files if 'Incr' in f]) + return init_files + incr_files + + if not reprocess_files: + return ordered(new_files) + + if not force_reprocess: + # Keep backward-compatible default behavior unless force mode is explicitly set. + return ordered(new_files) + + merged = set(new_files) | set(reprocess_files) + return ordered(merged) diff --git a/lib/etl_geocode.py b/lib/etl_geocode.py new file mode 100644 index 0000000..12317b2 --- /dev/null +++ b/lib/etl_geocode.py @@ -0,0 +1,251 @@ +import csv +import functools +import multiprocessing +import os +import shutil +from itertools import repeat + +import numpy as np +import pandas as pd + +from .geocode_record import geocode_record, geocode_using_census_batch, suppress_geosupport_logging + +ADDRESS_ROW_KEY_COLUMNS = [ + 'indexnumberid', 'street1', 'street2', 'city', 'state', 'postalcode', +] + +GEOCODE_ADDRESS_COLUMNS = [ + 'indexnumberid', 'street1', 'street2', 'city', 'state', 'postalcode', + 'status', 'house_number', 'street_name', 'borough_code', 'place_name', + 'sname', 'hnum', 'boro', 'lat', 'bin', 'bbl', 'cd', 'ct', 'council', + 'grc', 'grc2', 'msg', 'msg2', 'lon', 'zip_code', +] + +GEOCODE_EXPORT_COLUMNS = GEOCODE_ADDRESS_COLUMNS + +STAGING_ADDRESSES_CSV = 'oca_addresses_staging.csv' +GEOCODED_STAGING_ADDRESSES_CSV = 'oca_addresses_staging_geocoded.csv' + + +def _stringify_row_values(row): + normalized = {} + for key, value in row.items(): + if value is None: + normalized[key] = '' + elif isinstance(value, float) and np.isnan(value): + normalized[key] = '' + else: + normalized[key] = str(value) + return normalized + + +def _has_lat(value): + if value is None: + return False + text = str(value).strip() + return text != '' and text.lower() != 'nan' + + +def address_row_key(row): + """Stable per-address identity for merge/upsert (ingest columns only).""" + parts = [] + for col in ADDRESS_ROW_KEY_COLUMNS: + value = row.get(col) + if value is None: + parts.append('') + elif isinstance(value, float) and np.isnan(value): + parts.append('') + else: + parts.append(str(value)) + return tuple(parts) + + +def row_needs_geocode(row): + """Mirror select_addresses_needing_geocode.sql for unit tests.""" + return not _has_lat(row.get('lat')) + + +def _rows_from_fetchall(rows): + return [ + _stringify_row_values(dict(zip(GEOCODE_ADDRESS_COLUMNS, row))) + for row in rows + ] + + +def fetch_addresses_needing_geocode(db): + rows = db.sql_fetch_all_from_file('select_addresses_needing_geocode.sql') + return _rows_from_fetchall(rows) + + +def _prepare_rows_for_db(rows): + prepared = [] + for row in rows: + db_row = {} + for col in GEOCODE_EXPORT_COLUMNS: + value = row.get(col, '') + if col in ('lat', 'lon') and not _has_lat(value): + db_row[col] = None + else: + db_row[col] = value if value != '' else None + prepared.append(db_row) + return prepared + + +def _init_geosupport_worker(): + suppress_geosupport_logging() + + +def _geosupport_worker(record): + return geocode_record(record, addr_cols=['street1', 'city', 'postalcode']) + + +def _run_geosupport(records, geocode_workers, geocode_record_fn): + geocode_one = functools.partial( + geocode_record_fn, + addr_cols=['street1', 'city', 'postalcode'], + ) + use_pool = geocode_record_fn is geocode_record + if not use_pool: + return [geocode_one(record) for record in records] + + suppress_geosupport_logging() + worker_count = min(geocode_workers, multiprocessing.cpu_count()) + with multiprocessing.Pool( + processes=worker_count, + initializer=_init_geosupport_worker, + ) as pool: + return pool.map(_geosupport_worker, records, 10000) + + +def _run_census_batch(still_missing, census_batch_chunk_size, pub_dir, geocode_using_census_batch_fn): + if not still_missing: + return [] + + use_pool = geocode_using_census_batch_fn is geocode_using_census_batch + chunk_size = census_batch_chunk_size + df_missing = pd.DataFrame(still_missing) + splits = list(np.split(df_missing, range(chunk_size, df_missing.shape[0], chunk_size))) + + if not use_pool: + return [geocode_using_census_batch_fn(chunk, pub_dir) for chunk in splits] + + census_pool_workers = min(5, multiprocessing.cpu_count()) + data_split = zip(splits, repeat(pub_dir)) + with multiprocessing.Pool(processes=census_pool_workers) as pool: + return pool.starmap(geocode_using_census_batch_fn, data_split) + + +def geocode_candidate_records( + records, + geocode_workers, + census_batch_chunk_size, + pub_dir, + geocode_record_fn=geocode_record, + geocode_using_census_batch_fn=geocode_using_census_batch, +): + if not records: + return [] + + print(f'Geocoding {len(records)} addresses using Geosupport') + geosupport_results = _run_geosupport(records, geocode_workers, geocode_record_fn) + + still_missing = [row for row in geosupport_results if not _has_lat(row.get('lat'))] + if not still_missing: + return geosupport_results + + print(f'Geocoding {len(still_missing)} addresses using Census batch') + census_chunks = _run_census_batch( + still_missing, + census_batch_chunk_size, + pub_dir, + geocode_using_census_batch_fn, + ) + if census_chunks: + census_results = pd.concat(census_chunks, ignore_index=True).to_dict('records') + else: + census_results = [] + + by_key = {address_row_key(row): row for row in geosupport_results} + for row in census_results: + by_key[address_row_key(row)] = row + return [by_key[address_row_key(row)] for row in records] + + +def upsert_geocoded_addresses(db, rows): + if not rows: + return 0 + + # Large backfills can exceed default RDS statement_timeout on staging insert + merge. + db.set_statement_timeout() + db.execute_sql_file('create_geocode_staging_table.sql') + db.insert_rows(_prepare_rows_for_db(rows), 'oca_addresses_geocode_staging') + db.execute_sql_file('upsert_geocoded_addresses.sql') + return len(rows) + + +def read_staging_addresses_csv(pub_dir): + """Read ``oca_addresses_staging.csv`` rows as string-normalized dicts.""" + path = os.path.join(pub_dir, STAGING_ADDRESSES_CSV) + if not os.path.exists(path): + return [], [] + + with open(path, 'r', encoding='utf-8', newline='') as handle: + reader = csv.DictReader(handle) + fieldnames = list(reader.fieldnames or []) + rows = [_stringify_row_values(row) for row in reader] + return rows, fieldnames + + +def _merge_geocoded_row(original, geocoded): + merged = dict(original) + for col in GEOCODE_ADDRESS_COLUMNS: + if col in geocoded: + merged[col] = geocoded[col] + return merged + + +def write_geocoded_staging_csv(pub_dir, rows, fieldnames, dest_filename=GEOCODED_STAGING_ADDRESSES_CSV): + """Write geocoded address rows to a staging CSV (default: intermediate geocoded file).""" + path = os.path.join(pub_dir, dest_filename) + if not fieldnames: + fieldnames = list(GEOCODE_ADDRESS_COLUMNS) + + with open(path, 'w', encoding='utf-8', newline='') as handle: + writer = csv.DictWriter(handle, fieldnames=fieldnames, extrasaction='ignore') + writer.writeheader() + for row in rows: + writer.writerow({col: row.get(col, '') for col in fieldnames}) + + +def geocode_staging_addresses_csv( + pub_dir, + geocode_workers, + census_batch_chunk_size, + geocode_record_fn=geocode_record, + geocode_using_census_batch_fn=geocode_using_census_batch, +): + """ + Geocode every row in ``oca_addresses_staging.csv``, write + ``oca_addresses_staging_geocoded.csv``, then overwrite the staging file. + """ + rows, fieldnames = read_staging_addresses_csv(pub_dir) + if not rows: + return 0 + + geocoded_rows = geocode_candidate_records( + rows, + geocode_workers, + census_batch_chunk_size, + pub_dir, + geocode_record_fn=geocode_record_fn, + geocode_using_census_batch_fn=geocode_using_census_batch_fn, + ) + merged_rows = [ + _merge_geocoded_row(original, geocoded) + for original, geocoded in zip(rows, geocoded_rows) + ] + write_geocoded_staging_csv(pub_dir, merged_rows, fieldnames) + staging_path = os.path.join(pub_dir, STAGING_ADDRESSES_CSV) + geocoded_path = os.path.join(pub_dir, GEOCODED_STAGING_ADDRESSES_CSV) + shutil.copy2(geocoded_path, staging_path) + return len(merged_rows) diff --git a/lib/etl_helpers.py b/lib/etl_helpers.py new file mode 100644 index 0000000..61fc485 --- /dev/null +++ b/lib/etl_helpers.py @@ -0,0 +1,119 @@ +import csv +import io +import os +import re +import shutil +import zipfile + +import requests + +from .etl_constants import S3_PRIVATE_FOLDER, S3_PUBLIC_FOLDER +from .s3 import S3 + + +def s3_key(path, s3_prefix=''): + normalized_path = path.lstrip('/') + if not s3_prefix: + return normalized_path + normalized_prefix = s3_prefix.strip('/') + return f"{normalized_prefix}/{normalized_path}" + + +def make_dir(dir_name): + """ + Create a new directory in the same folder as this file, + deleting everything in the folder if it already exists + + :param dir_name: The name of the directory to be created as a string + """ + dir_path = os.path.abspath(os.path.join(os.path.dirname(__file__), dir_name)) + shutil.rmtree(dir_path, ignore_errors=True) + os.mkdir(dir_path) + return dir_path + + +def csv_has_rows(csv_filepath): + """Return True when the CSV has at least one data row (header excluded).""" + with open(csv_filepath, newline='', encoding='utf-8') as csv_file: + reader = csv.reader(csv_file) + try: + next(reader) + except StopIteration: + return False + for _ in reader: + return True + return False + + +def _last_updated_badge_svg(date): + label = f'Last Updated: {date}' + return f''' + + + {label} + +''' + + +def create_date_files(data_file, local_dir): + """ + Create a text file and a local SVG badge with the data last-updated date. + + :param data_file: file path for data being processed + :param local_dir: path for local directory to save date files + """ + date = re.search(r'(\d{4}-\d{2}-\d{2})', data_file).group(1) + + txt_file = os.path.join(local_dir, 'last-updated-date.txt') + with open(txt_file, 'w', encoding='utf-8') as handle: + handle.write(date) + + svg_file = os.path.join(local_dir, 'last-updated-shield.svg') + with open(svg_file, 'w', encoding='utf-8') as handle: + handle.write(_last_updated_badge_svg(date)) + + +def download_pluto(output_dir): + """ + Download and unzip PLUTO into the directory. + + :param output_dir: string or Path + """ + print('downloading pluto') + + # Check https://www.nyc.gov/content/planning/pages/resources/datasets/mappluto-pluto-change for updates + PLUTO_CSV_URL = 'https://s-media.nyc.gov/agencies/dcp/assets/files/zip/data-tools/bytes/pluto/nyc_pluto_25v1_1_csv.zip' + + #download and unzip + response = requests.get(PLUTO_CSV_URL) + content = response.content + z = zipfile.ZipFile(io.BytesIO(content)) + + pluto_csv = [name for name in z.namelist() if '.csv' in name][0] + z.extract(pluto_csv, output_dir) + + # rename + pluto_file = os.path.join(output_dir, "pluto.csv") + os.rename(os.path.join(output_dir, pluto_csv), pluto_file) + + return pluto_file + + +def upload_public_file(f, pub_dir, mode, s3_args, s3_prefix=''): + """ + Uploads a local file from the pub_dir folder to the S3_PUBLIC_FOLDER. + + :param f: filename + :paramp ub_dir: local path folder + :param mode: string + :param s3_args: dict/ kwargs with aws_id, aws_key aws_bucket_name + """ + s3 = S3(**s3_args) + print('-', f) + s3_filename = f + # to maintain consistent names for public level-1 csv files, we'll rename the level-2 version + if mode == "2" and f == "oca_addresses.csv": + s3_filename = "oca_addresses_private.csv" + s3.upload_file(s3_key(f"{S3_PUBLIC_FOLDER}/{s3_filename}", s3_prefix), os.path.join(pub_dir, f)) + del s3 diff --git a/lib/etl_promotion.py b/lib/etl_promotion.py new file mode 100644 index 0000000..257fc4a --- /dev/null +++ b/lib/etl_promotion.py @@ -0,0 +1,49 @@ +import hashlib +import json + +from .etl_constants import OCA_TABLES + +PROMOTION_SQL_FILE = 'promote_staging_to_main.sql' +PROMOTION_INDEX_SQL_FILE = 'ensure_promotion_indexes.sql' +PURGE_TOMBSTONED_CASES_SQL_FILE = 'purge_tombstoned_cases.sql' + +# Tables promoted via promote_staging_to_main.sql (oca_metadata merged in-SQL). +PROMOTED_TABLES = [t for t in OCA_TABLES if t != 'oca_metadata'] + +ADDRESS_NATURAL_KEY_COLUMNS = [ + 'indexnumberid', 'street1', 'street2', 'city', 'state', 'postalcode', +] + + +def ensure_promotion_indexes(db): + """Create indexes used by scoped promotion deletes when staging tables exist.""" + db.execute_sql_file(PROMOTION_INDEX_SQL_FILE, commit=False) + + +def promotion_table_counts(db, tables=None): + """Return row counts per main table (checksum hook for validation).""" + tables = tables or OCA_TABLES + counts = {} + for table in tables: + row = db.sql_fetch_one(f'SELECT COUNT(*)::bigint FROM {table}') + counts[table] = int(row[0]) if row else 0 + return counts + + +def promotion_counts_checksum(counts): + """Stable checksum string for comparing promotion snapshots.""" + payload = json.dumps(counts, sort_keys=True) + return hashlib.sha256(payload.encode('utf-8')).hexdigest() + + +def promote_staging_to_main(db): + """ + Promote all populated staging tables to main in one transaction. + + On failure, PostgreSQL rolls back deletes/inserts/metadata merge and staging + drops so a retry can re-import or re-run promotion safely. + """ + with db.transaction(): + ensure_promotion_indexes(db) + db.execute_sql_file(PROMOTION_SQL_FILE, commit=False) + diff --git a/lib/etl_publish.py b/lib/etl_publish.py new file mode 100644 index 0000000..bd5a77a --- /dev/null +++ b/lib/etl_publish.py @@ -0,0 +1,78 @@ +"""S3 publish helpers: exports and targeted post-publish encryption.""" + +import os + +from .etl_constants import OCA_TABLES, S3_PUBLIC_FOLDER +from .etl_helpers import csv_has_rows, s3_key + +PRIVATE_ADDRESS_CSV = 'oca_addresses_private.csv' + + +def staging_csv_filenames(): + """Expected local/S3 filenames for RDS staging import (one per OCA table).""" + return [f"{table}_staging.csv" for table in OCA_TABLES] + + +def list_staging_csvs_in_dir(pub_dir): + """Staging CSVs present in ``pub_dir``; whitelist only (ignores geocoder temps, etc.).""" + return sorted( + name for name in staging_csv_filenames() + if os.path.isfile(os.path.join(pub_dir, name)) + ) + + +def staging_tables_with_rows(pub_dir): + """Main table names whose staging CSV had at least one data row this run.""" + tables = [] + for table in OCA_TABLES: + csv_path = os.path.join(pub_dir, f"{table}_staging.csv") + if os.path.isfile(csv_path) and csv_has_rows(csv_path): + tables.append(table) + return set(tables) + + +def export_table_to_s3(db, table, s3_filename, s3_args, s3_prefix): + """Export one main table via aws_s3.query_export_to_s3; return the object key.""" + object_key = s3_key(f"{S3_PUBLIC_FOLDER}/{s3_filename}", s3_prefix) + db.sql(f""" + SELECT * from aws_s3.query_export_to_s3( + 'SELECT * from {table}', + aws_commons.create_s3_uri( + '{s3_args["aws_bucket_name"]}', + '{object_key}', + 'us-east-1' + ), + options :='FORMAT CSV, HEADER'); + """) + return object_key + + +ADDRESS_VIEW_EXPORTS = ( + ('oca_addresses_with_bbl', 'oca_addresses_with_bbl.csv'), + ('oca_addresses_with_ct', 'oca_addresses_with_ct.csv'), + ('oca_addresses_public', 'oca_addresses.csv'), +) + + +def published_keys_for_encryption(object_keys): + """Object keys to re-encrypt with SSE-S3; excludes the private address CSV.""" + keys = [] + for key in object_keys: + if not key: + continue + normalized = key.rstrip('/') + if normalized.endswith(PRIVATE_ADDRESS_CSV): + continue + keys.append(key) + return sorted(set(keys)) + + +def normalize_published_s3_encryption(s3, object_keys): + """Re-encrypt only objects written during this publish pass (SSE-S3).""" + keys = published_keys_for_encryption(object_keys) + if not keys: + return + print(f'Updating server-side encryption for {len(keys)} published S3 object(s)') + for object_key in keys: + print('-', object_key) + s3.update_encryption(object_key) diff --git a/lib/etl_run_manifest.py b/lib/etl_run_manifest.py new file mode 100644 index 0000000..31e2052 --- /dev/null +++ b/lib/etl_run_manifest.py @@ -0,0 +1,163 @@ +import json +import traceback +import uuid +from contextlib import contextmanager + + +class EtlRunManifest: + @staticmethod + def _escape(value): + return str(value).replace("'", "''") + + def _literal(self, value): + return f"'{self._escape(value)}'" + + def _json_literal(self, value): + return f"'{self._escape(json.dumps(value))}'::jsonb" + + def __init__(self, db, schema_name, s3_prefix, mode, reprocess_glob, force_reprocess): + self.db = db + self.schema_name = schema_name or 'public' + self.s3_prefix = s3_prefix or '' + self.mode = mode + self.reprocess_glob = reprocess_glob or '' + self.force_reprocess = force_reprocess + self.run_id = str(uuid.uuid4()) + self.file_details_by_name = {} + + def setup_tables(self): + self.db.execute_sql_file('create_etl_manifest_tables.sql') + + def create_run(self): + payload = { + "mode": self.mode, + "schema_name": self.schema_name, + "s3_prefix": self.s3_prefix, + "reprocess_glob": self.reprocess_glob, + "force_reprocess": self.force_reprocess, + } + self.db.sql(f""" + INSERT INTO etl_runs ( + run_id, schema_name, s3_prefix, mode, reprocess_glob, force_reprocess, status, metadata, started_at + ) VALUES ( + {self._literal(self.run_id)}, {self._literal(self.schema_name)}, {self._literal(self.s3_prefix)}, + {self._literal(self.mode)}, {self._literal(self.reprocess_glob)}, + {str(self.force_reprocess).upper()}, 'running', {self._json_literal(payload)}, NOW() + ) + """) + + def mark_run_completed( + self, + selected_count, + processed_count, + skipped_count, + files_needing_reprocess=None, + ): + metadata_patch = {} + if files_needing_reprocess: + metadata_patch['files_needing_reprocess'] = list(files_needing_reprocess) + metadata_sql = ( + f", metadata = metadata || {self._json_literal(metadata_patch)}" + if metadata_patch + else "" + ) + self.db.sql(f""" + UPDATE etl_runs + SET status = 'completed', + completed_at = NOW(), + selected_file_count = {selected_count}, + processed_file_count = {processed_count}, + skipped_file_count = {skipped_count} + {metadata_sql} + WHERE run_id = {self._literal(self.run_id)} + """) + + def mark_run_failed(self, exc): + message = str(exc) + details = {"traceback": traceback.format_exc()} + self.db.sql(f""" + UPDATE etl_runs + SET status = 'failed', + completed_at = NOW(), + error_message = {self._literal(message)}, + error_details = {self._json_literal(details)} + WHERE run_id = {self._literal(self.run_id)} + """) + + def upsert_file(self, file_name, source, status, stage=None, details=None, error=None): + if details is not None: + self.file_details_by_name[file_name] = dict(details) + stage_value = "NULL" if stage is None else self._literal(stage) + details_value = self._json_literal(details or {}) + error_message = "NULL" if error is None else self._literal(str(error)) + error_details = "NULL" if error is None else self._json_literal({'traceback': traceback.format_exc()}) + completed_at = "NOW()" if status in ("completed", "failed", "skipped") else "NULL" + started_at = "NOW()" if status in ("processing", "downloaded", "parsed", "promoted") else "NULL" + self.db.sql(f""" + INSERT INTO etl_files ( + run_id, file_name, source, status, stage, details, started_at, completed_at, error_message, error_details, updated_at + ) VALUES ( + {self._literal(self.run_id)}, {self._literal(file_name)}, {self._literal(source)}, {self._literal(status)}, {stage_value}, + {details_value}, {started_at}, {completed_at}, {error_message}, {error_details}, NOW() + ) + ON CONFLICT (run_id, file_name) DO UPDATE + SET source = EXCLUDED.source, + status = EXCLUDED.status, + stage = EXCLUDED.stage, + details = EXCLUDED.details, + started_at = COALESCE(etl_files.started_at, EXCLUDED.started_at), + completed_at = EXCLUDED.completed_at, + error_message = EXCLUDED.error_message, + error_details = EXCLUDED.error_details, + updated_at = NOW() + """) + + def upsert_step(self, step_name, status, details=None, error=None): + details_value = self._json_literal(details or {}) + started_at = "NOW()" if status == "running" else "NULL" + completed_at = "NOW()" if status in ("completed", "failed") else "NULL" + error_message = "NULL" if error is None else self._literal(str(error)) + error_details = "NULL" if error is None else self._json_literal({'traceback': traceback.format_exc()}) + self.db.sql(f""" + INSERT INTO etl_steps ( + run_id, step_name, status, started_at, completed_at, error_message, error_details, details, updated_at + ) VALUES ( + {self._literal(self.run_id)}, {self._literal(step_name)}, {self._literal(status)}, {started_at}, {completed_at}, + {error_message}, {error_details}, {details_value}, NOW() + ) + ON CONFLICT (run_id, step_name) DO UPDATE + SET status = EXCLUDED.status, + started_at = COALESCE(etl_steps.started_at, EXCLUDED.started_at), + completed_at = EXCLUDED.completed_at, + error_message = EXCLUDED.error_message, + error_details = EXCLUDED.error_details, + details = EXCLUDED.details, + updated_at = NOW() + """) + + +@contextmanager +def manifest_step(manifest, step_name, details=None): + manifest.upsert_step(step_name, 'running', details=details) + try: + yield + manifest.upsert_step(step_name, 'completed', details=details) + except Exception as exc: + manifest.upsert_step(step_name, 'failed', details=details, error=exc) + raise + + +def completed_reprocess_files(db, reprocess_files): + if not reprocess_files: + return set() + quoted_files = ",".join(["'" + f.replace("'", "''") + "'" for f in reprocess_files]) + rows = db.sql_fetch_all(f""" + SELECT DISTINCT ef.file_name + FROM etl_files ef + JOIN etl_runs er ON er.run_id = ef.run_id + WHERE ef.status = 'completed' + AND er.status = 'completed' + AND COALESCE((ef.details->>'cases_failed')::int, 0) = 0 + AND ef.file_name IN ({quoted_files}) + """) + return {row[0] for row in rows} diff --git a/lib/etl_stages.py b/lib/etl_stages.py new file mode 100644 index 0000000..1da5a36 --- /dev/null +++ b/lib/etl_stages.py @@ -0,0 +1,431 @@ +import multiprocessing +import os +import re +import zipfile +from itertools import repeat +from lxml import etree + +from .etl_constants import DATA_FILENAME, OCA_TABLES, S3_PRIVATE_FOLDER, S3_PUBLIC_FOLDER +from .etl_file_selection import ( + list_new_data_files, + list_reprocess_data_files, + select_data_files_to_process, +) +from .etl_run_manifest import completed_reprocess_files +from .etl_csv import preprocess_staging_csv_dir +from .etl_helpers import ( + create_date_files, + csv_has_rows, + s3_key, + upload_public_file, +) +from .etl_promotion import ( + PURGE_TOMBSTONED_CASES_SQL_FILE, + promote_staging_to_main, + promotion_counts_checksum, + promotion_table_counts, +) +from .etl_publish import ( + ADDRESS_VIEW_EXPORTS, + export_table_to_s3, + list_staging_csvs_in_dir, + normalize_published_s3_encryption, + staging_tables_with_rows, +) +from .etl_geocode import ( + fetch_addresses_needing_geocode, + geocode_candidate_records, + geocode_staging_addresses_csv, + upsert_geocoded_addresses, +) +from .parse_manifest import ( + finalize_parse_xml_step, + upsert_parsed_etl_file, + upsert_promoted_etl_file, +) +from .parsers import oca_tag, parse_file + + +class FileSelection: + """Selected input files and download routing for one ETL run.""" + + def __init__( + self, + selected_zip_files, + skipped_reprocess_files, + new_file_set, + reprocess_file_set, + sftp_download_files, + s3_download_files, + ): + self.selected_zip_files = selected_zip_files + self.skipped_reprocess_files = skipped_reprocess_files + self.new_file_set = new_file_set + self.reprocess_file_set = reprocess_file_set + self.sftp_download_files = sftp_download_files + self.s3_download_files = s3_download_files + + +def select_input_files(manifest, db, sftp, s3, s3_prefix, reprocess_glob, force_reprocess): + manifest.upsert_step('select_files', 'running') + new_sftp_zip_files = list_new_data_files(sftp, s3, s3_prefix=s3_prefix) + reprocess_s3_zip_files = list_reprocess_data_files(s3, reprocess_glob, s3_prefix=s3_prefix) + skipped_reprocess_files = [] + if reprocess_glob and not force_reprocess and reprocess_s3_zip_files: + already_completed = completed_reprocess_files(db, reprocess_s3_zip_files) + skipped_reprocess_files = sorted(already_completed) + reprocess_s3_zip_files = sorted(set(reprocess_s3_zip_files) - already_completed) + + selected_zip_files = select_data_files_to_process( + new_sftp_zip_files, + reprocess_s3_zip_files, + force_reprocess=force_reprocess + ) + manifest.upsert_step('select_files', 'completed', details={'selected_file_count': len(selected_zip_files)}) + + if reprocess_glob: + print(f"Reprocess selector active: REPROCESS_GLOB={reprocess_glob}, FORCE_REPROCESS={force_reprocess}") + print(f"Matched S3 private files: {len(reprocess_s3_zip_files)}") + if skipped_reprocess_files and not force_reprocess: + print(f"Skipping already-completed reprocess files from manifest: {len(skipped_reprocess_files)}") + + if not selected_zip_files: + return FileSelection([], skipped_reprocess_files, set(), set(), [], []) + + reprocess_file_set = set(reprocess_s3_zip_files) + new_file_set = set(new_sftp_zip_files) + selected_set = set(selected_zip_files) + sftp_download_files = sorted(selected_set & new_file_set) + s3_download_files = sorted(selected_set & reprocess_file_set) + + for f in sftp_download_files: + manifest.upsert_file(f, source='sftp', status='selected', stage='select') + for f in s3_download_files: + manifest.upsert_file(f, source='s3_private', status='selected', stage='select') + for f in skipped_reprocess_files: + manifest.upsert_file( + f, source='s3_private', status='skipped', stage='select', + details={'reason': 'already_completed_manifest'} + ) + + return FileSelection( + selected_zip_files, + skipped_reprocess_files, + new_file_set, + reprocess_file_set, + sftp_download_files, + s3_download_files, + ) + + +def download_selected_files(manifest, sftp, s3, priv_dir, s3_prefix, selection): + manifest.upsert_step('download_files', 'running') + print('Downloading selected files:') + for f in selection.sftp_download_files: + print('-', f) + sftp.download_files(f, priv_dir) + manifest.upsert_file(f, source='sftp', status='downloaded', stage='download') + for f in selection.s3_download_files: + print('-', f) + s3.download_file(s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), os.path.join(priv_dir, f)) + manifest.upsert_file(f, source='s3_private', status='downloaded', stage='download') + manifest.upsert_step('download_files', 'completed') + + +def parse_xml_to_staging(manifest, staging_db, priv_dir, parse_num_threads=8, parse_fail_fast=False): + def sort_by_date(file): + r = re.search(r'(\d+.+)\.zip', file).group(0).replace('.', ' ') + return r + + local_zip_files = sorted( + [os.path.join(priv_dir, f) for f in os.listdir(priv_dir) if f.endswith('.zip')], + key=sort_by_date + ) + + manifest.upsert_step('parse_xml', 'running') + staging_db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + print('Processing files:') + total_cases_failed = 0 + files_with_failures = 0 + for zip_file in local_zip_files: + file_name = os.path.basename(zip_file) + manifest.upsert_file(file_name, source='local', status='processing', stage='parse') + extract_date = None + with zipfile.ZipFile(zip_file, 'r').open(DATA_FILENAME) as xml_file: + for _, elem in etree.iterparse(xml_file, tag=oca_tag('RunDate')): + if not extract_date: + extract_date = elem.text + break + with zipfile.ZipFile(zip_file, 'r').open(DATA_FILENAME) as xml_file: + parse_result = parse_file( + xml_file, + staging_db, + extract_date, + num_threads=parse_num_threads, + file_name=file_name, + ) + failed = upsert_parsed_etl_file(manifest, file_name, parse_result, extract_date) + total_cases_failed += failed + if failed > 0: + files_with_failures += 1 + + finalize_parse_xml_step( + manifest, + total_cases_failed, + files_with_failures, + parse_fail_fast=parse_fail_fast, + ) + + +def export_staging_to_csv( + staging_db, + pub_dir, + *, + csv_preprocess_chunk_size=1000, + upload=True, + mode=None, + s3_args=None, + s3_prefix=None, +): + """Export DuckDB staging tables to CSV and optionally preprocess + upload.""" + staging_db.export_tables_to_csv(output_dir=pub_dir) + preprocess_staging_csv_dir(pub_dir, chunk_size=csv_preprocess_chunk_size) + + if not upload: + return + + public_files = list_staging_csvs_in_dir(pub_dir) + with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: + files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) + pool.starmap(upload_public_file, files_zip) + + +def export_staging_csvs(manifest, staging_db, pub_dir, csv_preprocess_chunk_size=1000): + """Export DuckDB staging tables to local CSV (no S3 upload).""" + manifest.upsert_step('export_staging', 'running') + export_staging_to_csv( + staging_db, + pub_dir, + csv_preprocess_chunk_size=csv_preprocess_chunk_size, + upload=False, + ) + manifest.upsert_step('export_staging', 'completed') + + +def geocode_staging_csvs(manifest, pub_dir, geocode_workers, census_batch_chunk_size): + """Geocode all rows in ``oca_addresses_staging.csv`` before S3 upload.""" + manifest.upsert_step('geocode_staging', 'running') + geocoded_row_count = geocode_staging_addresses_csv( + pub_dir, + geocode_workers, + census_batch_chunk_size, + ) + manifest.upsert_step( + 'geocode_staging', + 'completed', + details={'geocoded_row_count': geocoded_row_count}, + ) + return geocoded_row_count + + +def upload_staging_csvs(manifest, pub_dir, mode, s3_args, s3_prefix): + """Upload preprocessed staging CSVs to S3 ``public/``.""" + manifest.upsert_step('upload_staging', 'running') + public_files = list_staging_csvs_in_dir(pub_dir) + with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: + files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) + pool.starmap(upload_public_file, files_zip) + manifest.upsert_step( + 'upload_staging', + 'completed', + details={'uploaded_file_count': len(public_files)}, + ) + + +def _assert_schema_bootstrap_context(db, expected_schema): + schema_name = (expected_schema or '').strip() + if not schema_name: + raise RuntimeError('DB schema must be set before running core table bootstrap.') + + schema_row = db.sql_fetch_one( + "SELECT current_schema(), current_setting('search_path')" + ) + current_schema, search_path = schema_row if schema_row else (None, '') + if not current_schema: + raise RuntimeError('Unable to resolve active schema before core table bootstrap.') + + if current_schema != schema_name: + raise RuntimeError( + f"Schema bootstrap guard failed: expected current_schema '{schema_name}', got '{current_schema}'." + ) + + if schema_name not in (search_path or ''): + raise RuntimeError( + f"Schema bootstrap guard failed: search_path '{search_path}' does not include '{schema_name}'." + ) + + +def ensure_core_tables_exist(db, expected_schema): + _assert_schema_bootstrap_context(db, expected_schema) + db.execute_sql_file('create_tables.sql') + + +def import_and_promote_staging(manifest, db, pub_dir, s3_args, s3_prefix, selection, expected_schema): + imported_staging_tables = staging_tables_with_rows(pub_dir) + staging_tables = [t + '_staging' for t in OCA_TABLES] + manifest.upsert_step('promote_staging', 'running') + db.set_statement_timeout() + ensure_core_tables_exist(db, expected_schema) + db.execute_sql_file('create_tables_staging.sql') + for t in staging_tables: + csv_filepath = os.path.join(pub_dir, f"{t}.csv") + if csv_has_rows(csv_filepath): + columns = '' + if t == 'oca_appearances_staging': + columns = 'indexnumberid, appearancedatetime, appearancepurpose, appearancereason, appearancepart, motionsequence, appearanceoutcomes' + db.sql(f""" + SELECT aws_s3.table_import_from_s3( + '{t}', '{columns}', '(FORMAT CSV, HEADER)', + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/{t}.csv", s3_prefix)}', 'us-east-1'), + aws_commons.create_aws_credentials('{s3_args["aws_id"]}', '{s3_args["aws_key"]}', '') + ); + """) + + db.execute_sql_file('normalize_staging_after_import.sql') + db.execute_sql_file('update_appearance_outcomes.sql') + counts_before = promotion_table_counts(db) + checksum_before = promotion_counts_checksum(counts_before) + print('\t...Promoting staging tables to main (single transaction)') + promote_staging_to_main(db) + counts_after = promotion_table_counts(db) + checksum_after = promotion_counts_checksum(counts_after) + for selected_name in selection.selected_zip_files: + source = 'sftp' if selected_name in selection.new_file_set else 's3_private' + parse_details = manifest.file_details_by_name.get(selected_name, {}) + upsert_promoted_etl_file(manifest, selected_name, source, parse_details) + manifest.upsert_step( + 'promote_staging', + 'completed', + details={ + 'counts_before': counts_before, + 'counts_after': counts_after, + 'checksum_before': checksum_before, + 'checksum_after': checksum_after, + }, + ) + return imported_staging_tables + + +def publish_core_tables(db, s3_args, s3_prefix): + """ + Export all core tables via aws_s3.query_export_to_s3. + + When oca_index_staging has rows, promotion deletes child rows for the batch + even if a child staging CSV was empty, so per-table skip is unsafe. + """ + published_keys = [] + for t in OCA_TABLES: + s3_filename = t + '.csv' + if t == "oca_addresses": + s3_filename = "oca_addresses_private.csv" + published_keys.append( + export_table_to_s3(db, t, s3_filename, s3_args, s3_prefix) + ) + return published_keys + + +def count_tombstone_orphans(db): + """Cases with oca_metadata.deletedate still present in oca_index.""" + row = db.sql_fetch_one(""" + SELECT COUNT(*)::bigint + FROM oca_index i + INNER JOIN oca_metadata m ON m.indexnumberid = i.indexnumberid + WHERE m.deletedate IS NOT NULL + """) + return int(row[0]) if row else 0 + + +def purge_tombstoned_cases(manifest, db): + """Delete oca_index rows (and children via CASCADE) for metadata tombstones.""" + manifest.upsert_step('deletion_backfill', 'running') + orphan_count_before = count_tombstone_orphans(db) + db.execute_sql_file(PURGE_TOMBSTONED_CASES_SQL_FILE) + orphan_count_after = count_tombstone_orphans(db) + manifest.upsert_step( + 'deletion_backfill', + 'completed', + details={ + 'orphan_count_before': orphan_count_before, + 'orphan_count_after': orphan_count_after, + }, + ) + return orphan_count_before, orphan_count_after + + +def geocode_addresses(manifest, db, pub_dir, geocode_workers, census_batch_chunk_size): + """Incremental geocode for addresses missing lat/lon; upsert into RDS.""" + manifest.upsert_step('geocode_refresh', 'running') + candidates = fetch_addresses_needing_geocode(db) + geocoded_rows = geocode_candidate_records( + candidates, + geocode_workers, + census_batch_chunk_size, + pub_dir, + ) + print(f'Upserting {len(geocoded_rows)} geocoded addresses') + upsert_geocoded_addresses(db, geocoded_rows) + manifest.upsert_step( + 'geocode_refresh', + 'completed', + details={ + 'geocode_candidate_count': len(candidates), + 'geocoded_row_count': len(geocoded_rows), + }, + ) + return len(candidates) + + +def publish_public_artifacts(manifest, db, s3_args, s3_prefix, mode, selection, pub_dir): + """Rebuild address views and export all public CSVs and date artifacts to S3.""" + manifest.upsert_step('publish_public', 'running') + print('Publishing address views and all public CSVs') + db.execute_sql_file('create_addresses_views.sql') + published_keys = publish_core_tables(db, s3_args, s3_prefix) + for view_name, s3_filename in ADDRESS_VIEW_EXPORTS: + published_keys.append( + export_table_to_s3(db, view_name, s3_filename, s3_args, s3_prefix) + ) + + create_date_files(selection.selected_zip_files[-1], pub_dir) + date_files = ['last-updated-shield.svg', 'last-updated-date.txt'] + with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: + files_zip = zip(date_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) + pool.starmap(upload_public_file, files_zip) + for date_file in date_files: + published_keys.append(s3_key(f"{S3_PUBLIC_FOLDER}/{date_file}", s3_prefix)) + + manifest.upsert_step( + 'publish_public', + 'completed', + details={'published_object_count': len(published_keys)}, + ) + return published_keys + + +def normalize_public_s3_encryption(manifest, s3, published_keys): + """SSE-S3 normalization for published public objects (except private address CSV).""" + manifest.upsert_step('normalize_s3_encryption', 'running') + normalize_published_s3_encryption(s3, published_keys) + manifest.upsert_step('normalize_s3_encryption', 'completed') + + +def upload_private_source_files(manifest, s3, priv_dir, s3_prefix): + """Upload raw XML zip backups to the S3 private folder.""" + manifest.upsert_step('upload_private', 'running') + for f in os.listdir(priv_dir): + if f != '.DS_Store': + s3.upload_file( + s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), + os.path.join(priv_dir, f), + ) + manifest.upsert_step('upload_private', 'completed') diff --git a/lib/geocode_record.py b/lib/geocode_record.py index 7896dfb..58d2c4a 100644 --- a/lib/geocode_record.py +++ b/lib/geocode_record.py @@ -1,3 +1,4 @@ +import logging import os import pandas as pd from pandas.util import hash_pandas_object @@ -8,8 +9,18 @@ from .placename_to_borocode import placename_to_borocode + +def suppress_geosupport_logging(): + """Geosupport logs expected geocode failures at ERROR; row status captures outcomes instead.""" + gs_logger = logging.getLogger('geosupport.geosupport') + gs_logger.handlers.clear() + gs_logger.setLevel(logging.CRITICAL) + gs_logger.propagate = False + + # initialize geosupport g = Geosupport() +suppress_geosupport_logging() def parse_address(addr): """parses full address string and returns dict of address components needed for geocoding diff --git a/lib/parse_manifest.py b/lib/parse_manifest.py new file mode 100644 index 0000000..85c725a --- /dev/null +++ b/lib/parse_manifest.py @@ -0,0 +1,109 @@ +"""Manifest details for per-zip parse results (no heavy ETL imports).""" + + +class ParseFailFastError(RuntimeError): + """Raised when PARSE_FAIL_FAST is set and any zip has case-level parse failures.""" + + +def cases_failed_from_details(details): + if not details: + return 0 + return int(details.get('cases_failed') or 0) + + +def build_parsed_file_details(extract_date, parse_result): + return { + 'extract_date': extract_date, + 'cases_seen': parse_result.cases_seen, + 'cases_parsed_ok': parse_result.cases_parsed_ok, + 'cases_failed': parse_result.cases_failed, + 'error_samples': parse_result.error_samples, + } + + +def upsert_parsed_etl_file(manifest, file_name, parse_result, extract_date): + """Record per-zip parse counters on etl_files (status parsed).""" + details = build_parsed_file_details(extract_date, parse_result) + upsert_kwargs = { + 'file_name': file_name, + 'source': 'local', + 'status': 'parsed', + 'stage': 'parse', + 'details': details, + } + if parse_result.cases_failed > 0: + upsert_kwargs['error'] = ( + f"{parse_result.cases_failed} of {parse_result.cases_seen} cases failed to parse" + ) + manifest.upsert_file(**upsert_kwargs) + return parse_result.cases_failed + + +def build_parse_xml_step_details(total_cases_failed, files_with_failures): + return { + 'total_cases_failed': total_cases_failed, + 'files_with_failures': files_with_failures, + } + + +def upsert_promoted_etl_file(manifest, file_name, source, parse_details): + """Mark file completed after promote only when parse had zero failures.""" + cases_failed = cases_failed_from_details(parse_details) + if cases_failed > 0: + details = dict(parse_details) + details['parse_complete'] = False + manifest.upsert_file( + file_name, + source=source, + status='parsed', + stage='parse', + details=details, + ) + return False + manifest.upsert_file( + file_name, + source=source, + status='completed', + stage='promote', + details=parse_details, + ) + return True + + +def file_names_needing_reprocess(file_details_by_name): + return sorted( + name + for name, details in file_details_by_name.items() + if cases_failed_from_details(details) > 0 + ) + + +def finalize_parse_xml_step(manifest, total_cases_failed, files_with_failures, parse_fail_fast=False): + """Complete or fail the parse_xml manifest step; optionally abort before export/promote.""" + step_details = build_parse_xml_step_details(total_cases_failed, files_with_failures) + if parse_fail_fast and total_cases_failed > 0: + for file_name, details in manifest.file_details_by_name.items(): + if cases_failed_from_details(details) > 0: + manifest.upsert_file( + file_name, + source='local', + status='failed', + stage='parse', + details=details, + error=ParseFailFastError( + f"{details.get('cases_failed', 0)} case(s) failed in {file_name}" + ), + ) + manifest.upsert_step( + 'parse_xml', + 'failed', + details=step_details, + error=ParseFailFastError( + f"PARSE_FAIL_FAST: {total_cases_failed} case failure(s) across " + f"{files_with_failures} file(s)" + ), + ) + raise ParseFailFastError( + f"PARSE_FAIL_FAST: aborting before export/promote ({total_cases_failed} case failure(s))" + ) + manifest.upsert_step('parse_xml', 'completed', details=step_details) diff --git a/lib/parse_write_buffer.py b/lib/parse_write_buffer.py new file mode 100644 index 0000000..5ca064b --- /dev/null +++ b/lib/parse_write_buffer.py @@ -0,0 +1,168 @@ +"""Buffered DuckDB staging writes with explicit transaction windows for parse hot paths.""" + +from __future__ import annotations + +import os +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from .duckdb_database import DuckDB + + +def _env_bool(name: str, default: bool) -> bool: + raw = os.environ.get(name) + if raw is None or raw.strip() == '': + return default + return raw.strip().lower() in ('1', 'true', 'yes', 'y', 'on') + + +def _env_int(name: str, default: int) -> int: + raw = os.environ.get(name) + if raw is None or raw.strip() == '': + return default + return max(1, int(raw)) + + +@dataclass(frozen=True) +class ParseWriteConfig: + """Runtime knobs for parser-to-DuckDB batching (safe production defaults).""" + + enabled: bool = True + batch_size: int = 128 + flush_every_n_cases: int = 16 + + @classmethod + def from_env(cls) -> ParseWriteConfig: + return cls( + enabled=_env_bool('PARSE_WRITE_BATCH_ENABLED', True), + batch_size=_env_int('PARSE_WRITE_BATCH_SIZE', 128), + flush_every_n_cases=_env_int('PARSE_WRITE_FLUSH_EVERY_N_CASES', 16), + ) + + @classmethod + def legacy(cls) -> ParseWriteConfig: + """Per-row flush semantics (batching disabled).""" + return cls(enabled=False, batch_size=1, flush_every_n_cases=1) + + +class StagingWriteBuffer: + """ + Buffers DELETE + INSERT statements and flushes in transaction windows. + + Flush order preserves per-case child replacement: all queued DELETEs run + before any queued INSERTs in the same transaction. + """ + + def __init__(self, db: DuckDB, config: ParseWriteConfig): + self.db = db + self.config = config + self._deletes: list[tuple[str, tuple | None]] = [] + self._inserts: dict[str, list[tuple | None]] = {} + self._cases_in_window = 0 + self._flush_count = 0 + self._in_case_window = False + self._case_mark: tuple[int, dict[str, int]] | None = None + + def _pending_insert_count(self) -> int: + return sum(len(rows) for rows in self._inserts.values()) + + def queue_delete(self, sql: str, params: tuple | None) -> None: + self._deletes.append((sql, params)) + + def queue_insert(self, sql: str, params: tuple | None) -> None: + self._inserts.setdefault(sql, []).append(params) + if ( + not self._in_case_window + and self._pending_insert_count() >= self.config.batch_size + ): + self.flush(reason='batch_size') + + def begin_case(self) -> None: + """Start a whole-case write window (metadata + children).""" + self._case_mark = ( + len(self._deletes), + {sql: len(rows) for sql, rows in self._inserts.items()}, + ) + self._in_case_window = True + + def discard_case(self) -> None: + """Drop queued writes for the current case without executing them.""" + if not self._in_case_window: + return + mark = self._case_mark + if mark is not None: + del_len, insert_lens = mark + del self._deletes[del_len:] + for sql in list(self._inserts): + prev = insert_lens.get(sql, 0) + rows = self._inserts[sql] + if prev >= len(rows): + del self._inserts[sql] + else: + del rows[prev:] + if not rows: + del self._inserts[sql] + self._case_mark = None + self._in_case_window = False + + def commit_case(self) -> None: + """End a successful case window and apply cross-case cadence flush.""" + if self._in_case_window: + self._case_mark = None + self._in_case_window = False + self.on_case_complete() + + def on_case_complete(self) -> None: + self._cases_in_window += 1 + if self._cases_in_window >= self.config.flush_every_n_cases: + self.flush(reason='case_cadence') + + def flush(self, reason: str = 'explicit') -> None: + if not self._deletes and not self._inserts: + self._cases_in_window = 0 + return + + with self.db.transaction(): + for sql, params in self._deletes: + self.db._execute_unlocked(sql, params) + for sql, params_list in self._inserts.items(): + if not params_list: + continue + if len(params_list) == 1: + self.db._execute_unlocked(sql, params_list[0]) + else: + self.db._executemany_unlocked(sql, params_list) + + self._flush_count += 1 + self._deletes.clear() + self._inserts.clear() + self._cases_in_window = 0 + + +def attach_write_buffer(db: DuckDB) -> StagingWriteBuffer | None: + config = ParseWriteConfig.from_env() + if not config.enabled: + return None + buffer = StagingWriteBuffer(db, config) + db.write_buffer = buffer + return buffer + + +def staging_execute(db: DuckDB, sql: str, params: tuple | None = None) -> Any: + """Route a staging write through the optional per-connection write buffer.""" + buffer = getattr(db, 'write_buffer', None) + if buffer is None: + return db.execute(sql, params) + sql_upper = sql.lstrip().upper() + if sql_upper.startswith('DELETE'): + buffer.queue_delete(sql, params) + return None + buffer.queue_insert(sql, params) + return None + + +def flush_write_buffer(db: DuckDB, reason: str = 'shutdown') -> None: + buffer = getattr(db, 'write_buffer', None) + if buffer is not None: + buffer.flush(reason=reason) diff --git a/lib/parsers.py b/lib/parsers.py index 8b2c0f6..f90ac29 100644 --- a/lib/parsers.py +++ b/lib/parsers.py @@ -1,7 +1,50 @@ -import frogress -from lxml import etree -import threading +import logging import queue +import threading +from dataclasses import dataclass, field + +from lxml import etree + +from .parse_write_buffer import attach_write_buffer, flush_write_buffer, staging_execute + +logger = logging.getLogger(__name__) + +PARSE_PROGRESS_INTERVAL = 1000 +MAX_PARSE_ERROR_SAMPLES = 10 +MAX_PARSE_ERROR_SAMPLE_LEN = 500 + + +@dataclass +class ParseFileResult: + """Per-zip parse health counters (thread-safe).""" + + cases_seen: int = 0 + cases_parsed_ok: int = 0 + cases_failed: int = 0 + error_samples: list[str] = field(default_factory=list) + _lock: threading.Lock = field(default_factory=threading.Lock, repr=False, compare=False) + + def record_seen(self) -> None: + with self._lock: + self.cases_seen += 1 + + def record_ok(self) -> None: + with self._lock: + self.cases_parsed_ok += 1 + + def record_failed(self, error: str) -> None: + sample = _truncate_parse_error(str(error)) + with self._lock: + self.cases_failed += 1 + if len(self.error_samples) < MAX_PARSE_ERROR_SAMPLES: + self.error_samples.append(sample) + + +def _truncate_parse_error(message: str) -> str: + if len(message) <= MAX_PARSE_ERROR_SAMPLE_LEN: + return message + return message[: MAX_PARSE_ERROR_SAMPLE_LEN - 3] + '...' + NAMESPACE = '{http://www.example.org/LandlordTenantExtractSchema}' @@ -17,6 +60,11 @@ def oca_tag(tag): DELETE_TAG = oca_tag('Delete') +def _index_number_id_from_case(case) -> str | None: + elem = case.find(INDEX_NUMBER_ID_TAG) + return None if elem is None else elem.text + + def is_case_to_delete(case): """ Determine if a case should be deleted from the database @@ -107,7 +155,7 @@ def parse_index(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT OR REPLACE INTO oca_index_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_causes(case, db): @@ -120,7 +168,7 @@ def parse_causes(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # Eelete existing records for this case to handle multiple causes - db.execute("DELETE FROM oca_causes_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_causes_staging WHERE indexnumberid = ?", (IndexNumberId,)) causes = case.find(oca_tag('PrimaryClaimCauseOfActions')) @@ -138,7 +186,7 @@ def parse_causes(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_causes_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_addresses(case, db): @@ -151,7 +199,7 @@ def parse_addresses(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # First, delete existing records for this case to handle multiple addresses - db.execute("DELETE FROM oca_addresses_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_addresses_staging WHERE indexnumberid = ?", (IndexNumberId,)) addresses = case.find(oca_tag('PropertyAddresses')) @@ -171,7 +219,7 @@ def parse_addresses(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_addresses_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_parties(case, db): @@ -184,7 +232,7 @@ def parse_parties(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # First, delete existing records for this case to handle multiple parties - db.execute("DELETE FROM oca_parties_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_parties_staging WHERE indexnumberid = ?", (IndexNumberId,)) parties = case.find(oca_tag('Parties')) @@ -203,7 +251,7 @@ def parse_parties(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_parties_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_events(case, db): @@ -216,7 +264,7 @@ def parse_events(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # First, delete existing records for this case to handle multiple events - db.execute("DELETE FROM oca_events_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_events_staging WHERE indexnumberid = ?", (IndexNumberId,)) events = case.find(oca_tag('Events')) @@ -236,7 +284,7 @@ def parse_events(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_events_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def appearance_outcome_to_dict(elem): @@ -267,7 +315,7 @@ def parse_appearances(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # First, delete existing records for this case to handle multiple appearances - db.execute("DELETE FROM oca_appearances_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_appearances_staging WHERE indexnumberid = ?", (IndexNumberId,)) appearances = case.find(oca_tag('Appearances')) @@ -297,7 +345,7 @@ def parse_appearances(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_appearances_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_motions(case, db): @@ -310,7 +358,7 @@ def parse_motions(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # First, delete existing records for this case to handle multiple motions - db.execute("DELETE FROM oca_motions_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_motions_staging WHERE indexnumberid = ?", (IndexNumberId,)) motions = case.find(oca_tag('Motions')) @@ -332,7 +380,7 @@ def parse_motions(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_motions_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_decisions(case, db): @@ -349,7 +397,7 @@ def parse_decisions(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # First, delete existing records for this case to handle multiple decisions - db.execute("DELETE FROM oca_decisions_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_decisions_staging WHERE indexnumberid = ?", (IndexNumberId,)) decisions = case.find(oca_tag('Decisions')) @@ -367,7 +415,7 @@ def parse_decisions(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_decisions_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_judgments(case, db): @@ -381,7 +429,7 @@ def parse_judgments(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # First, delete existing records for this case to handle multiple judgments - db.execute("DELETE FROM oca_judgments_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_judgments_staging WHERE indexnumberid = ?", (IndexNumberId,)) judgments = case.find(oca_tag('Judgments')) @@ -407,7 +455,7 @@ def parse_judgments(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_judgments_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_warrants(case, db): @@ -421,7 +469,7 @@ def parse_warrants(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # First, delete existing records for this case to handle multiple warrants - db.execute("DELETE FROM oca_warrants_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_warrants_staging WHERE indexnumberid = ?", (IndexNumberId,)) judgments = case.find(oca_tag('Judgments')) @@ -469,7 +517,7 @@ def parse_warrants(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_warrants_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def update_metadata(case, db, extract_date): @@ -496,7 +544,7 @@ def update_metadata(case, db, extract_date): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT OR REPLACE INTO oca_metadata_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_case(case, db, extract_date): @@ -506,11 +554,16 @@ def parse_case(case, db, extract_date): :param db: a DuckDB object :param extract_date: date of extract """ - + buffer = getattr(db, 'write_buffer', None) + if buffer is not None: + buffer.begin_case() + update_metadata(case, db, extract_date) # If this case is flagged for removal, skip the parsing steps if is_case_to_delete(case): + if buffer is not None: + buffer.commit_case() return parse_index(case, db) @@ -524,58 +577,84 @@ def parse_case(case, db, extract_date): parse_judgments(case, db) parse_warrants(case, db) + if buffer is not None: + buffer.commit_case() + -def _worker_thread(case_queue, db_queue, extract_date, thread_id): +def _worker_thread(case_queue, db_queue, extract_date, thread_id, stats: ParseFileResult): """Worker thread that processes cases from the queue""" while True: try: case = case_queue.get(timeout=1) + except queue.Empty: + continue + + try: if case is None: # Sentinel value to stop thread break - + # Each thread needs its own database connection thread_db = db_queue.get() try: parse_case(case, thread_db, extract_date) + stats.record_ok() except Exception as e: - print(f"Thread {thread_id}: Error parsing case: {e}") + index_id = _index_number_id_from_case(case) + if index_id: + logger.warning( + "Parse case failed thread=%s indexnumberid=%s: %s", + thread_id, + index_id, + e, + exc_info=True, + ) + else: + logger.warning( + "Parse case failed thread=%s: %s", + thread_id, + e, + exc_info=True, + ) + buffer = getattr(thread_db, 'write_buffer', None) + if buffer is not None: + buffer.discard_case() + stats.record_failed(str(e)) finally: # Clear the case copy from memory case.clear() db_queue.put(thread_db) # Return db connection to pool - - except queue.Empty: - continue finally: case_queue.task_done() -def parse_file(xml_file, staging_db, extract_date, num_threads=8): +def parse_file(xml_file, staging_db, extract_date, num_threads=8, file_name=None): """ Parse XML file with multiple threads - + :param xml_file: file-like object or path to XML file :param staging_db: DuckDB database object :param extract_date: date of extract :param num_threads: number of worker threads (increasing this doesn't speed up much, bottleneck is the database writes) + :param file_name: basename for summary logging (optional) + :return: ParseFileResult with per-zip counters """ from .duckdb_database import DuckDB - - # Create queues + + stats = ParseFileResult() case_queue = queue.Queue(maxsize=num_threads * 10) db_queue = queue.Queue() - - # Create database connections for each thread + for _ in range(num_threads): thread_db = DuckDB(staging_db.dbname) + attach_write_buffer(thread_db) db_queue.put(thread_db) # Start worker threads threads = [] for i in range(num_threads): t = threading.Thread( - target=_worker_thread, - args=(case_queue, db_queue, extract_date, i) + target=_worker_thread, + args=(case_queue, db_queue, extract_date, i, stats), ) t.start() threads.append(t) @@ -583,32 +662,39 @@ def parse_file(xml_file, staging_db, extract_date, num_threads=8): # Parse XML and feed cases to queue context = etree.iterparse(xml_file, tag=oca_tag('Index')) - - total_cases = 0 - for _, case in frogress.bar(context): - # Make a deep copy since we'll be clearing the original + for _, case in context: case_copy = etree.fromstring(etree.tostring(case)) - case_queue.put(case_copy) - total_cases += 1 - + stats.record_seen() + if stats.cases_seen % PARSE_PROGRESS_INTERVAL == 0: + logger.info("Parsed %s cases", stats.cases_seen) + # Clear the case element to free memory case.clear() while case.getprevious() is not None: del case.getparent()[0] - # Signal threads to stop + # Signal threads to stop (workers flush remaining buffer on sentinel) for _ in range(num_threads): case_queue.put(None) - + # Wait for all threads to complete for t in threads: t.join() - - # Close thread database connections + + # Close thread database connections (final flush for any stragglers) while not db_queue.empty(): thread_db = db_queue.get() + flush_write_buffer(thread_db) thread_db.close() - - print(f"Processed {total_cases} cases with {num_threads} threads") + + label = file_name or getattr(xml_file, 'name', None) or 'unknown' + logger.info( + "Parse zip summary file=%s seen=%d ok=%d failed=%d", + label, + stats.cases_seen, + stats.cases_parsed_ok, + stats.cases_failed, + ) + return stats diff --git a/lib/s3.py b/lib/s3.py index 7bb7f3d..9a22b01 100644 --- a/lib/s3.py +++ b/lib/s3.py @@ -12,7 +12,7 @@ def s3_client(aws_id, aws_key): 's3', aws_access_key_id=aws_id, aws_secret_access_key=aws_key, - config=Config(connect_timeout=10, read_timeout=100, retries={'max_attempts': 10}) + config=Config(connect_timeout=10, read_timeout=100, retries={'max_attempts': 10}, signature_version='s3v4') ) return s3 @@ -139,7 +139,7 @@ def upload_file(self, object_name, file_path): }[ext] # date-updated image needs to have no-cache to be used in github readme - cache_control = 'no-cache' if content_type == 'image/png' else '' + cache_control = 'no-cache' if content_type in ('image/png', 'image/svg+xml') else '' # Put the object into the bucket put_object(self.s3, self.bucket_name, object_name, file_path, content_type, cache_control) @@ -155,3 +155,22 @@ def list_files(self, pattern, folder=''): files = [os.path.basename(x) for x in all_files if x != folder] return files + + def update_encryption(self, object_key): + """ + Update an S3 object's server-side encryption to SSE-S3 (AES256). + + :param object_key: Object key in the bucket + """ + try: + self.s3.copy_object( + Bucket=self.bucket_name, + CopySource={'Bucket': self.bucket_name, 'Key': object_key}, + Key=object_key, + ServerSideEncryption='AES256', + MetadataDirective='COPY' + ) + return True + except ClientError as e: + logging.error(e) + return False diff --git a/lib/sql/create_addresses_views.sql b/lib/sql/create_addresses_views.sql index 4fb6070..ee42849 100644 --- a/lib/sql/create_addresses_views.sql +++ b/lib/sql/create_addresses_views.sql @@ -6,12 +6,6 @@ DROP VIEW IF EXISTS public.oca_addresses_with_bbl; DROP VIEW IF EXISTS public.oca_addresses_with_ct; DROP VIEW IF EXISTS public.oca_addresses_public; --- Drop geom column if it exists -ALTER TABLE oca_addresses -DROP COLUMN IF EXISTS geom; -DROP INDEX IF EXISTS oca_addresses_geom_idx; - --- Recreate views CREATE OR REPLACE VIEW public.oca_addresses_with_bbl AS SELECT indexnumberid, @@ -40,17 +34,6 @@ CREATE OR REPLACE VIEW public.oca_addresses_with_bbl AS FROM oca_addresses o LEFT JOIN pluto p ON LEFT(p.bbl, 10) = o.bbl; --- update oca_addresses with geom field -ALTER TABLE oca_addresses - ADD COLUMN geom Geometry(Point, 4326); - -UPDATE oca_addresses - SET geom = ST_SetSRID(ST_Point(lon, lat),4326); - -CREATE INDEX oca_addresses_geom_idx - ON oca_addresses - USING GIST (geom); - CREATE OR REPLACE VIEW public.oca_addresses_with_ct AS SELECT o.indexnumberid, t.geoid, diff --git a/lib/sql/create_etl_manifest_tables.sql b/lib/sql/create_etl_manifest_tables.sql new file mode 100644 index 0000000..9198a8c --- /dev/null +++ b/lib/sql/create_etl_manifest_tables.sql @@ -0,0 +1,56 @@ +CREATE TABLE IF NOT EXISTS etl_runs ( + id BIGSERIAL PRIMARY KEY, + run_id TEXT NOT NULL UNIQUE, + schema_name TEXT NOT NULL, + s3_prefix TEXT NOT NULL DEFAULT '', + mode TEXT, + reprocess_glob TEXT, + force_reprocess BOOLEAN NOT NULL DEFAULT FALSE, + status TEXT NOT NULL, + selected_file_count INTEGER NOT NULL DEFAULT 0, + processed_file_count INTEGER NOT NULL DEFAULT 0, + skipped_file_count INTEGER NOT NULL DEFAULT 0, + started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + completed_at TIMESTAMPTZ, + error_message TEXT, + error_details JSONB, + metadata JSONB NOT NULL DEFAULT '{}'::jsonb +); + +CREATE INDEX IF NOT EXISTS idx_etl_runs_schema_started + ON etl_runs (schema_name, started_at DESC); + +CREATE TABLE IF NOT EXISTS etl_files ( + id BIGSERIAL PRIMARY KEY, + run_id TEXT NOT NULL REFERENCES etl_runs (run_id) ON DELETE CASCADE, + file_name TEXT NOT NULL, + source TEXT NOT NULL, + status TEXT NOT NULL, + stage TEXT, + details JSONB NOT NULL DEFAULT '{}'::jsonb, + started_at TIMESTAMPTZ, + completed_at TIMESTAMPTZ, + error_message TEXT, + error_details JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + UNIQUE (run_id, file_name) +); + +CREATE INDEX IF NOT EXISTS idx_etl_files_name_status + ON etl_files (file_name, status); + +CREATE TABLE IF NOT EXISTS etl_steps ( + id BIGSERIAL PRIMARY KEY, + run_id TEXT NOT NULL REFERENCES etl_runs (run_id) ON DELETE CASCADE, + step_name TEXT NOT NULL, + status TEXT NOT NULL, + started_at TIMESTAMPTZ, + completed_at TIMESTAMPTZ, + error_message TEXT, + error_details JSONB, + details JSONB NOT NULL DEFAULT '{}'::jsonb, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + UNIQUE (run_id, step_name) +); diff --git a/lib/sql/create_geocode_staging_table.sql b/lib/sql/create_geocode_staging_table.sql new file mode 100644 index 0000000..aa9ae92 --- /dev/null +++ b/lib/sql/create_geocode_staging_table.sql @@ -0,0 +1,30 @@ +CREATE TABLE IF NOT EXISTS oca_addresses_geocode_staging ( + indexnumberid text, + street1 text, + street2 text, + city text, + state text, + postalcode text, + status text, + house_number text, + street_name text, + borough_code text, + place_name text, + sname text, + hnum text, + boro text, + lat real, + bin text, + bbl text, + cd text, + ct text, + council text, + grc text, + grc2 text, + msg text, + msg2 text, + lon real, + zip_code text +); + +TRUNCATE oca_addresses_geocode_staging; diff --git a/lib/sql/create_tables.sql b/lib/sql/create_tables.sql index 3bd8ebe..6e81305 100644 --- a/lib/sql/create_tables.sql +++ b/lib/sql/create_tables.sql @@ -1,8 +1,4 @@ -DROP VIEW IF EXISTS oca_addresses_with_bbl CASCADE; -DROP VIEW IF EXISTS oca_addresses_with_ct CASCADE; - -DROP TABLE IF EXISTS oca_index CASCADE; -CREATE TABLE oca_index ( +CREATE TABLE IF NOT EXISTS oca_index ( indexnumberid text PRIMARY KEY, court text, fileddate date, @@ -17,16 +13,14 @@ CREATE TABLE oca_index ( dateofjurydemand date ); -DROP TABLE IF EXISTS oca_causes CASCADE; -CREATE TABLE oca_causes ( +CREATE TABLE IF NOT EXISTS oca_causes ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, causeofactiontype text, interestfromdate date, amount numeric ); -DROP TABLE IF EXISTS oca_addresses CASCADE; -CREATE TABLE oca_addresses ( +CREATE TABLE IF NOT EXISTS oca_addresses ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, street1 text, street2 text, @@ -52,11 +46,11 @@ CREATE TABLE oca_addresses ( msg text, msg2 text, lon real, - zip_code text + zip_code text, + geom Geometry(Point, 4326) ); -DROP TABLE IF EXISTS oca_parties CASCADE; -CREATE TABLE oca_parties ( +CREATE TABLE IF NOT EXISTS oca_parties ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, role text, partytype text, @@ -64,8 +58,7 @@ CREATE TABLE oca_parties ( undertenant text ); -DROP TABLE IF EXISTS oca_events CASCADE; -CREATE TABLE oca_events ( +CREATE TABLE IF NOT EXISTS oca_events ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, eventname text, fileddate date, @@ -74,8 +67,7 @@ CREATE TABLE oca_events ( answertype text ); -DROP TABLE IF EXISTS oca_appearances CASCADE; -CREATE TABLE oca_appearances ( +CREATE TABLE IF NOT EXISTS oca_appearances ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, appearanceid bigserial, appearancedatetime timestamp, @@ -85,16 +77,14 @@ CREATE TABLE oca_appearances ( motionsequence int ); -DROP TABLE IF EXISTS oca_appearance_outcomes CASCADE; -CREATE TABLE oca_appearance_outcomes ( +CREATE TABLE IF NOT EXISTS oca_appearance_outcomes ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, appearanceid bigint, appearanceoutcometype text, outcomebasedontype text ); -DROP TABLE IF EXISTS oca_motions CASCADE; -CREATE TABLE oca_motions ( +CREATE TABLE IF NOT EXISTS oca_motions ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, sequence int, motiontype text, @@ -106,8 +96,7 @@ CREATE TABLE oca_motions ( ); -DROP TABLE IF EXISTS oca_decisions CASCADE; -CREATE TABLE oca_decisions ( +CREATE TABLE IF NOT EXISTS oca_decisions ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, sequence int, resultof text, @@ -115,8 +104,7 @@ CREATE TABLE oca_decisions ( ); -DROP TABLE IF EXISTS oca_judgments CASCADE; -CREATE TABLE oca_judgments ( +CREATE TABLE IF NOT EXISTS oca_judgments ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, sequence int, amendedfromjudgmentsequence int, @@ -131,8 +119,7 @@ CREATE TABLE oca_judgments ( debtorsroles text[] ); -DROP TABLE IF EXISTS oca_warrants CASCADE; -CREATE TABLE oca_warrants ( +CREATE TABLE IF NOT EXISTS oca_warrants ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, judgmentsequence int, sequence text, @@ -160,8 +147,7 @@ CREATE TABLE oca_warrants ( executiondate date ); -DROP TABLE IF EXISTS oca_metadata CASCADE; -CREATE TABLE oca_metadata ( +CREATE TABLE IF NOT EXISTS oca_metadata ( -- we don't want to delete records here when deleted from others indexnumberid text PRIMARY KEY, initialdate date, @@ -170,15 +156,26 @@ CREATE TABLE oca_metadata ( ); -CREATE INDEX ON oca_causes (indexnumberid); -CREATE INDEX ON oca_addresses (indexnumberid); -CREATE INDEX ON oca_addresses (bbl); -CREATE INDEX ON oca_parties (indexnumberid); -CREATE INDEX ON oca_events (indexnumberid); -CREATE INDEX ON oca_appearances (indexnumberid); -CREATE INDEX ON oca_appearance_outcomes (indexnumberid); -CREATE INDEX ON oca_motions (indexnumberid); -CREATE INDEX ON oca_decisions (indexnumberid); -CREATE INDEX ON oca_judgments (indexnumberid); -CREATE INDEX ON oca_warrants (indexnumberid); -CREATE INDEX ON oca_metadata (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_causes_indexnumberid_idx ON oca_causes (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_addresses_indexnumberid_idx ON oca_addresses (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_addresses_bbl_idx ON oca_addresses (bbl); +CREATE INDEX IF NOT EXISTS oca_addresses_geom_idx ON oca_addresses USING GIST (geom); + +-- Existing deployments: add geom column and index if missing. +ALTER TABLE oca_addresses ADD COLUMN IF NOT EXISTS geom Geometry(Point, 4326); +CREATE INDEX IF NOT EXISTS oca_addresses_geom_idx ON oca_addresses USING GIST (geom); + +UPDATE oca_addresses +SET geom = ST_SetSRID(ST_Point(lon, lat), 4326) +WHERE geom IS NULL + AND lat IS NOT NULL + AND lon IS NOT NULL; +CREATE INDEX IF NOT EXISTS oca_parties_indexnumberid_idx ON oca_parties (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_events_indexnumberid_idx ON oca_events (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_appearances_indexnumberid_idx ON oca_appearances (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_appearance_outcomes_indexnumberid_idx ON oca_appearance_outcomes (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_motions_indexnumberid_idx ON oca_motions (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_decisions_indexnumberid_idx ON oca_decisions (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_judgments_indexnumberid_idx ON oca_judgments (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_warrants_indexnumberid_idx ON oca_warrants (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_metadata_indexnumberid_idx ON oca_metadata (indexnumberid); diff --git a/lib/sql/ensure_promotion_indexes.sql b/lib/sql/ensure_promotion_indexes.sql new file mode 100644 index 0000000..bb1093c --- /dev/null +++ b/lib/sql/ensure_promotion_indexes.sql @@ -0,0 +1,6 @@ +-- Indexes supporting deterministic, scoped promotion deletes (idempotent reruns). +CREATE INDEX IF NOT EXISTS oca_addresses_promotion_natural_key_idx + ON oca_addresses (indexnumberid, street1, street2, city, state, postalcode); + +CREATE INDEX IF NOT EXISTS oca_index_staging_indexnumberid_idx + ON oca_index_staging (indexnumberid); diff --git a/lib/sql/normalize_staging_after_import.sql b/lib/sql/normalize_staging_after_import.sql new file mode 100644 index 0000000..4ad35bb --- /dev/null +++ b/lib/sql/normalize_staging_after_import.sql @@ -0,0 +1,18 @@ +-- Post-import staging normalization (deterministic casts / null coercion). +-- Array bracket formatting is handled in lib/etl_csv.py before S3 import. + +UPDATE oca_appearances_staging +SET motionsequence = NULL +WHERE motionsequence IS NOT NULL AND motionsequence::text = ''; + +UPDATE oca_judgments_staging +SET amendedfromjudgmentsequence = NULL +WHERE amendedfromjudgmentsequence IS NOT NULL AND amendedfromjudgmentsequence::text = ''; + +UPDATE oca_warrants_staging +SET executionstayeddays = NULL +WHERE executionstayeddays IS NOT NULL AND executionstayeddays::text = ''; + +UPDATE oca_warrants_staging +SET issuancestayeddays = NULL +WHERE issuancestayeddays IS NOT NULL AND issuancestayeddays::text = ''; diff --git a/lib/sql/promote_staging_to_main.sql b/lib/sql/promote_staging_to_main.sql new file mode 100644 index 0000000..12aee16 --- /dev/null +++ b/lib/sql/promote_staging_to_main.sql @@ -0,0 +1,185 @@ +-- Atomic staging -> main promotion for one import batch. +-- Tombstoned cases (oca_metadata / oca_metadata_staging deletedate) are purged from +-- oca_index (children CASCADE) and excluded from staging upserts. +-- oca_index uses UPSERT for active staging cases; metadata merged before staging drops. + +SET session_replication_role = replica; + +CREATE TEMP TABLE tombstoned_ids ON COMMIT DROP AS +SELECT indexnumberid FROM oca_metadata WHERE deletedate IS NOT NULL +UNION +SELECT indexnumberid FROM oca_metadata_staging WHERE deletedate IS NOT NULL; + +DELETE FROM oca_index +WHERE indexnumberid IN (SELECT indexnumberid FROM tombstoned_ids); + +CREATE TEMP TABLE promotion_active_staging_ids ON COMMIT DROP AS +SELECT s.indexnumberid +FROM oca_index_staging s +WHERE NOT EXISTS ( + SELECT 1 FROM tombstoned_ids t WHERE t.indexnumberid = s.indexnumberid +); + +-- Child tables keyed by indexnumberid (full per-case row replace for active staging cases). +DELETE FROM oca_appearance_outcomes +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +DELETE FROM oca_appearances +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +DELETE FROM oca_warrants +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +DELETE FROM oca_judgments +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +DELETE FROM oca_decisions +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +DELETE FROM oca_motions +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +DELETE FROM oca_events +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +DELETE FROM oca_parties +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +DELETE FROM oca_causes +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +-- Addresses: natural-line key aligned with incremental geocode. +DELETE FROM oca_addresses m +WHERE m.indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids) +AND NOT EXISTS ( + SELECT 1 + FROM oca_addresses_staging s + WHERE s.indexnumberid = m.indexnumberid + AND m.street1 IS NOT DISTINCT FROM s.street1 + AND m.street2 IS NOT DISTINCT FROM s.street2 + AND m.city IS NOT DISTINCT FROM s.city + AND m.state IS NOT DISTINCT FROM s.state + AND m.postalcode IS NOT DISTINCT FROM s.postalcode +); + +DELETE FROM oca_addresses m +USING oca_addresses_staging s +WHERE m.indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids) + AND m.indexnumberid = s.indexnumberid + AND m.street1 IS NOT DISTINCT FROM s.street1 + AND m.street2 IS NOT DISTINCT FROM s.street2 + AND m.city IS NOT DISTINCT FROM s.city + AND m.state IS NOT DISTINCT FROM s.state + AND m.postalcode IS NOT DISTINCT FROM s.postalcode; + +INSERT INTO oca_index ( + indexnumberid, court, fileddate, propertytype, classification, + specialtydesignationtypes, status, disposeddate, disposedreason, + firstpaper, primaryclaimtotal, dateofjurydemand +) +SELECT + indexnumberid, court, fileddate, propertytype, classification, + specialtydesignationtypes, status, disposeddate, disposedreason, + firstpaper, primaryclaimtotal, dateofjurydemand +FROM oca_index_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids) +ON CONFLICT (indexnumberid) DO UPDATE SET + court = EXCLUDED.court, + fileddate = EXCLUDED.fileddate, + propertytype = EXCLUDED.propertytype, + classification = EXCLUDED.classification, + specialtydesignationtypes = EXCLUDED.specialtydesignationtypes, + status = EXCLUDED.status, + disposeddate = EXCLUDED.disposeddate, + disposedreason = EXCLUDED.disposedreason, + firstpaper = EXCLUDED.firstpaper, + primaryclaimtotal = EXCLUDED.primaryclaimtotal, + dateofjurydemand = EXCLUDED.dateofjurydemand; + +INSERT INTO oca_causes +SELECT * FROM oca_causes_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +INSERT INTO oca_addresses ( + indexnumberid, street1, street2, city, state, postalcode, status, + house_number, street_name, borough_code, place_name, sname, hnum, boro, + lat, bin, bbl, cd, ct, council, grc, grc2, msg, msg2, lon, zip_code +) +SELECT + indexnumberid, street1, street2, city, state, postalcode, status, + house_number, street_name, borough_code, place_name, sname, hnum, boro, + lat, bin, bbl, cd, ct, council, grc, grc2, msg, msg2, lon, zip_code +FROM oca_addresses_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +UPDATE oca_addresses AS o +SET geom = ST_SetSRID(ST_Point(o.lon, o.lat), 4326) +WHERE o.indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids) + AND o.lat IS NOT NULL + AND o.lon IS NOT NULL; + +INSERT INTO oca_parties +SELECT * FROM oca_parties_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +INSERT INTO oca_events +SELECT * FROM oca_events_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +INSERT INTO oca_appearances +SELECT * FROM oca_appearances_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +INSERT INTO oca_appearance_outcomes +SELECT * FROM oca_appearance_outcomes_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +INSERT INTO oca_motions +SELECT * FROM oca_motions_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +INSERT INTO oca_decisions +SELECT * FROM oca_decisions_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +INSERT INTO oca_judgments +SELECT * FROM oca_judgments_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +INSERT INTO oca_warrants +SELECT * FROM oca_warrants_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +-- Metadata merge (no nested transaction; must run before staging drops). +CREATE TABLE oca_metadata_temp AS +SELECT + COALESCE(om.indexnumberid, oms.indexnumberid) AS indexnumberid, + COALESCE(om.initialdate, oms.initialdate) AS initialdate, + COALESCE(oms.updatedate, om.updatedate) AS updatedate, + COALESCE(oms.deletedate, om.deletedate) AS deletedate +FROM oca_metadata om +FULL OUTER JOIN oca_metadata_staging oms ON om.indexnumberid = oms.indexnumberid; + +DROP TABLE oca_metadata; +ALTER TABLE oca_metadata_temp RENAME TO oca_metadata; + +-- Apply tombstones merged this batch (delete-only incr may add new deletedate rows). +DELETE FROM oca_index +WHERE indexnumberid IN ( + SELECT indexnumberid FROM oca_metadata WHERE deletedate IS NOT NULL +); + +DROP TABLE IF EXISTS oca_index_staging CASCADE; +DROP TABLE IF EXISTS oca_causes_staging CASCADE; +DROP TABLE IF EXISTS oca_addresses_staging CASCADE; +DROP TABLE IF EXISTS oca_parties_staging CASCADE; +DROP TABLE IF EXISTS oca_events_staging CASCADE; +DROP TABLE IF EXISTS oca_appearances_staging CASCADE; +DROP TABLE IF EXISTS oca_appearance_outcomes_staging CASCADE; +DROP TABLE IF EXISTS oca_motions_staging CASCADE; +DROP TABLE IF EXISTS oca_decisions_staging CASCADE; +DROP TABLE IF EXISTS oca_judgments_staging CASCADE; +DROP TABLE IF EXISTS oca_warrants_staging CASCADE; +DROP TABLE IF EXISTS oca_metadata_staging CASCADE; + +SET session_replication_role = default; diff --git a/lib/sql/purge_tombstoned_cases.sql b/lib/sql/purge_tombstoned_cases.sql new file mode 100644 index 0000000..1b74515 --- /dev/null +++ b/lib/sql/purge_tombstoned_cases.sql @@ -0,0 +1,9 @@ +-- Remove production case data for tombstoned indexnumberids (oca_metadata.deletedate). +-- Child rows cascade from oca_index; oca_metadata rows are preserved. + +DELETE FROM oca_index +WHERE indexnumberid IN ( + SELECT m.indexnumberid + FROM oca_metadata m + WHERE m.deletedate IS NOT NULL +); diff --git a/lib/sql/select_addresses_needing_geocode.sql b/lib/sql/select_addresses_needing_geocode.sql new file mode 100644 index 0000000..67cc97b --- /dev/null +++ b/lib/sql/select_addresses_needing_geocode.sql @@ -0,0 +1,30 @@ +-- Rows in oca_addresses that still need geocoding (missing lat/lon with parseable house number). +SELECT + indexnumberid, + street1, + street2, + city, + state, + postalcode, + status, + house_number, + street_name, + borough_code, + place_name, + sname, + hnum, + boro, + lat, + bin, + bbl, + cd, + ct, + council, + grc, + grc2, + msg, + msg2, + lon, + zip_code +FROM oca_addresses +WHERE lat IS NULL; diff --git a/lib/sql/update_appearance_outcomes.sql b/lib/sql/update_appearance_outcomes.sql index fcce4f0..73d2e66 100644 --- a/lib/sql/update_appearance_outcomes.sql +++ b/lib/sql/update_appearance_outcomes.sql @@ -1,3 +1,33 @@ +-- update appearanceid so that the serial resumes from the latest number in the main table +DO $$ +DECLARE + max_id bigint; + staging_count bigint; +BEGIN + -- Get max ID from main table + SELECT COALESCE(MAX(appearanceid), 0) INTO max_id FROM oca_appearances; + + -- Get count of staging records + SELECT COUNT(*) INTO staging_count FROM oca_appearances_staging; + + -- Update NULL appearanceid values with sequential numbers starting from max_id + 1 + WITH numbered_rows AS ( + SELECT ctid, ROW_NUMBER() OVER (ORDER BY ctid) as rn + FROM oca_appearances_staging + WHERE appearanceid IS NULL + ) + UPDATE oca_appearances_staging + SET appearanceid = max_id + nr.rn + FROM numbered_rows nr + WHERE oca_appearances_staging.ctid = nr.ctid; + + -- Set sequence for future inserts + PERFORM setval( + pg_get_serial_sequence('oca_appearances_staging', 'appearanceid'), + max_id + staging_count + ); +END $$; + -- In the "appearances" nodes they have further nested info about the outcomes -- of those appearances. There are no unique identifers to be able to link -- these elements in the original data, so we parse the outcomes as a json diff --git a/lib/sql/update_metadata.sql b/lib/sql/update_metadata.sql deleted file mode 100644 index 6acdcb3..0000000 --- a/lib/sql/update_metadata.sql +++ /dev/null @@ -1,23 +0,0 @@ -BEGIN TRANSACTION; - --- Create temporary table with new data -CREATE TABLE oca_metadata_temp AS -SELECT - COALESCE(om.indexnumberid, oms.indexnumberid) AS indexnumberid, - COALESCE(om.initialdate, oms.initialdate) AS initialdate, - CASE - WHEN om.indexnumberid IS NULL THEN oms.updatedate - ELSE oms.updatedate - END AS updatedate, - CASE - WHEN om.indexnumberid IS NULL THEN oms.deletedate - ELSE oms.deletedate - END AS deletedate -FROM oca_metadata om -FULL OUTER JOIN oca_metadata_staging oms ON om.indexnumberid = oms.indexnumberid; - --- Replace the original table -DROP TABLE oca_metadata; -ALTER TABLE oca_metadata_temp RENAME TO oca_metadata; - -COMMIT; \ No newline at end of file diff --git a/lib/sql/upsert_geocoded_addresses.sql b/lib/sql/upsert_geocoded_addresses.sql new file mode 100644 index 0000000..63bf010 --- /dev/null +++ b/lib/sql/upsert_geocoded_addresses.sql @@ -0,0 +1,35 @@ +UPDATE oca_addresses AS o +SET + street1 = s.street1, + street2 = s.street2, + city = s.city, + state = s.state, + postalcode = s.postalcode, + status = s.status, + house_number = s.house_number, + street_name = s.street_name, + borough_code = s.borough_code, + place_name = s.place_name, + sname = s.sname, + hnum = s.hnum, + boro = s.boro, + lat = s.lat, + bin = s.bin, + bbl = s.bbl, + cd = s.cd, + ct = s.ct, + council = s.council, + grc = s.grc, + grc2 = s.grc2, + msg = s.msg, + msg2 = s.msg2, + lon = s.lon, + zip_code = s.zip_code, + geom = ST_SetSRID(ST_Point(s.lon, s.lat), 4326) +FROM oca_addresses_geocode_staging AS s +WHERE o.indexnumberid IS NOT DISTINCT FROM s.indexnumberid + AND o.street1 IS NOT DISTINCT FROM s.street1 + AND o.street2 IS NOT DISTINCT FROM s.street2 + AND o.city IS NOT DISTINCT FROM s.city + AND o.state IS NOT DISTINCT FROM s.state + AND o.postalcode IS NOT DISTINCT FROM s.postalcode; diff --git a/lib/staging_csv_export.py b/lib/staging_csv_export.py new file mode 100644 index 0000000..bc1fe68 --- /dev/null +++ b/lib/staging_csv_export.py @@ -0,0 +1,136 @@ +"""DuckDB COPY export shaping for RDS-compatible staging CSVs.""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +# Match Task 1 raw DuckDB COPY + Python preprocess CSV shape (unquoted empty fields). +DUCKDB_CSV_COPY_OPTIONS = "HEADER, DELIMITER ','" + +_EMPTY_INT_MARKERS_SQL = "('', 'nan', 'NaN', 'None', '')" + + +@dataclass(frozen=True) +class StagingExportSpec: + """Per-table transforms applied during DuckDB export (replaces CSV preprocess).""" + + drop_columns: frozenset[str] = field(default_factory=frozenset) + array_columns: frozenset[str] = field(default_factory=frozenset) + int_columns: frozenset[str] = field(default_factory=frozenset) + + +def postgres_array_brackets_sql(column_expr: str) -> str: + """ + SQL expression matching ``replace_postgres_array_brackets`` in etl_csv.py. + + Converts DuckDB list literals ``[a,b]`` to PostgreSQL ``{a,b}`` while leaving + JSON object arrays (inner ``{...}``) unchanged. + """ + text = f"trim(cast({column_expr} AS VARCHAR))" + inner = f"trim(substr({text}, 2, length({text}) - 2))" + return ( + f"CASE WHEN {column_expr} IS NULL THEN NULL " + f"WHEN NOT (starts_with({text}, '[') AND ends_with({text}, ']')) " + f"THEN cast({column_expr} AS VARCHAR) " + f"WHEN starts_with({inner}, '{{') AND ends_with({inner}, '}}') " + f"THEN cast({column_expr} AS VARCHAR) " + f"ELSE '{{' || substr({text}, 2, length({text}) - 2) || '}}' END" + ) + + +def nullable_int_csv_sql(column_expr: str) -> str: + """ + SQL expression matching nullable integer CSV normalization in etl_csv.py. + + Python writes an empty CSV field (not ``""``); DuckDB COPY does the same when + the cell is NULL rather than an empty string literal. + """ + as_text = f"cast({column_expr} AS VARCHAR)" + return ( + f"CASE WHEN {column_expr} IS NULL THEN NULL " + f"WHEN {as_text} IN {_EMPTY_INT_MARKERS_SQL} THEN NULL " + f"ELSE {as_text} END" + ) + + +def _export_column_sql(column_name: str, column_type: str, spec: StagingExportSpec) -> str | None: + if column_name in spec.drop_columns: + return None + if column_name in spec.array_columns: + return f"{postgres_array_brackets_sql(column_name)} AS {column_name}" + if column_name in spec.int_columns: + return f"{nullable_int_csv_sql(column_name)} AS {column_name}" + if column_type.upper() in ('JSON',): + return f"cast({column_name} AS VARCHAR) AS {column_name}" + return column_name + + +STAGING_TABLE_EXPORT_SPECS: dict[str, StagingExportSpec] = { + 'oca_index_staging': StagingExportSpec( + array_columns=frozenset({'specialtydesignationtypes'}), + ), + 'oca_events_staging': StagingExportSpec( + array_columns=frozenset({'filingpartiesroles'}), + ), + 'oca_motions_staging': StagingExportSpec( + array_columns=frozenset({'filingpartiesroles'}), + ), + 'oca_judgments_staging': StagingExportSpec( + array_columns=frozenset({'creditorsroles', 'debtorsroles'}), + int_columns=frozenset({'amendedfromjudgmentsequence'}), + ), + 'oca_warrants_staging': StagingExportSpec( + array_columns=frozenset({ + 'propertiesonwarrantcities', + 'propertiesonwarrantstates', + 'propertiesonwarrantpostalcodes', + }), + int_columns=frozenset({'executionstayeddays', 'issuancestayeddays'}), + ), + 'oca_appearances_staging': StagingExportSpec( + drop_columns=frozenset({'appearanceid'}), + int_columns=frozenset({'motionsequence'}), + ), +} + +# Staging tables with no export-time transforms (no second-pass CSV rewrite). +STAGING_TABLES_PASSTHROUGH_EXPORT = frozenset({ + 'oca_addresses_staging', + 'oca_causes_staging', + 'oca_decisions_staging', + 'oca_metadata_staging', + 'oca_parties_staging', + 'oca_appearance_outcomes_staging', +}) + + +def staging_csv_needs_preprocess(filename: str) -> bool: + """Return True when a staging CSV still requires the Python preprocess pass.""" + if not filename.endswith('.csv'): + return False + table_name = filename[:-4] + if table_name in STAGING_TABLE_EXPORT_SPECS: + return False + if table_name in STAGING_TABLES_PASSTHROUGH_EXPORT: + return False + return True + + +def build_staging_copy_sql(table_name: str, csv_path: str, columns: list[tuple[str, str]]) -> str: + """ + Build COPY SQL for a staging table. + + ``columns`` is a list of (name, type) from DESCRIBE. + """ + spec = STAGING_TABLE_EXPORT_SPECS.get(table_name) + options = DUCKDB_CSV_COPY_OPTIONS + if spec is None: + return f"COPY {table_name} TO '{csv_path}' ({options})" + + select_cols = [] + for name, col_type in columns: + expr = _export_column_sql(name, col_type, spec) + if expr is not None: + select_cols.append(expr) + select_sql = ', '.join(select_cols) + return f"COPY (SELECT {select_sql} FROM {table_name}) TO '{csv_path}' ({options})" diff --git a/oca_deletion_backfill.py b/oca_deletion_backfill.py new file mode 100644 index 0000000..0bd8af4 --- /dev/null +++ b/oca_deletion_backfill.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +import argparse +import os + +import dotenv + +from lib.database import Database +from lib.etl_run_manifest import EtlRunManifest +from lib.etl_stages import purge_tombstoned_cases + +dotenv.load_dotenv() + + +def parse_args(): + parser = argparse.ArgumentParser( + description=( + 'Remove oca_index rows (and child tables via CASCADE) for cases ' + 'with oca_metadata.deletedate set' + ), + ) + parser.add_argument( + '--db-schema', + default=os.environ.get('DB_SCHEMA', ''), + help='Database schema search_path target', + ) + return parser.parse_args() + + +def run_deletion_backfill(db_args, runtime_args=None): + """Purge production case data for metadata tombstones.""" + runtime_args = runtime_args or {} + db_schema = runtime_args.get('db_schema') or db_args.get('schema') or 'public' + + db = Database(**db_args) + manifest = EtlRunManifest( + db=db, + schema_name=db_schema, + s3_prefix='', + mode='deletion_backfill', + reprocess_glob='', + force_reprocess=False, + ) + manifest.setup_tables() + manifest.create_run() + + try: + db.ensure_connection() + orphan_before, orphan_after = purge_tombstoned_cases(manifest, db) + manifest.mark_run_completed(0, 0, 0) + print( + f'Deletion backfill complete; ' + f'orphans before={orphan_before}, after={orphan_after}' + ) + return orphan_before, orphan_after + except Exception as exc: + manifest.mark_run_failed(exc) + raise + + +def main(): + args = parse_args() + db_args = { + 'db_url': os.environ.get('DATABASE_URL', ''), + 'schema': args.db_schema, + } + runtime_args = {'db_schema': args.db_schema} + run_deletion_backfill(db_args, runtime_args) + + +if __name__ == '__main__': + main() diff --git a/oca_geocode_backfill.py b/oca_geocode_backfill.py new file mode 100644 index 0000000..c01ed0b --- /dev/null +++ b/oca_geocode_backfill.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python + +import argparse +import multiprocessing +import os + +import dotenv + +from lib.database import Database +from lib.etl_helpers import make_dir +from lib.etl_run_manifest import EtlRunManifest +from lib.etl_stages import geocode_addresses + +dotenv.load_dotenv() + + +def parse_optional_int(raw_value): + if raw_value in (None, ''): + return None + return int(raw_value) + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Geocode oca_addresses rows in RDS where lat IS NULL', + ) + parser.add_argument( + '--db-schema', + default=os.environ.get('DB_SCHEMA', ''), + help='Database schema search_path target', + ) + parser.add_argument( + '--geocode-workers', + type=int, + default=parse_optional_int(os.environ.get('GEOCODE_WORKERS')), + help='Worker process count for geocode pool', + ) + parser.add_argument( + '--census-batch-chunk-size', + type=int, + default=int(os.environ.get('CENSUS_BATCH_CHUNK_SIZE', '2500')), + help='Chunk size for census batch geocoder input', + ) + return parser.parse_args() + + +def run_geocode_backfill(db_args, runtime_args=None): + """Fetch ungeocoded addresses from RDS, geocode, and upsert (+ geom).""" + runtime_args = runtime_args or {} + geocode_workers = runtime_args.get('geocode_workers') or multiprocessing.cpu_count() + census_batch_chunk_size = runtime_args.get('census_batch_chunk_size') or 2500 + db_schema = runtime_args.get('db_schema') or db_args.get('schema') or 'public' + + db = Database(**db_args) + manifest = EtlRunManifest( + db=db, + schema_name=db_schema, + s3_prefix='', + mode='geocode_backfill', + reprocess_glob='', + force_reprocess=False, + ) + manifest.setup_tables() + manifest.create_run() + + pub_dir = make_dir('data-public') + try: + db.ensure_connection() + candidate_count = geocode_addresses( + manifest, + db, + pub_dir, + geocode_workers, + census_batch_chunk_size, + ) + manifest.mark_run_completed(0, 0, 0) + print(f'Backfill complete; {candidate_count} candidate addresses processed') + return candidate_count + except Exception as exc: + manifest.mark_run_failed(exc) + raise + + +def main(): + args = parse_args() + db_args = { + 'db_url': os.environ.get('DATABASE_URL', ''), + 'schema': args.db_schema, + } + runtime_args = { + 'db_schema': args.db_schema, + 'geocode_workers': args.geocode_workers, + 'census_batch_chunk_size': args.census_batch_chunk_size, + } + run_geocode_backfill(db_args, runtime_args) + + +if __name__ == '__main__': + main() diff --git a/oca_update.py b/oca_update.py index f2c403b..254107a 100644 --- a/oca_update.py +++ b/oca_update.py @@ -1,17 +1,49 @@ #!/usr/bin/env python -import dotenv +import argparse +import logging import os from pathlib import Path +import dotenv + from lib.etl import oca_etl dotenv.load_dotenv() +def parse_bool(raw_value): + if raw_value is None: + return False + return str(raw_value).strip().lower() in ('1', 'true', 'yes', 'y', 'on') + +def parse_optional_int(raw_value): + if raw_value in (None, ''): + return None + return int(raw_value) + +def parse_args(): + parser = argparse.ArgumentParser(description='Run OCA ETL pipeline') + parser.add_argument('--db-schema', default=os.environ.get('DB_SCHEMA', ''), help='Database schema search_path target') + parser.add_argument('--s3-prefix', default=os.environ.get('S3_PREFIX', ''), help='Optional S3 prefix namespace for private/public files') + parser.add_argument('--reprocess-glob', default=os.environ.get('REPROCESS_GLOB', ''), help='Filename glob for S3 private zip reprocessing') + parser.add_argument('--force-reprocess', action='store_true', default=parse_bool(os.environ.get('FORCE_REPROCESS')), help='Reprocess matched files even if already in S3 private backup') + parser.add_argument('--skip-public-publish', action='store_true', default=parse_bool(os.environ.get('SKIP_PUBLIC_PUBLISH')), help='Skip post-promote RDS public CSV export and S3 encryption normalization') + parser.add_argument('--parse-fail-fast', action='store_true', default=parse_bool(os.environ.get('PARSE_FAIL_FAST')), help='Abort run before export/promote when any case-level parse failures occur') + parser.add_argument('--geocode-workers', type=int, default=parse_optional_int(os.environ.get('GEOCODE_WORKERS')), help='Worker process count for geocode pool') + parser.add_argument('--census-batch-chunk-size', type=int, default=int(os.environ.get('CENSUS_BATCH_CHUNK_SIZE', '2500')), help='Chunk size for census batch geocoder input') + parser.add_argument('--csv-row-check-chunk-size', type=int, default=int(os.environ.get('CSV_ROW_CHECK_CHUNK_SIZE', '1000')), help='Chunk size used for constant-memory CSV non-empty checks') + parser.add_argument('--parse-write-batch-enabled', action='store_true', default=parse_bool(os.environ.get('PARSE_WRITE_BATCH_ENABLED', '1')), help='Buffer parser DuckDB writes and flush in transaction windows') + parser.add_argument('--parse-write-batch-size', type=int, default=int(os.environ.get('PARSE_WRITE_BATCH_SIZE', '128')), help='Max buffered INSERT statements before flush') + parser.add_argument('--parse-write-flush-every-n-cases', type=int, default=int(os.environ.get('PARSE_WRITE_FLUSH_EVERY_N_CASES', '16')), help='Flush buffered writes after this many cases per worker') + return parser.parse_args() + def main(): + logging.basicConfig(level=logging.INFO, format="%(message)s") + args = parse_args() db_args = { - 'db_url': os.environ.get('DATABASE_URL', '') + 'db_url': os.environ.get('DATABASE_URL', ''), + 'schema': args.db_schema } s3_args = { @@ -33,7 +65,23 @@ def main(): 'db_url': os.environ.get('CLONED_DATABASE_URL', '') } - oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args) + os.environ['PARSE_WRITE_BATCH_ENABLED'] = '1' if args.parse_write_batch_enabled else '0' + os.environ['PARSE_WRITE_BATCH_SIZE'] = str(args.parse_write_batch_size) + os.environ['PARSE_WRITE_FLUSH_EVERY_N_CASES'] = str(args.parse_write_flush_every_n_cases) + + runtime_args = { + 'db_schema': args.db_schema, + 's3_prefix': args.s3_prefix, + 'reprocess_glob': args.reprocess_glob, + 'force_reprocess': args.force_reprocess, + 'geocode_workers': args.geocode_workers, + 'census_batch_chunk_size': args.census_batch_chunk_size, + 'csv_row_check_chunk_size': args.csv_row_check_chunk_size, + 'parse_fail_fast': args.parse_fail_fast, + 'skip_public_publish': args.skip_public_publish, + } + + oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args) if __name__== "__main__": main() diff --git a/pyproject.toml b/pyproject.toml index a334efd..0053a36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,6 @@ dependencies = [ "boto3>=1.38.41", "censusgeocode>=0.5.2", "duckdb>=1.3.1", - "frogress>=0.10.1", "lxml>=5.4.0", "notebook>=7.4.4", "pandas>=2.3.0", @@ -17,8 +16,8 @@ dependencies = [ "python-dotenv>=1.1.0", "python-geosupport>=1.1.0", "requests>=2.32.4", - "requests-toolbelt==0.10.1", + "requests-toolbelt>=1.0.0", "sqlalchemy>=2.0.41", - "urllib3==1.26.15", + "urllib3>=2.6.0", "usaddress>=0.5.14", ] diff --git a/requirements.txt b/requirements.txt index 0f00eaa..d532571 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ lxml -frogress boto3 python-dotenv urllib3==1.26.15 diff --git a/tests/csv_checksums.py b/tests/csv_checksums.py new file mode 100644 index 0000000..debbc80 --- /dev/null +++ b/tests/csv_checksums.py @@ -0,0 +1,21 @@ +"""CSV checksum helpers for parse/export regression tests.""" + +from __future__ import annotations + +import hashlib +import os + + +def md5_dir_csvs(pub_dir: str) -> dict[str, str]: + """MD5 hex digest per CSV in pub_dir (stable parity fingerprint).""" + digests = {} + for name in sorted(os.listdir(pub_dir)): + if not name.endswith('.csv'): + continue + path = os.path.join(pub_dir, name) + h = hashlib.md5() + with open(path, 'rb') as f: + for chunk in iter(lambda: f.read(1 << 20), b''): + h.update(chunk) + digests[name] = h.hexdigest() + return digests diff --git a/tests/parse_pipeline_helpers.py b/tests/parse_pipeline_helpers.py new file mode 100644 index 0000000..1a463ec --- /dev/null +++ b/tests/parse_pipeline_helpers.py @@ -0,0 +1,50 @@ +"""Test helpers for parse -> DuckDB -> export without the evaluation harness.""" + +from __future__ import annotations + +import os + +from lib.duckdb_database import DuckDB, fetch_staging_row_counts +from lib.etl_stages import export_staging_to_csv, parse_xml_to_staging + +from csv_checksums import md5_dir_csvs + + +class _NoopManifest: + def upsert_step(self, *args, **kwargs): + pass + + def upsert_file(self, *args, **kwargs): + pass + + +def run_parse_export_in_dir( + priv_dir: str, + *, + parse_num_threads: int = 1, +) -> tuple[dict[str, int], dict[str, str]]: + """ + Run parse -> export -> preprocess on zips in priv_dir (no upload). + + Returns (staging_row_counts, csv_checksums). + """ + staging_path = os.path.join(priv_dir, 'staging.duckdb') + pub_dir = os.path.join(priv_dir, 'public') + os.makedirs(pub_dir, exist_ok=True) + + if os.path.exists(staging_path): + os.remove(staging_path) + + staging_db = DuckDB(staging_path) + try: + parse_xml_to_staging( + _NoopManifest(), + staging_db, + priv_dir, + parse_num_threads=parse_num_threads, + ) + row_counts = fetch_staging_row_counts(staging_db) + export_staging_to_csv(staging_db, pub_dir, upload=False) + return row_counts, md5_dir_csvs(pub_dir) + finally: + staging_db.close() diff --git a/tests/parser_xml_fixtures.py b/tests/parser_xml_fixtures.py new file mode 100644 index 0000000..686011d --- /dev/null +++ b/tests/parser_xml_fixtures.py @@ -0,0 +1,157 @@ +"""Synthetic OCA XML fixtures for parser and export tests.""" + +from __future__ import annotations + +import zipfile +from typing import Literal + +from lib.etl_constants import DATA_FILENAME + +NS = 'http://www.example.org/LandlordTenantExtractSchema' + + +def _tag(local: str) -> str: + return f'{{{NS}}}{local}' + + +def _el(parent, local: str, text: str | None = None): + elem = parent.makeelement(_tag(local)) + if text is not None: + elem.text = text + parent.append(elem) + return elem + + +def build_case_xml( + index_id: str, + *, + with_delete: bool = False, + num_parties: int = 2, + num_events: int = 2, + num_appearances: int = 1, + num_judgments: int = 1, + num_warrants_per_judgment: int = 1, +) -> str: + """Return one Index element as an XML string.""" + from lxml import etree + + case = etree.Element(_tag('Index')) + _el(case, 'IndexNumberId', index_id) + _el(case, 'Court', 'Housing Part') + _el(case, 'FiledDate', '2024-01-15') + _el(case, 'PropertyType', 'Residential') + _el(case, 'Classification', 'Nonpayment') + _el(case, 'Status', 'Active') + _el(case, 'FirstPaper', 'Petition by Attorney') + + causes = etree.SubElement(case, _tag('PrimaryClaimCauseOfActions')) + cause = etree.SubElement(causes, _tag('PrimaryClaimCauseOfAction')) + _el(cause, 'CauseOfActionType', 'Rent Arrears') + _el(cause, 'Amount', '5000.00') + + addresses = etree.SubElement(case, _tag('PropertyAddresses')) + address = etree.SubElement(addresses, _tag('PropertyAddress')) + _el(address, 'Street1', '123 Main St') + _el(address, 'City', 'New York') + _el(address, 'State', 'NY') + _el(address, 'PostalCode', '10001') + + parties_parent = etree.SubElement(case, _tag('Parties')) + for i in range(num_parties): + party = etree.SubElement(parties_parent, _tag('Party')) + _el(party, 'Role', 'Petitioner' if i == 0 else 'Respondent') + _el(party, 'PartyType', 'Individual') + + events_parent = etree.SubElement(case, _tag('Events')) + for i in range(num_events): + event = etree.SubElement(events_parent, _tag('Event')) + _el(event, 'EventName', f'Event {i}') + _el(event, 'FiledDate', '2024-02-01') + + appearances_parent = etree.SubElement(case, _tag('Appearances')) + for i in range(num_appearances): + appearance = etree.SubElement(appearances_parent, _tag('Appearance')) + _el(appearance, 'AppearanceDateTime', '2024-02-15T10:00:00') + _el(appearance, 'AppearancePurpose', 'Conference') + outcomes = etree.SubElement(appearance, _tag('AppearanceOutcomes')) + outcome = etree.SubElement(outcomes, _tag('AppearanceOutcome')) + _el(outcome, 'AppearanceOutcomeType', 'Adjourned') + + motions_parent = etree.SubElement(case, _tag('Motions')) + motion = etree.SubElement(motions_parent, _tag('Motion')) + _el(motion, 'Sequence', '1') + _el(motion, 'MotionType', 'Default') + + decisions_parent = etree.SubElement(case, _tag('Decisions')) + decision = etree.SubElement(decisions_parent, _tag('Decision')) + _el(decision, 'Sequence', '1') + _el(decision, 'ResultOf', 'Motion') + + judgments_parent = etree.SubElement(case, _tag('Judgments')) + for j in range(num_judgments): + judgment = etree.SubElement(judgments_parent, _tag('Judgment')) + seq = str(j + 1) + _el(judgment, 'Sequence', seq) + _el(judgment, 'JudgmentType', 'Money') + _el(judgment, 'FiledDate', '2024-03-01') + warrants_parent = etree.SubElement(judgment, _tag('Warrants')) + for w in range(num_warrants_per_judgment): + warrant = etree.SubElement(warrants_parent, _tag('Warrant')) + _el(warrant, 'Sequence', str(w + 1)) + _el(warrant, 'CreatedReason', 'Nonpayment') + + if with_delete: + etree.SubElement(case, _tag('Delete')) + + return etree.tostring(case, encoding='unicode') + + +def build_extract_xml( + case_count: int, + *, + extract_date: str = '2024-03-08', + delete_every: int | None = None, + child_profile: Literal['weekly', 'heavy'] = 'weekly', +) -> bytes: + """Build a full LandlordTenantExtract XML document.""" + if child_profile == 'weekly': + parties, events, appearances = 2, 2, 1 + judgments, warrants = 1, 1 + else: + parties, events, appearances = 5, 5, 3 + judgments, warrants = 2, 2 + + parts = [ + '', + f'', + f'{extract_date}', + ] + for i in range(case_count): + index_id = f'LT-BENCH-{i:06d}' + with_delete = delete_every is not None and delete_every > 0 and i % delete_every == 0 + parts.append( + build_case_xml( + index_id, + with_delete=with_delete, + num_parties=parties, + num_events=events, + num_appearances=appearances, + num_judgments=judgments, + num_warrants_per_judgment=warrants, + ) + ) + parts.append('') + return ''.join(parts).encode('utf-8') + + +def write_test_zip( + zip_path: str, + case_count: int, + *, + child_profile: Literal['weekly', 'heavy'] = 'weekly', +) -> str: + """Write a zip containing LandlordTenantExtract.xml; return zip path.""" + xml_bytes = build_extract_xml(case_count, child_profile=child_profile) + with zipfile.ZipFile(zip_path, 'w', compression=zipfile.ZIP_DEFLATED) as zf: + zf.writestr(DATA_FILENAME, xml_bytes) + return zip_path diff --git a/tests/test_create_date_files.py b/tests/test_create_date_files.py new file mode 100644 index 0000000..ee27b78 --- /dev/null +++ b/tests/test_create_date_files.py @@ -0,0 +1,31 @@ +import os +import tempfile +import unittest +from unittest import mock + +from lib.etl_helpers import create_date_files + + +class CreateDateFilesTests(unittest.TestCase): + def test_writes_txt_and_svg_without_network(self): + with tempfile.TemporaryDirectory() as local_dir: + with mock.patch('lib.etl_helpers.requests.get') as get_mock: + create_date_files('LandlordTenant.Incr.2024-03-08.zip', local_dir) + get_mock.assert_not_called() + + txt_path = os.path.join(local_dir, 'last-updated-date.txt') + svg_path = os.path.join(local_dir, 'last-updated-shield.svg') + self.assertTrue(os.path.isfile(txt_path)) + self.assertTrue(os.path.isfile(svg_path)) + + with open(txt_path, encoding='utf-8') as handle: + self.assertEqual(handle.read(), '2024-03-08') + + with open(svg_path, encoding='utf-8') as handle: + svg = handle.read() + self.assertIn('Last Updated: 2024-03-08', svg) + self.assertIn(' DuckDB: + db = DuckDB(path) + db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + attach_write_buffer(db) + return db + + +def _parse_zip_bytes(xml_bytes: bytes, db: DuckDB, extract_date: str = '2024-03-08') -> ParseFileResult: + buf = io.BytesIO() + with zipfile.ZipFile(buf, 'w') as zf: + zf.writestr(DATA_FILENAME, xml_bytes) + buf.seek(0) + with zipfile.ZipFile(buf, 'r') as zf: + with zf.open(DATA_FILENAME) as xml_file: + return parse_file(xml_file, db, extract_date, num_threads=1) + + +class ParseFileResultTests(unittest.TestCase): + def test_cases_seen_matches_index_count(self): + xml_bytes = build_extract_xml(12, child_profile='weekly') + with tempfile.TemporaryDirectory() as tmp: + db = _init_staging_db(os.path.join(tmp, 'staging.duckdb')) + try: + result = _parse_zip_bytes(xml_bytes, db) + flush_write_buffer(db) + finally: + db.close() + + self.assertEqual(result.cases_seen, 12) + self.assertEqual(result.cases_parsed_ok, 12) + self.assertEqual(result.cases_failed, 0) + self.assertEqual(result.error_samples, []) + + def test_injected_failures_increment_counters_and_samples(self): + xml_bytes = build_extract_xml(5, child_profile='weekly') + fail_ids = {2, 4} + seen = {'n': 0} + + def parse_case_maybe_fail(case, db, extract_date): + seen['n'] += 1 + if seen['n'] in fail_ids: + raise RuntimeError(f'injected failure case {seen["n"]}') + parse_case(case, db, extract_date) + + with tempfile.TemporaryDirectory() as tmp: + db = _init_staging_db(os.path.join(tmp, 'staging.duckdb')) + try: + with patch('lib.parsers.parse_case', parse_case_maybe_fail): + result = _parse_zip_bytes(xml_bytes, db) + flush_write_buffer(db) + finally: + db.close() + + self.assertEqual(result.cases_seen, 5) + self.assertEqual(result.cases_parsed_ok, 3) + self.assertEqual(result.cases_failed, 2) + self.assertEqual(len(result.error_samples), 2) + self.assertTrue(all('injected failure' in s for s in result.error_samples)) + + def test_error_samples_capped_at_ten(self): + xml_bytes = build_extract_xml(15, child_profile='weekly') + seen = {'n': 0} + + def parse_case_always_fail(case, db, extract_date): + seen['n'] += 1 + raise RuntimeError('always fail') + + with tempfile.TemporaryDirectory() as tmp: + db = _init_staging_db(os.path.join(tmp, 'staging.duckdb')) + try: + with patch('lib.parsers.parse_case', parse_case_always_fail): + result = _parse_zip_bytes(xml_bytes, db) + finally: + db.close() + + self.assertEqual(result.cases_failed, 15) + self.assertEqual(len(result.error_samples), MAX_PARSE_ERROR_SAMPLES) + + def test_error_sample_truncation(self): + long_msg = 'x' * (MAX_PARSE_ERROR_SAMPLE_LEN + 50) + stats = ParseFileResult() + stats.record_failed(long_msg) + self.assertEqual(len(stats.error_samples[0]), MAX_PARSE_ERROR_SAMPLE_LEN) + self.assertTrue(stats.error_samples[0].endswith('...')) + + def test_parse_logs_indexnumberid_on_failure(self): + xml_bytes = build_extract_xml(1, child_profile='weekly') + case_id = 'LT-BENCH-000000' + + def parse_case_fail(case, db, extract_date): + raise RuntimeError('log test failure') + + with tempfile.TemporaryDirectory() as tmp: + db = _init_staging_db(os.path.join(tmp, 'staging.duckdb')) + try: + with patch('lib.parsers.parse_case', parse_case_fail): + with self.assertLogs('lib.parsers', level='WARNING') as logs: + _parse_zip_bytes(xml_bytes, db) + finally: + db.close() + + combined = '\n'.join(logs.output) + self.assertIn('indexnumberid=', combined) + self.assertIn(case_id, combined) + + +class ParseManifestUpsertTests(unittest.TestCase): + def test_upsert_parsed_etl_file_and_step_aggregate(self): + with tempfile.TemporaryDirectory() as tmp: + db = _init_staging_db(os.path.join(tmp, 'staging.duckdb')) + manifest = FakeManifest() + fail_on = {3} + + def parse_case_maybe_fail(case, db, extract_date): + index_elem = case.find( + '{http://www.example.org/LandlordTenantExtractSchema}IndexNumberId' + ) + case_num = int(index_elem.text.rsplit('-', 1)[-1]) if index_elem is not None else 0 + if case_num in fail_on: + raise RuntimeError('manifest test failure') + parse_case(case, db, extract_date) + + xml_bytes = build_extract_xml(6, child_profile='weekly') + try: + with patch('lib.parsers.parse_case', parse_case_maybe_fail): + result = _parse_zip_bytes(xml_bytes, db) + flush_write_buffer(db) + finally: + db.close() + + upsert_parsed_etl_file( + manifest, + 'LandlordTenant.Incr.2024-03-08.zip', + result, + '2024-03-08', + ) + manifest.upsert_step( + 'parse_xml', + 'completed', + details=build_parse_xml_step_details(result.cases_failed, 1), + ) + + self.assertEqual(len(manifest.file_upserts), 1) + upsert = manifest.file_upserts[0] + details = upsert['details'] + self.assertEqual(upsert['status'], 'parsed') + self.assertEqual(details['cases_seen'], 6) + self.assertEqual(details['cases_parsed_ok'], 5) + self.assertEqual(details['cases_failed'], 1) + self.assertEqual(len(details['error_samples']), 1) + self.assertEqual(details['extract_date'], '2024-03-08') + self.assertIn('1 of 6 cases failed', upsert['error']) + + step_details = manifest.step_upserts[0]['details'] + self.assertEqual(step_details['total_cases_failed'], 1) + self.assertEqual(step_details['files_with_failures'], 1) + + +class PromoteCompletedGateTests(unittest.TestCase): + def test_upsert_promoted_marks_completed_only_when_no_failures(self): + manifest = FakeManifest() + clean_details = { + 'extract_date': '2024-03-08', + 'cases_seen': 10, + 'cases_parsed_ok': 10, + 'cases_failed': 0, + 'error_samples': [], + } + dirty_details = dict(clean_details) + dirty_details['cases_failed'] = 3 + dirty_details['cases_parsed_ok'] = 7 + dirty_details['error_samples'] = ['err'] + + self.assertTrue(upsert_promoted_etl_file(manifest, 'clean.zip', 'sftp', clean_details)) + self.assertFalse(upsert_promoted_etl_file(manifest, 'dirty.zip', 's3_private', dirty_details)) + + clean_upsert = manifest.file_upserts[-2] + dirty_upsert = manifest.file_upserts[-1] + self.assertEqual(clean_upsert['status'], 'completed') + self.assertEqual(clean_upsert['stage'], 'promote') + self.assertNotIn('parse_complete', clean_upsert['details']) + + self.assertEqual(dirty_upsert['status'], 'parsed') + self.assertEqual(dirty_upsert['stage'], 'parse') + self.assertFalse(dirty_upsert['details']['parse_complete']) + self.assertEqual(dirty_upsert['details']['cases_failed'], 3) + + def test_cases_failed_from_details_coerces_missing(self): + self.assertEqual(cases_failed_from_details({}), 0) + self.assertEqual(cases_failed_from_details({'cases_failed': '2'}), 2) + + +class ParseFailFastTests(unittest.TestCase): + def test_finalize_parse_fail_fast_marks_step_and_files_failed(self): + manifest = FakeManifest() + manifest.file_details_by_name = { + 'bad.zip': { + 'cases_seen': 5, + 'cases_parsed_ok': 3, + 'cases_failed': 2, + 'error_samples': ['err'], + }, + 'good.zip': { + 'cases_seen': 1, + 'cases_parsed_ok': 1, + 'cases_failed': 0, + 'error_samples': [], + }, + } + + with self.assertRaises(ParseFailFastError): + finalize_parse_xml_step(manifest, 2, 1, parse_fail_fast=True) + + self.assertEqual(manifest.step_upserts[-1]['status'], 'failed') + self.assertEqual(manifest.step_upserts[-1]['step_name'], 'parse_xml') + failed_names = {u['file_name'] for u in manifest.file_upserts if u['status'] == 'failed'} + self.assertEqual(failed_names, {'bad.zip'}) + + def test_lenient_finalize_completes_step_with_failures(self): + manifest = FakeManifest() + finalize_parse_xml_step(manifest, 3, 1, parse_fail_fast=False) + self.assertEqual(manifest.step_upserts[-1]['status'], 'completed') + self.assertEqual(manifest.step_upserts[-1]['details']['total_cases_failed'], 3) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_parser_batching.py b/tests/test_parser_batching.py new file mode 100644 index 0000000..daf1a71 --- /dev/null +++ b/tests/test_parser_batching.py @@ -0,0 +1,209 @@ +import io +import os +import tempfile +import unittest +import zipfile + +from lxml import etree + +from parser_xml_fixtures import build_case_xml, build_extract_xml, write_test_zip +from lib.duckdb_database import DuckDB, fetch_staging_row_counts +from lib.etl_constants import DATA_FILENAME +from lib.parse_write_buffer import ParseWriteConfig, attach_write_buffer, flush_write_buffer +from lib.parsers import parse_file, parse_case + + +def _init_staging_db(path: str) -> DuckDB: + db = DuckDB(path) + db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + attach_write_buffer(db) + return db + + +def _case_from_xml(case_xml: str): + return etree.fromstring(case_xml.encode('utf-8')) + + +class ParserBatchingSemanticsTests(unittest.TestCase): + def test_repeated_case_update_replaces_child_rows(self): + case_id = 'LT-REPEAT-001' + first = build_case_xml(case_id, num_parties=2, num_events=1) + second = build_case_xml(case_id, num_parties=4, num_events=3) + + with tempfile.TemporaryDirectory() as tmp: + db_path = os.path.join(tmp, 'staging.duckdb') + db = _init_staging_db(db_path) + try: + parse_case(_case_from_xml(first), db, '2024-03-01') + flush_write_buffer(db) + parse_case(_case_from_xml(second), db, '2024-03-02') + flush_write_buffer(db) + counts = fetch_staging_row_counts(db) + finally: + db.close() + + self.assertEqual(counts['oca_index_staging'], 1) + self.assertEqual(counts['oca_parties_staging'], 4) + self.assertEqual(counts['oca_events_staging'], 3) + + def test_delete_short_circuit_keeps_metadata_only(self): + case_id = 'LT-DELETE-001' + live = build_case_xml(case_id, num_parties=2) + deleted = build_case_xml(case_id, with_delete=True, num_parties=2) + + with tempfile.TemporaryDirectory() as tmp: + db_path = os.path.join(tmp, 'staging.duckdb') + db = _init_staging_db(db_path) + try: + parse_case(_case_from_xml(live), db, '2024-03-01') + flush_write_buffer(db) + parse_case(_case_from_xml(deleted), db, '2024-03-02') + flush_write_buffer(db) + meta = db.execute( + 'SELECT updatedate, deletedate FROM oca_metadata_staging WHERE indexnumberid = ?', + (case_id,), + ).fetchone() + counts = fetch_staging_row_counts(db) + finally: + db.close() + + self.assertIsNone(meta[0]) + self.assertEqual(str(meta[1]), '2024-03-02') + self.assertEqual(counts['oca_index_staging'], 1) + self.assertEqual(counts['oca_parties_staging'], 2) + + def test_batched_path_matches_legacy_row_counts(self): + xml_bytes = build_extract_xml(25, child_profile='weekly') + with tempfile.TemporaryDirectory() as tmp: + legacy_path = os.path.join(tmp, 'legacy.duckdb') + batched_path = os.path.join(tmp, 'batched.duckdb') + + legacy_db = DuckDB(legacy_path) + legacy_db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + batched_db = DuckDB(batched_path) + batched_db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + attach_write_buffer(batched_db) + + xml_io = io.BytesIO(xml_bytes) + try: + os.environ['PARSE_WRITE_BATCH_ENABLED'] = '0' + parse_file(xml_io, legacy_db, '2024-03-08', num_threads=1) + xml_io.seek(0) + parse_file(xml_io, batched_db, '2024-03-08', num_threads=1) + flush_write_buffer(batched_db) + legacy_counts = fetch_staging_row_counts(legacy_db) + batched_counts = fetch_staging_row_counts(batched_db) + finally: + os.environ.pop('PARSE_WRITE_BATCH_ENABLED', None) + legacy_db.close() + batched_db.close() + + self.assertEqual(legacy_counts, batched_counts) + + def test_parse_file_end_to_end_zip(self): + with tempfile.TemporaryDirectory() as tmp: + zip_path = os.path.join(tmp, 'incr.zip') + with zipfile.ZipFile(zip_path, 'w') as zf: + zf.writestr(DATA_FILENAME, build_extract_xml(10, child_profile='weekly')) + + db_path = os.path.join(tmp, 'staging.duckdb') + db = _init_staging_db(db_path) + try: + with zipfile.ZipFile(zip_path, 'r').open(DATA_FILENAME) as xml_file: + parse_file(xml_file, db, '2024-03-08', num_threads=1) + flush_write_buffer(db) + counts = fetch_staging_row_counts(db) + finally: + db.close() + + self.assertEqual(counts['oca_index_staging'], 10) + self.assertGreater(counts['oca_parties_staging'], 10) + + +class ParseWriteBufferTests(unittest.TestCase): + def test_discard_case_drops_in_window_writes(self): + from lib.parse_write_buffer import StagingWriteBuffer + + with tempfile.TemporaryDirectory() as tmp: + db = DuckDB(os.path.join(tmp, 'buf.duckdb')) + db.execute('CREATE TABLE t (id INTEGER, v VARCHAR)') + buffer = StagingWriteBuffer( + db, + ParseWriteConfig(enabled=True, batch_size=100, flush_every_n_cases=10), + ) + buffer.begin_case() + buffer.queue_insert('INSERT INTO t VALUES (?, ?)', (1, 'orphan')) + buffer.discard_case() + buffer.flush() + count = db.execute('SELECT COUNT(*) FROM t').fetchone()[0] + db.close() + self.assertEqual(count, 0) + + def test_flush_order_deletes_before_inserts(self): + from lib.parse_write_buffer import StagingWriteBuffer + + with tempfile.TemporaryDirectory() as tmp: + db = DuckDB(os.path.join(tmp, 'buf.duckdb')) + db.execute('CREATE TABLE t (id INTEGER, v VARCHAR)') + db.execute('INSERT INTO t VALUES (1, ?)', ('old',)) + buffer = StagingWriteBuffer( + db, + ParseWriteConfig(enabled=True, batch_size=100, flush_every_n_cases=10), + ) + buffer.queue_delete('DELETE FROM t WHERE id = ?', (1,)) + buffer.queue_insert('INSERT INTO t VALUES (?, ?)', (1, 'new')) + buffer.flush() + row = db.execute('SELECT v FROM t WHERE id = 1').fetchone() + db.close() + self.assertEqual(row[0], 'new') + + def test_config_from_env(self): + os.environ['PARSE_WRITE_BATCH_SIZE'] = '128' + os.environ['PARSE_WRITE_FLUSH_EVERY_N_CASES'] = '8' + try: + cfg = ParseWriteConfig.from_env() + self.assertEqual(cfg.batch_size, 128) + self.assertEqual(cfg.flush_every_n_cases, 8) + finally: + os.environ.pop('PARSE_WRITE_BATCH_SIZE', None) + os.environ.pop('PARSE_WRITE_FLUSH_EVERY_N_CASES', None) + + +class ParserBatchingParityExportTests(unittest.TestCase): + def test_checksum_stable_with_batching_enabled(self): + from parse_pipeline_helpers import run_parse_export_in_dir + + with tempfile.TemporaryDirectory() as tmp: + priv = os.path.join(tmp, 'private') + os.makedirs(priv) + write_test_zip( + os.path.join(priv, 'LandlordTenant.Incr.2024-03-08.zip'), + 20, + child_profile='weekly', + ) + + os.environ['PARSE_WRITE_BATCH_ENABLED'] = '1' + try: + rows_on, checksums_on = run_parse_export_in_dir(priv, parse_num_threads=1) + finally: + os.environ.pop('PARSE_WRITE_BATCH_ENABLED', None) + + priv2 = os.path.join(tmp, 'private2') + os.makedirs(priv2) + write_test_zip( + os.path.join(priv2, 'LandlordTenant.Incr.2024-03-08.zip'), + 20, + child_profile='weekly', + ) + os.environ['PARSE_WRITE_BATCH_ENABLED'] = '0' + try: + rows_off, checksums_off = run_parse_export_in_dir(priv2, parse_num_threads=1) + finally: + os.environ.pop('PARSE_WRITE_BATCH_ENABLED', None) + + self.assertEqual(checksums_on, checksums_off) + self.assertEqual(rows_on, rows_off) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_parser_regression_safety.py b/tests/test_parser_regression_safety.py new file mode 100644 index 0000000..63d47fe --- /dev/null +++ b/tests/test_parser_regression_safety.py @@ -0,0 +1,281 @@ +"""Regression and failure-safety tests for Option A parser batching and export parity.""" + +from __future__ import annotations + +import io +import os +import tempfile +import unittest +import zipfile +from unittest.mock import patch + +from lib.duckdb_database import STAGING_TABLE_FAMILIES, DuckDB, fetch_staging_row_counts +from lib.etl_constants import DATA_FILENAME +from lib.etl_stages import export_staging_to_csv +from lib.parse_write_buffer import ParseWriteConfig, StagingWriteBuffer, attach_write_buffer, flush_write_buffer +from lib import parsers +from lib.parsers import parse_case, parse_file + +from csv_checksums import md5_dir_csvs +from parse_pipeline_helpers import run_parse_export_in_dir +from parser_xml_fixtures import build_case_xml, build_extract_xml, write_test_zip + + +def _init_staging_db(path: str) -> DuckDB: + db = DuckDB(path) + db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + attach_write_buffer(db) + return db + + +def _staging_counts_for_index(db: DuckDB, index_id: str) -> dict[str, int]: + counts: dict[str, int] = {} + for table_name in STAGING_TABLE_FAMILIES: + try: + row = db.execute( + f'SELECT COUNT(*) FROM {table_name} WHERE indexnumberid = ?', + (index_id,), + ).fetchone() + counts[table_name] = int(row[0]) if row else 0 + except Exception: + counts[table_name] = 0 + return counts + + +def _parse_zip_bytes( + xml_bytes: bytes, + db: DuckDB, + extract_date: str = '2024-03-08', +) -> None: + buf = io.BytesIO() + with zipfile.ZipFile(buf, 'w') as zf: + zf.writestr(DATA_FILENAME, xml_bytes) + buf.seek(0) + with zipfile.ZipFile(buf, 'r') as zf: + with zf.open(DATA_FILENAME) as xml_file: + parse_file(xml_file, db, extract_date, num_threads=1) + + +class WarmParseIdempotencyTests(unittest.TestCase): + def test_double_parse_same_staging_db_stable_row_counts(self): + xml_bytes = build_extract_xml(40, child_profile='weekly') + with tempfile.TemporaryDirectory() as tmp: + db_path = os.path.join(tmp, 'staging.duckdb') + db = _init_staging_db(db_path) + try: + _parse_zip_bytes(xml_bytes, db) + flush_write_buffer(db) + first_counts = fetch_staging_row_counts(db) + + _parse_zip_bytes(xml_bytes, db) + flush_write_buffer(db) + second_counts = fetch_staging_row_counts(db) + finally: + db.close() + + self.assertEqual(first_counts, second_counts) + self.assertEqual(first_counts['oca_index_staging'], 40) + + def test_double_parse_export_checksums_unchanged(self): + with tempfile.TemporaryDirectory() as tmp: + priv = os.path.join(tmp, 'private') + pub = os.path.join(priv, 'public') + os.makedirs(pub) + write_test_zip( + os.path.join(priv, 'LandlordTenant.Incr.2024-03-08.zip'), + 25, + child_profile='weekly', + ) + + staging_path = os.path.join(priv, 'staging.duckdb') + db = _init_staging_db(staging_path) + try: + with zipfile.ZipFile( + os.path.join(priv, 'LandlordTenant.Incr.2024-03-08.zip'), 'r' + ) as zf: + with zf.open(DATA_FILENAME) as xml_file: + parse_file(xml_file, db, '2024-03-08', num_threads=1) + flush_write_buffer(db) + export_staging_to_csv(db, pub, upload=False) + checksums_first = md5_dir_csvs(pub) + + with zipfile.ZipFile( + os.path.join(priv, 'LandlordTenant.Incr.2024-03-08.zip'), 'r' + ) as zf: + with zf.open(DATA_FILENAME) as xml_file: + parse_file(xml_file, db, '2024-03-08', num_threads=1) + flush_write_buffer(db) + export_staging_to_csv(db, pub, upload=False) + checksums_second = md5_dir_csvs(pub) + finally: + db.close() + + self.assertEqual(checksums_first, checksums_second) + + +class ColdRerunIdempotencyTests(unittest.TestCase): + def test_cold_rerun_parity(self): + """Two cold parse+export runs must match row counts and export checksums.""" + with tempfile.TemporaryDirectory() as tmp: + priv1 = os.path.join(tmp, 'run1') + priv2 = os.path.join(tmp, 'run2') + os.makedirs(priv1) + os.makedirs(priv2) + write_test_zip( + os.path.join(priv1, 'LandlordTenant.Incr.2024-03-08.zip'), + 30, + child_profile='weekly', + ) + write_test_zip( + os.path.join(priv2, 'LandlordTenant.Incr.2024-03-08.zip'), + 30, + child_profile='weekly', + ) + + rows1, checksums1 = run_parse_export_in_dir(priv1, parse_num_threads=1) + rows2, checksums2 = run_parse_export_in_dir(priv2, parse_num_threads=1) + + self.assertEqual(checksums1, checksums2) + self.assertEqual(rows1, rows2) + + +class ParserFailureRerunTests(unittest.TestCase): + def test_mid_file_failure_discard_then_rerun_matches_clean_parse(self): + xml_bytes = build_extract_xml(20, child_profile='weekly') + fail_on_case = 8 + seen = {'n': 0} + + def parse_case_maybe_fail(case, db, extract_date): + seen['n'] += 1 + if seen['n'] == fail_on_case: + raise RuntimeError('injected parse failure') + parse_case(case, db, extract_date) + + with tempfile.TemporaryDirectory() as tmp: + clean_path = os.path.join(tmp, 'clean.duckdb') + dirty_path = os.path.join(tmp, 'dirty.duckdb') + + clean_db = _init_staging_db(clean_path) + dirty_db = _init_staging_db(dirty_path) + try: + _parse_zip_bytes(xml_bytes, clean_db) + flush_write_buffer(clean_db) + clean_counts = fetch_staging_row_counts(clean_db) + + with patch('lib.parsers.parse_case', parse_case_maybe_fail): + _parse_zip_bytes(xml_bytes, dirty_db) + flush_write_buffer(dirty_db) + + _parse_zip_bytes(xml_bytes, dirty_db) + flush_write_buffer(dirty_db) + recovery_counts = fetch_staging_row_counts(dirty_db) + finally: + clean_db.close() + dirty_db.close() + + self.assertEqual(clean_counts, recovery_counts) + + def test_failed_mid_case_leaves_no_staging_footprint(self): + """Failure after metadata + partial children leaves no rows for that case.""" + xml_bytes = build_extract_xml(12, child_profile='weekly') + fail_id = 'LT-BENCH-000005' + real_parse_index = parsers.parse_index + + def parse_index_maybe_fail(case, db): + real_parse_index(case, db) + index_el = case.find(parsers.INDEX_NUMBER_ID_TAG) + if index_el is not None and index_el.text == fail_id: + raise RuntimeError('injected after metadata and index') + + with tempfile.TemporaryDirectory() as tmp: + db = _init_staging_db(os.path.join(tmp, 'dirty.duckdb')) + try: + baseline = _staging_counts_for_index(db, fail_id) + with patch('lib.parsers.parse_index', parse_index_maybe_fail): + _parse_zip_bytes(xml_bytes, db) + flush_write_buffer(db) + after_failure = _staging_counts_for_index(db, fail_id) + counts = fetch_staging_row_counts(db) + finally: + db.close() + + self.assertEqual(baseline, {table: 0 for table in STAGING_TABLE_FAMILIES}) + self.assertEqual(after_failure, baseline) + self.assertEqual(counts['oca_index_staging'], 11) + + +class BatchBoundaryCorrectnessTests(unittest.TestCase): + def test_aggressive_batching_matches_legacy_counts(self): + xml_bytes = build_extract_xml(48, child_profile='weekly') + env = { + 'PARSE_WRITE_BATCH_ENABLED': '1', + 'PARSE_WRITE_BATCH_SIZE': '4', + 'PARSE_WRITE_FLUSH_EVERY_N_CASES': '3', + } + with tempfile.TemporaryDirectory() as tmp: + legacy_path = os.path.join(tmp, 'legacy.duckdb') + batched_path = os.path.join(tmp, 'batched.duckdb') + legacy_db = DuckDB(legacy_path) + legacy_db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + batched_db = DuckDB(batched_path) + batched_db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + attach_write_buffer(batched_db) + + try: + os.environ['PARSE_WRITE_BATCH_ENABLED'] = '0' + _parse_zip_bytes(xml_bytes, legacy_db) + for key, value in env.items(): + os.environ[key] = value + _parse_zip_bytes(xml_bytes, batched_db) + flush_write_buffer(batched_db) + legacy_counts = fetch_staging_row_counts(legacy_db) + batched_counts = fetch_staging_row_counts(batched_db) + finally: + for key in env: + os.environ.pop(key, None) + os.environ.pop('PARSE_WRITE_BATCH_ENABLED', None) + legacy_db.close() + batched_db.close() + + self.assertEqual(legacy_counts, batched_counts) + + def test_no_duplicate_child_rows_across_flush_windows(self): + from lxml import etree + + case_id = 'LT-BOUNDARY-001' + cases = [ + build_case_xml(case_id, num_parties=2, num_events=1), + build_case_xml(case_id, num_parties=4, num_events=2), + build_case_xml(case_id, num_parties=3, num_events=3), + ] + + with tempfile.TemporaryDirectory() as tmp: + db = DuckDB(os.path.join(tmp, 'b.duckdb')) + db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + db.write_buffer = StagingWriteBuffer( + db, + ParseWriteConfig(enabled=True, batch_size=2, flush_every_n_cases=1), + ) + try: + for case_xml in cases: + case = etree.fromstring(case_xml.encode('utf-8')) + parse_case(case, db, '2024-03-01') + flush_write_buffer(db) + parties = db.execute( + 'SELECT COUNT(*) FROM oca_parties_staging WHERE indexnumberid = ?', + (case_id,), + ).fetchone()[0] + events = db.execute( + 'SELECT COUNT(*) FROM oca_events_staging WHERE indexnumberid = ?', + (case_id,), + ).fetchone()[0] + self.assertGreater(db.write_buffer._flush_count, 2) + finally: + db.close() + + self.assertEqual(parties, 3) + self.assertEqual(events, 3) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_promotion.py b/tests/test_promotion.py new file mode 100644 index 0000000..fafa632 --- /dev/null +++ b/tests/test_promotion.py @@ -0,0 +1,279 @@ +import unittest +from pathlib import Path +from unittest import mock + +from lib.database import Database +from lib.etl_constants import OCA_TABLES +from lib.etl_promotion import ( + ADDRESS_NATURAL_KEY_COLUMNS, + PROMOTION_SQL_FILE, + PURGE_TOMBSTONED_CASES_SQL_FILE, + promote_staging_to_main, + promotion_counts_checksum, + promotion_table_counts, +) + + +SQL_DIR = Path(__file__).resolve().parents[1] / 'lib' / 'sql' + + +class FakeConn: + def __init__(self): + self.committed = False + self.rolled_back = False + + def commit(self): + self.committed = True + + def rollback(self): + self.rolled_back = True + + def cursor(self): + raise NotImplementedError + + +class PromoteStagingTests(unittest.TestCase): + def test_promote_uses_single_transaction(self): + db = mock.Mock() + db.transaction.return_value.__enter__ = mock.Mock(return_value=db) + db.transaction.return_value.__exit__ = mock.Mock(return_value=False) + + promote_staging_to_main(db) + + db.transaction.assert_called_once() + db.execute_sql_file.assert_any_call('ensure_promotion_indexes.sql', commit=False) + db.execute_sql_file.assert_any_call('promote_staging_to_main.sql', commit=False) + + def test_promotion_failure_rolls_back(self): + conn = FakeConn() + db = mock.Mock() + db.conn = conn + db.execute_sql_file.side_effect = RuntimeError('simulated promotion failure') + + def transaction(): + class _Txn: + def __enter__(self_inner): + return db + + def __exit__(self_inner, exc_type, exc, tb): + if exc_type: + conn.rollback() + return False + conn.commit() + return False + + return _Txn() + + db.transaction.side_effect = transaction + + with self.assertRaises(RuntimeError): + promote_staging_to_main(db) + + self.assertTrue(conn.rolled_back) + self.assertFalse(conn.committed) + + def test_promotion_success_commits_once(self): + conn = FakeConn() + db = mock.Mock() + db.conn = conn + + def transaction(): + class _Txn: + def __enter__(self_inner): + return db + + def __exit__(self_inner, exc_type, exc, tb): + if exc_type: + conn.rollback() + return False + conn.commit() + return False + + return _Txn() + + db.transaction.side_effect = transaction + promote_staging_to_main(db) + + self.assertTrue(conn.committed) + self.assertFalse(conn.rolled_back) + + def test_counts_checksum_stable(self): + counts_a = {'oca_index': 1, 'oca_addresses': 2} + counts_b = {'oca_addresses': 2, 'oca_index': 1} + self.assertEqual( + promotion_counts_checksum(counts_a), + promotion_counts_checksum(counts_b), + ) + + +class PromoteStagingSqlContractTests(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.sql = (SQL_DIR / PROMOTION_SQL_FILE).read_text(encoding='utf-8') + + def test_single_transaction_session_role_reset(self): + self.assertIn('SET session_replication_role = replica', self.sql) + self.assertIn('SET session_replication_role = default', self.sql) + + def test_oca_index_upsert_with_tombstone_purge(self): + self.assertIn('ON CONFLICT (indexnumberid) DO UPDATE', self.sql) + self.assertRegex(self.sql, r'DELETE FROM oca_index\b') + self.assertGreaterEqual(self.sql.count('DELETE FROM oca_index'), 2) + + def test_tombstone_temp_tables_and_filtered_staging(self): + self.assertIn('CREATE TEMP TABLE tombstoned_ids', self.sql) + self.assertIn('CREATE TEMP TABLE promotion_active_staging_ids', self.sql) + self.assertIn('oca_metadata_staging WHERE deletedate IS NOT NULL', self.sql) + self.assertIn( + 'FROM oca_index_staging\nWHERE indexnumberid IN ' + '(SELECT indexnumberid FROM promotion_active_staging_ids)', + self.sql, + ) + + def test_child_inserts_exclude_tombstones(self): + self.assertIn( + 'FROM oca_causes_staging\n' + 'WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids)', + self.sql, + ) + + def test_addresses_use_natural_key_delete(self): + for col in ADDRESS_NATURAL_KEY_COLUMNS: + if col == 'indexnumberid': + continue + self.assertIn(f'm.{col} IS NOT DISTINCT FROM s.{col}', self.sql) + + def test_metadata_merged_before_staging_drop(self): + metadata_pos = self.sql.index('CREATE TABLE oca_metadata_temp') + drop_index_pos = self.sql.index('DROP TABLE IF EXISTS oca_index_staging') + self.assertLess(metadata_pos, drop_index_pos) + + def test_all_staging_tables_dropped(self): + for table in OCA_TABLES: + self.assertIn(f'DROP TABLE IF EXISTS {table}_staging', self.sql) + + +class PurgeTombstonedCasesSqlContractTests(unittest.TestCase): + def test_purge_deletes_index_not_metadata(self): + sql = (SQL_DIR / PURGE_TOMBSTONED_CASES_SQL_FILE).read_text(encoding='utf-8') + self.assertRegex(sql, r'DELETE FROM oca_index\b') + self.assertIn('oca_metadata', sql) + self.assertNotRegex(sql, r'DELETE FROM oca_metadata\b') + + +class DatabaseTransactionTests(unittest.TestCase): + @staticmethod + def _mock_connection(): + conn = mock.Mock() + cursor = mock.MagicMock() + cursor.__enter__.return_value = cursor + cursor.__exit__.return_value = False + conn.cursor.return_value = cursor + return conn + + @mock.patch('lib.database.psycopg2.connect') + def test_transaction_commits_on_success(self, connect_mock): + conn = self._mock_connection() + connect_mock.return_value = conn + db = Database(db_url='postgres://example') + with db.transaction(): + db.execute('SELECT 1') + conn.commit.assert_called_once() + conn.rollback.assert_not_called() + + @mock.patch('lib.database.psycopg2.connect') + def test_transaction_rolls_back_on_error(self, connect_mock): + conn = self._mock_connection() + connect_mock.return_value = conn + db = Database(db_url='postgres://example') + with self.assertRaises(RuntimeError): + with db.transaction(): + raise RuntimeError('fail') + conn.rollback.assert_called_once() + + +class PromotionTableCountsTests(unittest.TestCase): + def test_promotion_table_counts_queries_each_table(self): + db = mock.Mock() + db.sql_fetch_one.return_value = (42,) + counts = promotion_table_counts(db, tables=['oca_index', 'oca_causes']) + self.assertEqual(counts, {'oca_index': 42, 'oca_causes': 42}) + self.assertEqual(db.sql_fetch_one.call_count, 2) + + +class FakeManifest: + def __init__(self): + self.step_upserts = [] + self.file_details_by_name = {} + + def upsert_file(self, file_name, source, status, stage=None, details=None, error=None): + if details is not None: + self.file_details_by_name[file_name] = dict(details) + + def upsert_step(self, step_name, status, details=None, error=None): + self.step_upserts.append({ + 'step_name': step_name, + 'status': status, + 'details': details or {}, + 'error': error, + }) + + +class ImportAndPromoteStagingObservabilityTests(unittest.TestCase): + @mock.patch('lib.etl_stages.upsert_promoted_etl_file') + @mock.patch('lib.etl_stages.promote_staging_to_main') + @mock.patch('lib.etl_stages.ensure_core_tables_exist') + @mock.patch('lib.etl_stages.staging_tables_with_rows', return_value=[]) + @mock.patch('lib.etl_stages.promotion_table_counts') + @mock.patch('geosupport.Geosupport') + def test_promote_step_records_before_after_checksums( + self, + _geosupport_mock, + counts_mock, + _staging_rows_mock, + _ensure_tables_mock, + _promote_mock, + _upsert_file_mock, + ): + from lib.etl_stages import FileSelection, import_and_promote_staging + + counts_before = {'oca_index': 100, 'oca_metadata': 100} + counts_after = {'oca_index': 150, 'oca_metadata': 150} + counts_mock.side_effect = [counts_before, counts_after] + + manifest = FakeManifest() + db = mock.Mock() + selection = FileSelection( + selected_zip_files=['test.zip'], + skipped_reprocess_files=[], + new_file_set={'test.zip'}, + reprocess_file_set=set(), + sftp_download_files=['test.zip'], + s3_download_files=[], + ) + + with mock.patch('lib.etl_stages.csv_has_rows', return_value=False): + import_and_promote_staging( + manifest, + db, + '/tmp/pub', + {'aws_bucket_name': 'b', 'aws_id': 'i', 'aws_key': 'k'}, + '', + selection, + 'public', + ) + + completed = [s for s in manifest.step_upserts if s['status'] == 'completed'][-1] + self.assertEqual(completed['step_name'], 'promote_staging') + details = completed['details'] + self.assertEqual(details['counts_before'], counts_before) + self.assertEqual(details['counts_after'], counts_after) + self.assertEqual( + details['checksum_before'], + promotion_counts_checksum(counts_before), + ) + self.assertEqual( + details['checksum_after'], + promotion_counts_checksum(counts_after), + ) + self.assertEqual(counts_mock.call_count, 2) diff --git a/tests/test_run_manifest.py b/tests/test_run_manifest.py new file mode 100644 index 0000000..46e4555 --- /dev/null +++ b/tests/test_run_manifest.py @@ -0,0 +1,82 @@ +import unittest + +from lib.etl_run_manifest import EtlRunManifest, completed_reprocess_files +from lib.etl_file_selection import select_data_files_to_process + + +class FakeDb: + def __init__(self): + self.sql_calls = [] + self.fetch_one_queue = [] + self.fetch_all_result = [] + + def execute_sql_file(self, sql_file): + self.sql_calls.append(("execute_sql_file", sql_file)) + + def sql(self, statement): + self.sql_calls.append(("sql", statement)) + + def sql_fetch_one(self, statement): + self.sql_calls.append(("sql_fetch_one", statement)) + if self.fetch_one_queue: + return self.fetch_one_queue.pop(0) + return (None,) + + def sql_fetch_all(self, statement): + self.sql_calls.append(("sql_fetch_all", statement)) + return self.fetch_all_result + + +class RunManifestTests(unittest.TestCase): + def test_completed_reprocess_files_filters_manifest_hits(self): + fake_db = FakeDb() + fake_db.fetch_all_result = [("file_a.zip",), ("file_b.zip",)] + completed = completed_reprocess_files(fake_db, ["file_a.zip", "file_c.zip"]) + self.assertEqual(completed, {"file_a.zip", "file_b.zip"}) + sql = fake_db.sql_calls[-1][1] + self.assertIn("cases_failed", sql) + self.assertIn("= 0", sql) + + def test_completed_reprocess_files_excludes_files_with_case_failures(self): + """SQL must filter out completed rows where details.cases_failed > 0.""" + fake_db = FakeDb() + fake_db.fetch_all_result = [("file_clean.zip",)] + completed = completed_reprocess_files( + fake_db, + ["file_clean.zip", "file_dirty.zip"], + ) + self.assertEqual(completed, {"file_clean.zip"}) + sql = fake_db.sql_calls[-1][1] + self.assertIn("COALESCE((ef.details->>'cases_failed')::int, 0) = 0", sql) + + def test_reprocess_without_force_skips_completed_files(self): + new_files = ["LandlordTenant.Incr.2024-03-01.zip"] + reprocess_files = [ + "LandlordTenant.Incr.2023-01-01.zip", + "LandlordTenant.Incr.2023-01-08.zip", + ] + already_completed = {"LandlordTenant.Incr.2023-01-01.zip"} + selected = select_data_files_to_process( + new_files=new_files, + reprocess_files=sorted(set(reprocess_files) - already_completed), + force_reprocess=False, + ) + self.assertEqual(selected, ["LandlordTenant.Incr.2024-03-01.zip"]) + + def test_mark_run_completed_records_files_needing_reprocess(self): + fake_db = FakeDb() + manifest = EtlRunManifest(fake_db, 'public', '', '2', '', False) + manifest.mark_run_completed( + 2, + 1, + 0, + files_needing_reprocess=['dirty.zip'], + ) + sql = fake_db.sql_calls[-1][1] + self.assertIn('files_needing_reprocess', sql) + self.assertIn('dirty.zip', sql) + self.assertIn('processed_file_count = 1', sql) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_runtime_controls.py b/tests/test_runtime_controls.py new file mode 100644 index 0000000..55e04f0 --- /dev/null +++ b/tests/test_runtime_controls.py @@ -0,0 +1,101 @@ +import os +import unittest +from unittest.mock import MagicMock, patch + +import oca_update +from lib.database import Database + + +class RuntimeControlTests(unittest.TestCase): + @patch('oca_update.oca_etl') + def test_main_passes_defaults_when_unset(self, oca_etl_mock): + with patch.dict(os.environ, { + 'DATABASE_URL': 'postgres://example', + 'AWS_ACCESS_KEY_ID': 'id', + 'AWS_SECRET_ACCESS_KEY': 'key', + 'AWS_S3_BUCKET_NAME': 'bucket', + 'SFTP_HOST': 'host', + 'SFTP_USER': 'user', + 'SFTP_PSWD': 'pswd', + 'SFTP_DIR': '/incoming', + 'MODE': '2', + }, clear=True), patch('sys.argv', ['oca_update.py']): + oca_update.main() + + call_args = oca_etl_mock.call_args[0] + db_args = call_args[0] + runtime_args = call_args[5] + self.assertEqual(db_args['schema'], '') + self.assertEqual(runtime_args['db_schema'], '') + self.assertEqual(runtime_args['s3_prefix'], '') + self.assertEqual(runtime_args['reprocess_glob'], '') + self.assertFalse(runtime_args['force_reprocess']) + self.assertFalse(runtime_args['parse_fail_fast']) + self.assertFalse(runtime_args['skip_public_publish']) + + @patch('oca_update.oca_etl') + def test_main_non_default_schema_smoke_path(self, oca_etl_mock): + with patch.dict(os.environ, { + 'DATABASE_URL': 'postgres://example', + 'AWS_ACCESS_KEY_ID': 'id', + 'AWS_SECRET_ACCESS_KEY': 'key', + 'AWS_S3_BUCKET_NAME': 'bucket', + 'SFTP_HOST': 'host', + 'SFTP_USER': 'user', + 'SFTP_PSWD': 'pswd', + 'SFTP_DIR': '/incoming', + 'MODE': '2', + 'DB_SCHEMA': 'oca_refactor', + 'S3_PREFIX': 'refactor/dev', + 'REPROCESS_GLOB': 'LandlordTenant.Incr.2024-*.zip', + 'FORCE_REPROCESS': 'true', + 'PARSE_FAIL_FAST': 'true', + 'GEOCODE_WORKERS': '3', + 'CENSUS_BATCH_CHUNK_SIZE': '2000', + 'CSV_ROW_CHECK_CHUNK_SIZE': '500', + }, clear=True), patch('sys.argv', ['oca_update.py']): + oca_update.main() + + db_args = oca_etl_mock.call_args[0][0] + runtime_args = oca_etl_mock.call_args[0][5] + self.assertEqual(db_args['schema'], 'oca_refactor') + self.assertEqual(runtime_args['s3_prefix'], 'refactor/dev') + self.assertEqual(runtime_args['reprocess_glob'], 'LandlordTenant.Incr.2024-*.zip') + self.assertTrue(runtime_args['force_reprocess']) + self.assertEqual(runtime_args['geocode_workers'], 3) + self.assertTrue(runtime_args['parse_fail_fast']) + + @patch('oca_update.oca_etl') + def test_main_passes_skip_public_publish_from_env(self, oca_etl_mock): + with patch.dict(os.environ, { + 'DATABASE_URL': 'postgres://example', + 'AWS_ACCESS_KEY_ID': 'id', + 'AWS_SECRET_ACCESS_KEY': 'key', + 'AWS_S3_BUCKET_NAME': 'bucket', + 'SFTP_HOST': 'host', + 'SFTP_USER': 'user', + 'SFTP_PSWD': 'pswd', + 'SFTP_DIR': '/incoming', + 'MODE': '2', + 'SKIP_PUBLIC_PUBLISH': 'true', + }, clear=True), patch('sys.argv', ['oca_update.py']): + oca_update.main() + + runtime_args = oca_etl_mock.call_args[0][5] + self.assertTrue(runtime_args['skip_public_publish']) + + @patch('lib.database.psycopg2.connect') + def test_database_sets_search_path_for_schema(self, connect_mock): + conn = MagicMock() + connect_mock.return_value = conn + + Database(db_url='postgres://example', schema='oca_refactor') + + conn.cursor.return_value.__enter__.return_value.execute.assert_called_once() + execute_arg = conn.cursor.return_value.__enter__.return_value.execute.call_args[0][0] + self.assertIn('search_path', str(execute_arg)) + self.assertEqual(conn.commit.call_count, 1) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_schema_bootstrap.py b/tests/test_schema_bootstrap.py new file mode 100644 index 0000000..7ab2ffe --- /dev/null +++ b/tests/test_schema_bootstrap.py @@ -0,0 +1,59 @@ +import os +import unittest +from unittest import mock + +from lib.etl_stages import ensure_core_tables_exist + + +class CreateTablesSqlContractTests(unittest.TestCase): + @classmethod + def setUpClass(cls): + sql_path = os.path.join( + os.path.dirname(__file__), + '..', + 'lib', + 'sql', + 'create_tables.sql', + ) + with open(sql_path, encoding='utf-8') as f: + cls.sql = f.read() + + def test_bootstrap_sql_is_non_destructive(self): + self.assertNotIn('DROP TABLE', self.sql.upper()) + self.assertNotIn('DROP VIEW', self.sql.upper()) + + def test_bootstrap_sql_uses_idempotent_create_patterns(self): + self.assertIn('CREATE TABLE IF NOT EXISTS oca_index', self.sql) + self.assertIn('CREATE TABLE IF NOT EXISTS oca_metadata', self.sql) + self.assertIn('CREATE INDEX IF NOT EXISTS oca_addresses_bbl_idx', self.sql) + + +class EnsureCoreTablesExistTests(unittest.TestCase): + def test_bootstrap_runs_when_schema_context_matches(self): + db = mock.Mock() + db.sql_fetch_one.return_value = ('oca_refactor', '"oca_refactor", public') + + ensure_core_tables_exist(db, 'oca_refactor') + + db.execute_sql_file.assert_called_once_with('create_tables.sql') + + def test_bootstrap_fails_when_expected_schema_missing(self): + db = mock.Mock() + + with self.assertRaisesRegex(RuntimeError, 'DB schema must be set'): + ensure_core_tables_exist(db, '') + + db.execute_sql_file.assert_not_called() + + def test_bootstrap_fails_when_current_schema_does_not_match_expected(self): + db = mock.Mock() + db.sql_fetch_one.return_value = ('public', 'public') + + with self.assertRaisesRegex(RuntimeError, 'expected current_schema'): + ensure_core_tables_exist(db, 'oca_refactor') + + db.execute_sql_file.assert_not_called() + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_staging_csv_export.py b/tests/test_staging_csv_export.py new file mode 100644 index 0000000..5287353 --- /dev/null +++ b/tests/test_staging_csv_export.py @@ -0,0 +1,208 @@ +import csv +import os +import tempfile +import unittest + +import duckdb + +from lib.duckdb_database import DuckDB +from lib.etl_csv import preprocess_csv_file, replace_postgres_array_brackets +from lib.staging_csv_export import ( + nullable_int_csv_sql, + postgres_array_brackets_sql, + staging_csv_needs_preprocess, +) + + +class PostgresArrayBracketsSqlTests(unittest.TestCase): + def _eval_sql(self, value: str | None) -> str | None: + conn = duckdb.connect(':memory:') + literal = 'NULL' if value is None else f"'{value.replace(chr(39), chr(39)*2)}'" + row = conn.execute( + f"SELECT {postgres_array_brackets_sql(literal)}" + ).fetchone() + conn.close() + return row[0] + + def test_matches_python_simple_array(self): + self.assertEqual(self._eval_sql('[a,b]'), replace_postgres_array_brackets('[a,b]')) + + def test_matches_python_json_object_array(self): + value = '[{"appearanceoutcometype":"Hearing"}]' + self.assertEqual(self._eval_sql(value), replace_postgres_array_brackets(value)) + + def test_matches_python_plain_text(self): + self.assertEqual(self._eval_sql('plain'), replace_postgres_array_brackets('plain')) + + def test_null_unchanged(self): + self.assertIsNone(self._eval_sql(None)) + + +class NullableIntSqlTests(unittest.TestCase): + def _eval_int(self, value) -> str: + conn = duckdb.connect(':memory:') + if value is None: + row = conn.execute(f"SELECT {nullable_int_csv_sql('NULL::INTEGER')}").fetchone() + else: + row = conn.execute( + f"SELECT {nullable_int_csv_sql(str(int(value)))}" + ).fetchone() + conn.close() + return row[0] + + def test_null_becomes_sql_null(self): + conn = duckdb.connect(':memory:') + row = conn.execute( + f"SELECT {nullable_int_csv_sql('motionsequence')} FROM (SELECT NULL::INTEGER AS motionsequence) t" + ).fetchone() + self.assertIsNone(row[0]) + + def test_nan_marker_becomes_sql_null(self): + conn = duckdb.connect(':memory:') + row = conn.execute( + f"SELECT {nullable_int_csv_sql('v')} FROM (SELECT 'NaN' AS v) t" + ).fetchone() + self.assertIsNone(row[0]) + + +class StagingExportIntegrationTests(unittest.TestCase): + def test_appearances_export_drops_appearanceid(self): + with tempfile.TemporaryDirectory() as tmp: + db_path = os.path.join(tmp, 'staging.duckdb') + pub = os.path.join(tmp, 'public') + os.makedirs(pub) + db = DuckDB(db_path) + db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + db.execute( + """ + INSERT INTO oca_appearances_staging ( + indexnumberid, appearanceid, appearancedatetime, + appearancepurpose, motionsequence, appearanceoutcomes + ) VALUES ( + 'LT-1', 99, '2024-02-15 10:00:00', 'Conference', NULL, + '[{"appearanceoutcometype":"Adjourned"}]' + ) + """ + ) + db.export_tables_to_csv(pub) + db.close() + + path = os.path.join(pub, 'oca_appearances_staging.csv') + with open(path, newline='', encoding='utf-8') as f: + row = next(csv.DictReader(f)) + self.assertNotIn('appearanceid', row) + self.assertEqual(row['motionsequence'], '') + self.assertEqual( + row['appearanceoutcomes'], + '[{"appearanceoutcometype":"Adjourned"}]', + ) + + def test_index_array_export_matches_preprocess(self): + with tempfile.TemporaryDirectory() as tmp: + raw_path = os.path.join(tmp, 'raw.csv') + export_path = os.path.join(tmp, 'export.csv') + conn = duckdb.connect(':memory:') + conn.execute('CREATE TABLE t (specialtydesignationtypes VARCHAR[])') + conn.execute("INSERT INTO t VALUES (['HP', 'RTC'])") + conn.execute(f"COPY t TO '{raw_path}' (HEADER, DELIMITER ',')") + conn.close() + + with open(raw_path, newline='', encoding='utf-8') as f: + reader = csv.DictReader(f) + row = next(reader) + with open(export_path, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=['specialtydesignationtypes']) + writer.writeheader() + writer.writerow(row) + + preprocess_csv_file(export_path) + + db_path = os.path.join(tmp, 'staging.duckdb') + pub = os.path.join(tmp, 'public') + os.makedirs(pub) + db = DuckDB(db_path) + db.execute('CREATE TABLE oca_index_staging (specialtydesignationtypes VARCHAR[])') + db.execute("INSERT INTO oca_index_staging VALUES (['HP', 'RTC'])") + db.export_tables_to_csv(pub) + db.close() + + with open(export_path, newline='', encoding='utf-8') as f: + preprocessed = next(csv.DictReader(f))['specialtydesignationtypes'] + with open(os.path.join(pub, 'oca_index_staging.csv'), newline='', encoding='utf-8') as f: + exported = next(csv.DictReader(f))['specialtydesignationtypes'] + self.assertEqual(exported, preprocessed) + + +class ExportMatchesLegacyPreprocessTests(unittest.TestCase): + def test_all_staging_csvs_match_raw_copy_plus_preprocess(self): + import lib.etl_csv as etl_csv_mod + from parser_xml_fixtures import write_test_zip + from lib.etl_stages import parse_xml_to_staging + + with tempfile.TemporaryDirectory() as tmp: + priv = os.path.join(tmp, 'priv') + os.makedirs(priv) + write_test_zip( + os.path.join(priv, 'LandlordTenant.Incr.2024-03-08.zip'), + 10, + child_profile='weekly', + ) + pub_legacy = os.path.join(tmp, 'legacy') + pub_export = os.path.join(tmp, 'export') + os.makedirs(pub_legacy) + os.makedirs(pub_export) + + class _Manifest: + def upsert_step(self, *args, **kwargs): + pass + + def upsert_file(self, *args, **kwargs): + pass + + db_legacy = DuckDB(os.path.join(priv, 'legacy.duckdb')) + db_legacy.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + parse_xml_to_staging(_Manifest(), db_legacy, priv, parse_num_threads=1) + with db_legacy._lock: + for table_row in db_legacy.conn.execute('SHOW TABLES').fetchall(): + table_name = table_row[0] + path = os.path.join(pub_legacy, f'{table_name}.csv') + db_legacy.conn.execute( + f"COPY {table_name} TO '{path}' (HEADER, DELIMITER ',')" + ) + db_legacy.close() + + orig = etl_csv_mod.staging_csv_needs_preprocess + etl_csv_mod.staging_csv_needs_preprocess = lambda _f: True + try: + etl_csv_mod.preprocess_staging_csv_dir(pub_legacy) + finally: + etl_csv_mod.staging_csv_needs_preprocess = orig + + db_export = DuckDB(os.path.join(priv, 'export.duckdb')) + db_export.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + parse_xml_to_staging(_Manifest(), db_export, priv, parse_num_threads=1) + db_export.export_tables_to_csv(pub_export) + db_export.close() + + for name in sorted(os.listdir(pub_legacy)): + if not name.endswith('.csv'): + continue + with open(os.path.join(pub_legacy, name), 'rb') as f: + legacy = f.read() + with open(os.path.join(pub_export, name), 'rb') as f: + exported = f.read() + self.assertEqual(exported, legacy, name) + + +class StagingCsvNeedsPreprocessTests(unittest.TestCase): + def test_staging_tables_skip_second_pass(self): + self.assertFalse(staging_csv_needs_preprocess('oca_index_staging.csv')) + self.assertFalse(staging_csv_needs_preprocess('oca_addresses_staging.csv')) + self.assertFalse(staging_csv_needs_preprocess('oca_appearances_staging.csv')) + + def test_unknown_table_still_preprocessed(self): + self.assertTrue(staging_csv_needs_preprocess('custom_table.csv')) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_update_appearance_outcomes_sql.py b/tests/test_update_appearance_outcomes_sql.py new file mode 100644 index 0000000..0908b98 --- /dev/null +++ b/tests/test_update_appearance_outcomes_sql.py @@ -0,0 +1,26 @@ +import os +import unittest + + +class UpdateAppearanceOutcomesSqlTests(unittest.TestCase): + def test_assigns_appearanceid_before_outcomes_insert(self): + sql_path = os.path.join( + os.path.dirname(__file__), + '..', + 'lib', + 'sql', + 'update_appearance_outcomes.sql', + ) + with open(sql_path, encoding='utf-8') as f: + sql = f.read() + + self.assertIn('DO $$', sql) + self.assertIn('setval', sql) + self.assertIn('MAX(appearanceid)', sql) + do_pos = sql.index('DO $$') + insert_pos = sql.index('INSERT INTO oca_appearance_outcomes_staging') + self.assertLess(do_pos, insert_pos) + + +if __name__ == '__main__': + unittest.main() diff --git a/uv.lock b/uv.lock index e7d6060..e0a7782 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.12" [[package]] @@ -24,7 +24,6 @@ dependencies = [ { name = "boto3" }, { name = "censusgeocode" }, { name = "duckdb" }, - { name = "frogress" }, { name = "lxml" }, { name = "notebook" }, { name = "pandas" }, @@ -44,7 +43,6 @@ requires-dist = [ { name = "boto3", specifier = ">=1.38.41" }, { name = "censusgeocode", specifier = ">=0.5.2" }, { name = "duckdb", specifier = ">=1.3.1" }, - { name = "frogress", specifier = ">=0.10.1" }, { name = "lxml", specifier = ">=5.4.0" }, { name = "notebook", specifier = ">=7.4.4" }, { name = "pandas", specifier = ">=2.3.0" }, @@ -53,9 +51,9 @@ requires-dist = [ { name = "python-dotenv", specifier = ">=1.1.0" }, { name = "python-geosupport", specifier = ">=1.1.0" }, { name = "requests", specifier = ">=2.32.4" }, - { name = "requests-toolbelt", specifier = "==0.10.1" }, + { name = "requests-toolbelt", specifier = ">=1.0.0" }, { name = "sqlalchemy", specifier = ">=2.0.41" }, - { name = "urllib3", specifier = "==1.26.15" }, + { name = "urllib3", specifier = ">=2.6.0" }, { name = "usaddress", specifier = ">=0.5.14" }, ] @@ -260,15 +258,15 @@ wheels = [ [[package]] name = "censusgeocode" -version = "0.5.2" +version = "0.5.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "requests" }, { name = "requests-toolbelt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/f2/f5/83c9a6aead949cfda4d21f7fed673f8a40d859a4e4c05e3ff30a8f021b08/censusgeocode-0.5.2.tar.gz", hash = "sha256:ee590d1b7806c630b4a6e60adfa572abf502c95fb2d9489ac045bdda46edde38", size = 20898, upload-time = "2022-01-22T16:22:23.784Z" } +sdist = { url = "https://files.pythonhosted.org/packages/25/b7/74615b3db872f28e3d6ffcbcef0a8f0b4ba1d2982a7fb5c12d233eb204a0/censusgeocode-0.5.3.tar.gz", hash = "sha256:6b26c71495ce860e38ece54032552fd1e112fec6a16f8c7fff88788d84f1dfbc", size = 22239, upload-time = "2026-02-08T00:06:50.716Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a8/f7/eba876fa146e434ab29b295213ca3357d8708f8e4e730453096d5d63b151/censusgeocode-0.5.2-py3-none-any.whl", hash = "sha256:fa2a9e0d44a7216fb5c3d031fc09d6c2008de28cec45495aaad5d309ef06b98b", size = 9178, upload-time = "2022-01-22T16:22:22.395Z" }, + { url = "https://files.pythonhosted.org/packages/1f/c0/d3f062406149c5744364286c48c6edf3e15e6955b577a9e3f9503337e4a6/censusgeocode-0.5.3-py3-none-any.whl", hash = "sha256:f6dc7e5f6162593f8f93b8d35bb756ff96e025f2221dc93ad13b516fb38d18f7", size = 20633, upload-time = "2026-02-08T00:06:49.327Z" }, ] [[package]] @@ -488,15 +486,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl", hash = "sha256:3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014", size = 9121, upload-time = "2021-03-11T07:16:28.351Z" }, ] -[[package]] -name = "frogress" -version = "0.10.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6d/50/35f9d1758ac852fe75cd5d687709710ea076fad9b5fdb9f2f6d53f5510d9/frogress-0.10.1.tar.gz", hash = "sha256:0150cfb988eeda65e019283a06b69603c26457081c545b1b143d907996bf96d3", size = 19300, upload-time = "2024-04-04T08:16:01.834Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ba/e4/4e7f4f4148854d63550c96acfb5daec82aec5f801f07323822ab84ae5910/frogress-0.10.1-py3-none-any.whl", hash = "sha256:a47be1dbca0b89dcbd0628063a159948ff96343e4f5086a4f25e2780469f96d7", size = 15216, upload-time = "2024-04-04T08:15:59.977Z" }, -] - [[package]] name = "greenlet" version = "3.2.3" @@ -506,7 +495,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f3/94/ad0d435f7c48debe960c53b8f60fb41c2026b1d0fa4a99a1cb17c3461e09/greenlet-3.2.3-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:25ad29caed5783d4bd7a85c9251c651696164622494c00802a139c00d639242d", size = 271992, upload-time = "2025-06-05T16:11:23.467Z" }, { url = "https://files.pythonhosted.org/packages/93/5d/7c27cf4d003d6e77749d299c7c8f5fd50b4f251647b5c2e97e1f20da0ab5/greenlet-3.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:88cd97bf37fe24a6710ec6a3a7799f3f81d9cd33317dcf565ff9950c83f55e0b", size = 638820, upload-time = "2025-06-05T16:38:52.882Z" }, { url = "https://files.pythonhosted.org/packages/c6/7e/807e1e9be07a125bb4c169144937910bf59b9d2f6d931578e57f0bce0ae2/greenlet-3.2.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:baeedccca94880d2f5666b4fa16fc20ef50ba1ee353ee2d7092b383a243b0b0d", size = 653046, upload-time = "2025-06-05T16:41:36.343Z" }, - { url = "https://files.pythonhosted.org/packages/9d/ab/158c1a4ea1068bdbc78dba5a3de57e4c7aeb4e7fa034320ea94c688bfb61/greenlet-3.2.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:be52af4b6292baecfa0f397f3edb3c6092ce071b499dd6fe292c9ac9f2c8f264", size = 647701, upload-time = "2025-06-05T16:48:19.604Z" }, { url = "https://files.pythonhosted.org/packages/cc/0d/93729068259b550d6a0288da4ff72b86ed05626eaf1eb7c0d3466a2571de/greenlet-3.2.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0cc73378150b8b78b0c9fe2ce56e166695e67478550769536a6742dca3651688", size = 649747, upload-time = "2025-06-05T16:13:04.628Z" }, { url = "https://files.pythonhosted.org/packages/f6/f6/c82ac1851c60851302d8581680573245c8fc300253fc1ff741ae74a6c24d/greenlet-3.2.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:706d016a03e78df129f68c4c9b4c4f963f7d73534e48a24f5f5a7101ed13dbbb", size = 605461, upload-time = "2025-06-05T16:12:50.792Z" }, { url = "https://files.pythonhosted.org/packages/98/82/d022cf25ca39cf1200650fc58c52af32c90f80479c25d1cbf57980ec3065/greenlet-3.2.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:419e60f80709510c343c57b4bb5a339d8767bf9aef9b8ce43f4f143240f88b7c", size = 1121190, upload-time = "2025-06-05T16:36:48.59Z" }, @@ -515,7 +503,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b1/cf/f5c0b23309070ae93de75c90d29300751a5aacefc0a3ed1b1d8edb28f08b/greenlet-3.2.3-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:500b8689aa9dd1ab26872a34084503aeddefcb438e2e7317b89b11eaea1901ad", size = 270732, upload-time = "2025-06-05T16:10:08.26Z" }, { url = "https://files.pythonhosted.org/packages/48/ae/91a957ba60482d3fecf9be49bc3948f341d706b52ddb9d83a70d42abd498/greenlet-3.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a07d3472c2a93117af3b0136f246b2833fdc0b542d4a9799ae5f41c28323faef", size = 639033, upload-time = "2025-06-05T16:38:53.983Z" }, { url = "https://files.pythonhosted.org/packages/6f/df/20ffa66dd5a7a7beffa6451bdb7400d66251374ab40b99981478c69a67a8/greenlet-3.2.3-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:8704b3768d2f51150626962f4b9a9e4a17d2e37c8a8d9867bbd9fa4eb938d3b3", size = 652999, upload-time = "2025-06-05T16:41:37.89Z" }, - { url = "https://files.pythonhosted.org/packages/51/b4/ebb2c8cb41e521f1d72bf0465f2f9a2fd803f674a88db228887e6847077e/greenlet-3.2.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5035d77a27b7c62db6cf41cf786cfe2242644a7a337a0e155c80960598baab95", size = 647368, upload-time = "2025-06-05T16:48:21.467Z" }, { url = "https://files.pythonhosted.org/packages/8e/6a/1e1b5aa10dced4ae876a322155705257748108b7fd2e4fae3f2a091fe81a/greenlet-3.2.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2d8aa5423cd4a396792f6d4580f88bdc6efcb9205891c9d40d20f6e670992efb", size = 650037, upload-time = "2025-06-05T16:13:06.402Z" }, { url = "https://files.pythonhosted.org/packages/26/f2/ad51331a157c7015c675702e2d5230c243695c788f8f75feba1af32b3617/greenlet-3.2.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2c724620a101f8170065d7dded3f962a2aea7a7dae133a009cada42847e04a7b", size = 608402, upload-time = "2025-06-05T16:12:51.91Z" }, { url = "https://files.pythonhosted.org/packages/26/bc/862bd2083e6b3aff23300900a956f4ea9a4059de337f5c8734346b9b34fc/greenlet-3.2.3-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:873abe55f134c48e1f2a6f53f7d1419192a3d1a4e873bace00499a4e45ea6af0", size = 1119577, upload-time = "2025-06-05T16:36:49.787Z" }, @@ -524,7 +511,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d8/ca/accd7aa5280eb92b70ed9e8f7fd79dc50a2c21d8c73b9a0856f5b564e222/greenlet-3.2.3-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:3d04332dddb10b4a211b68111dabaee2e1a073663d117dc10247b5b1642bac86", size = 271479, upload-time = "2025-06-05T16:10:47.525Z" }, { url = "https://files.pythonhosted.org/packages/55/71/01ed9895d9eb49223280ecc98a557585edfa56b3d0e965b9fa9f7f06b6d9/greenlet-3.2.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8186162dffde068a465deab08fc72c767196895c39db26ab1c17c0b77a6d8b97", size = 683952, upload-time = "2025-06-05T16:38:55.125Z" }, { url = "https://files.pythonhosted.org/packages/ea/61/638c4bdf460c3c678a0a1ef4c200f347dff80719597e53b5edb2fb27ab54/greenlet-3.2.3-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f4bfbaa6096b1b7a200024784217defedf46a07c2eee1a498e94a1b5f8ec5728", size = 696917, upload-time = "2025-06-05T16:41:38.959Z" }, - { url = "https://files.pythonhosted.org/packages/22/cc/0bd1a7eb759d1f3e3cc2d1bc0f0b487ad3cc9f34d74da4b80f226fde4ec3/greenlet-3.2.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:ed6cfa9200484d234d8394c70f5492f144b20d4533f69262d530a1a082f6ee9a", size = 692443, upload-time = "2025-06-05T16:48:23.113Z" }, { url = "https://files.pythonhosted.org/packages/67/10/b2a4b63d3f08362662e89c103f7fe28894a51ae0bc890fabf37d1d780e52/greenlet-3.2.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:02b0df6f63cd15012bed5401b47829cfd2e97052dc89da3cfaf2c779124eb892", size = 692995, upload-time = "2025-06-05T16:13:07.972Z" }, { url = "https://files.pythonhosted.org/packages/5a/c6/ad82f148a4e3ce9564056453a71529732baf5448ad53fc323e37efe34f66/greenlet-3.2.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:86c2d68e87107c1792e2e8d5399acec2487a4e993ab76c792408e59394d52141", size = 655320, upload-time = "2025-06-05T16:12:53.453Z" }, { url = "https://files.pythonhosted.org/packages/5c/4f/aab73ecaa6b3086a4c89863d94cf26fa84cbff63f52ce9bc4342b3087a06/greenlet-3.2.3-cp314-cp314-win_amd64.whl", hash = "sha256:8c47aae8fbbfcf82cc13327ae802ba13c9c36753b67e760023fd116bc124a62a", size = 301236, upload-time = "2025-06-05T16:15:20.111Z" }, @@ -1555,14 +1541,14 @@ wheels = [ [[package]] name = "requests-toolbelt" -version = "0.10.1" +version = "1.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "requests" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/0c/4c/07f01c6ac44f7784fa399137fbc8d0cdc1b5d35304e8c0f278ad82105b58/requests-toolbelt-0.10.1.tar.gz", hash = "sha256:62e09f7ff5ccbda92772a29f394a49c3ad6cb181d568b1337626b2abb628a63d", size = 208956, upload-time = "2022-10-25T03:14:58.576Z" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/61/d7545dafb7ac2230c70d38d31cbfe4cc64f7144dc41f6e4e4b78ecd9f5bb/requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6", size = 206888, upload-time = "2023-05-01T04:11:33.229Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/05/d3/bf87a36bff1cb88fd30a509fd366c70ec30676517ee791b2f77e0e29817a/requests_toolbelt-0.10.1-py2.py3-none-any.whl", hash = "sha256:18565aa58116d9951ac39baa288d3adb5b3ff975c4f25eee78555d89e8f247f7", size = 54525, upload-time = "2022-10-25T03:14:55.289Z" }, + { url = "https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06", size = 54481, upload-time = "2023-05-01T04:11:28.427Z" }, ] [[package]] @@ -1827,11 +1813,11 @@ wheels = [ [[package]] name = "urllib3" -version = "1.26.15" +version = "2.7.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/21/79/6372d8c0d0641b4072889f3ff84f279b738cd8595b64c8e0496d4e848122/urllib3-1.26.15.tar.gz", hash = "sha256:8a388717b9476f934a21484e8c8e61875ab60644d29b9b39e11e4b9dc1c6b305", size = 301444, upload-time = "2023-03-11T00:01:41.302Z" } +sdist = { url = "https://files.pythonhosted.org/packages/53/0c/06f8b233b8fd13b9e5ee11424ef85419ba0d8ba0b3138bf360be2ff56953/urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c", size = 433602, upload-time = "2026-05-07T16:13:18.596Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7b/f5/890a0baca17a61c1f92f72b81d3c31523c99bec609e60c292ea55b387ae8/urllib3-1.26.15-py2.py3-none-any.whl", hash = "sha256:aa751d169e23c7479ce47a0cb0da579e3ede798f994f5816a74e4f4500dcea42", size = 140881, upload-time = "2023-03-11T00:01:39.031Z" }, + { url = "https://files.pythonhosted.org/packages/7f/3e/5db95bcf282c52709639744ca2a8b149baccf648e39c8cc87553df9eae0c/urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897", size = 131087, upload-time = "2026-05-07T16:13:17.151Z" }, ] [[package]]