From 16d166f3d64e9bf593aa3c426335b806cc300dbc Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Wed, 27 May 2026 08:19:34 -0400 Subject: [PATCH 01/30] update ignore files --- .dockerignore | 5 +++++ .gitignore | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.dockerignore b/.dockerignore index 9596778..4cff36c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -8,8 +8,13 @@ .env Dockerfile docker-compose.yml +dockerhub-publish.sh __pycache__/ +notebooks/ +docs/ +k8s/ + # data folders staging.db data/ diff --git a/.gitignore b/.gitignore index 8b20829..1366847 100644 --- a/.gitignore +++ b/.gitignore @@ -20,4 +20,6 @@ data-public/ .DS_Store .Trash-0/ -staging.duckdb* \ No newline at end of file +staging.duckdb* + +k8s/*-kubeconfig.yaml From f2495a06076947bd20c6333908e64ce9980ad8ca Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Wed, 27 May 2026 08:20:01 -0400 Subject: [PATCH 02/30] update docker and python deps for security --- Dockerfile | 5 +++-- docker-compose.yml | 9 +++++++++ pyproject.toml | 4 ++-- uv.lock | 27 ++++++++++++--------------- 4 files changed, 26 insertions(+), 19 deletions(-) diff --git a/Dockerfile b/Dockerfile index 2728242..93ae26d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,10 +1,11 @@ -FROM --platform=linux/amd64 python:3.12-slim-bookworm +FROM --platform=linux/amd64 python:3.12.13-slim-trixie ENV TZ=America/New_York # Update package lists and setup Python with uv RUN apt-get update && \ + apt-get upgrade -y && \ apt-get install -y --no-install-recommends \ - openssh-client curl ca-certificates python3 unzip && \ + openssh-client curl ca-certificates unzip && \ rm -rf /var/lib/apt/lists/* ADD https://astral.sh/uv/install.sh /uv-installer.sh RUN sh /uv-installer.sh && rm /uv-installer.sh diff --git a/docker-compose.yml b/docker-compose.yml index 1763d5f..950a6d9 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,6 +6,15 @@ services: user: 'root' volumes: - .:/app + environment: + DATABASE_URL: ${DATABASE_URL} + AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID} + AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY} + AWS_S3_BUCKET_NAME: ${AWS_S3_BUCKET_NAME} + SFTP_HOST: ${SFTP_HOST} + SFTP_USER: ${SFTP_USER} + SFTP_PSWD: ${SFTP_PSWD} + SFTP_DIR: ${SFTP_DIR} # to debug in the container uncomment below # then use use > docker compose up -d # > docker compose exec app /bin/bash diff --git a/pyproject.toml b/pyproject.toml index a334efd..95c649a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,8 +17,8 @@ dependencies = [ "python-dotenv>=1.1.0", "python-geosupport>=1.1.0", "requests>=2.32.4", - "requests-toolbelt==0.10.1", + "requests-toolbelt>=1.0.0", "sqlalchemy>=2.0.41", - "urllib3==1.26.15", + "urllib3>=2.6.0", "usaddress>=0.5.14", ] diff --git a/uv.lock b/uv.lock index e7d6060..48f1be5 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.12" [[package]] @@ -53,9 +53,9 @@ requires-dist = [ { name = "python-dotenv", specifier = ">=1.1.0" }, { name = "python-geosupport", specifier = ">=1.1.0" }, { name = "requests", specifier = ">=2.32.4" }, - { name = "requests-toolbelt", specifier = "==0.10.1" }, + { name = "requests-toolbelt", specifier = ">=1.0.0" }, { name = "sqlalchemy", specifier = ">=2.0.41" }, - { name = "urllib3", specifier = "==1.26.15" }, + { name = "urllib3", specifier = ">=2.6.0" }, { name = "usaddress", specifier = ">=0.5.14" }, ] @@ -260,15 +260,15 @@ wheels = [ [[package]] name = "censusgeocode" -version = "0.5.2" +version = "0.5.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "requests" }, { name = "requests-toolbelt" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/f2/f5/83c9a6aead949cfda4d21f7fed673f8a40d859a4e4c05e3ff30a8f021b08/censusgeocode-0.5.2.tar.gz", hash = "sha256:ee590d1b7806c630b4a6e60adfa572abf502c95fb2d9489ac045bdda46edde38", size = 20898, upload-time = "2022-01-22T16:22:23.784Z" } +sdist = { url = "https://files.pythonhosted.org/packages/25/b7/74615b3db872f28e3d6ffcbcef0a8f0b4ba1d2982a7fb5c12d233eb204a0/censusgeocode-0.5.3.tar.gz", hash = "sha256:6b26c71495ce860e38ece54032552fd1e112fec6a16f8c7fff88788d84f1dfbc", size = 22239, upload-time = "2026-02-08T00:06:50.716Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a8/f7/eba876fa146e434ab29b295213ca3357d8708f8e4e730453096d5d63b151/censusgeocode-0.5.2-py3-none-any.whl", hash = "sha256:fa2a9e0d44a7216fb5c3d031fc09d6c2008de28cec45495aaad5d309ef06b98b", size = 9178, upload-time = "2022-01-22T16:22:22.395Z" }, + { url = "https://files.pythonhosted.org/packages/1f/c0/d3f062406149c5744364286c48c6edf3e15e6955b577a9e3f9503337e4a6/censusgeocode-0.5.3-py3-none-any.whl", hash = "sha256:f6dc7e5f6162593f8f93b8d35bb756ff96e025f2221dc93ad13b516fb38d18f7", size = 20633, upload-time = "2026-02-08T00:06:49.327Z" }, ] [[package]] @@ -506,7 +506,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f3/94/ad0d435f7c48debe960c53b8f60fb41c2026b1d0fa4a99a1cb17c3461e09/greenlet-3.2.3-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:25ad29caed5783d4bd7a85c9251c651696164622494c00802a139c00d639242d", size = 271992, upload-time = "2025-06-05T16:11:23.467Z" }, { url = "https://files.pythonhosted.org/packages/93/5d/7c27cf4d003d6e77749d299c7c8f5fd50b4f251647b5c2e97e1f20da0ab5/greenlet-3.2.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:88cd97bf37fe24a6710ec6a3a7799f3f81d9cd33317dcf565ff9950c83f55e0b", size = 638820, upload-time = "2025-06-05T16:38:52.882Z" }, { url = "https://files.pythonhosted.org/packages/c6/7e/807e1e9be07a125bb4c169144937910bf59b9d2f6d931578e57f0bce0ae2/greenlet-3.2.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:baeedccca94880d2f5666b4fa16fc20ef50ba1ee353ee2d7092b383a243b0b0d", size = 653046, upload-time = "2025-06-05T16:41:36.343Z" }, - { url = "https://files.pythonhosted.org/packages/9d/ab/158c1a4ea1068bdbc78dba5a3de57e4c7aeb4e7fa034320ea94c688bfb61/greenlet-3.2.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:be52af4b6292baecfa0f397f3edb3c6092ce071b499dd6fe292c9ac9f2c8f264", size = 647701, upload-time = "2025-06-05T16:48:19.604Z" }, { url = "https://files.pythonhosted.org/packages/cc/0d/93729068259b550d6a0288da4ff72b86ed05626eaf1eb7c0d3466a2571de/greenlet-3.2.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0cc73378150b8b78b0c9fe2ce56e166695e67478550769536a6742dca3651688", size = 649747, upload-time = "2025-06-05T16:13:04.628Z" }, { url = "https://files.pythonhosted.org/packages/f6/f6/c82ac1851c60851302d8581680573245c8fc300253fc1ff741ae74a6c24d/greenlet-3.2.3-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:706d016a03e78df129f68c4c9b4c4f963f7d73534e48a24f5f5a7101ed13dbbb", size = 605461, upload-time = "2025-06-05T16:12:50.792Z" }, { url = "https://files.pythonhosted.org/packages/98/82/d022cf25ca39cf1200650fc58c52af32c90f80479c25d1cbf57980ec3065/greenlet-3.2.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:419e60f80709510c343c57b4bb5a339d8767bf9aef9b8ce43f4f143240f88b7c", size = 1121190, upload-time = "2025-06-05T16:36:48.59Z" }, @@ -515,7 +514,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b1/cf/f5c0b23309070ae93de75c90d29300751a5aacefc0a3ed1b1d8edb28f08b/greenlet-3.2.3-cp313-cp313-macosx_11_0_universal2.whl", hash = "sha256:500b8689aa9dd1ab26872a34084503aeddefcb438e2e7317b89b11eaea1901ad", size = 270732, upload-time = "2025-06-05T16:10:08.26Z" }, { url = "https://files.pythonhosted.org/packages/48/ae/91a957ba60482d3fecf9be49bc3948f341d706b52ddb9d83a70d42abd498/greenlet-3.2.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a07d3472c2a93117af3b0136f246b2833fdc0b542d4a9799ae5f41c28323faef", size = 639033, upload-time = "2025-06-05T16:38:53.983Z" }, { url = "https://files.pythonhosted.org/packages/6f/df/20ffa66dd5a7a7beffa6451bdb7400d66251374ab40b99981478c69a67a8/greenlet-3.2.3-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:8704b3768d2f51150626962f4b9a9e4a17d2e37c8a8d9867bbd9fa4eb938d3b3", size = 652999, upload-time = "2025-06-05T16:41:37.89Z" }, - { url = "https://files.pythonhosted.org/packages/51/b4/ebb2c8cb41e521f1d72bf0465f2f9a2fd803f674a88db228887e6847077e/greenlet-3.2.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:5035d77a27b7c62db6cf41cf786cfe2242644a7a337a0e155c80960598baab95", size = 647368, upload-time = "2025-06-05T16:48:21.467Z" }, { url = "https://files.pythonhosted.org/packages/8e/6a/1e1b5aa10dced4ae876a322155705257748108b7fd2e4fae3f2a091fe81a/greenlet-3.2.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:2d8aa5423cd4a396792f6d4580f88bdc6efcb9205891c9d40d20f6e670992efb", size = 650037, upload-time = "2025-06-05T16:13:06.402Z" }, { url = "https://files.pythonhosted.org/packages/26/f2/ad51331a157c7015c675702e2d5230c243695c788f8f75feba1af32b3617/greenlet-3.2.3-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2c724620a101f8170065d7dded3f962a2aea7a7dae133a009cada42847e04a7b", size = 608402, upload-time = "2025-06-05T16:12:51.91Z" }, { url = "https://files.pythonhosted.org/packages/26/bc/862bd2083e6b3aff23300900a956f4ea9a4059de337f5c8734346b9b34fc/greenlet-3.2.3-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:873abe55f134c48e1f2a6f53f7d1419192a3d1a4e873bace00499a4e45ea6af0", size = 1119577, upload-time = "2025-06-05T16:36:49.787Z" }, @@ -524,7 +522,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d8/ca/accd7aa5280eb92b70ed9e8f7fd79dc50a2c21d8c73b9a0856f5b564e222/greenlet-3.2.3-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:3d04332dddb10b4a211b68111dabaee2e1a073663d117dc10247b5b1642bac86", size = 271479, upload-time = "2025-06-05T16:10:47.525Z" }, { url = "https://files.pythonhosted.org/packages/55/71/01ed9895d9eb49223280ecc98a557585edfa56b3d0e965b9fa9f7f06b6d9/greenlet-3.2.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8186162dffde068a465deab08fc72c767196895c39db26ab1c17c0b77a6d8b97", size = 683952, upload-time = "2025-06-05T16:38:55.125Z" }, { url = "https://files.pythonhosted.org/packages/ea/61/638c4bdf460c3c678a0a1ef4c200f347dff80719597e53b5edb2fb27ab54/greenlet-3.2.3-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f4bfbaa6096b1b7a200024784217defedf46a07c2eee1a498e94a1b5f8ec5728", size = 696917, upload-time = "2025-06-05T16:41:38.959Z" }, - { url = "https://files.pythonhosted.org/packages/22/cc/0bd1a7eb759d1f3e3cc2d1bc0f0b487ad3cc9f34d74da4b80f226fde4ec3/greenlet-3.2.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:ed6cfa9200484d234d8394c70f5492f144b20d4533f69262d530a1a082f6ee9a", size = 692443, upload-time = "2025-06-05T16:48:23.113Z" }, { url = "https://files.pythonhosted.org/packages/67/10/b2a4b63d3f08362662e89c103f7fe28894a51ae0bc890fabf37d1d780e52/greenlet-3.2.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:02b0df6f63cd15012bed5401b47829cfd2e97052dc89da3cfaf2c779124eb892", size = 692995, upload-time = "2025-06-05T16:13:07.972Z" }, { url = "https://files.pythonhosted.org/packages/5a/c6/ad82f148a4e3ce9564056453a71529732baf5448ad53fc323e37efe34f66/greenlet-3.2.3-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:86c2d68e87107c1792e2e8d5399acec2487a4e993ab76c792408e59394d52141", size = 655320, upload-time = "2025-06-05T16:12:53.453Z" }, { url = "https://files.pythonhosted.org/packages/5c/4f/aab73ecaa6b3086a4c89863d94cf26fa84cbff63f52ce9bc4342b3087a06/greenlet-3.2.3-cp314-cp314-win_amd64.whl", hash = "sha256:8c47aae8fbbfcf82cc13327ae802ba13c9c36753b67e760023fd116bc124a62a", size = 301236, upload-time = "2025-06-05T16:15:20.111Z" }, @@ -1555,14 +1552,14 @@ wheels = [ [[package]] name = "requests-toolbelt" -version = "0.10.1" +version = "1.0.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "requests" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/0c/4c/07f01c6ac44f7784fa399137fbc8d0cdc1b5d35304e8c0f278ad82105b58/requests-toolbelt-0.10.1.tar.gz", hash = "sha256:62e09f7ff5ccbda92772a29f394a49c3ad6cb181d568b1337626b2abb628a63d", size = 208956, upload-time = "2022-10-25T03:14:58.576Z" } +sdist = { url = "https://files.pythonhosted.org/packages/f3/61/d7545dafb7ac2230c70d38d31cbfe4cc64f7144dc41f6e4e4b78ecd9f5bb/requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6", size = 206888, upload-time = "2023-05-01T04:11:33.229Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/05/d3/bf87a36bff1cb88fd30a509fd366c70ec30676517ee791b2f77e0e29817a/requests_toolbelt-0.10.1-py2.py3-none-any.whl", hash = "sha256:18565aa58116d9951ac39baa288d3adb5b3ff975c4f25eee78555d89e8f247f7", size = 54525, upload-time = "2022-10-25T03:14:55.289Z" }, + { url = "https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06", size = 54481, upload-time = "2023-05-01T04:11:28.427Z" }, ] [[package]] @@ -1827,11 +1824,11 @@ wheels = [ [[package]] name = "urllib3" -version = "1.26.15" +version = "2.7.0" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/21/79/6372d8c0d0641b4072889f3ff84f279b738cd8595b64c8e0496d4e848122/urllib3-1.26.15.tar.gz", hash = "sha256:8a388717b9476f934a21484e8c8e61875ab60644d29b9b39e11e4b9dc1c6b305", size = 301444, upload-time = "2023-03-11T00:01:41.302Z" } +sdist = { url = "https://files.pythonhosted.org/packages/53/0c/06f8b233b8fd13b9e5ee11424ef85419ba0d8ba0b3138bf360be2ff56953/urllib3-2.7.0.tar.gz", hash = "sha256:231e0ec3b63ceb14667c67be60f2f2c40a518cb38b03af60abc813da26505f4c", size = 433602, upload-time = "2026-05-07T16:13:18.596Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/7b/f5/890a0baca17a61c1f92f72b81d3c31523c99bec609e60c292ea55b387ae8/urllib3-1.26.15-py2.py3-none-any.whl", hash = "sha256:aa751d169e23c7479ce47a0cb0da579e3ede798f994f5816a74e4f4500dcea42", size = 140881, upload-time = "2023-03-11T00:01:39.031Z" }, + { url = "https://files.pythonhosted.org/packages/7f/3e/5db95bcf282c52709639744ca2a8b149baccf648e39c8cc87553df9eae0c/urllib3-2.7.0-py3-none-any.whl", hash = "sha256:9fb4c81ebbb1ce9531cce37674bbc6f1360472bc18ca9a553ede278ef7276897", size = 131087, upload-time = "2026-05-07T16:13:17.151Z" }, ] [[package]] From b61884df061b02016d60f6c81506fcbfdc3279d1 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Wed, 27 May 2026 10:37:13 -0400 Subject: [PATCH 03/30] Runtime controls + schema plumbing + reprocess selectors --- .env.example | 9 +++ README.md | 24 +++++++ lib/database.py | 10 ++- lib/etl.py | 128 ++++++++++++++++++++++++++------- oca_update.py | 38 +++++++++- tests/test_file_selection.py | 55 ++++++++++++++ tests/test_runtime_controls.py | 78 ++++++++++++++++++++ 7 files changed, 312 insertions(+), 30 deletions(-) create mode 100644 tests/test_file_selection.py create mode 100644 tests/test_runtime_controls.py diff --git a/.env.example b/.env.example index 65d4a7b..558bff5 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,15 @@ # Mode (level 1 or 2) MODE=2 +# Optional runtime controls (safe defaults preserve current behavior) +DB_SCHEMA= +S3_PREFIX= +REPROCESS_GLOB= +FORCE_REPROCESS=false +GEOCODE_WORKERS= +CENSUS_BATCH_CHUNK_SIZE=2500 +CSV_ROW_CHECK_CHUNK_SIZE=1000 + # The database URL # ---------------- # diff --git a/README.md b/README.md index 8817437..6cb8b7c 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,30 @@ To run the whole process in the docker container run: docker-compose up ``` +### Runtime controls (Step 1 refactor) + +These optional variables let operators isolate schema/data paths and tune memory-sensitive parts of the run. If omitted, behavior remains the same as before (new files only, default schema/search path, default worker/chunk values). + +- `DB_SCHEMA`: set PostgreSQL `search_path` target schema for the ETL session. +- `S3_PREFIX`: optional namespace prefix for S3 object keys (applies to `private/` and `public/` paths). +- `REPROCESS_GLOB`: filename glob against S3 `private/` zip backups (example: `LandlordTenant.Incr.2024-*.zip`). +- `FORCE_REPROCESS`: when `true`, include `REPROCESS_GLOB` matches for replay; otherwise matches are logged and skipped. +- `GEOCODE_WORKERS`: max workers for the Geosupport multiprocessing pool. +- `CENSUS_BATCH_CHUNK_SIZE`: chunk size for Census batch geocoder requests (default `2500`). +- `CSV_ROW_CHECK_CHUNK_SIZE`: chunk size for CSV non-empty checks before S3 import (default `1000`). + +Example Docker run with non-default schema and forced replay: + +```bash +DB_SCHEMA=oca_refactor \ +S3_PREFIX=refactor/dev \ +REPROCESS_GLOB='LandlordTenant.Incr.2024-*.zip' \ +FORCE_REPROCESS=true \ +GEOCODE_WORKERS=4 \ +CENSUS_BATCH_CHUNK_SIZE=2000 \ +docker-compose run --rm app python oca_update.py +``` + ### Jupyter notebook for maintenance Comment out `CMD ["python", "oca_update.py"]` in the Dockerfile diff --git a/lib/database.py b/lib/database.py index 77212cc..be3085d 100644 --- a/lib/database.py +++ b/lib/database.py @@ -31,13 +31,21 @@ def insert_many(table_name, rows): class Database: """Database connection to OCA database""" - def __init__(self, db_url, autocommit = False): + def __init__(self, db_url, schema = '', autocommit = False): self.db_url = db_url + self.schema = schema self.conn = psycopg2.connect(db_url) + if self.schema: + self.set_search_path(self.schema) def __exit__(self, exc_type, exc_value, traceback): self.conn.close() + def set_search_path(self, schema): + with self.conn.cursor() as curs: + curs.execute(sql.SQL("SET search_path TO {}, public").format(sql.Identifier(schema))) + self.conn.commit() + def sql(self, SQL, autocommit = False): """ Executes single sql statement diff --git a/lib/etl.py b/lib/etl.py index aa17698..3dca94d 100644 --- a/lib/etl.py +++ b/lib/etl.py @@ -5,6 +5,7 @@ import requests import re import json +import fnmatch from datetime import datetime # TODO - replace os.path with Pathlib and its '/' operator from pathlib import Path @@ -49,6 +50,14 @@ S3_PUBLIC_FOLDER = 'public' +def s3_key(path, s3_prefix=''): + normalized_path = path.lstrip('/') + if not s3_prefix: + return normalized_path + normalized_prefix = s3_prefix.strip('/') + return f"{normalized_prefix}/{normalized_path}" + + def make_dir(dir_name): """ Create a new directory in the same folder as this file, @@ -62,7 +71,7 @@ def make_dir(dir_name): return dir_path -def list_new_data_files(sftp, s3): +def list_new_data_files(sftp, s3, s3_prefix=''): """ Get a list of filenames for all the data files available in the SFTP that are not already in the private S3 folder. These are the new ones @@ -74,7 +83,7 @@ def list_new_data_files(sftp, s3): """ sftp_zip_files = sftp.list_files(DATA_ZIPFILE_PAT) - s3_zip_files = s3.list_files(DATA_ZIPFILE_PAT, S3_PRIVATE_FOLDER) + s3_zip_files = s3.list_files(DATA_ZIPFILE_PAT, s3_key(S3_PRIVATE_FOLDER, s3_prefix)) new_sftp_zip_files = list(set(sftp_zip_files) - set(s3_zip_files)) # It's important that everything is processed in order because files @@ -89,6 +98,36 @@ def list_new_data_files(sftp, s3): return files +def list_reprocess_data_files(s3, reprocess_glob, s3_prefix=''): + if not reprocess_glob: + return [] + s3_zip_files = s3.list_files(DATA_ZIPFILE_PAT, s3_key(S3_PRIVATE_FOLDER, s3_prefix)) + return sorted([f for f in s3_zip_files if fnmatch.fnmatch(f, reprocess_glob)]) + + +def select_data_files_to_process(new_files, reprocess_files, force_reprocess=False): + def ordered(files): + init_files = sorted([f for f in files if 'Initial' in f]) + incr_files = sorted([f for f in files if 'Incr' in f]) + return init_files + incr_files + + if not reprocess_files: + return ordered(new_files) + + if not force_reprocess: + # Keep backward-compatible default behavior unless force mode is explicitly set. + return ordered(new_files) + + merged = set(new_files) | set(reprocess_files) + return ordered(merged) + + +def csv_has_rows(csv_filepath, chunk_size=1000): + for _ in pd.read_csv(csv_filepath, chunksize=chunk_size): + return True + return False + + def prep_db(s3, db, local_dir): """ Create a new directory in the same folder as this file, @@ -183,7 +222,7 @@ def download_pluto(output_dir): return pluto_file -def upload_public_file(f, pub_dir, mode, s3_args): +def upload_public_file(f, pub_dir, mode, s3_args, s3_prefix=''): """ Uploads a local file from the pub_dir folder to the S3_PUBLIC_FOLDER. @@ -198,14 +237,22 @@ def upload_public_file(f, pub_dir, mode, s3_args): # to maintain consistent names for public level-1 csv files, we'll rename the level-2 version if mode == "2" and f == "oca_addresses.csv": s3_filename = "oca_addresses_private.csv" - s3.upload_file(f"{S3_PUBLIC_FOLDER}/{s3_filename}", os.path.join(pub_dir, f)) + s3.upload_file(s3_key(f"{S3_PUBLIC_FOLDER}/{s3_filename}", s3_prefix), os.path.join(pub_dir, f)) del s3 -def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args): +def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None): """ Extract files from SFTP, parse cases, upload to S3 bucket """ + runtime_args = runtime_args or {} + s3_prefix = (runtime_args.get('s3_prefix') or '').strip('/') + reprocess_glob = runtime_args.get('reprocess_glob') or '' + force_reprocess = bool(runtime_args.get('force_reprocess')) + geocode_workers = runtime_args.get('geocode_workers') or multiprocessing.cpu_count() + census_batch_chunk_size = runtime_args.get('census_batch_chunk_size') or 2500 + csv_row_check_chunk_size = runtime_args.get('csv_row_check_chunk_size') or 1000 + db = Database(**db_args) Path('staging.duckdb').unlink(missing_ok=True) staging_db = DuckDB(dbname='staging.duckdb') @@ -221,20 +268,47 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args): priv_dir = make_dir('data-private') # "private/" pub_dir = make_dir('data-public') # "public/" - # Get list of new files to download from SFTP - new_sftp_zip_files = list_new_data_files(sftp, s3) + # Get default and optional reprocess file selections. + new_sftp_zip_files = list_new_data_files(sftp, s3, s3_prefix=s3_prefix) + reprocess_s3_zip_files = list_reprocess_data_files(s3, reprocess_glob, s3_prefix=s3_prefix) + selected_zip_files = select_data_files_to_process( + new_sftp_zip_files, + reprocess_s3_zip_files, + force_reprocess=force_reprocess + ) + + if reprocess_glob: + print(f"Reprocess selector active: REPROCESS_GLOB={reprocess_glob}, FORCE_REPROCESS={force_reprocess}") + print(f"Matched S3 private files: {len(reprocess_s3_zip_files)}") + if reprocess_s3_zip_files and not force_reprocess: + print('Matched files are excluded unless FORCE_REPROCESS=true.') - # If there are no new files we can stop everything here. - if not new_sftp_zip_files: - print('No new files to download from SFTP. Stopping process.') + # If there are no selected files we can stop everything here. + if not selected_zip_files: + print('No files selected for processing. Stopping process.') return True - # If there are new files, download them. - print('Downloading new files from SFTP:') - for f in new_sftp_zip_files: + # Download selected files from SFTP (new) and optionally from S3 backups. + print('Downloading selected files:') + reprocess_file_set = set(reprocess_s3_zip_files) + new_file_set = set(new_sftp_zip_files) + selected_set = set(selected_zip_files) + + sftp_download_files = sorted(selected_set & new_file_set) + s3_download_files = sorted(selected_set & reprocess_file_set) + + if sftp_download_files: + print(' - From SFTP (new files):') + for f in sftp_download_files: print('-', f) sftp.download_files(f, priv_dir) + if s3_download_files: + print(' - From S3 private backups:') + for f in s3_download_files: + print('-', f) + s3.download_file(s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), os.path.join(priv_dir, f)) + # Sort zipfiles by date def sort_by_date(file): r = re.search(r'(\d+.+)\.zip', file).group(0).replace('.',' ') @@ -312,7 +386,7 @@ def replace_brackets(text): staging_tables = [t + '_staging' for t in OCA_TABLES] public_files = [i for i in os.listdir(pub_dir) if i.endswith('.csv')] with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: - files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args)) + files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) pool.starmap(upload_public_file, files_zip) # reset staging tables then import from s3 to rds @@ -321,7 +395,7 @@ def replace_brackets(text): print('-', f"{t} table to db") # only import to the rds, if the local csv has rows csv_filepath = os.path.join(pub_dir, f"{t}.csv") - if len(pd.read_csv(csv_filepath)): + if csv_has_rows(csv_filepath, chunk_size=csv_row_check_chunk_size): columns = '' # ignore the appearanceid column if t == 'oca_appearances_staging': @@ -329,7 +403,7 @@ def replace_brackets(text): db.sql(f""" SELECT aws_s3.table_import_from_s3( '{t}', '{columns}', '(FORMAT CSV, HEADER)', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', 'public/{t}.csv', 'us-east-1'), + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/{t}.csv", s3_prefix)}', 'us-east-1'), aws_commons.create_aws_credentials('{s3_args["aws_id"]}', '{s3_args["aws_key"]}', '') ); """) @@ -357,7 +431,7 @@ def replace_brackets(text): db.sql(f""" SELECT * from aws_s3.query_export_to_s3( 'SELECT * from {t}', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', 'public/{s3_filename}', 'us-east-1'), + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/{s3_filename}", s3_prefix)}', 'us-east-1'), options :='FORMAT CSV, HEADER'); """) @@ -396,7 +470,7 @@ def replace_brackets(text): # Geocode records using NYC GeoSupport # TODO - check if pluto in the database matches the pluto version of the geosupport # TODO - adjust geocode to put lat/lng on the lot centroid? instead of the centerline/sidewalk - with multiprocessing.Pool(processes=multiprocessing.cpu_count()) as pool: + with multiprocessing.Pool(processes=min((geocode_workers, multiprocessing.cpu_count()))) as pool: it = pd.DataFrame(pool.map(functools.partial(geocode_record, addr_cols=addr_cols), records, 10000)) del df_1 # delete unused objects to avoid docker's memory error / 137 @@ -413,7 +487,7 @@ def replace_brackets(text): # geocode_using_census_batch(data_split[2], pub_dir) with multiprocessing.Pool(processes=min([5, multiprocessing.cpu_count()])) as pool: - chunk_size = 2500 # census batch limit is 10,000. Smaller batches tend to work better + chunk_size = census_batch_chunk_size # census batch limit is 10,000. Smaller batches tend to work better data_split = zip(np.split(df_2, range(chunk_size, df_2.shape[0], chunk_size)), repeat(pub_dir)) it_2 = pd.concat(pool.starmap(geocode_using_census_batch, data_split)) del df_2 @@ -436,13 +510,13 @@ def replace_brackets(text): # s3 = S3(**s3_args) # Update "last updated date" files on S3 for the latest file processed - create_date_files(s3, new_sftp_zip_files[-1], pub_dir) + create_date_files(s3, selected_zip_files[-1], pub_dir) print('Uploading public files to S3:') public_files = [i for i in os.listdir(pub_dir) if i in ('last-updated-shield.png', 'last-updated-date.txt', 'oca_addresses_private.csv')] with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: - files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args)) + files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) pool.starmap(upload_public_file, files_zip) # # Create/upload a dump of the database as a backup @@ -454,7 +528,7 @@ def replace_brackets(text): for f in os.listdir(priv_dir): if f != '.DS_Store': print('-', f) - s3.upload_file(f"{S3_PRIVATE_FOLDER}/{f}", os.path.join(priv_dir, f)) + s3.upload_file(s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), os.path.join(priv_dir, f)) # reset oca_addresses (removes geom), and uses the geocoded s3 import to overwrite oca_addresses table print('-', f'overwrite oca_addresses with geocoded version') @@ -462,7 +536,7 @@ def replace_brackets(text): db.sql(f""" SELECT aws_s3.table_import_from_s3( 'oca_addresses', '', '(FORMAT CSV, HEADER)', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', 'public/oca_addresses_private.csv', 'us-east-1'), + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_private.csv", s3_prefix)}', 'us-east-1'), aws_commons.create_aws_credentials('{s3_args["aws_id"]}', '{s3_args["aws_key"]}', '') ); """) # TODO: replace with similar sql query as update_metadata.sql to reduce the time this takes (10 mins) @@ -499,7 +573,7 @@ def replace_brackets(text): db.sql(f""" SELECT * from aws_s3.query_export_to_s3( 'SELECT * from oca_addresses_with_bbl', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', 'public/oca_addresses_with_bbl.csv', 'us-east-1'), + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_with_bbl.csv", s3_prefix)}', 'us-east-1'), options :='FORMAT CSV, HEADER'); """) @@ -507,7 +581,7 @@ def replace_brackets(text): db.sql(f""" SELECT * from aws_s3.query_export_to_s3( 'SELECT * from oca_addresses_with_ct', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', 'public/oca_addresses_with_ct.csv', 'us-east-1'), + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_with_ct.csv", s3_prefix)}', 'us-east-1'), options :='FORMAT CSV, HEADER'); """) @@ -516,6 +590,6 @@ def replace_brackets(text): db.sql(f""" SELECT * from aws_s3.query_export_to_s3( 'SELECT * from oca_addresses_public', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', 'public/oca_addresses.csv', 'us-east-1'), + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses.csv", s3_prefix)}', 'us-east-1'), options :='FORMAT CSV, HEADER'); - """) \ No newline at end of file + """) diff --git a/oca_update.py b/oca_update.py index f2c403b..3628094 100644 --- a/oca_update.py +++ b/oca_update.py @@ -2,16 +2,40 @@ import dotenv import os +import argparse from pathlib import Path from lib.etl import oca_etl dotenv.load_dotenv() +def parse_bool(raw_value): + if raw_value is None: + return False + return str(raw_value).strip().lower() in ('1', 'true', 'yes', 'y', 'on') + +def parse_optional_int(raw_value): + if raw_value in (None, ''): + return None + return int(raw_value) + +def parse_args(): + parser = argparse.ArgumentParser(description='Run OCA ETL pipeline') + parser.add_argument('--db-schema', default=os.environ.get('DB_SCHEMA', ''), help='Database schema search_path target') + parser.add_argument('--s3-prefix', default=os.environ.get('S3_PREFIX', ''), help='Optional S3 prefix namespace for private/public files') + parser.add_argument('--reprocess-glob', default=os.environ.get('REPROCESS_GLOB', ''), help='Filename glob for S3 private zip reprocessing') + parser.add_argument('--force-reprocess', action='store_true', default=parse_bool(os.environ.get('FORCE_REPROCESS')), help='Reprocess matched files even if already in S3 private backup') + parser.add_argument('--geocode-workers', type=int, default=parse_optional_int(os.environ.get('GEOCODE_WORKERS')), help='Worker process count for geocode pool') + parser.add_argument('--census-batch-chunk-size', type=int, default=int(os.environ.get('CENSUS_BATCH_CHUNK_SIZE', '2500')), help='Chunk size for census batch geocoder input') + parser.add_argument('--csv-row-check-chunk-size', type=int, default=int(os.environ.get('CSV_ROW_CHECK_CHUNK_SIZE', '1000')), help='Chunk size used for constant-memory CSV non-empty checks') + return parser.parse_args() + def main(): + args = parse_args() db_args = { - 'db_url': os.environ.get('DATABASE_URL', '') + 'db_url': os.environ.get('DATABASE_URL', ''), + 'schema': args.db_schema } s3_args = { @@ -33,7 +57,17 @@ def main(): 'db_url': os.environ.get('CLONED_DATABASE_URL', '') } - oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args) + runtime_args = { + 'db_schema': args.db_schema, + 's3_prefix': args.s3_prefix, + 'reprocess_glob': args.reprocess_glob, + 'force_reprocess': args.force_reprocess, + 'geocode_workers': args.geocode_workers, + 'census_batch_chunk_size': args.census_batch_chunk_size, + 'csv_row_check_chunk_size': args.csv_row_check_chunk_size, + } + + oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args) if __name__== "__main__": main() diff --git a/tests/test_file_selection.py b/tests/test_file_selection.py new file mode 100644 index 0000000..4e6afe9 --- /dev/null +++ b/tests/test_file_selection.py @@ -0,0 +1,55 @@ +import unittest + +from lib.etl import select_data_files_to_process + + +class FileSelectionTests(unittest.TestCase): + def test_default_new_only_behavior(self): + selected = select_data_files_to_process( + new_files=[ + 'LandlordTenant.Initial.FiledIn2024.2024-03-01.zip', + 'LandlordTenant.Incr.2024-03-08.zip', + ], + reprocess_files=[], + force_reprocess=False + ) + self.assertEqual( + selected, + [ + 'LandlordTenant.Initial.FiledIn2024.2024-03-01.zip', + 'LandlordTenant.Incr.2024-03-08.zip', + ] + ) + + def test_reprocess_glob_without_force_skips_matches(self): + selected = select_data_files_to_process( + new_files=['LandlordTenant.Incr.2024-03-08.zip'], + reprocess_files=[ + 'LandlordTenant.Initial.FiledIn2023.2023-01-05.zip', + 'LandlordTenant.Incr.2023-05-05.zip', + ], + force_reprocess=False + ) + self.assertEqual(selected, ['LandlordTenant.Incr.2024-03-08.zip']) + + def test_reprocess_glob_with_force_includes_matches(self): + selected = select_data_files_to_process( + new_files=['LandlordTenant.Incr.2024-03-08.zip'], + reprocess_files=[ + 'LandlordTenant.Initial.FiledIn2023.2023-01-05.zip', + 'LandlordTenant.Incr.2023-05-05.zip', + ], + force_reprocess=True + ) + self.assertEqual( + selected, + [ + 'LandlordTenant.Initial.FiledIn2023.2023-01-05.zip', + 'LandlordTenant.Incr.2023-05-05.zip', + 'LandlordTenant.Incr.2024-03-08.zip', + ] + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_runtime_controls.py b/tests/test_runtime_controls.py new file mode 100644 index 0000000..5566ccd --- /dev/null +++ b/tests/test_runtime_controls.py @@ -0,0 +1,78 @@ +import os +import unittest +from unittest.mock import MagicMock, patch + +import oca_update +from lib.database import Database + + +class RuntimeControlTests(unittest.TestCase): + @patch('oca_update.oca_etl') + def test_main_passes_defaults_when_unset(self, oca_etl_mock): + with patch.dict(os.environ, { + 'DATABASE_URL': 'postgres://example', + 'AWS_ACCESS_KEY_ID': 'id', + 'AWS_SECRET_ACCESS_KEY': 'key', + 'AWS_S3_BUCKET_NAME': 'bucket', + 'SFTP_HOST': 'host', + 'SFTP_USER': 'user', + 'SFTP_PSWD': 'pswd', + 'SFTP_DIR': '/incoming', + 'MODE': '2', + }, clear=True), patch('sys.argv', ['oca_update.py']): + oca_update.main() + + call_args = oca_etl_mock.call_args[0] + db_args = call_args[0] + runtime_args = call_args[5] + self.assertEqual(db_args['schema'], '') + self.assertEqual(runtime_args['db_schema'], '') + self.assertEqual(runtime_args['s3_prefix'], '') + self.assertEqual(runtime_args['reprocess_glob'], '') + self.assertFalse(runtime_args['force_reprocess']) + + @patch('oca_update.oca_etl') + def test_main_non_default_schema_smoke_path(self, oca_etl_mock): + with patch.dict(os.environ, { + 'DATABASE_URL': 'postgres://example', + 'AWS_ACCESS_KEY_ID': 'id', + 'AWS_SECRET_ACCESS_KEY': 'key', + 'AWS_S3_BUCKET_NAME': 'bucket', + 'SFTP_HOST': 'host', + 'SFTP_USER': 'user', + 'SFTP_PSWD': 'pswd', + 'SFTP_DIR': '/incoming', + 'MODE': '2', + 'DB_SCHEMA': 'oca_refactor', + 'S3_PREFIX': 'refactor/dev', + 'REPROCESS_GLOB': 'LandlordTenant.Incr.2024-*.zip', + 'FORCE_REPROCESS': 'true', + 'GEOCODE_WORKERS': '3', + 'CENSUS_BATCH_CHUNK_SIZE': '2000', + 'CSV_ROW_CHECK_CHUNK_SIZE': '500', + }, clear=True), patch('sys.argv', ['oca_update.py']): + oca_update.main() + + db_args = oca_etl_mock.call_args[0][0] + runtime_args = oca_etl_mock.call_args[0][5] + self.assertEqual(db_args['schema'], 'oca_refactor') + self.assertEqual(runtime_args['s3_prefix'], 'refactor/dev') + self.assertEqual(runtime_args['reprocess_glob'], 'LandlordTenant.Incr.2024-*.zip') + self.assertTrue(runtime_args['force_reprocess']) + self.assertEqual(runtime_args['geocode_workers'], 3) + + @patch('lib.database.psycopg2.connect') + def test_database_sets_search_path_for_schema(self, connect_mock): + conn = MagicMock() + connect_mock.return_value = conn + + Database(db_url='postgres://example', schema='oca_refactor') + + conn.cursor.return_value.__enter__.return_value.execute.assert_called_once() + execute_arg = conn.cursor.return_value.__enter__.return_value.execute.call_args[0][0] + self.assertIn('search_path', str(execute_arg)) + self.assertEqual(conn.commit.call_count, 1) + + +if __name__ == '__main__': + unittest.main() From f60d2c539169ca540aaf00cd3f24e9d70ca1774f Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Wed, 27 May 2026 14:12:46 -0400 Subject: [PATCH 04/30] Run manifest + stage checkpointing + single-run locking --- lib/database.py | 5 + lib/etl.py | 706 +++++++++++++------------ lib/sql/create_etl_manifest_tables.sql | 56 ++ tests/test_run_manifest.py | 66 +++ 4 files changed, 508 insertions(+), 325 deletions(-) create mode 100644 lib/sql/create_etl_manifest_tables.sql create mode 100644 tests/test_run_manifest.py diff --git a/lib/database.py b/lib/database.py index be3085d..720433d 100644 --- a/lib/database.py +++ b/lib/database.py @@ -65,6 +65,11 @@ def sql_fetch_one(self, SQL): curs.execute(SQL) return curs.fetchone() + def sql_fetch_all(self, SQL): + with self.conn.cursor() as curs: + curs.execute(SQL) + return curs.fetchall() + def insert_rows(self, rows, table_name): """ Inserts many rows, all in the same transaction. diff --git a/lib/etl.py b/lib/etl.py index 3dca94d..d5e7c6e 100644 --- a/lib/etl.py +++ b/lib/etl.py @@ -6,6 +6,8 @@ import re import json import fnmatch +import uuid +import traceback from datetime import datetime # TODO - replace os.path with Pathlib and its '/' operator from pathlib import Path @@ -15,6 +17,7 @@ import multiprocessing import functools from itertools import repeat +from contextlib import contextmanager from lxml import etree import sys @@ -122,6 +125,163 @@ def ordered(files): return ordered(merged) +def completed_reprocess_files(db, reprocess_files): + if not reprocess_files: + return set() + quoted_files = ",".join(["'" + f.replace("'", "''") + "'" for f in reprocess_files]) + rows = db.sql_fetch_all(f""" + SELECT DISTINCT ef.file_name + FROM etl_files ef + JOIN etl_runs er ON er.run_id = ef.run_id + WHERE ef.status = 'completed' + AND er.status = 'completed' + AND ef.file_name IN ({quoted_files}) + """) + return {row[0] for row in rows} + + +class EtlRunManifest: + @staticmethod + def _escape(value): + return str(value).replace("'", "''") + + def _literal(self, value): + return f"'{self._escape(value)}'" + + def _json_literal(self, value): + return f"'{self._escape(json.dumps(value))}'::jsonb" + + def __init__(self, db, schema_name, s3_prefix, mode, reprocess_glob, force_reprocess): + self.db = db + self.schema_name = schema_name or 'public' + self.s3_prefix = s3_prefix or '' + self.mode = mode + self.reprocess_glob = reprocess_glob or '' + self.force_reprocess = force_reprocess + self.run_id = str(uuid.uuid4()) + self.lock_key = None + self.lock_acquired = False + + def setup_tables(self): + self.db.execute_sql_file('create_etl_manifest_tables.sql') + + def acquire_lock(self): + row = self.db.sql_fetch_one( + f"SELECT hashtext('oca_etl:' || {self._literal(self.schema_name)})::bigint" + ) + self.lock_key = row[0] + locked = self.db.sql_fetch_one(f"SELECT pg_try_advisory_lock({self.lock_key})") + self.lock_acquired = bool(locked and locked[0]) + if not self.lock_acquired: + raise RuntimeError(f"Another ETL run is already active for schema '{self.schema_name}'.") + + def release_lock(self): + if self.lock_acquired and self.lock_key is not None: + self.db.sql_fetch_one(f"SELECT pg_advisory_unlock({self.lock_key})") + self.lock_acquired = False + + def create_run(self): + payload = { + "mode": self.mode, + "schema_name": self.schema_name, + "s3_prefix": self.s3_prefix, + "reprocess_glob": self.reprocess_glob, + "force_reprocess": self.force_reprocess, + } + self.db.sql(f""" + INSERT INTO etl_runs ( + run_id, schema_name, s3_prefix, mode, reprocess_glob, force_reprocess, status, metadata, started_at + ) VALUES ( + {self._literal(self.run_id)}, {self._literal(self.schema_name)}, {self._literal(self.s3_prefix)}, + {self._literal(self.mode)}, {self._literal(self.reprocess_glob)}, + {str(self.force_reprocess).upper()}, 'running', {self._json_literal(payload)}, NOW() + ) + """) + + def mark_run_completed(self, selected_count, processed_count, skipped_count): + self.db.sql(f""" + UPDATE etl_runs + SET status = 'completed', + completed_at = NOW(), + selected_file_count = {selected_count}, + processed_file_count = {processed_count}, + skipped_file_count = {skipped_count} + WHERE run_id = '{self.run_id}' + """) + + def mark_run_failed(self, exc): + message = str(exc) + details = {"traceback": traceback.format_exc()} + self.db.sql(f""" + UPDATE etl_runs + SET status = 'failed', + completed_at = NOW(), + error_message = {self._literal(message)}, + error_details = {self._json_literal(details)} + WHERE run_id = {self._literal(self.run_id)} + """) + + def upsert_file(self, file_name, source, status, stage=None, details=None, error=None): + stage_value = "NULL" if stage is None else self._literal(stage) + details_value = self._json_literal(details or {}) + error_message = "NULL" if error is None else self._literal(str(error)) + error_details = "NULL" if error is None else self._json_literal({'traceback': traceback.format_exc()}) + completed_at = "NOW()" if status in ("completed", "failed", "skipped") else "NULL" + started_at = "NOW()" if status in ("processing", "downloaded", "parsed", "promoted") else "NULL" + self.db.sql(f""" + INSERT INTO etl_files ( + run_id, file_name, source, status, stage, details, started_at, completed_at, error_message, error_details, updated_at + ) VALUES ( + {self._literal(self.run_id)}, {self._literal(file_name)}, {self._literal(source)}, {self._literal(status)}, {stage_value}, + {details_value}, {started_at}, {completed_at}, {error_message}, {error_details}, NOW() + ) + ON CONFLICT (run_id, file_name) DO UPDATE + SET source = EXCLUDED.source, + status = EXCLUDED.status, + stage = EXCLUDED.stage, + details = EXCLUDED.details, + started_at = COALESCE(etl_files.started_at, EXCLUDED.started_at), + completed_at = EXCLUDED.completed_at, + error_message = EXCLUDED.error_message, + error_details = EXCLUDED.error_details, + updated_at = NOW() + """) + + def upsert_step(self, step_name, status, details=None, error=None): + details_value = self._json_literal(details or {}) + started_at = "NOW()" if status == "running" else "NULL" + completed_at = "NOW()" if status in ("completed", "failed") else "NULL" + error_message = "NULL" if error is None else self._literal(str(error)) + error_details = "NULL" if error is None else self._json_literal({'traceback': traceback.format_exc()}) + self.db.sql(f""" + INSERT INTO etl_steps ( + run_id, step_name, status, started_at, completed_at, error_message, error_details, details, updated_at + ) VALUES ( + {self._literal(self.run_id)}, {self._literal(step_name)}, {self._literal(status)}, {started_at}, {completed_at}, + {error_message}, {error_details}, {details_value}, NOW() + ) + ON CONFLICT (run_id, step_name) DO UPDATE + SET status = EXCLUDED.status, + started_at = COALESCE(etl_steps.started_at, EXCLUDED.started_at), + completed_at = EXCLUDED.completed_at, + error_message = EXCLUDED.error_message, + error_details = EXCLUDED.error_details, + details = EXCLUDED.details, + updated_at = NOW() + """) + + +@contextmanager +def manifest_step(manifest, step_name, details=None): + manifest.upsert_step(step_name, 'running', details=details) + try: + yield + manifest.upsert_step(step_name, 'completed', details=details) + except Exception as exc: + manifest.upsert_step(step_name, 'failed', details=details, error=exc) + raise + + def csv_has_rows(csv_filepath, chunk_size=1000): for _ in pd.read_csv(csv_filepath, chunksize=chunk_size): return True @@ -254,342 +414,238 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None csv_row_check_chunk_size = runtime_args.get('csv_row_check_chunk_size') or 1000 db = Database(**db_args) + manifest = EtlRunManifest( + db=db, + schema_name=(runtime_args.get('db_schema') or db_args.get('schema') or 'public'), + s3_prefix=s3_prefix, + mode=mode, + reprocess_glob=reprocess_glob, + force_reprocess=force_reprocess + ) + manifest.setup_tables() + manifest.acquire_lock() + manifest.create_run() + Path('staging.duckdb').unlink(missing_ok=True) staging_db = DuckDB(dbname='staging.duckdb') sftp = Sftp(**sftp_args) s3 = S3(**s3_args) - - - - # Create local versions of folder in the S3 bucket "oca-data" - # # For debugging only -- replace with the var declarations below - # priv_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data-private')) - # pub_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data-public')) - priv_dir = make_dir('data-private') # "private/" - pub_dir = make_dir('data-public') # "public/" - - # Get default and optional reprocess file selections. - new_sftp_zip_files = list_new_data_files(sftp, s3, s3_prefix=s3_prefix) - reprocess_s3_zip_files = list_reprocess_data_files(s3, reprocess_glob, s3_prefix=s3_prefix) - selected_zip_files = select_data_files_to_process( - new_sftp_zip_files, - reprocess_s3_zip_files, - force_reprocess=force_reprocess - ) - - if reprocess_glob: - print(f"Reprocess selector active: REPROCESS_GLOB={reprocess_glob}, FORCE_REPROCESS={force_reprocess}") - print(f"Matched S3 private files: {len(reprocess_s3_zip_files)}") - if reprocess_s3_zip_files and not force_reprocess: - print('Matched files are excluded unless FORCE_REPROCESS=true.') - - # If there are no selected files we can stop everything here. - if not selected_zip_files: - print('No files selected for processing. Stopping process.') - return True - - # Download selected files from SFTP (new) and optionally from S3 backups. - print('Downloading selected files:') - reprocess_file_set = set(reprocess_s3_zip_files) - new_file_set = set(new_sftp_zip_files) - selected_set = set(selected_zip_files) - - sftp_download_files = sorted(selected_set & new_file_set) - s3_download_files = sorted(selected_set & reprocess_file_set) - - if sftp_download_files: - print(' - From SFTP (new files):') - for f in sftp_download_files: - print('-', f) - sftp.download_files(f, priv_dir) - - if s3_download_files: - print(' - From S3 private backups:') - for f in s3_download_files: - print('-', f) - s3.download_file(s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), os.path.join(priv_dir, f)) - - # Sort zipfiles by date - def sort_by_date(file): - r = re.search(r'(\d+.+)\.zip', file).group(0).replace('.',' ') - return r - local_zip_files = sorted([os.path.join(priv_dir, f) for f in os.listdir(priv_dir) if f.endswith('.zip')], key = sort_by_date) - - # Rebuild the staging tables - # Then for each zipfile, unzip the XML file and - # parse it into the staging tables - print(' - Creating staging tables...') - staging_db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') - print('Processing files:') - for zip_file in local_zip_files: - print('-', os.path.basename(zip_file)) - print(' - Parsing XML file...') - # takes about 4-5 minutes per xml - extract_date = None - with zipfile.ZipFile(zip_file, 'r').open(DATA_FILENAME) as xml_file: - for _, elem in etree.iterparse(xml_file, tag=oca_tag('RunDate')): - # Grab the first date and break - if not extract_date: - extract_date = elem.text - break - - with zipfile.ZipFile(zip_file, 'r').open(DATA_FILENAME) as xml_file: - parse_file(xml_file, staging_db, extract_date) - - # export staging tables to the pub_dir, upload to s3, and then rds - staging_db.export_tables_to_csv(output_dir=pub_dir) - - def preprocess_csvs(pub_dir): - """Convert all CSV files from DuckDB to PostgreSQL array format; and make small corrections (todo fix this in the parser/ duckdb export)""" - for filename in os.listdir(pub_dir): - if filename.endswith('.csv'): - file_path = os.path.join(pub_dir, filename) - df = pd.read_csv(file_path) - for col in df.columns: - if df[col].dtype == 'object': - # Convert arrays: [anything] -> {anything} - # But only if the content doesn't contain JSON objects - def replace_brackets(text): - if pd.isna(text) or not isinstance(text, str): + priv_dir = make_dir('data-private') + pub_dir = make_dir('data-public') + selected_zip_files = [] + skipped_reprocess_files = [] + new_file_set = set() + + try: + manifest.upsert_step('select_files', 'running') + new_sftp_zip_files = list_new_data_files(sftp, s3, s3_prefix=s3_prefix) + reprocess_s3_zip_files = list_reprocess_data_files(s3, reprocess_glob, s3_prefix=s3_prefix) + if reprocess_glob and not force_reprocess and reprocess_s3_zip_files: + already_completed = completed_reprocess_files(db, reprocess_s3_zip_files) + skipped_reprocess_files = sorted(already_completed) + reprocess_s3_zip_files = sorted(set(reprocess_s3_zip_files) - already_completed) + + selected_zip_files = select_data_files_to_process( + new_sftp_zip_files, + reprocess_s3_zip_files, + force_reprocess=force_reprocess + ) + manifest.upsert_step('select_files', 'completed', details={'selected_file_count': len(selected_zip_files)}) + + if reprocess_glob: + print(f"Reprocess selector active: REPROCESS_GLOB={reprocess_glob}, FORCE_REPROCESS={force_reprocess}") + print(f"Matched S3 private files: {len(reprocess_s3_zip_files)}") + if skipped_reprocess_files and not force_reprocess: + print(f"Skipping already-completed reprocess files from manifest: {len(skipped_reprocess_files)}") + + if not selected_zip_files: + print('No files selected for processing. Stopping process.') + manifest.mark_run_completed(0, 0, len(skipped_reprocess_files)) + return True + + reprocess_file_set = set(reprocess_s3_zip_files) + new_file_set = set(new_sftp_zip_files) + selected_set = set(selected_zip_files) + sftp_download_files = sorted(selected_set & new_file_set) + s3_download_files = sorted(selected_set & reprocess_file_set) + + for f in sftp_download_files: + manifest.upsert_file(f, source='sftp', status='selected', stage='select') + for f in s3_download_files: + manifest.upsert_file(f, source='s3_private', status='selected', stage='select') + for f in skipped_reprocess_files: + manifest.upsert_file(f, source='s3_private', status='skipped', stage='select', details={'reason': 'already_completed_manifest'}) + + manifest.upsert_step('download_files', 'running') + print('Downloading selected files:') + for f in sftp_download_files: + print('-', f) + sftp.download_files(f, priv_dir) + manifest.upsert_file(f, source='sftp', status='downloaded', stage='download') + for f in s3_download_files: + print('-', f) + s3.download_file(s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), os.path.join(priv_dir, f)) + manifest.upsert_file(f, source='s3_private', status='downloaded', stage='download') + manifest.upsert_step('download_files', 'completed') + + def sort_by_date(file): + r = re.search(r'(\d+.+)\.zip', file).group(0).replace('.', ' ') + return r + local_zip_files = sorted([os.path.join(priv_dir, f) for f in os.listdir(priv_dir) if f.endswith('.zip')], key=sort_by_date) + + manifest.upsert_step('parse_xml', 'running') + staging_db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + print('Processing files:') + for zip_file in local_zip_files: + file_name = os.path.basename(zip_file) + manifest.upsert_file(file_name, source='local', status='processing', stage='parse') + extract_date = None + with zipfile.ZipFile(zip_file, 'r').open(DATA_FILENAME) as xml_file: + for _, elem in etree.iterparse(xml_file, tag=oca_tag('RunDate')): + if not extract_date: + extract_date = elem.text + break + with zipfile.ZipFile(zip_file, 'r').open(DATA_FILENAME) as xml_file: + parse_file(xml_file, staging_db, extract_date) + manifest.upsert_file(file_name, source='local', status='parsed', stage='parse', details={'extract_date': extract_date}) + manifest.upsert_step('parse_xml', 'completed') + + staging_db.export_tables_to_csv(output_dir=pub_dir) + + def preprocess_csvs(target_dir): + for filename in os.listdir(target_dir): + if filename.endswith('.csv'): + file_path = os.path.join(target_dir, filename) + df = pd.read_csv(file_path) + for col in df.columns: + if df[col].dtype == 'object': + def replace_brackets(text): + if pd.isna(text) or not isinstance(text, str): + return text + if text.startswith('[') and text.endswith(']'): + inner = text[1:-1].strip() + if inner.startswith('{') and inner.endswith('}'): + return text + return '{' + text[1:-1] + '}' return text + df[col] = df[col].apply(replace_brackets) + if filename.startswith('oca_appearances'): + if 'appearanceid' in df.columns: + del df['appearanceid'] + df['motionsequence'] = df['motionsequence'].astype('Int64') + if filename.startswith('oca_judgments'): + df['amendedfromjudgmentsequence'] = df['amendedfromjudgmentsequence'].astype('Int64') + if filename.startswith('oca_warrants'): + df['executionstayeddays'] = df['executionstayeddays'].astype('Int64') + df['issuancestayeddays'] = df['issuancestayeddays'].astype('Int64') + df.to_csv(file_path, index=False) + + preprocess_csvs(pub_dir) + staging_tables = [t + '_staging' for t in OCA_TABLES] + public_files = [i for i in os.listdir(pub_dir) if i.endswith('.csv')] + with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: + files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) + pool.starmap(upload_public_file, files_zip) + + manifest.upsert_step('promote_staging', 'running') + db.execute_sql_file('create_tables_staging.sql') + for t in staging_tables: + csv_filepath = os.path.join(pub_dir, f"{t}.csv") + if csv_has_rows(csv_filepath, chunk_size=csv_row_check_chunk_size): + columns = '' + if t == 'oca_appearances_staging': + columns = 'indexnumberid, appearancedatetime, appearancepurpose, appearancereason, appearancepart, motionsequence, appearanceoutcomes' + db.sql(f""" + SELECT aws_s3.table_import_from_s3( + '{t}', '{columns}', '(FORMAT CSV, HEADER)', + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/{t}.csv", s3_prefix)}', 'us-east-1'), + aws_commons.create_aws_credentials('{s3_args["aws_id"]}', '{s3_args["aws_key"]}', '') + ); + """) - if text.startswith('[') and text.endswith(']'): - inner_content = text[1:-1].strip() - # Don't replace if the inner content is wrapped in {} - # Todo: fix appearanceoutcomes that are blank [] ... they are still converted to {} - if inner_content.startswith('{') and inner_content.endswith('}'): - return text # Keep original - it's [{}] format - else: - return '{' + text[1:-1] + '}' # Convert [] to {} - - return text - - df[col] = df[col].apply(replace_brackets) - - if filename.startswith('oca_appearances'): - # remove the appearanceid column, BIGSERIAL is assigned in postgres - if 'appearanceid' in df.columns: del df['appearanceid'] - # change motionsequence to a int instead of a float - df['motionsequence'] = df['motionsequence'].astype('Int64') - - if filename.startswith('oca_judgments'): - df['amendedfromjudgmentsequence'] = df['amendedfromjudgmentsequence'].astype('Int64') - - if filename.startswith('oca_warrants'): - df['executionstayeddays'] = df['executionstayeddays'].astype('Int64') - df['issuancestayeddays'] = df['issuancestayeddays'].astype('Int64') - - df.to_csv(file_path, index=False) - - print('Convert csvs:') - preprocess_csvs(pub_dir) - staging_tables = [t + '_staging' for t in OCA_TABLES] - public_files = [i for i in os.listdir(pub_dir) if i.endswith('.csv')] - with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: - files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) - pool.starmap(upload_public_file, files_zip) - - # reset staging tables then import from s3 to rds - db.execute_sql_file('create_tables_staging.sql') - for t in staging_tables: - print('-', f"{t} table to db") - # only import to the rds, if the local csv has rows - csv_filepath = os.path.join(pub_dir, f"{t}.csv") - if csv_has_rows(csv_filepath, chunk_size=csv_row_check_chunk_size): - columns = '' - # ignore the appearanceid column - if t == 'oca_appearances_staging': - columns = 'indexnumberid, appearancedatetime, appearancepurpose, appearancereason, appearancepart, motionsequence, appearanceoutcomes' + db.execute_sql_file('update_appearance_outcomes.sql') + insert_staging_to_main(db, OCA_TABLES) + db.execute_sql_file('update_metadata.sql') + for selected_name in selected_zip_files: + source = 'sftp' if selected_name in new_file_set else 's3_private' + manifest.upsert_file(selected_name, source=source, status='completed', stage='promote') + manifest.upsert_step('promote_staging', 'completed') + + manifest.upsert_step('publish_tables', 'running') + for t in OCA_TABLES: + s3_filename = t + '.csv' + if t == "oca_addresses": + s3_filename = "oca_addresses_private.csv" db.sql(f""" - SELECT aws_s3.table_import_from_s3( - '{t}', '{columns}', '(FORMAT CSV, HEADER)', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/{t}.csv", s3_prefix)}', 'us-east-1'), - aws_commons.create_aws_credentials('{s3_args["aws_id"]}', '{s3_args["aws_key"]}', '') - ); + SELECT * from aws_s3.query_export_to_s3( + 'SELECT * from {t}', + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/{s3_filename}", s3_prefix)}', 'us-east-1'), + options :='FORMAT CSV, HEADER'); + """) + manifest.upsert_step('publish_tables', 'completed') + + manifest.upsert_step('geocode_refresh', 'running') + csv_filepath = os.path.join(pub_dir, "oca_addresses_private.csv") + db.export_csv('oca_addresses', csv_filepath) + input_csv = Path(pub_dir) / 'oca_addresses_private.csv' + output_csv = Path(pub_dir) / 'oca_addresses_private.csv' + df = pd.read_csv(input_csv, dtype=str, index_col=False, usecols=lambda x: x, keep_default_na=False) + df_1 = df[((pd.isna(df['lat'])) | (df['lat'] == '')) & ((df['house_number'] != '') | (pd.notna(df['house_number'])))].copy().reset_index() + records = df_1.to_dict('records') + with multiprocessing.Pool(processes=min((geocode_workers, multiprocessing.cpu_count()))) as pool: + it = pd.DataFrame(pool.map(functools.partial(geocode_record, addr_cols=['street1', 'city', 'postalcode']), records, 10000)) + df_2 = it[(((pd.isna(it['lat'])) | (it['lat'] == '')))].copy().reset_index() + with multiprocessing.Pool(processes=min([5, multiprocessing.cpu_count()])) as pool: + chunk_size = census_batch_chunk_size + data_split = zip(np.split(df_2, range(chunk_size, df_2.shape[0], chunk_size)), repeat(pub_dir)) + it_2 = pd.concat(pool.starmap(geocode_using_census_batch, data_split)) + export_cols = ['indexnumberid', 'street1', 'street2', 'city', 'state', 'postalcode', 'status', 'house_number', 'street_name', 'borough_code', 'place_name', 'sname', 'hnum', 'boro', 'lat', 'bin', 'bbl', 'cd', 'ct', 'council', 'grc', 'grc2', 'msg', 'msg2', 'lon', 'zip_code'] + concat = pd.concat([df, it, it_2], ignore_index=True).drop_duplicates(subset=['indexnumberid'], ignore_index=True, keep='last')[export_cols] + pd.DataFrame(concat).to_csv(output_csv, index=False) + create_date_files(s3, selected_zip_files[-1], pub_dir) + public_files = [i for i in os.listdir(pub_dir) if i in ('last-updated-shield.png', 'last-updated-date.txt', 'oca_addresses_private.csv')] + with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: + files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) + pool.starmap(upload_public_file, files_zip) + for f in os.listdir(priv_dir): + if f != '.DS_Store': + s3.upload_file(s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), os.path.join(priv_dir, f)) + db.execute_sql_file('reset_addresses_table.sql') + db.sql(f""" + SELECT aws_s3.table_import_from_s3( + 'oca_addresses', '', '(FORMAT CSV, HEADER)', + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_private.csv", s3_prefix)}', 'us-east-1'), + aws_commons.create_aws_credentials('{s3_args["aws_id"]}', '{s3_args["aws_key"]}', '') + ); + """) + db.execute_sql_file('create_addresses_views.sql') + db.sql(f""" + SELECT * from aws_s3.query_export_to_s3( + 'SELECT * from oca_addresses_with_bbl', + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_with_bbl.csv", s3_prefix)}', 'us-east-1'), + options :='FORMAT CSV, HEADER'); """) - - # expand appearance_outcomes from json - print('\n - Updating appearance outcomes...') - db.execute_sql_file('update_appearance_outcomes.sql') - - print('\n - Inserting from staging to main ...') - # moves records from staging tables to the main tables, skips oca_metadata - insert_staging_to_main(db, OCA_TABLES) - - # Merging in oca_metadata using case if logic - print('\n - Update metadata in main ...') - db.execute_sql_file('update_metadata.sql') - - - # Export the rds tables to csv files directly into the s3 bucket - for t in OCA_TABLES: - print('-', f'{t} table from db to s3') - # to maintain consistent names for public level-1 csv files, we'll rename the level-2 version - s3_filename = t + '.csv' - if t == "oca_addresses": - s3_filename = "oca_addresses_private.csv" db.sql(f""" SELECT * from aws_s3.query_export_to_s3( - 'SELECT * from {t}', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/{s3_filename}", s3_prefix)}', 'us-east-1'), + 'SELECT * from oca_addresses_with_ct', + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_with_ct.csv", s3_prefix)}', 'us-east-1'), options :='FORMAT CSV, HEADER'); - """) - - # Export oca_addresses_private.csv to pub_dir to geocode - csv_filepath = os.path.join(pub_dir, f"oca_addresses_private.csv") - db.export_csv('oca_addresses', csv_filepath) - - input_csv = Path(pub_dir) / 'oca_addresses_private.csv' - output_csv = Path(pub_dir) /'oca_addresses_private.csv' - addr_cols = ['street1', 'city', 'postalcode'] - - #keep all cols - keep_cols = lambda x: x - - df = pd.read_csv( - input_csv, - dtype = str, - index_col = False, - usecols=keep_cols, - keep_default_na=False - ) - - #filter for only records that need to be geocoded - df_1 = df[ - ((pd.isna(df['lat'])) | (df['lat'] == '')) & - ((df['house_number'] != '') | (pd.notna(df['house_number']))) - ].copy().reset_index() - - # # DEBUG: geocode all records - # df_1 = df - - print(f'Geocoding {len(df_1)} entries in {output_csv}.') - - records = df_1.to_dict('records') - - # Geocode records using NYC GeoSupport - # TODO - check if pluto in the database matches the pluto version of the geosupport - # TODO - adjust geocode to put lat/lng on the lot centroid? instead of the centerline/sidewalk - with multiprocessing.Pool(processes=min((geocode_workers, multiprocessing.cpu_count()))) as pool: - it = pd.DataFrame(pool.map(functools.partial(geocode_record, addr_cols=addr_cols), records, 10000)) - - del df_1 # delete unused objects to avoid docker's memory error / 137 - del records - - # Geocode other records using the US Batch Census Geocoder - # Sub-select for all addresses that are missing latitude; also needs to have a house number - df_2 = it[(((pd.isna(it['lat'])) | (it['lat'] == '')))].copy().reset_index() - print(f'Geocoding {len(df_2)} entries in {output_csv} using another geocoder. {datetime.now()}') - - # For debugging only - # --- - # data_split = np.split(df_2, range(chunk_size, df_2.shape[0], 10000)) - # geocode_using_census_batch(data_split[2], pub_dir) - - with multiprocessing.Pool(processes=min([5, multiprocessing.cpu_count()])) as pool: - chunk_size = census_batch_chunk_size # census batch limit is 10,000. Smaller batches tend to work better - data_split = zip(np.split(df_2, range(chunk_size, df_2.shape[0], chunk_size)), repeat(pub_dir)) - it_2 = pd.concat(pool.starmap(geocode_using_census_batch, data_split)) - del df_2 - del data_split - - print(f'Done geocoding. {datetime.now()}') - # Concat and drop duplicates by keeping the last changes from US Batch Census Geocoder (overwrites the GeoSupport returns - export_cols = ['indexnumberid', 'street1', 'street2', 'city', 'state', - 'postalcode', 'status', 'house_number', 'street_name', 'borough_code', - 'place_name', 'sname', 'hnum', 'boro', 'lat', 'bin', 'bbl', 'cd', - 'ct', 'council', 'grc', 'grc2', 'msg', 'msg2', 'lon', 'zip_code'] - concat = pd.concat([df, it, it_2], ignore_index = True).drop_duplicates(subset=['indexnumberid'], ignore_index = True, keep = 'last')[export_cols] - del df - del it - del it_2 - pd.DataFrame(concat).to_csv(output_csv, index=False) - del concat - - # # reset connection to s3 - # s3 = S3(**s3_args) - - # Update "last updated date" files on S3 for the latest file processed - create_date_files(s3, selected_zip_files[-1], pub_dir) - - print('Uploading public files to S3:') - public_files = [i for i in os.listdir(pub_dir) - if i in ('last-updated-shield.png', 'last-updated-date.txt', 'oca_addresses_private.csv')] - with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: - files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) - pool.starmap(upload_public_file, files_zip) - - # # Create/upload a dump of the database as a backup - # print('Creating database dump and uploading to s3') - # db.dump_to(os.path.join(priv_dir, 'oca.dump')) - - # Upload raw data files and database dump to private folder in S3 bucket - print('Uploading private files to S3:') - for f in os.listdir(priv_dir): - if f != '.DS_Store': - print('-', f) - s3.upload_file(s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), os.path.join(priv_dir, f)) - - # reset oca_addresses (removes geom), and uses the geocoded s3 import to overwrite oca_addresses table - print('-', f'overwrite oca_addresses with geocoded version') - db.execute_sql_file('reset_addresses_table.sql') - db.sql(f""" - SELECT aws_s3.table_import_from_s3( - 'oca_addresses', '', '(FORMAT CSV, HEADER)', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_private.csv", s3_prefix)}', 'us-east-1'), - aws_commons.create_aws_credentials('{s3_args["aws_id"]}', '{s3_args["aws_key"]}', '') - ); - """) # TODO: replace with similar sql query as update_metadata.sql to reduce the time this takes (10 mins) - - # # setup pluto if it does not exist - # # # TODO: setup census tracts if it does not exist - # if not db.sql_fetch_one( - # "SELECT * FROM information_schema.tables WHERE table_name = 'pluto'"): - # pluto_file = download_pluto(pub_dir) - - - # print('uploading pluto to s3') - # s3.upload_file(f"{S3_PUBLIC_FOLDER}/pluto.csv", pluto_file) - - # print('importing pluto to db') - # db.execute_sql_file('create_pluto_table.sql') - - # db.sql(f""" - # SELECT aws_s3.table_import_from_s3( - # 'pluto', '', '(FORMAT CSV, HEADER)', - # aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', 'public/pluto_24v2.csv', 'us-east-1'), - # aws_commons.create_aws_credentials('{s3_args["aws_id"]}', '{s3_args["aws_key"]}', '') - # ); - # """) - - # db.execute_sql_file('alter_pluto_table.sql') - - - # create views and grant access to folks - db.execute_sql_file('create_addresses_views.sql') - - # export views directly to s3, each takes 1-2 minutes - print(f"Creating oca_addresses_with_bbl and exporting to S3") - db.sql(f""" - SELECT * from aws_s3.query_export_to_s3( - 'SELECT * from oca_addresses_with_bbl', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_with_bbl.csv", s3_prefix)}', 'us-east-1'), - options :='FORMAT CSV, HEADER'); - """) - - print(f"Creating oca_addresses_with_ct and exporting to S3") - db.sql(f""" + """) + db.sql(f""" SELECT * from aws_s3.query_export_to_s3( - 'SELECT * from oca_addresses_with_ct', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_with_ct.csv", s3_prefix)}', 'us-east-1'), - options :='FORMAT CSV, HEADER'); + 'SELECT * from oca_addresses_public', + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses.csv", s3_prefix)}', 'us-east-1'), + options :='FORMAT CSV, HEADER'); """) + manifest.upsert_step('geocode_refresh', 'completed') - # add level-1 version of address table from level-2 data and maintain consistent name - print(f"Creating oca_addresses_public and exporting to S3") - db.sql(f""" - SELECT * from aws_s3.query_export_to_s3( - 'SELECT * from oca_addresses_public', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses.csv", s3_prefix)}', 'us-east-1'), - options :='FORMAT CSV, HEADER'); - """) + manifest.mark_run_completed(len(selected_zip_files), len(selected_zip_files), len(skipped_reprocess_files)) + return True + except Exception as exc: + for selected_name in selected_zip_files: + source = 'sftp' if selected_name in new_file_set else 's3_private' + manifest.upsert_file(selected_name, source=source, status='failed', stage='run', error=exc) + manifest.mark_run_failed(exc) + raise + finally: + manifest.release_lock() diff --git a/lib/sql/create_etl_manifest_tables.sql b/lib/sql/create_etl_manifest_tables.sql new file mode 100644 index 0000000..9198a8c --- /dev/null +++ b/lib/sql/create_etl_manifest_tables.sql @@ -0,0 +1,56 @@ +CREATE TABLE IF NOT EXISTS etl_runs ( + id BIGSERIAL PRIMARY KEY, + run_id TEXT NOT NULL UNIQUE, + schema_name TEXT NOT NULL, + s3_prefix TEXT NOT NULL DEFAULT '', + mode TEXT, + reprocess_glob TEXT, + force_reprocess BOOLEAN NOT NULL DEFAULT FALSE, + status TEXT NOT NULL, + selected_file_count INTEGER NOT NULL DEFAULT 0, + processed_file_count INTEGER NOT NULL DEFAULT 0, + skipped_file_count INTEGER NOT NULL DEFAULT 0, + started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + completed_at TIMESTAMPTZ, + error_message TEXT, + error_details JSONB, + metadata JSONB NOT NULL DEFAULT '{}'::jsonb +); + +CREATE INDEX IF NOT EXISTS idx_etl_runs_schema_started + ON etl_runs (schema_name, started_at DESC); + +CREATE TABLE IF NOT EXISTS etl_files ( + id BIGSERIAL PRIMARY KEY, + run_id TEXT NOT NULL REFERENCES etl_runs (run_id) ON DELETE CASCADE, + file_name TEXT NOT NULL, + source TEXT NOT NULL, + status TEXT NOT NULL, + stage TEXT, + details JSONB NOT NULL DEFAULT '{}'::jsonb, + started_at TIMESTAMPTZ, + completed_at TIMESTAMPTZ, + error_message TEXT, + error_details JSONB, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + UNIQUE (run_id, file_name) +); + +CREATE INDEX IF NOT EXISTS idx_etl_files_name_status + ON etl_files (file_name, status); + +CREATE TABLE IF NOT EXISTS etl_steps ( + id BIGSERIAL PRIMARY KEY, + run_id TEXT NOT NULL REFERENCES etl_runs (run_id) ON DELETE CASCADE, + step_name TEXT NOT NULL, + status TEXT NOT NULL, + started_at TIMESTAMPTZ, + completed_at TIMESTAMPTZ, + error_message TEXT, + error_details JSONB, + details JSONB NOT NULL DEFAULT '{}'::jsonb, + created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), + UNIQUE (run_id, step_name) +); diff --git a/tests/test_run_manifest.py b/tests/test_run_manifest.py new file mode 100644 index 0000000..1f7d05c --- /dev/null +++ b/tests/test_run_manifest.py @@ -0,0 +1,66 @@ +import unittest + +from lib.etl import EtlRunManifest, completed_reprocess_files, select_data_files_to_process + + +class FakeDb: + def __init__(self): + self.sql_calls = [] + self.fetch_one_queue = [] + self.fetch_all_result = [] + + def execute_sql_file(self, sql_file): + self.sql_calls.append(("execute_sql_file", sql_file)) + + def sql(self, statement): + self.sql_calls.append(("sql", statement)) + + def sql_fetch_one(self, statement): + self.sql_calls.append(("sql_fetch_one", statement)) + if self.fetch_one_queue: + return self.fetch_one_queue.pop(0) + return (None,) + + def sql_fetch_all(self, statement): + self.sql_calls.append(("sql_fetch_all", statement)) + return self.fetch_all_result + + +class RunManifestTests(unittest.TestCase): + def test_completed_reprocess_files_filters_manifest_hits(self): + fake_db = FakeDb() + fake_db.fetch_all_result = [("file_a.zip",), ("file_b.zip",)] + completed = completed_reprocess_files(fake_db, ["file_a.zip", "file_c.zip"]) + self.assertEqual(completed, {"file_a.zip", "file_b.zip"}) + + def test_reprocess_without_force_skips_completed_files(self): + new_files = ["LandlordTenant.Incr.2024-03-01.zip"] + reprocess_files = [ + "LandlordTenant.Incr.2023-01-01.zip", + "LandlordTenant.Incr.2023-01-08.zip", + ] + already_completed = {"LandlordTenant.Incr.2023-01-01.zip"} + selected = select_data_files_to_process( + new_files=new_files, + reprocess_files=sorted(set(reprocess_files) - already_completed), + force_reprocess=False, + ) + self.assertEqual(selected, ["LandlordTenant.Incr.2024-03-01.zip"]) + + def test_advisory_lock_failure_raises(self): + fake_db = FakeDb() + fake_db.fetch_one_queue = [(12345,), (False,)] + manifest = EtlRunManifest( + db=fake_db, + schema_name="oca_refactor", + s3_prefix="refactor/dev", + mode="2", + reprocess_glob="", + force_reprocess=False, + ) + with self.assertRaises(RuntimeError): + manifest.acquire_lock() + + +if __name__ == "__main__": + unittest.main() From a10632e00ce7aa976511ad81ab6a46025cc090d4 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Wed, 27 May 2026 14:35:37 -0400 Subject: [PATCH 05/30] ETL module structure (flat split, move-only) --- lib/etl.py | 677 +++++--------------------------------- lib/etl_constants.py | 22 ++ lib/etl_file_selection.py | 55 ++++ lib/etl_helpers.py | 151 +++++++++ lib/etl_run_manifest.py | 161 +++++++++ lib/etl_stages.py | 287 ++++++++++++++++ 6 files changed, 757 insertions(+), 596 deletions(-) create mode 100644 lib/etl_constants.py create mode 100644 lib/etl_file_selection.py create mode 100644 lib/etl_helpers.py create mode 100644 lib/etl_run_manifest.py create mode 100644 lib/etl_stages.py diff --git a/lib/etl.py b/lib/etl.py index d5e7c6e..ebe8f85 100644 --- a/lib/etl.py +++ b/lib/etl.py @@ -1,404 +1,68 @@ -import os -import io -import shutil -import zipfile -import requests -import re -import json -import fnmatch -import uuid -import traceback -from datetime import datetime -# TODO - replace os.path with Pathlib and its '/' operator -from pathlib import Path - -import numpy as np -import pandas as pd import multiprocessing -import functools -from itertools import repeat -from contextlib import contextmanager -from lxml import etree -import sys +from pathlib import Path from .database import Database from .duckdb_database import DuckDB +from .etl_constants import ( + DATA_FILENAME, + DATA_ZIPFILE_PAT, + OCA_TABLES, + S3_PRIVATE_FOLDER, + S3_PUBLIC_FOLDER, +) +from .etl_file_selection import ( + list_new_data_files, + list_reprocess_data_files, + select_data_files_to_process, +) +from .etl_helpers import ( + create_date_files, + csv_has_rows, + download_pluto, + insert_staging_to_main, + make_dir, + prep_db, + s3_key, + upload_public_file, +) +from .etl_run_manifest import EtlRunManifest, completed_reprocess_files, manifest_step +from .etl_stages import ( + FileSelection, + download_selected_files, + geocode_and_publish_addresses, + import_and_promote_staging, + parse_xml_to_staging, + preprocess_and_upload_staging_csvs, + publish_core_tables, + select_input_files, +) from .s3 import S3 from .sftp import Sftp -from .parsers import oca_tag, parse_file - -from .geocode_record import geocode_record, geocode_using_census_batch -OCA_TABLES = [ - 'oca_index', - 'oca_causes', - 'oca_addresses', - 'oca_parties', - 'oca_events', - 'oca_appearances', - 'oca_appearance_outcomes', - 'oca_motions', - 'oca_decisions', - 'oca_judgments', - 'oca_warrants', - 'oca_metadata' +__all__ = [ + 'OCA_TABLES', + 'DATA_ZIPFILE_PAT', + 'DATA_FILENAME', + 'S3_PRIVATE_FOLDER', + 'S3_PUBLIC_FOLDER', + 's3_key', + 'make_dir', + 'list_new_data_files', + 'list_reprocess_data_files', + 'select_data_files_to_process', + 'completed_reprocess_files', + 'EtlRunManifest', + 'manifest_step', + 'csv_has_rows', + 'prep_db', + 'insert_staging_to_main', + 'create_date_files', + 'download_pluto', + 'upload_public_file', + 'FileSelection', + 'oca_etl', ] -DATA_ZIPFILE_PAT = r'LandlordTenant\.(Initial\.FiledIn\d{4}|Incr)\.\d{4}-\d{2}-\d{2}\.zip' - -DATA_FILENAME = 'LandlordTenantExtract.xml' - -S3_PRIVATE_FOLDER = 'private' - -S3_PUBLIC_FOLDER = 'public' - - -def s3_key(path, s3_prefix=''): - normalized_path = path.lstrip('/') - if not s3_prefix: - return normalized_path - normalized_prefix = s3_prefix.strip('/') - return f"{normalized_prefix}/{normalized_path}" - - -def make_dir(dir_name): - """ - Create a new directory in the same folder as this file, - deleting everything in the folder if it already exists - - :param dir_name: The name of the directory to be created as a string - """ - dir_path = os.path.abspath(os.path.join(os.path.dirname(__file__), dir_name)) - shutil.rmtree(dir_path, ignore_errors=True) - os.mkdir(dir_path) - return dir_path - - -def list_new_data_files(sftp, s3, s3_prefix=''): - """ - Get a list of filenames for all the data files available in the SFTP - that are not already in the private S3 folder. These are the new ones - that still need to be processed. They are returned in the proper order - in which they need to be processed. - - :param sftp: SFTP object - :param s3: S3 object - """ - - sftp_zip_files = sftp.list_files(DATA_ZIPFILE_PAT) - s3_zip_files = s3.list_files(DATA_ZIPFILE_PAT, s3_key(S3_PRIVATE_FOLDER, s3_prefix)) - new_sftp_zip_files = list(set(sftp_zip_files) - set(s3_zip_files)) - - # It's important that everything is processed in order because files - # can contain modify/delete cases included in past files - init_files = [f for f in new_sftp_zip_files if 'Initial' in f] - incr_files = [f for f in new_sftp_zip_files if 'Incr' in f] - - files = [] - files += sorted(init_files) if init_files else [] - files += sorted(incr_files) if incr_files else [] - - return files - - -def list_reprocess_data_files(s3, reprocess_glob, s3_prefix=''): - if not reprocess_glob: - return [] - s3_zip_files = s3.list_files(DATA_ZIPFILE_PAT, s3_key(S3_PRIVATE_FOLDER, s3_prefix)) - return sorted([f for f in s3_zip_files if fnmatch.fnmatch(f, reprocess_glob)]) - - -def select_data_files_to_process(new_files, reprocess_files, force_reprocess=False): - def ordered(files): - init_files = sorted([f for f in files if 'Initial' in f]) - incr_files = sorted([f for f in files if 'Incr' in f]) - return init_files + incr_files - - if not reprocess_files: - return ordered(new_files) - - if not force_reprocess: - # Keep backward-compatible default behavior unless force mode is explicitly set. - return ordered(new_files) - - merged = set(new_files) | set(reprocess_files) - return ordered(merged) - - -def completed_reprocess_files(db, reprocess_files): - if not reprocess_files: - return set() - quoted_files = ",".join(["'" + f.replace("'", "''") + "'" for f in reprocess_files]) - rows = db.sql_fetch_all(f""" - SELECT DISTINCT ef.file_name - FROM etl_files ef - JOIN etl_runs er ON er.run_id = ef.run_id - WHERE ef.status = 'completed' - AND er.status = 'completed' - AND ef.file_name IN ({quoted_files}) - """) - return {row[0] for row in rows} - - -class EtlRunManifest: - @staticmethod - def _escape(value): - return str(value).replace("'", "''") - - def _literal(self, value): - return f"'{self._escape(value)}'" - - def _json_literal(self, value): - return f"'{self._escape(json.dumps(value))}'::jsonb" - - def __init__(self, db, schema_name, s3_prefix, mode, reprocess_glob, force_reprocess): - self.db = db - self.schema_name = schema_name or 'public' - self.s3_prefix = s3_prefix or '' - self.mode = mode - self.reprocess_glob = reprocess_glob or '' - self.force_reprocess = force_reprocess - self.run_id = str(uuid.uuid4()) - self.lock_key = None - self.lock_acquired = False - - def setup_tables(self): - self.db.execute_sql_file('create_etl_manifest_tables.sql') - - def acquire_lock(self): - row = self.db.sql_fetch_one( - f"SELECT hashtext('oca_etl:' || {self._literal(self.schema_name)})::bigint" - ) - self.lock_key = row[0] - locked = self.db.sql_fetch_one(f"SELECT pg_try_advisory_lock({self.lock_key})") - self.lock_acquired = bool(locked and locked[0]) - if not self.lock_acquired: - raise RuntimeError(f"Another ETL run is already active for schema '{self.schema_name}'.") - - def release_lock(self): - if self.lock_acquired and self.lock_key is not None: - self.db.sql_fetch_one(f"SELECT pg_advisory_unlock({self.lock_key})") - self.lock_acquired = False - - def create_run(self): - payload = { - "mode": self.mode, - "schema_name": self.schema_name, - "s3_prefix": self.s3_prefix, - "reprocess_glob": self.reprocess_glob, - "force_reprocess": self.force_reprocess, - } - self.db.sql(f""" - INSERT INTO etl_runs ( - run_id, schema_name, s3_prefix, mode, reprocess_glob, force_reprocess, status, metadata, started_at - ) VALUES ( - {self._literal(self.run_id)}, {self._literal(self.schema_name)}, {self._literal(self.s3_prefix)}, - {self._literal(self.mode)}, {self._literal(self.reprocess_glob)}, - {str(self.force_reprocess).upper()}, 'running', {self._json_literal(payload)}, NOW() - ) - """) - - def mark_run_completed(self, selected_count, processed_count, skipped_count): - self.db.sql(f""" - UPDATE etl_runs - SET status = 'completed', - completed_at = NOW(), - selected_file_count = {selected_count}, - processed_file_count = {processed_count}, - skipped_file_count = {skipped_count} - WHERE run_id = '{self.run_id}' - """) - - def mark_run_failed(self, exc): - message = str(exc) - details = {"traceback": traceback.format_exc()} - self.db.sql(f""" - UPDATE etl_runs - SET status = 'failed', - completed_at = NOW(), - error_message = {self._literal(message)}, - error_details = {self._json_literal(details)} - WHERE run_id = {self._literal(self.run_id)} - """) - - def upsert_file(self, file_name, source, status, stage=None, details=None, error=None): - stage_value = "NULL" if stage is None else self._literal(stage) - details_value = self._json_literal(details or {}) - error_message = "NULL" if error is None else self._literal(str(error)) - error_details = "NULL" if error is None else self._json_literal({'traceback': traceback.format_exc()}) - completed_at = "NOW()" if status in ("completed", "failed", "skipped") else "NULL" - started_at = "NOW()" if status in ("processing", "downloaded", "parsed", "promoted") else "NULL" - self.db.sql(f""" - INSERT INTO etl_files ( - run_id, file_name, source, status, stage, details, started_at, completed_at, error_message, error_details, updated_at - ) VALUES ( - {self._literal(self.run_id)}, {self._literal(file_name)}, {self._literal(source)}, {self._literal(status)}, {stage_value}, - {details_value}, {started_at}, {completed_at}, {error_message}, {error_details}, NOW() - ) - ON CONFLICT (run_id, file_name) DO UPDATE - SET source = EXCLUDED.source, - status = EXCLUDED.status, - stage = EXCLUDED.stage, - details = EXCLUDED.details, - started_at = COALESCE(etl_files.started_at, EXCLUDED.started_at), - completed_at = EXCLUDED.completed_at, - error_message = EXCLUDED.error_message, - error_details = EXCLUDED.error_details, - updated_at = NOW() - """) - - def upsert_step(self, step_name, status, details=None, error=None): - details_value = self._json_literal(details or {}) - started_at = "NOW()" if status == "running" else "NULL" - completed_at = "NOW()" if status in ("completed", "failed") else "NULL" - error_message = "NULL" if error is None else self._literal(str(error)) - error_details = "NULL" if error is None else self._json_literal({'traceback': traceback.format_exc()}) - self.db.sql(f""" - INSERT INTO etl_steps ( - run_id, step_name, status, started_at, completed_at, error_message, error_details, details, updated_at - ) VALUES ( - {self._literal(self.run_id)}, {self._literal(step_name)}, {self._literal(status)}, {started_at}, {completed_at}, - {error_message}, {error_details}, {details_value}, NOW() - ) - ON CONFLICT (run_id, step_name) DO UPDATE - SET status = EXCLUDED.status, - started_at = COALESCE(etl_steps.started_at, EXCLUDED.started_at), - completed_at = EXCLUDED.completed_at, - error_message = EXCLUDED.error_message, - error_details = EXCLUDED.error_details, - details = EXCLUDED.details, - updated_at = NOW() - """) - - -@contextmanager -def manifest_step(manifest, step_name, details=None): - manifest.upsert_step(step_name, 'running', details=details) - try: - yield - manifest.upsert_step(step_name, 'completed', details=details) - except Exception as exc: - manifest.upsert_step(step_name, 'failed', details=details, error=exc) - raise - - -def csv_has_rows(csv_filepath, chunk_size=1000): - for _ in pd.read_csv(csv_filepath, chunksize=chunk_size): - return True - return False - - -def prep_db(s3, db, local_dir): - """ - Create a new directory in the same folder as this file, - deleting everything in the folder if it already exists - - :param s3: S3 object - :param db: Database object - :param local_dir: Path for local directory to save database dump file - """ - if s3.list_files('oca.dump', S3_PRIVATE_FOLDER): - print('Rebuilding tables from SQL dump') - s3.download_file(f"{S3_PRIVATE_FOLDER}/oca.dump", os.path.join(local_dir, 'oca.dump')) - db.execute_sql_file('create_tables.sql') - db.restore_from(os.path.join(local_dir, 'oca.dump')) - else: - print('Creating tables from scratch') - db.execute_sql_file('create_tables.sql') - - -def insert_staging_to_main(db, tables): - """ - Delete all cases from main tables if they exist in the staging table, - then insert all records from the staging tables to the main tables - - issue: SET session_replication_role = replica - https://stackoverflow.com/questions/3942258/how-do-i-temporarily-disable-triggers-in-postgresql/18709987#18709987 - to a work around to avoid DELETE FROM command stalling. - A VACUUM FULL on all the tables were tried, it does not seem to help - Might be an issue with the staging table schema? - - :param db: Database object - """ - - db.sql("SET session_replication_role = replica;") - for table in tables: - if table in ('oca_metadata'): # skip these tables - continue - print(f"\t...Deleting older entries from {table}") - db.sql(f"DELETE FROM {table} WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging)") - db.sql("SET session_replication_role = default;") - - for table in tables: - if table in ('oca_metadata'): # skip these tables - continue - print(f"\t...Inserting to {table}") - db.sql(f"INSERT INTO {table} SELECT * FROM {table}_staging") - db.sql(f"DROP TABLE {table}_staging") - - -def create_date_files(s3, data_file, local_dir): - """ - Create a text file and a custom shield image with date the data was - last updated and add them to the public S3 folder. - - :param s3: S3 object - :param data_file: file path for data being processed - """ - date = re.search(r'(\d{4}-\d{2}-\d{2})', data_file).group(1) - - txt_file = os.path.join(local_dir, 'last-updated-date.txt') - open(txt_file, 'w').write(date) - - url = f"https://raster.shields.io/badge/Last%20Updated-{date.replace('-', '--')}-yellow" - r = requests.get(url) - img_file = os.path.join(local_dir, 'last-updated-shield.png') - open(img_file, 'wb').write(r.content) - - -def download_pluto(output_dir): - """ - Download and unzip PLUTO into the directory. - - :param output_dir: string or Path - """ - print('downloading pluto') - - # Check https://www.nyc.gov/content/planning/pages/resources/datasets/mappluto-pluto-change for updates - PLUTO_CSV_URL = 'https://s-media.nyc.gov/agencies/dcp/assets/files/zip/data-tools/bytes/pluto/nyc_pluto_25v1_1_csv.zip' - - #download and unzip - response = requests.get(PLUTO_CSV_URL) - content = response.content - z = zipfile.ZipFile(io.BytesIO(content)) - - pluto_csv = [name for name in z.namelist() if '.csv' in name][0] - z.extract(pluto_csv, output_dir) - - # rename - pluto_file = os.path.join(output_dir, "pluto.csv") - os.rename(os.path.join(output_dir, pluto_csv), pluto_file) - - return pluto_file - - -def upload_public_file(f, pub_dir, mode, s3_args, s3_prefix=''): - """ - Uploads a local file from the pub_dir folder to the S3_PUBLIC_FOLDER. - - :param f: filename - :paramp ub_dir: local path folder - :param mode: string - :param s3_args: dict/ kwargs with aws_id, aws_key aws_bucket_name - """ - s3 = S3(**s3_args) - print('-', f) - s3_filename = f - # to maintain consistent names for public level-1 csv files, we'll rename the level-2 version - if mode == "2" and f == "oca_addresses.csv": - s3_filename = "oca_addresses_private.csv" - s3.upload_file(s3_key(f"{S3_PUBLIC_FOLDER}/{s3_filename}", s3_prefix), os.path.join(pub_dir, f)) - del s3 def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None): """ @@ -432,219 +96,40 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None s3 = S3(**s3_args) priv_dir = make_dir('data-private') pub_dir = make_dir('data-public') - selected_zip_files = [] - skipped_reprocess_files = [] - new_file_set = set() + selection = None try: - manifest.upsert_step('select_files', 'running') - new_sftp_zip_files = list_new_data_files(sftp, s3, s3_prefix=s3_prefix) - reprocess_s3_zip_files = list_reprocess_data_files(s3, reprocess_glob, s3_prefix=s3_prefix) - if reprocess_glob and not force_reprocess and reprocess_s3_zip_files: - already_completed = completed_reprocess_files(db, reprocess_s3_zip_files) - skipped_reprocess_files = sorted(already_completed) - reprocess_s3_zip_files = sorted(set(reprocess_s3_zip_files) - already_completed) - - selected_zip_files = select_data_files_to_process( - new_sftp_zip_files, - reprocess_s3_zip_files, - force_reprocess=force_reprocess + selection = select_input_files( + manifest, db, sftp, s3, s3_prefix, reprocess_glob, force_reprocess ) - manifest.upsert_step('select_files', 'completed', details={'selected_file_count': len(selected_zip_files)}) - - if reprocess_glob: - print(f"Reprocess selector active: REPROCESS_GLOB={reprocess_glob}, FORCE_REPROCESS={force_reprocess}") - print(f"Matched S3 private files: {len(reprocess_s3_zip_files)}") - if skipped_reprocess_files and not force_reprocess: - print(f"Skipping already-completed reprocess files from manifest: {len(skipped_reprocess_files)}") - - if not selected_zip_files: + if not selection.selected_zip_files: print('No files selected for processing. Stopping process.') - manifest.mark_run_completed(0, 0, len(skipped_reprocess_files)) + manifest.mark_run_completed(0, 0, len(selection.skipped_reprocess_files)) return True - reprocess_file_set = set(reprocess_s3_zip_files) - new_file_set = set(new_sftp_zip_files) - selected_set = set(selected_zip_files) - sftp_download_files = sorted(selected_set & new_file_set) - s3_download_files = sorted(selected_set & reprocess_file_set) - - for f in sftp_download_files: - manifest.upsert_file(f, source='sftp', status='selected', stage='select') - for f in s3_download_files: - manifest.upsert_file(f, source='s3_private', status='selected', stage='select') - for f in skipped_reprocess_files: - manifest.upsert_file(f, source='s3_private', status='skipped', stage='select', details={'reason': 'already_completed_manifest'}) - - manifest.upsert_step('download_files', 'running') - print('Downloading selected files:') - for f in sftp_download_files: - print('-', f) - sftp.download_files(f, priv_dir) - manifest.upsert_file(f, source='sftp', status='downloaded', stage='download') - for f in s3_download_files: - print('-', f) - s3.download_file(s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), os.path.join(priv_dir, f)) - manifest.upsert_file(f, source='s3_private', status='downloaded', stage='download') - manifest.upsert_step('download_files', 'completed') - - def sort_by_date(file): - r = re.search(r'(\d+.+)\.zip', file).group(0).replace('.', ' ') - return r - local_zip_files = sorted([os.path.join(priv_dir, f) for f in os.listdir(priv_dir) if f.endswith('.zip')], key=sort_by_date) - - manifest.upsert_step('parse_xml', 'running') - staging_db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') - print('Processing files:') - for zip_file in local_zip_files: - file_name = os.path.basename(zip_file) - manifest.upsert_file(file_name, source='local', status='processing', stage='parse') - extract_date = None - with zipfile.ZipFile(zip_file, 'r').open(DATA_FILENAME) as xml_file: - for _, elem in etree.iterparse(xml_file, tag=oca_tag('RunDate')): - if not extract_date: - extract_date = elem.text - break - with zipfile.ZipFile(zip_file, 'r').open(DATA_FILENAME) as xml_file: - parse_file(xml_file, staging_db, extract_date) - manifest.upsert_file(file_name, source='local', status='parsed', stage='parse', details={'extract_date': extract_date}) - manifest.upsert_step('parse_xml', 'completed') - - staging_db.export_tables_to_csv(output_dir=pub_dir) - - def preprocess_csvs(target_dir): - for filename in os.listdir(target_dir): - if filename.endswith('.csv'): - file_path = os.path.join(target_dir, filename) - df = pd.read_csv(file_path) - for col in df.columns: - if df[col].dtype == 'object': - def replace_brackets(text): - if pd.isna(text) or not isinstance(text, str): - return text - if text.startswith('[') and text.endswith(']'): - inner = text[1:-1].strip() - if inner.startswith('{') and inner.endswith('}'): - return text - return '{' + text[1:-1] + '}' - return text - df[col] = df[col].apply(replace_brackets) - if filename.startswith('oca_appearances'): - if 'appearanceid' in df.columns: - del df['appearanceid'] - df['motionsequence'] = df['motionsequence'].astype('Int64') - if filename.startswith('oca_judgments'): - df['amendedfromjudgmentsequence'] = df['amendedfromjudgmentsequence'].astype('Int64') - if filename.startswith('oca_warrants'): - df['executionstayeddays'] = df['executionstayeddays'].astype('Int64') - df['issuancestayeddays'] = df['issuancestayeddays'].astype('Int64') - df.to_csv(file_path, index=False) - - preprocess_csvs(pub_dir) - staging_tables = [t + '_staging' for t in OCA_TABLES] - public_files = [i for i in os.listdir(pub_dir) if i.endswith('.csv')] - with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: - files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) - pool.starmap(upload_public_file, files_zip) - - manifest.upsert_step('promote_staging', 'running') - db.execute_sql_file('create_tables_staging.sql') - for t in staging_tables: - csv_filepath = os.path.join(pub_dir, f"{t}.csv") - if csv_has_rows(csv_filepath, chunk_size=csv_row_check_chunk_size): - columns = '' - if t == 'oca_appearances_staging': - columns = 'indexnumberid, appearancedatetime, appearancepurpose, appearancereason, appearancepart, motionsequence, appearanceoutcomes' - db.sql(f""" - SELECT aws_s3.table_import_from_s3( - '{t}', '{columns}', '(FORMAT CSV, HEADER)', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/{t}.csv", s3_prefix)}', 'us-east-1'), - aws_commons.create_aws_credentials('{s3_args["aws_id"]}', '{s3_args["aws_key"]}', '') - ); - """) - - db.execute_sql_file('update_appearance_outcomes.sql') - insert_staging_to_main(db, OCA_TABLES) - db.execute_sql_file('update_metadata.sql') - for selected_name in selected_zip_files: - source = 'sftp' if selected_name in new_file_set else 's3_private' - manifest.upsert_file(selected_name, source=source, status='completed', stage='promote') - manifest.upsert_step('promote_staging', 'completed') - - manifest.upsert_step('publish_tables', 'running') - for t in OCA_TABLES: - s3_filename = t + '.csv' - if t == "oca_addresses": - s3_filename = "oca_addresses_private.csv" - db.sql(f""" - SELECT * from aws_s3.query_export_to_s3( - 'SELECT * from {t}', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/{s3_filename}", s3_prefix)}', 'us-east-1'), - options :='FORMAT CSV, HEADER'); - """) - manifest.upsert_step('publish_tables', 'completed') - - manifest.upsert_step('geocode_refresh', 'running') - csv_filepath = os.path.join(pub_dir, "oca_addresses_private.csv") - db.export_csv('oca_addresses', csv_filepath) - input_csv = Path(pub_dir) / 'oca_addresses_private.csv' - output_csv = Path(pub_dir) / 'oca_addresses_private.csv' - df = pd.read_csv(input_csv, dtype=str, index_col=False, usecols=lambda x: x, keep_default_na=False) - df_1 = df[((pd.isna(df['lat'])) | (df['lat'] == '')) & ((df['house_number'] != '') | (pd.notna(df['house_number'])))].copy().reset_index() - records = df_1.to_dict('records') - with multiprocessing.Pool(processes=min((geocode_workers, multiprocessing.cpu_count()))) as pool: - it = pd.DataFrame(pool.map(functools.partial(geocode_record, addr_cols=['street1', 'city', 'postalcode']), records, 10000)) - df_2 = it[(((pd.isna(it['lat'])) | (it['lat'] == '')))].copy().reset_index() - with multiprocessing.Pool(processes=min([5, multiprocessing.cpu_count()])) as pool: - chunk_size = census_batch_chunk_size - data_split = zip(np.split(df_2, range(chunk_size, df_2.shape[0], chunk_size)), repeat(pub_dir)) - it_2 = pd.concat(pool.starmap(geocode_using_census_batch, data_split)) - export_cols = ['indexnumberid', 'street1', 'street2', 'city', 'state', 'postalcode', 'status', 'house_number', 'street_name', 'borough_code', 'place_name', 'sname', 'hnum', 'boro', 'lat', 'bin', 'bbl', 'cd', 'ct', 'council', 'grc', 'grc2', 'msg', 'msg2', 'lon', 'zip_code'] - concat = pd.concat([df, it, it_2], ignore_index=True).drop_duplicates(subset=['indexnumberid'], ignore_index=True, keep='last')[export_cols] - pd.DataFrame(concat).to_csv(output_csv, index=False) - create_date_files(s3, selected_zip_files[-1], pub_dir) - public_files = [i for i in os.listdir(pub_dir) if i in ('last-updated-shield.png', 'last-updated-date.txt', 'oca_addresses_private.csv')] - with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: - files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) - pool.starmap(upload_public_file, files_zip) - for f in os.listdir(priv_dir): - if f != '.DS_Store': - s3.upload_file(s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), os.path.join(priv_dir, f)) - db.execute_sql_file('reset_addresses_table.sql') - db.sql(f""" - SELECT aws_s3.table_import_from_s3( - 'oca_addresses', '', '(FORMAT CSV, HEADER)', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_private.csv", s3_prefix)}', 'us-east-1'), - aws_commons.create_aws_credentials('{s3_args["aws_id"]}', '{s3_args["aws_key"]}', '') - ); - """) - db.execute_sql_file('create_addresses_views.sql') - db.sql(f""" - SELECT * from aws_s3.query_export_to_s3( - 'SELECT * from oca_addresses_with_bbl', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_with_bbl.csv", s3_prefix)}', 'us-east-1'), - options :='FORMAT CSV, HEADER'); - """) - db.sql(f""" - SELECT * from aws_s3.query_export_to_s3( - 'SELECT * from oca_addresses_with_ct', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_with_ct.csv", s3_prefix)}', 'us-east-1'), - options :='FORMAT CSV, HEADER'); - """) - db.sql(f""" - SELECT * from aws_s3.query_export_to_s3( - 'SELECT * from oca_addresses_public', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses.csv", s3_prefix)}', 'us-east-1'), - options :='FORMAT CSV, HEADER'); - """) - manifest.upsert_step('geocode_refresh', 'completed') + download_selected_files(manifest, sftp, s3, priv_dir, s3_prefix, selection) + parse_xml_to_staging(manifest, staging_db, priv_dir) + preprocess_and_upload_staging_csvs(staging_db, pub_dir, mode, s3_args, s3_prefix) + import_and_promote_staging( + manifest, db, pub_dir, s3_args, s3_prefix, selection, csv_row_check_chunk_size + ) + publish_core_tables(manifest, db, s3_args, s3_prefix) + geocode_and_publish_addresses( + manifest, db, s3, priv_dir, pub_dir, s3_args, s3_prefix, mode, selection, + geocode_workers, census_batch_chunk_size + ) - manifest.mark_run_completed(len(selected_zip_files), len(selected_zip_files), len(skipped_reprocess_files)) + manifest.mark_run_completed( + len(selection.selected_zip_files), + len(selection.selected_zip_files), + len(selection.skipped_reprocess_files) + ) return True except Exception as exc: - for selected_name in selected_zip_files: - source = 'sftp' if selected_name in new_file_set else 's3_private' - manifest.upsert_file(selected_name, source=source, status='failed', stage='run', error=exc) + if selection and selection.selected_zip_files: + for selected_name in selection.selected_zip_files: + source = 'sftp' if selected_name in selection.new_file_set else 's3_private' + manifest.upsert_file(selected_name, source=source, status='failed', stage='run', error=exc) manifest.mark_run_failed(exc) raise finally: diff --git a/lib/etl_constants.py b/lib/etl_constants.py new file mode 100644 index 0000000..df5d17f --- /dev/null +++ b/lib/etl_constants.py @@ -0,0 +1,22 @@ +OCA_TABLES = [ + 'oca_index', + 'oca_causes', + 'oca_addresses', + 'oca_parties', + 'oca_events', + 'oca_appearances', + 'oca_appearance_outcomes', + 'oca_motions', + 'oca_decisions', + 'oca_judgments', + 'oca_warrants', + 'oca_metadata' +] + +DATA_ZIPFILE_PAT = r'LandlordTenant\.(Initial\.FiledIn\d{4}|Incr)\.\d{4}-\d{2}-\d{2}\.zip' + +DATA_FILENAME = 'LandlordTenantExtract.xml' + +S3_PRIVATE_FOLDER = 'private' + +S3_PUBLIC_FOLDER = 'public' diff --git a/lib/etl_file_selection.py b/lib/etl_file_selection.py new file mode 100644 index 0000000..4031df9 --- /dev/null +++ b/lib/etl_file_selection.py @@ -0,0 +1,55 @@ +import fnmatch + +from .etl_constants import DATA_ZIPFILE_PAT, S3_PRIVATE_FOLDER +from .etl_helpers import s3_key + + +def list_new_data_files(sftp, s3, s3_prefix=''): + """ + Get a list of filenames for all the data files available in the SFTP + that are not already in the private S3 folder. These are the new ones + that still need to be processed. They are returned in the proper order + in which they need to be processed. + + :param sftp: SFTP object + :param s3: S3 object + """ + + sftp_zip_files = sftp.list_files(DATA_ZIPFILE_PAT) + s3_zip_files = s3.list_files(DATA_ZIPFILE_PAT, s3_key(S3_PRIVATE_FOLDER, s3_prefix)) + new_sftp_zip_files = list(set(sftp_zip_files) - set(s3_zip_files)) + + # It's important that everything is processed in order because files + # can contain modify/delete cases included in past files + init_files = [f for f in new_sftp_zip_files if 'Initial' in f] + incr_files = [f for f in new_sftp_zip_files if 'Incr' in f] + + files = [] + files += sorted(init_files) if init_files else [] + files += sorted(incr_files) if incr_files else [] + + return files + + +def list_reprocess_data_files(s3, reprocess_glob, s3_prefix=''): + if not reprocess_glob: + return [] + s3_zip_files = s3.list_files(DATA_ZIPFILE_PAT, s3_key(S3_PRIVATE_FOLDER, s3_prefix)) + return sorted([f for f in s3_zip_files if fnmatch.fnmatch(f, reprocess_glob)]) + + +def select_data_files_to_process(new_files, reprocess_files, force_reprocess=False): + def ordered(files): + init_files = sorted([f for f in files if 'Initial' in f]) + incr_files = sorted([f for f in files if 'Incr' in f]) + return init_files + incr_files + + if not reprocess_files: + return ordered(new_files) + + if not force_reprocess: + # Keep backward-compatible default behavior unless force mode is explicitly set. + return ordered(new_files) + + merged = set(new_files) | set(reprocess_files) + return ordered(merged) diff --git a/lib/etl_helpers.py b/lib/etl_helpers.py new file mode 100644 index 0000000..0f8145f --- /dev/null +++ b/lib/etl_helpers.py @@ -0,0 +1,151 @@ +import io +import os +import re +import shutil +import zipfile + +import pandas as pd +import requests + +from .etl_constants import S3_PRIVATE_FOLDER, S3_PUBLIC_FOLDER +from .s3 import S3 + + +def s3_key(path, s3_prefix=''): + normalized_path = path.lstrip('/') + if not s3_prefix: + return normalized_path + normalized_prefix = s3_prefix.strip('/') + return f"{normalized_prefix}/{normalized_path}" + + +def make_dir(dir_name): + """ + Create a new directory in the same folder as this file, + deleting everything in the folder if it already exists + + :param dir_name: The name of the directory to be created as a string + """ + dir_path = os.path.abspath(os.path.join(os.path.dirname(__file__), dir_name)) + shutil.rmtree(dir_path, ignore_errors=True) + os.mkdir(dir_path) + return dir_path + + +def csv_has_rows(csv_filepath, chunk_size=1000): + for _ in pd.read_csv(csv_filepath, chunksize=chunk_size): + return True + return False + + +def prep_db(s3, db, local_dir): + """ + Create a new directory in the same folder as this file, + deleting everything in the folder if it already exists + + :param s3: S3 object + :param db: Database object + :param local_dir: Path for local directory to save database dump file + """ + if s3.list_files('oca.dump', S3_PRIVATE_FOLDER): + print('Rebuilding tables from SQL dump') + s3.download_file(f"{S3_PRIVATE_FOLDER}/oca.dump", os.path.join(local_dir, 'oca.dump')) + db.execute_sql_file('create_tables.sql') + db.restore_from(os.path.join(local_dir, 'oca.dump')) + else: + print('Creating tables from scratch') + db.execute_sql_file('create_tables.sql') + + +def insert_staging_to_main(db, tables): + """ + Delete all cases from main tables if they exist in the staging table, + then insert all records from the staging tables to the main tables + + issue: SET session_replication_role = replica + https://stackoverflow.com/questions/3942258/how-do-i-temporarily-disable-triggers-in-postgresql/18709987#18709987 + to a work around to avoid DELETE FROM command stalling. + A VACUUM FULL on all the tables were tried, it does not seem to help + Might be an issue with the staging table schema? + + :param db: Database object + """ + + db.sql("SET session_replication_role = replica;") + for table in tables: + if table in ('oca_metadata'): # skip these tables + continue + print(f"\t...Deleting older entries from {table}") + db.sql(f"DELETE FROM {table} WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging)") + db.sql("SET session_replication_role = default;") + + for table in tables: + if table in ('oca_metadata'): # skip these tables + continue + print(f"\t...Inserting to {table}") + db.sql(f"INSERT INTO {table} SELECT * FROM {table}_staging") + db.sql(f"DROP TABLE {table}_staging") + + +def create_date_files(s3, data_file, local_dir): + """ + Create a text file and a custom shield image with date the data was + last updated and add them to the public S3 folder. + + :param s3: S3 object + :param data_file: file path for data being processed + """ + date = re.search(r'(\d{4}-\d{2}-\d{2})', data_file).group(1) + + txt_file = os.path.join(local_dir, 'last-updated-date.txt') + open(txt_file, 'w').write(date) + + url = f"https://raster.shields.io/badge/Last%20Updated-{date.replace('-', '--')}-yellow" + r = requests.get(url) + img_file = os.path.join(local_dir, 'last-updated-shield.png') + open(img_file, 'wb').write(r.content) + + +def download_pluto(output_dir): + """ + Download and unzip PLUTO into the directory. + + :param output_dir: string or Path + """ + print('downloading pluto') + + # Check https://www.nyc.gov/content/planning/pages/resources/datasets/mappluto-pluto-change for updates + PLUTO_CSV_URL = 'https://s-media.nyc.gov/agencies/dcp/assets/files/zip/data-tools/bytes/pluto/nyc_pluto_25v1_1_csv.zip' + + #download and unzip + response = requests.get(PLUTO_CSV_URL) + content = response.content + z = zipfile.ZipFile(io.BytesIO(content)) + + pluto_csv = [name for name in z.namelist() if '.csv' in name][0] + z.extract(pluto_csv, output_dir) + + # rename + pluto_file = os.path.join(output_dir, "pluto.csv") + os.rename(os.path.join(output_dir, pluto_csv), pluto_file) + + return pluto_file + + +def upload_public_file(f, pub_dir, mode, s3_args, s3_prefix=''): + """ + Uploads a local file from the pub_dir folder to the S3_PUBLIC_FOLDER. + + :param f: filename + :paramp ub_dir: local path folder + :param mode: string + :param s3_args: dict/ kwargs with aws_id, aws_key aws_bucket_name + """ + s3 = S3(**s3_args) + print('-', f) + s3_filename = f + # to maintain consistent names for public level-1 csv files, we'll rename the level-2 version + if mode == "2" and f == "oca_addresses.csv": + s3_filename = "oca_addresses_private.csv" + s3.upload_file(s3_key(f"{S3_PUBLIC_FOLDER}/{s3_filename}", s3_prefix), os.path.join(pub_dir, f)) + del s3 diff --git a/lib/etl_run_manifest.py b/lib/etl_run_manifest.py new file mode 100644 index 0000000..6c01c07 --- /dev/null +++ b/lib/etl_run_manifest.py @@ -0,0 +1,161 @@ +import json +import traceback +import uuid +from contextlib import contextmanager + + +class EtlRunManifest: + @staticmethod + def _escape(value): + return str(value).replace("'", "''") + + def _literal(self, value): + return f"'{self._escape(value)}'" + + def _json_literal(self, value): + return f"'{self._escape(json.dumps(value))}'::jsonb" + + def __init__(self, db, schema_name, s3_prefix, mode, reprocess_glob, force_reprocess): + self.db = db + self.schema_name = schema_name or 'public' + self.s3_prefix = s3_prefix or '' + self.mode = mode + self.reprocess_glob = reprocess_glob or '' + self.force_reprocess = force_reprocess + self.run_id = str(uuid.uuid4()) + self.lock_key = None + self.lock_acquired = False + + def setup_tables(self): + self.db.execute_sql_file('create_etl_manifest_tables.sql') + + def acquire_lock(self): + row = self.db.sql_fetch_one( + f"SELECT hashtext('oca_etl:' || {self._literal(self.schema_name)})::bigint" + ) + self.lock_key = row[0] + locked = self.db.sql_fetch_one(f"SELECT pg_try_advisory_lock({self.lock_key})") + self.lock_acquired = bool(locked and locked[0]) + if not self.lock_acquired: + raise RuntimeError(f"Another ETL run is already active for schema '{self.schema_name}'.") + + def release_lock(self): + if self.lock_acquired and self.lock_key is not None: + self.db.sql_fetch_one(f"SELECT pg_advisory_unlock({self.lock_key})") + self.lock_acquired = False + + def create_run(self): + payload = { + "mode": self.mode, + "schema_name": self.schema_name, + "s3_prefix": self.s3_prefix, + "reprocess_glob": self.reprocess_glob, + "force_reprocess": self.force_reprocess, + } + self.db.sql(f""" + INSERT INTO etl_runs ( + run_id, schema_name, s3_prefix, mode, reprocess_glob, force_reprocess, status, metadata, started_at + ) VALUES ( + {self._literal(self.run_id)}, {self._literal(self.schema_name)}, {self._literal(self.s3_prefix)}, + {self._literal(self.mode)}, {self._literal(self.reprocess_glob)}, + {str(self.force_reprocess).upper()}, 'running', {self._json_literal(payload)}, NOW() + ) + """) + + def mark_run_completed(self, selected_count, processed_count, skipped_count): + self.db.sql(f""" + UPDATE etl_runs + SET status = 'completed', + completed_at = NOW(), + selected_file_count = {selected_count}, + processed_file_count = {processed_count}, + skipped_file_count = {skipped_count} + WHERE run_id = {self._literal(self.run_id)} + """) + + def mark_run_failed(self, exc): + message = str(exc) + details = {"traceback": traceback.format_exc()} + self.db.sql(f""" + UPDATE etl_runs + SET status = 'failed', + completed_at = NOW(), + error_message = {self._literal(message)}, + error_details = {self._json_literal(details)} + WHERE run_id = {self._literal(self.run_id)} + """) + + def upsert_file(self, file_name, source, status, stage=None, details=None, error=None): + stage_value = "NULL" if stage is None else self._literal(stage) + details_value = self._json_literal(details or {}) + error_message = "NULL" if error is None else self._literal(str(error)) + error_details = "NULL" if error is None else self._json_literal({'traceback': traceback.format_exc()}) + completed_at = "NOW()" if status in ("completed", "failed", "skipped") else "NULL" + started_at = "NOW()" if status in ("processing", "downloaded", "parsed", "promoted") else "NULL" + self.db.sql(f""" + INSERT INTO etl_files ( + run_id, file_name, source, status, stage, details, started_at, completed_at, error_message, error_details, updated_at + ) VALUES ( + {self._literal(self.run_id)}, {self._literal(file_name)}, {self._literal(source)}, {self._literal(status)}, {stage_value}, + {details_value}, {started_at}, {completed_at}, {error_message}, {error_details}, NOW() + ) + ON CONFLICT (run_id, file_name) DO UPDATE + SET source = EXCLUDED.source, + status = EXCLUDED.status, + stage = EXCLUDED.stage, + details = EXCLUDED.details, + started_at = COALESCE(etl_files.started_at, EXCLUDED.started_at), + completed_at = EXCLUDED.completed_at, + error_message = EXCLUDED.error_message, + error_details = EXCLUDED.error_details, + updated_at = NOW() + """) + + def upsert_step(self, step_name, status, details=None, error=None): + details_value = self._json_literal(details or {}) + started_at = "NOW()" if status == "running" else "NULL" + completed_at = "NOW()" if status in ("completed", "failed") else "NULL" + error_message = "NULL" if error is None else self._literal(str(error)) + error_details = "NULL" if error is None else self._json_literal({'traceback': traceback.format_exc()}) + self.db.sql(f""" + INSERT INTO etl_steps ( + run_id, step_name, status, started_at, completed_at, error_message, error_details, details, updated_at + ) VALUES ( + {self._literal(self.run_id)}, {self._literal(step_name)}, {self._literal(status)}, {started_at}, {completed_at}, + {error_message}, {error_details}, {details_value}, NOW() + ) + ON CONFLICT (run_id, step_name) DO UPDATE + SET status = EXCLUDED.status, + started_at = COALESCE(etl_steps.started_at, EXCLUDED.started_at), + completed_at = EXCLUDED.completed_at, + error_message = EXCLUDED.error_message, + error_details = EXCLUDED.error_details, + details = EXCLUDED.details, + updated_at = NOW() + """) + + +@contextmanager +def manifest_step(manifest, step_name, details=None): + manifest.upsert_step(step_name, 'running', details=details) + try: + yield + manifest.upsert_step(step_name, 'completed', details=details) + except Exception as exc: + manifest.upsert_step(step_name, 'failed', details=details, error=exc) + raise + + +def completed_reprocess_files(db, reprocess_files): + if not reprocess_files: + return set() + quoted_files = ",".join(["'" + f.replace("'", "''") + "'" for f in reprocess_files]) + rows = db.sql_fetch_all(f""" + SELECT DISTINCT ef.file_name + FROM etl_files ef + JOIN etl_runs er ON er.run_id = ef.run_id + WHERE ef.status = 'completed' + AND er.status = 'completed' + AND ef.file_name IN ({quoted_files}) + """) + return {row[0] for row in rows} diff --git a/lib/etl_stages.py b/lib/etl_stages.py new file mode 100644 index 0000000..527cee2 --- /dev/null +++ b/lib/etl_stages.py @@ -0,0 +1,287 @@ +import functools +import multiprocessing +import os +import re +import zipfile +from itertools import repeat +from pathlib import Path + +import numpy as np +import pandas as pd +from lxml import etree + +from .etl_constants import DATA_FILENAME, OCA_TABLES, S3_PRIVATE_FOLDER, S3_PUBLIC_FOLDER +from .etl_file_selection import ( + list_new_data_files, + list_reprocess_data_files, + select_data_files_to_process, +) +from .etl_run_manifest import completed_reprocess_files +from .etl_helpers import ( + create_date_files, + csv_has_rows, + insert_staging_to_main, + s3_key, + upload_public_file, +) +from .geocode_record import geocode_record, geocode_using_census_batch +from .parsers import oca_tag, parse_file + + +class FileSelection: + """Selected input files and download routing for one ETL run.""" + + def __init__( + self, + selected_zip_files, + skipped_reprocess_files, + new_file_set, + reprocess_file_set, + sftp_download_files, + s3_download_files, + ): + self.selected_zip_files = selected_zip_files + self.skipped_reprocess_files = skipped_reprocess_files + self.new_file_set = new_file_set + self.reprocess_file_set = reprocess_file_set + self.sftp_download_files = sftp_download_files + self.s3_download_files = s3_download_files + + +def select_input_files(manifest, db, sftp, s3, s3_prefix, reprocess_glob, force_reprocess): + manifest.upsert_step('select_files', 'running') + new_sftp_zip_files = list_new_data_files(sftp, s3, s3_prefix=s3_prefix) + reprocess_s3_zip_files = list_reprocess_data_files(s3, reprocess_glob, s3_prefix=s3_prefix) + skipped_reprocess_files = [] + if reprocess_glob and not force_reprocess and reprocess_s3_zip_files: + already_completed = completed_reprocess_files(db, reprocess_s3_zip_files) + skipped_reprocess_files = sorted(already_completed) + reprocess_s3_zip_files = sorted(set(reprocess_s3_zip_files) - already_completed) + + selected_zip_files = select_data_files_to_process( + new_sftp_zip_files, + reprocess_s3_zip_files, + force_reprocess=force_reprocess + ) + manifest.upsert_step('select_files', 'completed', details={'selected_file_count': len(selected_zip_files)}) + + if reprocess_glob: + print(f"Reprocess selector active: REPROCESS_GLOB={reprocess_glob}, FORCE_REPROCESS={force_reprocess}") + print(f"Matched S3 private files: {len(reprocess_s3_zip_files)}") + if skipped_reprocess_files and not force_reprocess: + print(f"Skipping already-completed reprocess files from manifest: {len(skipped_reprocess_files)}") + + if not selected_zip_files: + return FileSelection([], skipped_reprocess_files, set(), set(), [], []) + + reprocess_file_set = set(reprocess_s3_zip_files) + new_file_set = set(new_sftp_zip_files) + selected_set = set(selected_zip_files) + sftp_download_files = sorted(selected_set & new_file_set) + s3_download_files = sorted(selected_set & reprocess_file_set) + + for f in sftp_download_files: + manifest.upsert_file(f, source='sftp', status='selected', stage='select') + for f in s3_download_files: + manifest.upsert_file(f, source='s3_private', status='selected', stage='select') + for f in skipped_reprocess_files: + manifest.upsert_file( + f, source='s3_private', status='skipped', stage='select', + details={'reason': 'already_completed_manifest'} + ) + + return FileSelection( + selected_zip_files, + skipped_reprocess_files, + new_file_set, + reprocess_file_set, + sftp_download_files, + s3_download_files, + ) + + +def download_selected_files(manifest, sftp, s3, priv_dir, s3_prefix, selection): + manifest.upsert_step('download_files', 'running') + print('Downloading selected files:') + for f in selection.sftp_download_files: + print('-', f) + sftp.download_files(f, priv_dir) + manifest.upsert_file(f, source='sftp', status='downloaded', stage='download') + for f in selection.s3_download_files: + print('-', f) + s3.download_file(s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), os.path.join(priv_dir, f)) + manifest.upsert_file(f, source='s3_private', status='downloaded', stage='download') + manifest.upsert_step('download_files', 'completed') + + +def parse_xml_to_staging(manifest, staging_db, priv_dir): + def sort_by_date(file): + r = re.search(r'(\d+.+)\.zip', file).group(0).replace('.', ' ') + return r + + local_zip_files = sorted( + [os.path.join(priv_dir, f) for f in os.listdir(priv_dir) if f.endswith('.zip')], + key=sort_by_date + ) + + manifest.upsert_step('parse_xml', 'running') + staging_db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + print('Processing files:') + for zip_file in local_zip_files: + file_name = os.path.basename(zip_file) + manifest.upsert_file(file_name, source='local', status='processing', stage='parse') + extract_date = None + with zipfile.ZipFile(zip_file, 'r').open(DATA_FILENAME) as xml_file: + for _, elem in etree.iterparse(xml_file, tag=oca_tag('RunDate')): + if not extract_date: + extract_date = elem.text + break + with zipfile.ZipFile(zip_file, 'r').open(DATA_FILENAME) as xml_file: + parse_file(xml_file, staging_db, extract_date) + manifest.upsert_file( + file_name, source='local', status='parsed', stage='parse', + details={'extract_date': extract_date} + ) + manifest.upsert_step('parse_xml', 'completed') + + +def preprocess_and_upload_staging_csvs(staging_db, pub_dir, mode, s3_args, s3_prefix): + staging_db.export_tables_to_csv(output_dir=pub_dir) + + def preprocess_csvs(target_dir): + for filename in os.listdir(target_dir): + if filename.endswith('.csv'): + file_path = os.path.join(target_dir, filename) + df = pd.read_csv(file_path) + for col in df.columns: + if df[col].dtype == 'object': + def replace_brackets(text): + if pd.isna(text) or not isinstance(text, str): + return text + if text.startswith('[') and text.endswith(']'): + inner = text[1:-1].strip() + if inner.startswith('{') and inner.endswith('}'): + return text + return '{' + text[1:-1] + '}' + return text + df[col] = df[col].apply(replace_brackets) + if filename.startswith('oca_appearances'): + if 'appearanceid' in df.columns: + del df['appearanceid'] + df['motionsequence'] = df['motionsequence'].astype('Int64') + if filename.startswith('oca_judgments'): + df['amendedfromjudgmentsequence'] = df['amendedfromjudgmentsequence'].astype('Int64') + if filename.startswith('oca_warrants'): + df['executionstayeddays'] = df['executionstayeddays'].astype('Int64') + df['issuancestayeddays'] = df['issuancestayeddays'].astype('Int64') + df.to_csv(file_path, index=False) + + preprocess_csvs(pub_dir) + public_files = [i for i in os.listdir(pub_dir) if i.endswith('.csv')] + with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: + files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) + pool.starmap(upload_public_file, files_zip) + + +def import_and_promote_staging( + manifest, db, pub_dir, s3_args, s3_prefix, selection, csv_row_check_chunk_size +): + staging_tables = [t + '_staging' for t in OCA_TABLES] + manifest.upsert_step('promote_staging', 'running') + db.execute_sql_file('create_tables_staging.sql') + for t in staging_tables: + csv_filepath = os.path.join(pub_dir, f"{t}.csv") + if csv_has_rows(csv_filepath, chunk_size=csv_row_check_chunk_size): + columns = '' + if t == 'oca_appearances_staging': + columns = 'indexnumberid, appearancedatetime, appearancepurpose, appearancereason, appearancepart, motionsequence, appearanceoutcomes' + db.sql(f""" + SELECT aws_s3.table_import_from_s3( + '{t}', '{columns}', '(FORMAT CSV, HEADER)', + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/{t}.csv", s3_prefix)}', 'us-east-1'), + aws_commons.create_aws_credentials('{s3_args["aws_id"]}', '{s3_args["aws_key"]}', '') + ); + """) + + db.execute_sql_file('update_appearance_outcomes.sql') + insert_staging_to_main(db, OCA_TABLES) + db.execute_sql_file('update_metadata.sql') + for selected_name in selection.selected_zip_files: + source = 'sftp' if selected_name in selection.new_file_set else 's3_private' + manifest.upsert_file(selected_name, source=source, status='completed', stage='promote') + manifest.upsert_step('promote_staging', 'completed') + + +def publish_core_tables(manifest, db, s3_args, s3_prefix): + manifest.upsert_step('publish_tables', 'running') + for t in OCA_TABLES: + s3_filename = t + '.csv' + if t == "oca_addresses": + s3_filename = "oca_addresses_private.csv" + db.sql(f""" + SELECT * from aws_s3.query_export_to_s3( + 'SELECT * from {t}', + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/{s3_filename}", s3_prefix)}', 'us-east-1'), + options :='FORMAT CSV, HEADER'); + """) + manifest.upsert_step('publish_tables', 'completed') + + +def geocode_and_publish_addresses( + manifest, db, s3, priv_dir, pub_dir, s3_args, s3_prefix, mode, selection, + geocode_workers, census_batch_chunk_size +): + manifest.upsert_step('geocode_refresh', 'running') + csv_filepath = os.path.join(pub_dir, "oca_addresses_private.csv") + db.export_csv('oca_addresses', csv_filepath) + input_csv = Path(pub_dir) / 'oca_addresses_private.csv' + output_csv = Path(pub_dir) / 'oca_addresses_private.csv' + df = pd.read_csv(input_csv, dtype=str, index_col=False, usecols=lambda x: x, keep_default_na=False) + df_1 = df[((pd.isna(df['lat'])) | (df['lat'] == '')) & ((df['house_number'] != '') | (pd.notna(df['house_number'])))].copy().reset_index() + records = df_1.to_dict('records') + with multiprocessing.Pool(processes=min((geocode_workers, multiprocessing.cpu_count()))) as pool: + it = pd.DataFrame(pool.map(functools.partial(geocode_record, addr_cols=['street1', 'city', 'postalcode']), records, 10000)) + df_2 = it[(((pd.isna(it['lat'])) | (it['lat'] == '')))].copy().reset_index() + with multiprocessing.Pool(processes=min([5, multiprocessing.cpu_count()])) as pool: + chunk_size = census_batch_chunk_size + data_split = zip(np.split(df_2, range(chunk_size, df_2.shape[0], chunk_size)), repeat(pub_dir)) + it_2 = pd.concat(pool.starmap(geocode_using_census_batch, data_split)) + export_cols = ['indexnumberid', 'street1', 'street2', 'city', 'state', 'postalcode', 'status', 'house_number', 'street_name', 'borough_code', 'place_name', 'sname', 'hnum', 'boro', 'lat', 'bin', 'bbl', 'cd', 'ct', 'council', 'grc', 'grc2', 'msg', 'msg2', 'lon', 'zip_code'] + concat = pd.concat([df, it, it_2], ignore_index=True).drop_duplicates(subset=['indexnumberid'], ignore_index=True, keep='last')[export_cols] + pd.DataFrame(concat).to_csv(output_csv, index=False) + create_date_files(s3, selection.selected_zip_files[-1], pub_dir) + public_files = [i for i in os.listdir(pub_dir) if i in ('last-updated-shield.png', 'last-updated-date.txt', 'oca_addresses_private.csv')] + with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: + files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) + pool.starmap(upload_public_file, files_zip) + for f in os.listdir(priv_dir): + if f != '.DS_Store': + s3.upload_file(s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), os.path.join(priv_dir, f)) + db.execute_sql_file('reset_addresses_table.sql') + db.sql(f""" + SELECT aws_s3.table_import_from_s3( + 'oca_addresses', '', '(FORMAT CSV, HEADER)', + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_private.csv", s3_prefix)}', 'us-east-1'), + aws_commons.create_aws_credentials('{s3_args["aws_id"]}', '{s3_args["aws_key"]}', '') + ); + """) + db.execute_sql_file('create_addresses_views.sql') + db.sql(f""" + SELECT * from aws_s3.query_export_to_s3( + 'SELECT * from oca_addresses_with_bbl', + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_with_bbl.csv", s3_prefix)}', 'us-east-1'), + options :='FORMAT CSV, HEADER'); + """) + db.sql(f""" + SELECT * from aws_s3.query_export_to_s3( + 'SELECT * from oca_addresses_with_ct', + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_with_ct.csv", s3_prefix)}', 'us-east-1'), + options :='FORMAT CSV, HEADER'); + """) + db.sql(f""" + SELECT * from aws_s3.query_export_to_s3( + 'SELECT * from oca_addresses_public', + aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses.csv", s3_prefix)}', 'us-east-1'), + options :='FORMAT CSV, HEADER'); + """) + manifest.upsert_step('geocode_refresh', 'completed') From dcc3c3202953f4b0532f73a41513061aeb78fbf4 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Wed, 27 May 2026 14:59:37 -0400 Subject: [PATCH 06/30] Parse/load memory reduction and CSV preprocessing reduction --- lib/etl.py | 7 +- lib/etl_csv.py | 105 +++++++++++++++++ lib/etl_helpers.py | 15 ++- lib/etl_stages.py | 43 ++----- lib/sql/normalize_staging_after_import.sql | 18 +++ tests/test_etl_csv.py | 125 +++++++++++++++++++++ 6 files changed, 271 insertions(+), 42 deletions(-) create mode 100644 lib/etl_csv.py create mode 100644 lib/sql/normalize_staging_after_import.sql create mode 100644 tests/test_etl_csv.py diff --git a/lib/etl.py b/lib/etl.py index ebe8f85..7b6dde4 100644 --- a/lib/etl.py +++ b/lib/etl.py @@ -109,10 +109,11 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None download_selected_files(manifest, sftp, s3, priv_dir, s3_prefix, selection) parse_xml_to_staging(manifest, staging_db, priv_dir) - preprocess_and_upload_staging_csvs(staging_db, pub_dir, mode, s3_args, s3_prefix) - import_and_promote_staging( - manifest, db, pub_dir, s3_args, s3_prefix, selection, csv_row_check_chunk_size + preprocess_and_upload_staging_csvs( + staging_db, pub_dir, mode, s3_args, s3_prefix, + csv_preprocess_chunk_size=csv_row_check_chunk_size, ) + import_and_promote_staging(manifest, db, pub_dir, s3_args, s3_prefix, selection) publish_core_tables(manifest, db, s3_args, s3_prefix) geocode_and_publish_addresses( manifest, db, s3, priv_dir, pub_dir, s3_args, s3_prefix, mode, selection, diff --git a/lib/etl_csv.py b/lib/etl_csv.py new file mode 100644 index 0000000..1a4a3dc --- /dev/null +++ b/lib/etl_csv.py @@ -0,0 +1,105 @@ +"""Constant-memory CSV preprocessing for DuckDB staging exports.""" + +import csv +import os + +_APPEARANCES_PREFIX = 'oca_appearances_staging' +_JUDGMENTS_PREFIX = 'oca_judgments_staging' +_WARRANTS_PREFIX = 'oca_warrants_staging' + +_EMPTY_INT_MARKERS = frozenset({'', 'nan', 'NaN', 'None', ''}) + + +def replace_postgres_array_brackets(text): + """ + Convert DuckDB-style array literals ``[a,b]`` to PostgreSQL ``{a,b}``. + + JSON object arrays (inner ``{...}``) are left unchanged. + """ + if not text or not isinstance(text, str): + return text + stripped = text.strip() + if not (stripped.startswith('[') and stripped.endswith(']')): + return text + inner = stripped[1:-1].strip() + if inner.startswith('{') and inner.endswith('}'): + return text + return '{' + stripped[1:-1] + '}' + + +def _normalize_int_cell(value): + if value is None: + return '' + if isinstance(value, str) and value.strip() in _EMPTY_INT_MARKERS: + return '' + return value + + +def _preprocess_row(filename, fieldnames, int_columns, row): + out = {} + for col in fieldnames: + value = row.get(col, '') + if col in int_columns: + out[col] = _normalize_int_cell(value) + elif isinstance(value, str): + out[col] = replace_postgres_array_brackets(value) + else: + out[col] = value + return out + + +def _file_preprocess_rules(filename): + drop_columns = set() + int_columns = set() + if filename.startswith(_APPEARANCES_PREFIX): + drop_columns.add('appearanceid') + int_columns.add('motionsequence') + elif filename.startswith(_JUDGMENTS_PREFIX): + int_columns.add('amendedfromjudgmentsequence') + elif filename.startswith(_WARRANTS_PREFIX): + int_columns.update(('executionstayeddays', 'issuancestayeddays')) + return drop_columns, int_columns + + +def preprocess_csv_file(file_path, chunk_size=1000): + """ + Rewrite one staging CSV in place using bounded memory. + + ``chunk_size`` controls how many rows are buffered before writing; it does + not load the full file into a DataFrame. + """ + filename = os.path.basename(file_path) + if not filename.endswith('.csv'): + return + + drop_columns, int_columns = _file_preprocess_rules(filename) + tmp_path = f'{file_path}.tmp' + + with open(file_path, newline='', encoding='utf-8') as infile, open( + tmp_path, 'w', newline='', encoding='utf-8' + ) as outfile: + reader = csv.DictReader(infile) + if not reader.fieldnames: + os.remove(tmp_path) + return + + fieldnames = [name for name in reader.fieldnames if name not in drop_columns] + writer = csv.DictWriter(outfile, fieldnames=fieldnames, lineterminator='\n') + writer.writeheader() + + batch = [] + for row in reader: + batch.append(_preprocess_row(filename, fieldnames, int_columns, row)) + if len(batch) >= chunk_size: + writer.writerows(batch) + batch.clear() + if batch: + writer.writerows(batch) + + os.replace(tmp_path, file_path) + + +def preprocess_staging_csv_dir(target_dir, chunk_size=1000): + for filename in sorted(os.listdir(target_dir)): + if filename.endswith('.csv'): + preprocess_csv_file(os.path.join(target_dir, filename), chunk_size=chunk_size) diff --git a/lib/etl_helpers.py b/lib/etl_helpers.py index 0f8145f..227e7d9 100644 --- a/lib/etl_helpers.py +++ b/lib/etl_helpers.py @@ -1,10 +1,10 @@ +import csv import io import os import re import shutil import zipfile -import pandas as pd import requests from .etl_constants import S3_PRIVATE_FOLDER, S3_PUBLIC_FOLDER @@ -32,9 +32,16 @@ def make_dir(dir_name): return dir_path -def csv_has_rows(csv_filepath, chunk_size=1000): - for _ in pd.read_csv(csv_filepath, chunksize=chunk_size): - return True +def csv_has_rows(csv_filepath): + """Return True when the CSV has at least one data row (header excluded).""" + with open(csv_filepath, newline='', encoding='utf-8') as csv_file: + reader = csv.reader(csv_file) + try: + next(reader) + except StopIteration: + return False + for _ in reader: + return True return False diff --git a/lib/etl_stages.py b/lib/etl_stages.py index 527cee2..5ff43c1 100644 --- a/lib/etl_stages.py +++ b/lib/etl_stages.py @@ -17,6 +17,7 @@ select_data_files_to_process, ) from .etl_run_manifest import completed_reprocess_files +from .etl_csv import preprocess_staging_csv_dir from .etl_helpers import ( create_date_files, csv_has_rows, @@ -145,53 +146,24 @@ def sort_by_date(file): manifest.upsert_step('parse_xml', 'completed') -def preprocess_and_upload_staging_csvs(staging_db, pub_dir, mode, s3_args, s3_prefix): +def preprocess_and_upload_staging_csvs( + staging_db, pub_dir, mode, s3_args, s3_prefix, csv_preprocess_chunk_size=1000 +): staging_db.export_tables_to_csv(output_dir=pub_dir) - - def preprocess_csvs(target_dir): - for filename in os.listdir(target_dir): - if filename.endswith('.csv'): - file_path = os.path.join(target_dir, filename) - df = pd.read_csv(file_path) - for col in df.columns: - if df[col].dtype == 'object': - def replace_brackets(text): - if pd.isna(text) or not isinstance(text, str): - return text - if text.startswith('[') and text.endswith(']'): - inner = text[1:-1].strip() - if inner.startswith('{') and inner.endswith('}'): - return text - return '{' + text[1:-1] + '}' - return text - df[col] = df[col].apply(replace_brackets) - if filename.startswith('oca_appearances'): - if 'appearanceid' in df.columns: - del df['appearanceid'] - df['motionsequence'] = df['motionsequence'].astype('Int64') - if filename.startswith('oca_judgments'): - df['amendedfromjudgmentsequence'] = df['amendedfromjudgmentsequence'].astype('Int64') - if filename.startswith('oca_warrants'): - df['executionstayeddays'] = df['executionstayeddays'].astype('Int64') - df['issuancestayeddays'] = df['issuancestayeddays'].astype('Int64') - df.to_csv(file_path, index=False) - - preprocess_csvs(pub_dir) + preprocess_staging_csv_dir(pub_dir, chunk_size=csv_preprocess_chunk_size) public_files = [i for i in os.listdir(pub_dir) if i.endswith('.csv')] with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) pool.starmap(upload_public_file, files_zip) -def import_and_promote_staging( - manifest, db, pub_dir, s3_args, s3_prefix, selection, csv_row_check_chunk_size -): +def import_and_promote_staging(manifest, db, pub_dir, s3_args, s3_prefix, selection): staging_tables = [t + '_staging' for t in OCA_TABLES] manifest.upsert_step('promote_staging', 'running') db.execute_sql_file('create_tables_staging.sql') for t in staging_tables: csv_filepath = os.path.join(pub_dir, f"{t}.csv") - if csv_has_rows(csv_filepath, chunk_size=csv_row_check_chunk_size): + if csv_has_rows(csv_filepath): columns = '' if t == 'oca_appearances_staging': columns = 'indexnumberid, appearancedatetime, appearancepurpose, appearancereason, appearancepart, motionsequence, appearanceoutcomes' @@ -203,6 +175,7 @@ def import_and_promote_staging( ); """) + db.execute_sql_file('normalize_staging_after_import.sql') db.execute_sql_file('update_appearance_outcomes.sql') insert_staging_to_main(db, OCA_TABLES) db.execute_sql_file('update_metadata.sql') diff --git a/lib/sql/normalize_staging_after_import.sql b/lib/sql/normalize_staging_after_import.sql new file mode 100644 index 0000000..4ad35bb --- /dev/null +++ b/lib/sql/normalize_staging_after_import.sql @@ -0,0 +1,18 @@ +-- Post-import staging normalization (deterministic casts / null coercion). +-- Array bracket formatting is handled in lib/etl_csv.py before S3 import. + +UPDATE oca_appearances_staging +SET motionsequence = NULL +WHERE motionsequence IS NOT NULL AND motionsequence::text = ''; + +UPDATE oca_judgments_staging +SET amendedfromjudgmentsequence = NULL +WHERE amendedfromjudgmentsequence IS NOT NULL AND amendedfromjudgmentsequence::text = ''; + +UPDATE oca_warrants_staging +SET executionstayeddays = NULL +WHERE executionstayeddays IS NOT NULL AND executionstayeddays::text = ''; + +UPDATE oca_warrants_staging +SET issuancestayeddays = NULL +WHERE issuancestayeddays IS NOT NULL AND issuancestayeddays::text = ''; diff --git a/tests/test_etl_csv.py b/tests/test_etl_csv.py new file mode 100644 index 0000000..bf41aeb --- /dev/null +++ b/tests/test_etl_csv.py @@ -0,0 +1,125 @@ +import csv +import os +import tempfile +import tracemalloc +import unittest + +from lib.etl_csv import preprocess_csv_file, replace_postgres_array_brackets +from lib.etl_helpers import csv_has_rows + + +class ReplacePostgresArrayBracketsTests(unittest.TestCase): + def test_converts_simple_array(self): + self.assertEqual(replace_postgres_array_brackets('[a,b]'), '{a,b}') + + def test_preserves_json_object_array(self): + value = '[{"appearanceoutcometype":"x"}]' + self.assertEqual(replace_postgres_array_brackets(value), value) + + def test_non_array_unchanged(self): + self.assertEqual(replace_postgres_array_brackets('plain'), 'plain') + + +class PreprocessCsvFileTests(unittest.TestCase): + def test_appearances_drops_appearanceid_and_normalizes_motionsequence(self): + with tempfile.TemporaryDirectory() as tmp: + path = os.path.join(tmp, 'oca_appearances_staging.csv') + with open(path, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter( + f, + fieldnames=[ + 'indexnumberid', + 'appearanceid', + 'motionsequence', + 'appearanceoutcomes', + ], + ) + writer.writeheader() + writer.writerow({ + 'indexnumberid': '1', + 'appearanceid': '99', + 'motionsequence': '', + 'appearanceoutcomes': '[{"appearanceoutcometype":"Hearing"}]', + }) + + preprocess_csv_file(path, chunk_size=2) + + with open(path, newline='', encoding='utf-8') as f: + rows = list(csv.DictReader(f)) + + self.assertEqual( + list(rows[0].keys()), + ['indexnumberid', 'motionsequence', 'appearanceoutcomes'], + ) + self.assertEqual(rows[0]['motionsequence'], '') + self.assertEqual( + rows[0]['appearanceoutcomes'], + '[{"appearanceoutcometype":"Hearing"}]', + ) + + def test_index_array_brackets_converted(self): + with tempfile.TemporaryDirectory() as tmp: + path = os.path.join(tmp, 'oca_index_staging.csv') + with open(path, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter( + f, + fieldnames=['indexnumberid', 'specialtydesignationtypes'], + ) + writer.writeheader() + writer.writerow({ + 'indexnumberid': '1', + 'specialtydesignationtypes': '[HP, RTC]', + }) + + preprocess_csv_file(path) + + with open(path, newline='', encoding='utf-8') as f: + row = next(csv.DictReader(f)) + + self.assertEqual(row['specialtydesignationtypes'], '{HP, RTC}') + + +class CsvHasRowsTests(unittest.TestCase): + def test_empty_file_has_no_rows(self): + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + f.write('a,b\n') + path = f.name + try: + self.assertFalse(csv_has_rows(path)) + finally: + os.unlink(path) + + def test_data_row_detected(self): + with tempfile.NamedTemporaryFile(mode='w', suffix='.csv', delete=False) as f: + f.write('a,b\n1,2\n') + path = f.name + try: + self.assertTrue(csv_has_rows(path)) + finally: + os.unlink(path) + + +class PreprocessMemoryTests(unittest.TestCase): + def test_preprocess_does_not_scale_with_file_size(self): + with tempfile.TemporaryDirectory() as tmp: + path = os.path.join(tmp, 'oca_index_staging.csv') + with open(path, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=['indexnumberid', 'specialtydesignationtypes']) + writer.writeheader() + for i in range(20000): + writer.writerow({ + 'indexnumberid': str(i), + 'specialtydesignationtypes': '[A]', + }) + + tracemalloc.start() + preprocess_csv_file(path, chunk_size=500) + _, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + + # Full-file materialization would typically exceed a few MB for 20k rows. + self.assertLess(peak, 5 * 1024 * 1024) + + +if __name__ == '__main__': + unittest.main() From 596e208fa41fd0c6f3ea1221048bef300f3a52b3 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Wed, 27 May 2026 15:36:43 -0400 Subject: [PATCH 07/30] PR #19 appearanceid + S3 publish fixes --- lib/etl_stages.py | 10 +++++++ lib/s3.py | 21 +++++++++++++- lib/sql/update_appearance_outcomes.sql | 30 ++++++++++++++++++++ tests/test_update_appearance_outcomes_sql.py | 26 +++++++++++++++++ 4 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 tests/test_update_appearance_outcomes_sql.py diff --git a/lib/etl_stages.py b/lib/etl_stages.py index 5ff43c1..05703e0 100644 --- a/lib/etl_stages.py +++ b/lib/etl_stages.py @@ -232,6 +232,7 @@ def geocode_and_publish_addresses( s3.upload_file(s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), os.path.join(priv_dir, f)) db.execute_sql_file('reset_addresses_table.sql') db.sql(f""" + SET statement_timeout = '2000000'; SELECT aws_s3.table_import_from_s3( 'oca_addresses', '', '(FORMAT CSV, HEADER)', aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_private.csv", s3_prefix)}', 'us-east-1'), @@ -257,4 +258,13 @@ def geocode_and_publish_addresses( aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses.csv", s3_prefix)}', 'us-east-1'), options :='FORMAT CSV, HEADER'); """) + print('Updating server-side encryption for S3 files') + public_folder = s3_key(S3_PUBLIC_FOLDER, s3_prefix) + public_files_to_encrypt = [ + f for f in s3.list_files('', public_folder) + if not f.endswith('_staging.csv') and 'oca_addresses_private' not in f + ] + for f in public_files_to_encrypt: + print('-', f) + s3.update_encryption(s3_key(f"{S3_PUBLIC_FOLDER}/{f}", s3_prefix)) manifest.upsert_step('geocode_refresh', 'completed') diff --git a/lib/s3.py b/lib/s3.py index 7bb7f3d..6d578e0 100644 --- a/lib/s3.py +++ b/lib/s3.py @@ -12,7 +12,7 @@ def s3_client(aws_id, aws_key): 's3', aws_access_key_id=aws_id, aws_secret_access_key=aws_key, - config=Config(connect_timeout=10, read_timeout=100, retries={'max_attempts': 10}) + config=Config(connect_timeout=10, read_timeout=100, retries={'max_attempts': 10}, signature_version='s3v4') ) return s3 @@ -155,3 +155,22 @@ def list_files(self, pattern, folder=''): files = [os.path.basename(x) for x in all_files if x != folder] return files + + def update_encryption(self, object_key): + """ + Update an S3 object's server-side encryption to SSE-S3 (AES256). + + :param object_key: Object key in the bucket + """ + try: + self.s3.copy_object( + Bucket=self.bucket_name, + CopySource={'Bucket': self.bucket_name, 'Key': object_key}, + Key=object_key, + ServerSideEncryption='AES256', + MetadataDirective='COPY' + ) + return True + except ClientError as e: + logging.error(e) + return False diff --git a/lib/sql/update_appearance_outcomes.sql b/lib/sql/update_appearance_outcomes.sql index fcce4f0..73d2e66 100644 --- a/lib/sql/update_appearance_outcomes.sql +++ b/lib/sql/update_appearance_outcomes.sql @@ -1,3 +1,33 @@ +-- update appearanceid so that the serial resumes from the latest number in the main table +DO $$ +DECLARE + max_id bigint; + staging_count bigint; +BEGIN + -- Get max ID from main table + SELECT COALESCE(MAX(appearanceid), 0) INTO max_id FROM oca_appearances; + + -- Get count of staging records + SELECT COUNT(*) INTO staging_count FROM oca_appearances_staging; + + -- Update NULL appearanceid values with sequential numbers starting from max_id + 1 + WITH numbered_rows AS ( + SELECT ctid, ROW_NUMBER() OVER (ORDER BY ctid) as rn + FROM oca_appearances_staging + WHERE appearanceid IS NULL + ) + UPDATE oca_appearances_staging + SET appearanceid = max_id + nr.rn + FROM numbered_rows nr + WHERE oca_appearances_staging.ctid = nr.ctid; + + -- Set sequence for future inserts + PERFORM setval( + pg_get_serial_sequence('oca_appearances_staging', 'appearanceid'), + max_id + staging_count + ); +END $$; + -- In the "appearances" nodes they have further nested info about the outcomes -- of those appearances. There are no unique identifers to be able to link -- these elements in the original data, so we parse the outcomes as a json diff --git a/tests/test_update_appearance_outcomes_sql.py b/tests/test_update_appearance_outcomes_sql.py new file mode 100644 index 0000000..0908b98 --- /dev/null +++ b/tests/test_update_appearance_outcomes_sql.py @@ -0,0 +1,26 @@ +import os +import unittest + + +class UpdateAppearanceOutcomesSqlTests(unittest.TestCase): + def test_assigns_appearanceid_before_outcomes_insert(self): + sql_path = os.path.join( + os.path.dirname(__file__), + '..', + 'lib', + 'sql', + 'update_appearance_outcomes.sql', + ) + with open(sql_path, encoding='utf-8') as f: + sql = f.read() + + self.assertIn('DO $$', sql) + self.assertIn('setval', sql) + self.assertIn('MAX(appearanceid)', sql) + do_pos = sql.index('DO $$') + insert_pos = sql.index('INSERT INTO oca_appearance_outcomes_staging') + self.assertLess(do_pos, insert_pos) + + +if __name__ == '__main__': + unittest.main() From 00840e6b3b5f27434e581536960ad2fe59a5e47e Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Wed, 27 May 2026 16:00:43 -0400 Subject: [PATCH 08/30] Incremental geocoding with delta extraction and DB upsert --- lib/database.py | 5 + lib/etl_geocode.py | 144 ++++++++++++ lib/etl_stages.py | 45 ++-- lib/sql/create_geocode_staging_table.sql | 30 +++ lib/sql/select_addresses_needing_geocode.sql | 31 +++ lib/sql/upsert_geocoded_addresses.sql | 29 +++ tests/test_incremental_geocode.py | 233 +++++++++++++++++++ 7 files changed, 487 insertions(+), 30 deletions(-) create mode 100644 lib/etl_geocode.py create mode 100644 lib/sql/create_geocode_staging_table.sql create mode 100644 lib/sql/select_addresses_needing_geocode.sql create mode 100644 lib/sql/upsert_geocoded_addresses.sql create mode 100644 tests/test_incremental_geocode.py diff --git a/lib/database.py b/lib/database.py index 720433d..a042b28 100644 --- a/lib/database.py +++ b/lib/database.py @@ -70,6 +70,11 @@ def sql_fetch_all(self, SQL): curs.execute(SQL) return curs.fetchall() + def sql_fetch_all_from_file(self, sql_file): + file_path = os.path.join(os.path.dirname(__file__), 'sql', sql_file) + with open(file_path, 'r', encoding='utf-8') as f: + return self.sql_fetch_all(f.read()) + def insert_rows(self, rows, table_name): """ Inserts many rows, all in the same transaction. diff --git a/lib/etl_geocode.py b/lib/etl_geocode.py new file mode 100644 index 0000000..2e34fb8 --- /dev/null +++ b/lib/etl_geocode.py @@ -0,0 +1,144 @@ +import functools +import multiprocessing +import os +from itertools import repeat + +import numpy as np +import pandas as pd + +from .geocode_record import geocode_record, geocode_using_census_batch + +GEOCODE_ADDRESS_COLUMNS = [ + 'indexnumberid', 'street1', 'street2', 'city', 'state', 'postalcode', + 'status', 'house_number', 'street_name', 'borough_code', 'place_name', + 'sname', 'hnum', 'boro', 'lat', 'bin', 'bbl', 'cd', 'ct', 'council', + 'grc', 'grc2', 'msg', 'msg2', 'lon', 'zip_code', +] + +GEOCODE_EXPORT_COLUMNS = GEOCODE_ADDRESS_COLUMNS + + +def _stringify_row_values(row): + normalized = {} + for key, value in row.items(): + if value is None: + normalized[key] = '' + elif isinstance(value, float) and np.isnan(value): + normalized[key] = '' + else: + normalized[key] = str(value) + return normalized + + +def _has_lat(value): + if value is None: + return False + text = str(value).strip() + return text != '' and text.lower() != 'nan' + + +def row_needs_geocode(row): + """Mirror select_addresses_needing_geocode.sql for unit tests.""" + return not _has_lat(row.get('lat')) and str(row.get('house_number') or '').strip() != '' + + +def _rows_from_fetchall(rows): + return [ + _stringify_row_values(dict(zip(GEOCODE_ADDRESS_COLUMNS, row))) + for row in rows + ] + + +def fetch_addresses_needing_geocode(db): + rows = db.sql_fetch_all_from_file('select_addresses_needing_geocode.sql') + return _rows_from_fetchall(rows) + + +def _prepare_rows_for_db(rows): + prepared = [] + for row in rows: + db_row = {} + for col in GEOCODE_EXPORT_COLUMNS: + value = row.get(col, '') + if col in ('lat', 'lon') and not _has_lat(value): + db_row[col] = None + else: + db_row[col] = value if value != '' else None + prepared.append(db_row) + return prepared + + +def _run_geosupport(records, geocode_workers, geocode_record_fn): + geocode_one = functools.partial( + geocode_record_fn, + addr_cols=['street1', 'city', 'postalcode'], + ) + use_pool = geocode_record_fn is geocode_record + if not use_pool: + return [geocode_one(record) for record in records] + + worker_count = min(geocode_workers, multiprocessing.cpu_count()) + with multiprocessing.Pool(processes=worker_count) as pool: + return pool.map(geocode_one, records, 10000) + + +def _run_census_batch(still_missing, census_batch_chunk_size, pub_dir, geocode_using_census_batch_fn): + if not still_missing: + return [] + + use_pool = geocode_using_census_batch_fn is geocode_using_census_batch + chunk_size = census_batch_chunk_size + df_missing = pd.DataFrame(still_missing) + splits = list(np.split(df_missing, range(chunk_size, df_missing.shape[0], chunk_size))) + + if not use_pool: + return [geocode_using_census_batch_fn(chunk, pub_dir) for chunk in splits] + + census_pool_workers = min(5, multiprocessing.cpu_count()) + data_split = zip(splits, repeat(pub_dir)) + with multiprocessing.Pool(processes=census_pool_workers) as pool: + return pool.starmap(geocode_using_census_batch_fn, data_split) + + +def geocode_candidate_records( + records, + geocode_workers, + census_batch_chunk_size, + pub_dir, + geocode_record_fn=geocode_record, + geocode_using_census_batch_fn=geocode_using_census_batch, +): + if not records: + return [] + + geosupport_results = _run_geosupport(records, geocode_workers, geocode_record_fn) + + still_missing = [row for row in geosupport_results if not _has_lat(row.get('lat'))] + if not still_missing: + return geosupport_results + + census_chunks = _run_census_batch( + still_missing, + census_batch_chunk_size, + pub_dir, + geocode_using_census_batch_fn, + ) + if census_chunks: + census_results = pd.concat(census_chunks, ignore_index=True).to_dict('records') + else: + census_results = [] + + by_id = {row['indexnumberid']: row for row in geosupport_results} + for row in census_results: + by_id[row['indexnumberid']] = row + return [by_id[row['indexnumberid']] for row in records] + + +def upsert_geocoded_addresses(db, rows): + if not rows: + return 0 + + db.execute_sql_file('create_geocode_staging_table.sql') + db.insert_rows(_prepare_rows_for_db(rows), 'oca_addresses_geocode_staging') + db.execute_sql_file('upsert_geocoded_addresses.sql') + return len(rows) diff --git a/lib/etl_stages.py b/lib/etl_stages.py index 05703e0..d47e44d 100644 --- a/lib/etl_stages.py +++ b/lib/etl_stages.py @@ -1,13 +1,8 @@ -import functools import multiprocessing import os import re import zipfile from itertools import repeat -from pathlib import Path - -import numpy as np -import pandas as pd from lxml import etree from .etl_constants import DATA_FILENAME, OCA_TABLES, S3_PRIVATE_FOLDER, S3_PUBLIC_FOLDER @@ -25,7 +20,11 @@ s3_key, upload_public_file, ) -from .geocode_record import geocode_record, geocode_using_census_batch +from .etl_geocode import ( + fetch_addresses_needing_geocode, + geocode_candidate_records, + upsert_geocoded_addresses, +) from .parsers import oca_tag, parse_file @@ -205,23 +204,18 @@ def geocode_and_publish_addresses( geocode_workers, census_batch_chunk_size ): manifest.upsert_step('geocode_refresh', 'running') + candidates = fetch_addresses_needing_geocode(db) + print(f'Geocoding {len(candidates)} addresses missing lat/lon') + geocoded_rows = geocode_candidate_records( + candidates, + geocode_workers, + census_batch_chunk_size, + pub_dir, + ) + upsert_geocoded_addresses(db, geocoded_rows) + csv_filepath = os.path.join(pub_dir, "oca_addresses_private.csv") db.export_csv('oca_addresses', csv_filepath) - input_csv = Path(pub_dir) / 'oca_addresses_private.csv' - output_csv = Path(pub_dir) / 'oca_addresses_private.csv' - df = pd.read_csv(input_csv, dtype=str, index_col=False, usecols=lambda x: x, keep_default_na=False) - df_1 = df[((pd.isna(df['lat'])) | (df['lat'] == '')) & ((df['house_number'] != '') | (pd.notna(df['house_number'])))].copy().reset_index() - records = df_1.to_dict('records') - with multiprocessing.Pool(processes=min((geocode_workers, multiprocessing.cpu_count()))) as pool: - it = pd.DataFrame(pool.map(functools.partial(geocode_record, addr_cols=['street1', 'city', 'postalcode']), records, 10000)) - df_2 = it[(((pd.isna(it['lat'])) | (it['lat'] == '')))].copy().reset_index() - with multiprocessing.Pool(processes=min([5, multiprocessing.cpu_count()])) as pool: - chunk_size = census_batch_chunk_size - data_split = zip(np.split(df_2, range(chunk_size, df_2.shape[0], chunk_size)), repeat(pub_dir)) - it_2 = pd.concat(pool.starmap(geocode_using_census_batch, data_split)) - export_cols = ['indexnumberid', 'street1', 'street2', 'city', 'state', 'postalcode', 'status', 'house_number', 'street_name', 'borough_code', 'place_name', 'sname', 'hnum', 'boro', 'lat', 'bin', 'bbl', 'cd', 'ct', 'council', 'grc', 'grc2', 'msg', 'msg2', 'lon', 'zip_code'] - concat = pd.concat([df, it, it_2], ignore_index=True).drop_duplicates(subset=['indexnumberid'], ignore_index=True, keep='last')[export_cols] - pd.DataFrame(concat).to_csv(output_csv, index=False) create_date_files(s3, selection.selected_zip_files[-1], pub_dir) public_files = [i for i in os.listdir(pub_dir) if i in ('last-updated-shield.png', 'last-updated-date.txt', 'oca_addresses_private.csv')] with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: @@ -230,15 +224,6 @@ def geocode_and_publish_addresses( for f in os.listdir(priv_dir): if f != '.DS_Store': s3.upload_file(s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), os.path.join(priv_dir, f)) - db.execute_sql_file('reset_addresses_table.sql') - db.sql(f""" - SET statement_timeout = '2000000'; - SELECT aws_s3.table_import_from_s3( - 'oca_addresses', '', '(FORMAT CSV, HEADER)', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_private.csv", s3_prefix)}', 'us-east-1'), - aws_commons.create_aws_credentials('{s3_args["aws_id"]}', '{s3_args["aws_key"]}', '') - ); - """) db.execute_sql_file('create_addresses_views.sql') db.sql(f""" SELECT * from aws_s3.query_export_to_s3( diff --git a/lib/sql/create_geocode_staging_table.sql b/lib/sql/create_geocode_staging_table.sql new file mode 100644 index 0000000..aa9ae92 --- /dev/null +++ b/lib/sql/create_geocode_staging_table.sql @@ -0,0 +1,30 @@ +CREATE TABLE IF NOT EXISTS oca_addresses_geocode_staging ( + indexnumberid text, + street1 text, + street2 text, + city text, + state text, + postalcode text, + status text, + house_number text, + street_name text, + borough_code text, + place_name text, + sname text, + hnum text, + boro text, + lat real, + bin text, + bbl text, + cd text, + ct text, + council text, + grc text, + grc2 text, + msg text, + msg2 text, + lon real, + zip_code text +); + +TRUNCATE oca_addresses_geocode_staging; diff --git a/lib/sql/select_addresses_needing_geocode.sql b/lib/sql/select_addresses_needing_geocode.sql new file mode 100644 index 0000000..b926691 --- /dev/null +++ b/lib/sql/select_addresses_needing_geocode.sql @@ -0,0 +1,31 @@ +-- Rows in oca_addresses that still need geocoding (missing lat/lon with parseable house number). +SELECT + indexnumberid, + street1, + street2, + city, + state, + postalcode, + status, + house_number, + street_name, + borough_code, + place_name, + sname, + hnum, + boro, + lat, + bin, + bbl, + cd, + ct, + council, + grc, + grc2, + msg, + msg2, + lon, + zip_code +FROM oca_addresses +WHERE lat IS NULL + AND COALESCE(house_number, '') <> ''; diff --git a/lib/sql/upsert_geocoded_addresses.sql b/lib/sql/upsert_geocoded_addresses.sql new file mode 100644 index 0000000..5d4e4b7 --- /dev/null +++ b/lib/sql/upsert_geocoded_addresses.sql @@ -0,0 +1,29 @@ +UPDATE oca_addresses AS o +SET + street1 = s.street1, + street2 = s.street2, + city = s.city, + state = s.state, + postalcode = s.postalcode, + status = s.status, + house_number = s.house_number, + street_name = s.street_name, + borough_code = s.borough_code, + place_name = s.place_name, + sname = s.sname, + hnum = s.hnum, + boro = s.boro, + lat = s.lat, + bin = s.bin, + bbl = s.bbl, + cd = s.cd, + ct = s.ct, + council = s.council, + grc = s.grc, + grc2 = s.grc2, + msg = s.msg, + msg2 = s.msg2, + lon = s.lon, + zip_code = s.zip_code +FROM oca_addresses_geocode_staging AS s +WHERE o.indexnumberid = s.indexnumberid; diff --git a/tests/test_incremental_geocode.py b/tests/test_incremental_geocode.py new file mode 100644 index 0000000..e15d996 --- /dev/null +++ b/tests/test_incremental_geocode.py @@ -0,0 +1,233 @@ +import os +import tracemalloc +import unittest +from unittest import mock + +from lib.etl_geocode import ( + GEOCODE_ADDRESS_COLUMNS, + fetch_addresses_needing_geocode, + geocode_candidate_records, + row_needs_geocode, + upsert_geocoded_addresses, +) +from lib.etl_stages import geocode_and_publish_addresses + + +class RowNeedsGeocodeTests(unittest.TestCase): + def test_missing_lat_with_house_number(self): + self.assertTrue(row_needs_geocode({'lat': '', 'house_number': '123'})) + + def test_existing_lat_skipped(self): + self.assertFalse(row_needs_geocode({'lat': '40.7', 'house_number': '123'})) + + def test_missing_house_number_skipped(self): + self.assertFalse(row_needs_geocode({'lat': '', 'house_number': ''})) + + +class GeocodeCandidateRecordsTests(unittest.TestCase): + def test_only_missing_lat_rows_sent_to_census(self): + records = [ + {'indexnumberid': 'a', 'lat': '', 'house_number': '1', 'street1': '1 Main', 'city': 'NYC', 'postalcode': '10001'}, + {'indexnumberid': 'b', 'lat': '', 'house_number': '2', 'street1': '2 Main', 'city': 'NYC', 'postalcode': '10002'}, + ] + + def fake_geocode_record(row, addr_cols): + row = dict(row) + if row['indexnumberid'] == 'a': + row['lat'] = '40.1' + row['lon'] = '-73.9' + return row + + census_calls = [] + + def fake_census_batch(dataframe, pub_dir): + census_calls.append(list(dataframe['indexnumberid'])) + dataframe = dataframe.copy() + dataframe['lat'] = '40.2' + dataframe['lon'] = '-73.8' + return dataframe + + results = geocode_candidate_records( + records, + geocode_workers=1, + census_batch_chunk_size=2500, + pub_dir='/tmp', + geocode_record_fn=fake_geocode_record, + geocode_using_census_batch_fn=fake_census_batch, + ) + + self.assertEqual(census_calls, [['b']]) + self.assertEqual(len(results), 2) + self.assertEqual(results[0]['lat'], '40.1') + self.assertEqual(results[1]['lat'], '40.2') + + def test_empty_candidates_skips_geocoders(self): + geocode_mock = mock.Mock() + results = geocode_candidate_records( + [], + geocode_workers=1, + census_batch_chunk_size=2500, + pub_dir='/tmp', + geocode_record_fn=geocode_mock, + ) + self.assertEqual(results, []) + geocode_mock.assert_not_called() + + def test_idempotent_rerun_fetches_no_candidates(self): + fake_db = mock.Mock() + fake_db.sql_fetch_all_from_file.return_value = [] + first = fetch_addresses_needing_geocode(fake_db) + second = fetch_addresses_needing_geocode(fake_db) + self.assertEqual(first, []) + self.assertEqual(second, []) + self.assertEqual(fake_db.sql_fetch_all_from_file.call_count, 2) + + +class FetchAddressesNeedingGeocodeTests(unittest.TestCase): + def test_fetch_uses_sql_file(self): + fake_db = mock.Mock() + fake_db.sql_fetch_all_from_file.return_value = [ + tuple('' if col == 'lat' else f'val-{col}' for col in GEOCODE_ADDRESS_COLUMNS), + ] + + rows = fetch_addresses_needing_geocode(fake_db) + + fake_db.sql_fetch_all_from_file.assert_called_once_with( + 'select_addresses_needing_geocode.sql' + ) + self.assertEqual(rows[0]['indexnumberid'], 'val-indexnumberid') + self.assertEqual(rows[0]['lat'], '') + + +class UpsertGeocodedAddressesTests(unittest.TestCase): + def test_upsert_writes_staging_and_merges(self): + fake_db = mock.Mock() + rows = [{'indexnumberid': 'a', 'lat': '40.1', 'lon': '-73.9', 'house_number': '1'}] + + count = upsert_geocoded_addresses(fake_db, rows) + + self.assertEqual(count, 1) + fake_db.execute_sql_file.assert_any_call('create_geocode_staging_table.sql') + fake_db.insert_rows.assert_called_once() + fake_db.execute_sql_file.assert_any_call('upsert_geocoded_addresses.sql') + + def test_empty_rows_skips_db_writes(self): + fake_db = mock.Mock() + count = upsert_geocoded_addresses(fake_db, []) + self.assertEqual(count, 0) + fake_db.execute_sql_file.assert_not_called() + + +class GeocodeStageIntegrationTests(unittest.TestCase): + def test_geocode_stage_skips_reset_and_s3_import(self): + fake_db = mock.Mock() + fake_db.sql_fetch_all_from_file.return_value = [] + fake_manifest = mock.Mock() + fake_s3 = mock.Mock() + fake_s3.list_files.return_value = [] + selection = mock.Mock(selected_zip_files=['file.zip']) + + with mock.patch('lib.etl_stages.create_date_files'), \ + mock.patch('lib.etl_stages.upload_public_file'), \ + mock.patch('os.listdir', return_value=[]): + geocode_and_publish_addresses( + fake_manifest, + fake_db, + fake_s3, + '/tmp/priv', + '/tmp/pub', + {'aws_bucket_name': 'bucket', 'aws_id': 'id', 'aws_key': 'key'}, + 'refactor/', + '2', + selection, + geocode_workers=1, + census_batch_chunk_size=2500, + ) + + executed_files = [ + call.args[0] + for call in fake_db.execute_sql_file.call_args_list + ] + self.assertNotIn('reset_addresses_table.sql', executed_files) + import_calls = [ + call for call in fake_db.sql.call_args_list + if call.args and 'table_import_from_s3' in call.args[0] + ] + self.assertEqual(import_calls, []) + + def test_geocode_stage_delta_path_invoked(self): + fake_db = mock.Mock() + fake_db.sql_fetch_all_from_file.return_value = [ + ( + 'case-1', '1 Main', '', 'NYC', 'NY', '10001', '', '1', 'Main', + '', '', '', '', '', '', None, '', '', '', '', '', '', '', '', None, '', + ), + ] + fake_manifest = mock.Mock() + fake_s3 = mock.Mock() + fake_s3.list_files.return_value = [] + selection = mock.Mock(selected_zip_files=['file.zip']) + + with mock.patch('lib.etl_stages.geocode_candidate_records', return_value=[{'indexnumberid': 'case-1', 'lat': '40.1'}]) as geocode_mock, \ + mock.patch('lib.etl_stages.upsert_geocoded_addresses', return_value=1) as upsert_mock, \ + mock.patch('lib.etl_stages.create_date_files'), \ + mock.patch('lib.etl_stages.upload_public_file'), \ + mock.patch('os.listdir', return_value=[]): + geocode_and_publish_addresses( + fake_manifest, + fake_db, + fake_s3, + '/tmp/priv', + '/tmp/pub', + {'aws_bucket_name': 'bucket', 'aws_id': 'id', 'aws_key': 'key'}, + 'refactor/', + '2', + selection, + geocode_workers=2, + census_batch_chunk_size=1000, + ) + + geocode_mock.assert_called_once() + upsert_mock.assert_called_once() + self.assertEqual(geocode_mock.call_args.args[1], 2) + self.assertEqual(geocode_mock.call_args.args[2], 1000) + + +class GeocodeMemoryTests(unittest.TestCase): + def test_candidate_only_geocode_uses_bounded_memory(self): + records = [ + { + 'indexnumberid': f'id-{i}', + 'lat': '', + 'house_number': str(i), + 'street1': f'{i} Main St', + 'city': 'NYC', + 'postalcode': '10001', + } + for i in range(5000) + ] + + def fake_geocode_record(row, addr_cols): + row = dict(row) + row['lat'] = '40.0' + row['lon'] = '-73.0' + return row + + tracemalloc.start() + geocode_candidate_records( + records, + geocode_workers=1, + census_batch_chunk_size=2500, + pub_dir='/tmp', + geocode_record_fn=fake_geocode_record, + geocode_using_census_batch_fn=mock.Mock(), + ) + _, peak = tracemalloc.get_traced_memory() + tracemalloc.stop() + + # Candidate-only path should stay well under a 2GB envelope on sample data. + self.assertLess(peak, 50 * 1024 * 1024) + + +if __name__ == '__main__': + unittest.main() From 57de2407163cbed4ea408499f09a5df665206ef0 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Wed, 27 May 2026 16:29:06 -0400 Subject: [PATCH 09/30] Multi-address geocode row keys --- lib/etl_geocode.py | 24 +++++- lib/sql/upsert_geocoded_addresses.sql | 7 +- tests/test_incremental_geocode.py | 120 ++++++++++++++++++++++++++ 3 files changed, 147 insertions(+), 4 deletions(-) diff --git a/lib/etl_geocode.py b/lib/etl_geocode.py index 2e34fb8..b45e863 100644 --- a/lib/etl_geocode.py +++ b/lib/etl_geocode.py @@ -8,6 +8,10 @@ from .geocode_record import geocode_record, geocode_using_census_batch +ADDRESS_ROW_KEY_COLUMNS = [ + 'indexnumberid', 'street1', 'street2', 'city', 'state', 'postalcode', +] + GEOCODE_ADDRESS_COLUMNS = [ 'indexnumberid', 'street1', 'street2', 'city', 'state', 'postalcode', 'status', 'house_number', 'street_name', 'borough_code', 'place_name', @@ -37,6 +41,20 @@ def _has_lat(value): return text != '' and text.lower() != 'nan' +def address_row_key(row): + """Stable per-address identity for merge/upsert (ingest columns only).""" + parts = [] + for col in ADDRESS_ROW_KEY_COLUMNS: + value = row.get(col) + if value is None: + parts.append('') + elif isinstance(value, float) and np.isnan(value): + parts.append('') + else: + parts.append(str(value)) + return tuple(parts) + + def row_needs_geocode(row): """Mirror select_addresses_needing_geocode.sql for unit tests.""" return not _has_lat(row.get('lat')) and str(row.get('house_number') or '').strip() != '' @@ -128,10 +146,10 @@ def geocode_candidate_records( else: census_results = [] - by_id = {row['indexnumberid']: row for row in geosupport_results} + by_key = {address_row_key(row): row for row in geosupport_results} for row in census_results: - by_id[row['indexnumberid']] = row - return [by_id[row['indexnumberid']] for row in records] + by_key[address_row_key(row)] = row + return [by_key[address_row_key(row)] for row in records] def upsert_geocoded_addresses(db, rows): diff --git a/lib/sql/upsert_geocoded_addresses.sql b/lib/sql/upsert_geocoded_addresses.sql index 5d4e4b7..3deeca3 100644 --- a/lib/sql/upsert_geocoded_addresses.sql +++ b/lib/sql/upsert_geocoded_addresses.sql @@ -26,4 +26,9 @@ SET lon = s.lon, zip_code = s.zip_code FROM oca_addresses_geocode_staging AS s -WHERE o.indexnumberid = s.indexnumberid; +WHERE o.indexnumberid IS NOT DISTINCT FROM s.indexnumberid + AND o.street1 IS NOT DISTINCT FROM s.street1 + AND o.street2 IS NOT DISTINCT FROM s.street2 + AND o.city IS NOT DISTINCT FROM s.city + AND o.state IS NOT DISTINCT FROM s.state + AND o.postalcode IS NOT DISTINCT FROM s.postalcode; diff --git a/tests/test_incremental_geocode.py b/tests/test_incremental_geocode.py index e15d996..1bcde09 100644 --- a/tests/test_incremental_geocode.py +++ b/tests/test_incremental_geocode.py @@ -1,10 +1,13 @@ import os import tracemalloc import unittest +from pathlib import Path from unittest import mock from lib.etl_geocode import ( + ADDRESS_ROW_KEY_COLUMNS, GEOCODE_ADDRESS_COLUMNS, + address_row_key, fetch_addresses_needing_geocode, geocode_candidate_records, row_needs_geocode, @@ -73,6 +76,78 @@ def test_empty_candidates_skips_geocoders(self): self.assertEqual(results, []) geocode_mock.assert_not_called() + def test_multi_address_same_case_distinct_geocodes(self): + """Same indexnumberid, different street1 — each row keeps its own geocode.""" + shared_id = 'case-multi' + records = [ + { + 'indexnumberid': shared_id, + 'lat': '', + 'house_number': '1', + 'street1': '100 Main St', + 'street2': '', + 'city': 'NYC', + 'state': 'NY', + 'postalcode': '10001', + }, + { + 'indexnumberid': shared_id, + 'lat': '', + 'house_number': '2', + 'street1': '200 Oak Ave', + 'street2': 'Apt 3', + 'city': 'NYC', + 'state': 'NY', + 'postalcode': '10002', + }, + ] + + def fake_geocode_record(row, addr_cols): + row = dict(row) + if row['street1'] == '100 Main St': + row['lat'] = '40.100' + row['lon'] = '-73.100' + return row + + def fake_census_batch(dataframe, pub_dir): + dataframe = dataframe.copy() + lats = [] + lons = [] + for street1 in dataframe['street1']: + if street1 == '200 Oak Ave': + lats.append('40.200') + lons.append('-73.200') + else: + lats.append('') + lons.append('') + dataframe['lat'] = lats + dataframe['lon'] = lons + return dataframe + + results = geocode_candidate_records( + records, + geocode_workers=1, + census_batch_chunk_size=2500, + pub_dir='/tmp', + geocode_record_fn=fake_geocode_record, + geocode_using_census_batch_fn=fake_census_batch, + ) + + self.assertEqual(len(results), 2) + self.assertEqual(results[0]['street1'], '100 Main St') + self.assertEqual(results[0]['lat'], '40.100') + self.assertEqual(results[1]['street1'], '200 Oak Ave') + self.assertEqual(results[1]['lat'], '40.200') + self.assertNotEqual(results[0]['lat'], results[1]['lat']) + self.assertEqual( + address_row_key(results[0]), + address_row_key(records[0]), + ) + self.assertEqual( + address_row_key(results[1]), + address_row_key(records[1]), + ) + def test_idempotent_rerun_fetches_no_candidates(self): fake_db = mock.Mock() fake_db.sql_fetch_all_from_file.return_value = [] @@ -99,6 +174,51 @@ def test_fetch_uses_sql_file(self): self.assertEqual(rows[0]['lat'], '') +class AddressRowKeyTests(unittest.TestCase): + def test_address_row_key_normalizes_none(self): + row = { + 'indexnumberid': 'case-1', + 'street1': '1 Main', + 'street2': None, + 'city': 'NYC', + 'state': 'NY', + 'postalcode': '10001', + } + self.assertEqual( + address_row_key(row), + ('case-1', '1 Main', '', 'NYC', 'NY', '10001'), + ) + + def test_distinct_keys_for_different_street1(self): + base = { + 'indexnumberid': 'case-1', + 'street2': '', + 'city': 'NYC', + 'state': 'NY', + 'postalcode': '10001', + } + key_a = address_row_key({**base, 'street1': '100 Main St'}) + key_b = address_row_key({**base, 'street1': '200 Oak Ave'}) + self.assertNotEqual(key_a, key_b) + + +class UpsertGeocodedAddressesSqlTests(unittest.TestCase): + def test_upsert_sql_matches_on_natural_address_key(self): + sql_path = Path(__file__).resolve().parents[1] / 'lib' / 'sql' / 'upsert_geocoded_addresses.sql' + sql = sql_path.read_text() + for col in ADDRESS_ROW_KEY_COLUMNS: + self.assertIn( + f'o.{col} IS NOT DISTINCT FROM s.{col}', + sql, + f'expected null-safe join on {col}', + ) + self.assertNotRegex( + sql, + r'WHERE\s+o\.indexnumberid\s*=\s*s\.indexnumberid\s*;', + 'upsert must not join on indexnumberid alone', + ) + + class UpsertGeocodedAddressesTests(unittest.TestCase): def test_upsert_writes_staging_and_merges(self): fake_db = mock.Mock() From ee8d3d70bba9ce2707ed40970258794d5cf73b06 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Wed, 27 May 2026 17:05:51 -0400 Subject: [PATCH 10/30] Atomic staging->main promotion hardening --- lib/README.md | 4 +- lib/database.py | 41 +++++-- lib/etl.py | 10 +- lib/etl_helpers.py | 30 ----- lib/etl_promotion.py | 48 ++++++++ lib/etl_stages.py | 6 +- lib/sql/ensure_promotion_indexes.sql | 6 + lib/sql/promote_staging_to_main.sql | 118 ++++++++++++++++++ lib/sql/update_metadata.sql | 23 ---- tests/test_promotion.py | 174 +++++++++++++++++++++++++++ 10 files changed, 390 insertions(+), 70 deletions(-) create mode 100644 lib/etl_promotion.py create mode 100644 lib/sql/ensure_promotion_indexes.sql create mode 100644 lib/sql/promote_staging_to_main.sql delete mode 100644 lib/sql/update_metadata.sql create mode 100644 tests/test_promotion.py diff --git a/lib/README.md b/lib/README.md index 8e1643f..9ae6a27 100644 --- a/lib/README.md +++ b/lib/README.md @@ -29,8 +29,8 @@ A few basic helper functions: * `prep_db` * Prepare the Postgres database (either from scratch with SQL scripts or from a `pg_dump` file) -* `insert_staging_to_main` - * Move newly parsed records in the database over from staging tables to the main ones +* `promote_staging_to_main` (`etl_promotion.py`) + * Move newly parsed records in the database over from staging tables to the main ones (single transaction) * `create_date_files` * Create plain text and image files for the most recent date of the data extracts for display in this repo diff --git a/lib/database.py b/lib/database.py index a042b28..582b403 100644 --- a/lib/database.py +++ b/lib/database.py @@ -1,6 +1,7 @@ import urllib.parse import psycopg2 import psycopg2.extras +from contextlib import contextmanager from psycopg2 import sql import os @@ -46,20 +47,36 @@ def set_search_path(self, schema): curs.execute(sql.SQL("SET search_path TO {}, public").format(sql.Identifier(schema))) self.conn.commit() - def sql(self, SQL, autocommit = False): - """ Executes single sql statement + def execute(self, SQL, autocommit=False): + """Execute SQL without committing (for use inside transaction blocks).""" + if autocommit: + self.conn.set_session(autocommit=True) - Set auto commit to run queries like VACUUM FULL [1] - [1]: https://til.codeinthehole.com/posts/about-a-gotcha-with-psycopg2s-autocommit-handling/ - """ - if autocommit: self.conn.set_session(autocommit=True) - with self.conn.cursor() as curs: curs.execute(SQL) - if autocommit: self.conn.set_session(autocommit=False) # unset + if autocommit: + self.conn.set_session(autocommit=False) + + def sql(self, SQL, autocommit=False): + """Execute a single SQL statement and commit. + + Set autocommit to run queries like VACUUM FULL [1] + [1]: https://til.codeinthehole.com/posts/about-a-gotcha-with-psycopg2s-autocommit-handling/ + """ + self.execute(SQL, autocommit=autocommit) self.conn.commit() + @contextmanager + def transaction(self): + """Run a block in one DB transaction; rollback on any exception.""" + try: + yield self + self.conn.commit() + except Exception: + self.conn.rollback() + raise + def sql_fetch_one(self, SQL): with self.conn.cursor() as curs: curs.execute(SQL) @@ -96,7 +113,7 @@ def insert_rows(self, rows, table_name): self.conn.commit() - def execute_sql_file(self, sql_file): + def execute_sql_file(self, sql_file, commit=True): """ Executes the provided sql file. Assumes the path is relative to ./sql @@ -104,7 +121,11 @@ def execute_sql_file(self, sql_file): file_path = os.path.join(os.path.dirname(__file__), 'sql', sql_file) with open(file_path, 'r', encoding='utf-8') as f: - self.sql(f.read()) + sql_text = f.read() + if commit: + self.sql(sql_text) + else: + self.execute(sql_text) def export_csv(self, table_name, file_path): diff --git a/lib/etl.py b/lib/etl.py index 7b6dde4..dad7312 100644 --- a/lib/etl.py +++ b/lib/etl.py @@ -19,12 +19,16 @@ create_date_files, csv_has_rows, download_pluto, - insert_staging_to_main, make_dir, prep_db, s3_key, upload_public_file, ) +from .etl_promotion import ( + promote_staging_to_main, + promotion_counts_checksum, + promotion_table_counts, +) from .etl_run_manifest import EtlRunManifest, completed_reprocess_files, manifest_step from .etl_stages import ( FileSelection, @@ -55,7 +59,9 @@ 'manifest_step', 'csv_has_rows', 'prep_db', - 'insert_staging_to_main', + 'promote_staging_to_main', + 'promotion_table_counts', + 'promotion_counts_checksum', 'create_date_files', 'download_pluto', 'upload_public_file', diff --git a/lib/etl_helpers.py b/lib/etl_helpers.py index 227e7d9..6decc7b 100644 --- a/lib/etl_helpers.py +++ b/lib/etl_helpers.py @@ -64,36 +64,6 @@ def prep_db(s3, db, local_dir): db.execute_sql_file('create_tables.sql') -def insert_staging_to_main(db, tables): - """ - Delete all cases from main tables if they exist in the staging table, - then insert all records from the staging tables to the main tables - - issue: SET session_replication_role = replica - https://stackoverflow.com/questions/3942258/how-do-i-temporarily-disable-triggers-in-postgresql/18709987#18709987 - to a work around to avoid DELETE FROM command stalling. - A VACUUM FULL on all the tables were tried, it does not seem to help - Might be an issue with the staging table schema? - - :param db: Database object - """ - - db.sql("SET session_replication_role = replica;") - for table in tables: - if table in ('oca_metadata'): # skip these tables - continue - print(f"\t...Deleting older entries from {table}") - db.sql(f"DELETE FROM {table} WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging)") - db.sql("SET session_replication_role = default;") - - for table in tables: - if table in ('oca_metadata'): # skip these tables - continue - print(f"\t...Inserting to {table}") - db.sql(f"INSERT INTO {table} SELECT * FROM {table}_staging") - db.sql(f"DROP TABLE {table}_staging") - - def create_date_files(s3, data_file, local_dir): """ Create a text file and a custom shield image with date the data was diff --git a/lib/etl_promotion.py b/lib/etl_promotion.py new file mode 100644 index 0000000..9ffd19f --- /dev/null +++ b/lib/etl_promotion.py @@ -0,0 +1,48 @@ +import hashlib +import json + +from .etl_constants import OCA_TABLES + +PROMOTION_SQL_FILE = 'promote_staging_to_main.sql' +PROMOTION_INDEX_SQL_FILE = 'ensure_promotion_indexes.sql' + +# Tables promoted via promote_staging_to_main.sql (oca_metadata merged in-SQL). +PROMOTED_TABLES = [t for t in OCA_TABLES if t != 'oca_metadata'] + +ADDRESS_NATURAL_KEY_COLUMNS = [ + 'indexnumberid', 'street1', 'street2', 'city', 'state', 'postalcode', +] + + +def ensure_promotion_indexes(db): + """Create indexes used by scoped promotion deletes when staging tables exist.""" + db.execute_sql_file(PROMOTION_INDEX_SQL_FILE, commit=False) + + +def promotion_table_counts(db, tables=None): + """Return row counts per main table (checksum hook for validation).""" + tables = tables or OCA_TABLES + counts = {} + for table in tables: + row = db.sql_fetch_one(f'SELECT COUNT(*)::bigint FROM {table}') + counts[table] = int(row[0]) if row else 0 + return counts + + +def promotion_counts_checksum(counts): + """Stable checksum string for comparing promotion snapshots.""" + payload = json.dumps(counts, sort_keys=True) + return hashlib.sha256(payload.encode('utf-8')).hexdigest() + + +def promote_staging_to_main(db): + """ + Promote all populated staging tables to main in one transaction. + + On failure, PostgreSQL rolls back deletes/inserts/metadata merge and staging + drops so a retry can re-import or re-run promotion safely. + """ + with db.transaction(): + ensure_promotion_indexes(db) + db.execute_sql_file(PROMOTION_SQL_FILE, commit=False) + diff --git a/lib/etl_stages.py b/lib/etl_stages.py index d47e44d..2d2790a 100644 --- a/lib/etl_stages.py +++ b/lib/etl_stages.py @@ -16,10 +16,10 @@ from .etl_helpers import ( create_date_files, csv_has_rows, - insert_staging_to_main, s3_key, upload_public_file, ) +from .etl_promotion import promote_staging_to_main from .etl_geocode import ( fetch_addresses_needing_geocode, geocode_candidate_records, @@ -176,8 +176,8 @@ def import_and_promote_staging(manifest, db, pub_dir, s3_args, s3_prefix, select db.execute_sql_file('normalize_staging_after_import.sql') db.execute_sql_file('update_appearance_outcomes.sql') - insert_staging_to_main(db, OCA_TABLES) - db.execute_sql_file('update_metadata.sql') + print('\t...Promoting staging tables to main (single transaction)') + promote_staging_to_main(db) for selected_name in selection.selected_zip_files: source = 'sftp' if selected_name in selection.new_file_set else 's3_private' manifest.upsert_file(selected_name, source=source, status='completed', stage='promote') diff --git a/lib/sql/ensure_promotion_indexes.sql b/lib/sql/ensure_promotion_indexes.sql new file mode 100644 index 0000000..bb1093c --- /dev/null +++ b/lib/sql/ensure_promotion_indexes.sql @@ -0,0 +1,6 @@ +-- Indexes supporting deterministic, scoped promotion deletes (idempotent reruns). +CREATE INDEX IF NOT EXISTS oca_addresses_promotion_natural_key_idx + ON oca_addresses (indexnumberid, street1, street2, city, state, postalcode); + +CREATE INDEX IF NOT EXISTS oca_index_staging_indexnumberid_idx + ON oca_index_staging (indexnumberid); diff --git a/lib/sql/promote_staging_to_main.sql b/lib/sql/promote_staging_to_main.sql new file mode 100644 index 0000000..5f1c435 --- /dev/null +++ b/lib/sql/promote_staging_to_main.sql @@ -0,0 +1,118 @@ +-- Atomic staging -> main promotion for one import batch. +-- Case scope: all indexnumberid values present in oca_index_staging. +-- oca_index uses UPSERT; child tables use scoped DELETE + INSERT; metadata merged last. + +SET session_replication_role = replica; + +-- Child tables keyed by indexnumberid (full per-case row replace for the batch). +DELETE FROM oca_appearance_outcomes +WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); + +DELETE FROM oca_appearances +WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); + +DELETE FROM oca_warrants +WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); + +DELETE FROM oca_judgments +WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); + +DELETE FROM oca_decisions +WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); + +DELETE FROM oca_motions +WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); + +DELETE FROM oca_events +WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); + +DELETE FROM oca_parties +WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); + +DELETE FROM oca_causes +WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); + +-- Addresses: natural-line key aligned with incremental geocode. +DELETE FROM oca_addresses m +WHERE m.indexnumberid IN (SELECT indexnumberid FROM oca_index_staging) +AND NOT EXISTS ( + SELECT 1 + FROM oca_addresses_staging s + WHERE s.indexnumberid = m.indexnumberid + AND m.street1 IS NOT DISTINCT FROM s.street1 + AND m.street2 IS NOT DISTINCT FROM s.street2 + AND m.city IS NOT DISTINCT FROM s.city + AND m.state IS NOT DISTINCT FROM s.state + AND m.postalcode IS NOT DISTINCT FROM s.postalcode +); + +DELETE FROM oca_addresses m +USING oca_addresses_staging s +WHERE m.indexnumberid = s.indexnumberid + AND m.street1 IS NOT DISTINCT FROM s.street1 + AND m.street2 IS NOT DISTINCT FROM s.street2 + AND m.city IS NOT DISTINCT FROM s.city + AND m.state IS NOT DISTINCT FROM s.state + AND m.postalcode IS NOT DISTINCT FROM s.postalcode; + +INSERT INTO oca_index ( + indexnumberid, court, fileddate, propertytype, classification, + specialtydesignationtypes, status, disposeddate, disposedreason, + firstpaper, primaryclaimtotal, dateofjurydemand +) +SELECT + indexnumberid, court, fileddate, propertytype, classification, + specialtydesignationtypes, status, disposeddate, disposedreason, + firstpaper, primaryclaimtotal, dateofjurydemand +FROM oca_index_staging +ON CONFLICT (indexnumberid) DO UPDATE SET + court = EXCLUDED.court, + fileddate = EXCLUDED.fileddate, + propertytype = EXCLUDED.propertytype, + classification = EXCLUDED.classification, + specialtydesignationtypes = EXCLUDED.specialtydesignationtypes, + status = EXCLUDED.status, + disposeddate = EXCLUDED.disposeddate, + disposedreason = EXCLUDED.disposedreason, + firstpaper = EXCLUDED.firstpaper, + primaryclaimtotal = EXCLUDED.primaryclaimtotal, + dateofjurydemand = EXCLUDED.dateofjurydemand; + +INSERT INTO oca_causes SELECT * FROM oca_causes_staging; +INSERT INTO oca_addresses SELECT * FROM oca_addresses_staging; +INSERT INTO oca_parties SELECT * FROM oca_parties_staging; +INSERT INTO oca_events SELECT * FROM oca_events_staging; +INSERT INTO oca_appearances SELECT * FROM oca_appearances_staging; +INSERT INTO oca_appearance_outcomes SELECT * FROM oca_appearance_outcomes_staging; +INSERT INTO oca_motions SELECT * FROM oca_motions_staging; +INSERT INTO oca_decisions SELECT * FROM oca_decisions_staging; +INSERT INTO oca_judgments SELECT * FROM oca_judgments_staging; +INSERT INTO oca_warrants SELECT * FROM oca_warrants_staging; + +-- Metadata merge (no nested transaction; must run before staging drops). +CREATE TABLE oca_metadata_temp AS +SELECT + COALESCE(om.indexnumberid, oms.indexnumberid) AS indexnumberid, + COALESCE(om.initialdate, oms.initialdate) AS initialdate, + COALESCE(oms.updatedate, om.updatedate) AS updatedate, + COALESCE(oms.deletedate, om.deletedate) AS deletedate +FROM oca_metadata om +FULL OUTER JOIN oca_metadata_staging oms ON om.indexnumberid = oms.indexnumberid; + +DROP TABLE oca_metadata; +ALTER TABLE oca_metadata_temp RENAME TO oca_metadata; + +DROP TABLE IF EXISTS oca_index_staging CASCADE; +DROP TABLE IF EXISTS oca_causes_staging CASCADE; +DROP TABLE IF EXISTS oca_addresses_staging CASCADE; +DROP TABLE IF EXISTS oca_parties_staging CASCADE; +DROP TABLE IF EXISTS oca_events_staging CASCADE; +DROP TABLE IF EXISTS oca_appearances_staging CASCADE; +DROP TABLE IF EXISTS oca_appearance_outcomes_staging CASCADE; +DROP TABLE IF EXISTS oca_motions_staging CASCADE; +DROP TABLE IF EXISTS oca_decisions_staging CASCADE; +DROP TABLE IF EXISTS oca_judgments_staging CASCADE; +DROP TABLE IF EXISTS oca_warrants_staging CASCADE; +DROP TABLE IF EXISTS oca_metadata_staging CASCADE; + +SET session_replication_role = default; diff --git a/lib/sql/update_metadata.sql b/lib/sql/update_metadata.sql deleted file mode 100644 index 6acdcb3..0000000 --- a/lib/sql/update_metadata.sql +++ /dev/null @@ -1,23 +0,0 @@ -BEGIN TRANSACTION; - --- Create temporary table with new data -CREATE TABLE oca_metadata_temp AS -SELECT - COALESCE(om.indexnumberid, oms.indexnumberid) AS indexnumberid, - COALESCE(om.initialdate, oms.initialdate) AS initialdate, - CASE - WHEN om.indexnumberid IS NULL THEN oms.updatedate - ELSE oms.updatedate - END AS updatedate, - CASE - WHEN om.indexnumberid IS NULL THEN oms.deletedate - ELSE oms.deletedate - END AS deletedate -FROM oca_metadata om -FULL OUTER JOIN oca_metadata_staging oms ON om.indexnumberid = oms.indexnumberid; - --- Replace the original table -DROP TABLE oca_metadata; -ALTER TABLE oca_metadata_temp RENAME TO oca_metadata; - -COMMIT; \ No newline at end of file diff --git a/tests/test_promotion.py b/tests/test_promotion.py new file mode 100644 index 0000000..8b3e547 --- /dev/null +++ b/tests/test_promotion.py @@ -0,0 +1,174 @@ +import unittest +from pathlib import Path +from unittest import mock + +from lib.database import Database +from lib.etl_constants import OCA_TABLES +from lib.etl_promotion import ( + ADDRESS_NATURAL_KEY_COLUMNS, + PROMOTION_SQL_FILE, + promote_staging_to_main, + promotion_counts_checksum, + promotion_table_counts, +) + + +SQL_DIR = Path(__file__).resolve().parents[1] / 'lib' / 'sql' + + +class FakeConn: + def __init__(self): + self.committed = False + self.rolled_back = False + + def commit(self): + self.committed = True + + def rollback(self): + self.rolled_back = True + + def cursor(self): + raise NotImplementedError + + +class PromoteStagingTests(unittest.TestCase): + def test_promote_uses_single_transaction(self): + db = mock.Mock() + db.transaction.return_value.__enter__ = mock.Mock(return_value=db) + db.transaction.return_value.__exit__ = mock.Mock(return_value=False) + + promote_staging_to_main(db) + + db.transaction.assert_called_once() + db.execute_sql_file.assert_any_call('ensure_promotion_indexes.sql', commit=False) + db.execute_sql_file.assert_any_call('promote_staging_to_main.sql', commit=False) + + def test_promotion_failure_rolls_back(self): + conn = FakeConn() + db = mock.Mock() + db.conn = conn + db.execute_sql_file.side_effect = RuntimeError('simulated promotion failure') + + def transaction(): + class _Txn: + def __enter__(self_inner): + return db + + def __exit__(self_inner, exc_type, exc, tb): + if exc_type: + conn.rollback() + return False + conn.commit() + return False + + return _Txn() + + db.transaction.side_effect = transaction + + with self.assertRaises(RuntimeError): + promote_staging_to_main(db) + + self.assertTrue(conn.rolled_back) + self.assertFalse(conn.committed) + + def test_promotion_success_commits_once(self): + conn = FakeConn() + db = mock.Mock() + db.conn = conn + + def transaction(): + class _Txn: + def __enter__(self_inner): + return db + + def __exit__(self_inner, exc_type, exc, tb): + if exc_type: + conn.rollback() + return False + conn.commit() + return False + + return _Txn() + + db.transaction.side_effect = transaction + promote_staging_to_main(db) + + self.assertTrue(conn.committed) + self.assertFalse(conn.rolled_back) + + def test_counts_checksum_stable(self): + counts_a = {'oca_index': 1, 'oca_addresses': 2} + counts_b = {'oca_addresses': 2, 'oca_index': 1} + self.assertEqual( + promotion_counts_checksum(counts_a), + promotion_counts_checksum(counts_b), + ) + + +class PromoteStagingSqlContractTests(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.sql = (SQL_DIR / PROMOTION_SQL_FILE).read_text(encoding='utf-8') + + def test_single_transaction_session_role_reset(self): + self.assertIn('SET session_replication_role = replica', self.sql) + self.assertIn('SET session_replication_role = default', self.sql) + + def test_oca_index_upsert_not_delete(self): + self.assertIn('ON CONFLICT (indexnumberid) DO UPDATE', self.sql) + self.assertNotRegex(self.sql, r'DELETE FROM oca_index\b') + + def test_addresses_use_natural_key_delete(self): + for col in ADDRESS_NATURAL_KEY_COLUMNS: + if col == 'indexnumberid': + continue + self.assertIn(f'm.{col} IS NOT DISTINCT FROM s.{col}', self.sql) + + def test_metadata_merged_before_staging_drop(self): + metadata_pos = self.sql.index('CREATE TABLE oca_metadata_temp') + drop_index_pos = self.sql.index('DROP TABLE IF EXISTS oca_index_staging') + self.assertLess(metadata_pos, drop_index_pos) + + def test_all_staging_tables_dropped(self): + for table in OCA_TABLES: + self.assertIn(f'DROP TABLE IF EXISTS {table}_staging', self.sql) + + +class DatabaseTransactionTests(unittest.TestCase): + @staticmethod + def _mock_connection(): + conn = mock.Mock() + cursor = mock.MagicMock() + cursor.__enter__.return_value = cursor + cursor.__exit__.return_value = False + conn.cursor.return_value = cursor + return conn + + @mock.patch('lib.database.psycopg2.connect') + def test_transaction_commits_on_success(self, connect_mock): + conn = self._mock_connection() + connect_mock.return_value = conn + db = Database(db_url='postgres://example') + with db.transaction(): + db.execute('SELECT 1') + conn.commit.assert_called_once() + conn.rollback.assert_not_called() + + @mock.patch('lib.database.psycopg2.connect') + def test_transaction_rolls_back_on_error(self, connect_mock): + conn = self._mock_connection() + connect_mock.return_value = conn + db = Database(db_url='postgres://example') + with self.assertRaises(RuntimeError): + with db.transaction(): + raise RuntimeError('fail') + conn.rollback.assert_called_once() + + +class PromotionTableCountsTests(unittest.TestCase): + def test_promotion_table_counts_queries_each_table(self): + db = mock.Mock() + db.sql_fetch_one.return_value = (42,) + counts = promotion_table_counts(db, tables=['oca_index', 'oca_causes']) + self.assertEqual(counts, {'oca_index': 42, 'oca_causes': 42}) + self.assertEqual(db.sql_fetch_one.call_count, 2) From db12b023886e36ac02fa43a7370d243857aa5570 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Wed, 27 May 2026 21:27:30 -0400 Subject: [PATCH 11/30] Publish optimization + operational hardening --- README.md | 10 ++ dockerhub-publish.sh | 51 +++++++ docs/operations/weekly-etl-scheduling.md | 165 +++++++++++++++++++++++ k8s/k8s-cron-job.yaml | 52 +++++++ k8s/oca-etl-secret.example.yaml | 27 ++++ lib/etl.py | 9 +- lib/etl_publish.py | 64 +++++++++ lib/etl_stages.py | 89 ++++++------ tests/test_etl_publish.py | 87 ++++++++++++ tests/test_incremental_geocode.py | 10 ++ tests/test_k8s_cron_job.py | 45 +++++++ 11 files changed, 568 insertions(+), 41 deletions(-) create mode 100755 dockerhub-publish.sh create mode 100644 docs/operations/weekly-etl-scheduling.md create mode 100644 k8s/k8s-cron-job.yaml create mode 100644 k8s/oca-etl-secret.example.yaml create mode 100644 lib/etl_publish.py create mode 100644 tests/test_etl_publish.py create mode 100644 tests/test_k8s_cron_job.py diff --git a/README.md b/README.md index 6cb8b7c..583b8e4 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,16 @@ To run the whole process in the docker container run: docker-compose up ``` +### Weekly scheduling and Kubernetes + +See [`docs/operations/weekly-etl-scheduling.md`](docs/operations/weekly-etl-scheduling.md) for: + +- local Docker + **cron** (weekly example), +- **Kubernetes CronJob** (`k8s/k8s-cron-job.yaml`, 2Gi memory limit, secrets via `oca-etl-secrets`), +- **AWS EventBridge + ECS Fargate** (weekly task schedule). + +Create cluster secrets from [`k8s/oca-etl-secret.example.yaml`](k8s/oca-etl-secret.example.yaml); do not commit real credentials. + ### Runtime controls (Step 1 refactor) These optional variables let operators isolate schema/data paths and tune memory-sensitive parts of the run. If omitted, behavior remains the same as before (new files only, default schema/search path, default worker/chunk values). diff --git a/dockerhub-publish.sh b/dockerhub-publish.sh new file mode 100755 index 0000000..e1c6872 --- /dev/null +++ b/dockerhub-publish.sh @@ -0,0 +1,51 @@ +#!/bin/bash +set -eo pipefail + +# --- Configuration --- +DOCKER_USER="justfix" +DOCKER_TEAM="justfixnyc" +REPO_NAME="oca" +IMAGE_TAG="latest" # Or use a dynamic tag like $1 or a version number + +FULL_IMAGE_NAME="${DOCKER_TEAM}/${REPO_NAME}:${IMAGE_TAG}" +DOCKERFILE_PATH="./Dockerfile" # Path to your Dockerfile + +# Ensure credentials are set as environment variables for security +if [ -z "$DOCKER_PASSWORD" ]; then + echo "Error: DOCKER_PASSWORD environment variable not set." + exit 1 +fi +# --------------------- + +echo "Starting Docker image build and push process..." + +# 1. Log in to Docker Hub using standard input for the password for security +echo "Logging in to Docker Hub..." +echo "$DOCKER_PASSWORD" | docker login --username "$DOCKER_USER" --password-stdin +if [ $? -ne 0 ]; then + echo "Error: Docker login failed." + exit 1 +fi +echo "Successfully logged in." + +# 2. Build the Docker image +echo "Building image: ${FULL_IMAGE_NAME} from ${DOCKERFILE_PATH}..." +docker build -f "${DOCKERFILE_PATH}" -t "${FULL_IMAGE_NAME}" . +if [ $? -ne 0 ]; then + echo "Error: Docker build failed." + exit 1 +fi +echo "Successfully built image." + +# 3. Push the image to Docker Hub +echo "Pushing image: ${FULL_IMAGE_NAME} to Docker Hub..." +docker push "${FULL_IMAGE_NAME}" +if [ $? -ne 0 ]; then + echo "Error: Docker push failed." + exit 1 +fi +echo "Successfully pushed image to Docker Hub." + +# Optional: Log out of Docker Hub after pushing +docker logout +echo "Logged out of Docker Hub." diff --git a/docs/operations/weekly-etl-scheduling.md b/docs/operations/weekly-etl-scheduling.md new file mode 100644 index 0000000..1b6e778 --- /dev/null +++ b/docs/operations/weekly-etl-scheduling.md @@ -0,0 +1,165 @@ +# Weekly OCA ETL scheduling and deployment + +The OCA pipeline ingests new SFTP XML zip files weekly, promotes staging data in PostgreSQL, geocodes addresses incrementally, and publishes CSVs to S3 via `aws_s3`. All three supported schedulers run the same container entrypoint: + +```bash +python oca_update.py +``` + +Use Docker (or the published image `justfixnyc/oca:latest`) with credentials supplied via environment variables or a secret store. See [Runtime controls](#runtime-controls) and the root [README](../../README.md). + +## Runtime controls + +| Variable | Purpose | Production default | +|----------|---------|-------------------| +| `MODE` | Publish mode (`2` = full S3 publish) | `2` | +| `DATABASE_URL` | PostgreSQL connection (RDS) | required | +| `AWS_ACCESS_KEY_ID` / `AWS_SECRET_ACCESS_KEY` | S3 + RDS `aws_s3` credentials | required (or IAM role on ECS) | +| `AWS_S3_BUCKET_NAME` | Target bucket | required | +| `SFTP_*` | OCA SFTP download | required | +| `DB_SCHEMA` | `search_path` schema (refactor/E2E) | empty → `public` | +| `S3_PREFIX` | Key prefix for `private/` and `public/` | empty → bucket root | +| `REPROCESS_GLOB` | Replay zip files from S3 `private/` | empty | +| `FORCE_REPROCESS` | Replay manifest-completed files | `false` | +| `GEOCODE_WORKERS` | Geosupport pool size | CPU count | +| `CENSUS_BATCH_CHUNK_SIZE` | Census batch chunk | `2500` | +| `CSV_ROW_CHECK_CHUNK_SIZE` | Staging CSV preprocess chunk | `1000` | + +Refactor and E2E runs must set `S3_PREFIX=refactor/` (or another isolated prefix) so reads/writes stay out of production public paths. + +Memory target: **≤ 2 GiB** per job. Tune `GEOCODE_WORKERS` down (e.g. `2`) if geocoding approaches the limit. + +## Publish behavior (Step 6) + +- **Core tables:** every table in `OCA_TABLES` is exported after a successful promotion batch. Selective skip per table is unsafe when `oca_index_staging` has rows: promotion deletes child rows for the batch even when a child staging CSV was empty. +- **Addresses:** when incremental geocode has zero candidates and `oca_addresses_staging` had no rows this run, address CSV/view exports and `create_addresses_views.sql` are skipped. +- **S3 encryption:** SSE-S3 normalization runs only on objects exported in the current run (not a full public-prefix scan). + +## 1. Local Docker + cron (weekly) + +Best for a single host with Docker and an `.env` file. + +**Weekly schedule example** (Saturdays 12:00 US/Eastern, same cadence as K8s manifest): + +```cron +# /etc/cron.d/oca-etl — adjust path to your clone +0 12 * * 6 root cd /path/to/oca && /usr/bin/docker compose run --rm \ + -e MODE=2 \ + -e GEOCODE_WORKERS=2 \ + app python oca_update.py >> /var/log/oca-etl.log 2>&1 +``` + +Ensure `.env` in the repo root defines `DATABASE_URL`, AWS, and SFTP variables (see `.env.example`). Do not commit `.env`. + +**Manual run:** + +```bash +docker compose run --rm app python oca_update.py +``` + +**Refactor / replay example:** + +```bash +docker compose run --rm app env \ + DB_SCHEMA=oca_refactor \ + S3_PREFIX=refactor/ \ + REPROCESS_GLOB='LandlordTenant.Incr.2025-*.zip' \ + FORCE_REPROCESS=true \ + GEOCODE_WORKERS=2 \ + python oca_update.py +``` + +## 2. Kubernetes CronJob (weekly) + +Manifest: [`k8s/k8s-cron-job.yaml`](../../k8s/k8s-cron-job.yaml). + +**Schedule:** `0 12 * * 6` with `timeZone: America/New_York` (weekly Saturday noon). + +**Memory:** requests `1536Mi`, limit `2Gi` (2 GB class). + +**Secrets:** create `oca-etl-secrets` before applying the CronJob (see [`k8s/oca-etl-secret.example.yaml`](../../k8s/oca-etl-secret.example.yaml)). + +```bash +kubectl apply -f k8s/oca-etl-secret.example.yaml # after editing placeholders +kubectl apply -f k8s/k8s-cron-job.yaml +kubectl get cronjob oca-etl +``` + +Non-secret runtime knobs are set inline in the CronJob (`MODE`, `GEOCODE_WORKERS`, etc.). Override `DB_SCHEMA` / `S3_PREFIX` there for refactor jobs. + +**One-off job from the CronJob template:** + +```bash +kubectl create job --from=cronjob/oca-etl oca-etl-manual-$(date +%s) +kubectl logs -f job/oca-etl-manual- +``` + +## 3. AWS EventBridge + ECS Fargate (weekly, non-Kubernetes) + +Use when production runs on AWS without a cluster. EventBridge starts an ECS task on a schedule; the task uses the same image and command as Docker/K8s. + +**High-level steps** + +1. Push `justfixnyc/oca:latest` (or your ECR mirror) and register a Fargate task definition with: + - `command`: `["python", "oca_update.py"]` + - `memory`: `2048` (hard limit, MiB) + - `cpu`: `1024` (1 vCPU; adjust if needed) + - Secrets from AWS Secrets Manager or SSM Parameter Store → container environment (same keys as `.env.example`) + - Task role: S3 access for the bucket; execution role: ECR pull + secrets +2. Create an ECS cluster and service is optional; scheduled tasks can run standalone. +3. EventBridge rule (weekly Saturday 12:00 ET): + +```json +{ + "scheduleExpression": "cron(0 12 ? * SAT *)", + "scheduleExpressionTimezone": "America/New_York", + "state": "ENABLED", + "targets": [{ + "Arn": "arn:aws:ecs:us-east-1:ACCOUNT_ID:cluster/oca-etl", + "RoleArn": "arn:aws:iam::ACCOUNT_ID:role/EventBridgeECSRunTask", + "EcsParameters": { + "TaskDefinitionArn": "arn:aws:ecs:us-east-1:ACCOUNT_ID:task-definition/oca-etl:1", + "LaunchType": "FARGATE", + "NetworkConfiguration": { + "awsvpcConfiguration": { + "subnets": ["subnet-xxx"], + "securityGroups": ["sg-xxx"], + "assignPublicIp": "DISABLED" + } + } + } + }] +} +``` + +Replace ARNs, subnets, and security groups. The task needs outbound HTTPS (SFTP, Census geocoder, S3, RDS) and RDS connectivity from the task subnets. + +**Environment example (task definition fragment):** + +```json +"environment": [ + { "name": "MODE", "value": "2" }, + { "name": "GEOCODE_WORKERS", "value": "2" }, + { "name": "CENSUS_BATCH_CHUNK_SIZE", "value": "2500" }, + { "name": "CSV_ROW_CHECK_CHUNK_SIZE", "value": "1000" } +], +"secrets": [ + { "name": "DATABASE_URL", "valueFrom": "arn:aws:secretsmanager:us-east-1:ACCOUNT:secret:oca-etl:DATABASE_URL::" }, + { "name": "AWS_ACCESS_KEY_ID", "valueFrom": "..." } +] +``` + +On ECS, prefer IAM task roles for S3 instead of long-lived access keys when RDS `aws_s3` integration allows it. + +## Validation checklist + +- [ ] `docker compose run --rm app python -m unittest discover -s tests -p "test_*.py"` +- [ ] CronJob or ECS task memory limit ≤ 2 GiB; geocode workers tuned if OOM +- [ ] Secrets not stored in git-tracked manifests (use K8s Secret / Secrets Manager) +- [ ] Refactor runs use `S3_PREFIX=refactor/` (or dedicated prefix) + +## Related files + +- [`k8s/k8s-cron-job.yaml`](../../k8s/k8s-cron-job.yaml) — CronJob, resources, env +- [`k8s/oca-etl-secret.example.yaml`](../../k8s/oca-etl-secret.example.yaml) — secret template +- [`README.md`](../../README.md) — local setup and runtime controls diff --git a/k8s/k8s-cron-job.yaml b/k8s/k8s-cron-job.yaml new file mode 100644 index 0000000..43eb52d --- /dev/null +++ b/k8s/k8s-cron-job.yaml @@ -0,0 +1,52 @@ +# Weekly OCA ETL (Saturday 12:00 US/Eastern). See docs/operations/weekly-etl-scheduling.md +apiVersion: batch/v1 +kind: CronJob +metadata: + name: oca-etl +spec: + schedule: "0 12 * * 6" + timeZone: America/New_York + failedJobsHistoryLimit: 3 + jobTemplate: + spec: + backoffLimit: 0 + template: + spec: + restartPolicy: Never + containers: + - name: oca-etl + image: justfixnyc/oca:latest + imagePullPolicy: Always + command: + - python + - oca_update.py + # ~2GB class: Geosupport + DuckDB staging + geocode pools + resources: + requests: + memory: "1536Mi" + cpu: "500m" + limits: + memory: "2Gi" + cpu: "2" + envFrom: + - secretRef: + name: oca-etl-secrets + env: + - name: MODE + value: "2" + # Tune down if geocode OOMs near the 2Gi limit + - name: GEOCODE_WORKERS + value: "2" + - name: CENSUS_BATCH_CHUNK_SIZE + value: "2500" + - name: CSV_ROW_CHECK_CHUNK_SIZE + value: "1000" + # Production: leave empty. Refactor/E2E: set e.g. refactor/ + - name: S3_PREFIX + value: "" + - name: DB_SCHEMA + value: "" + - name: REPROCESS_GLOB + value: "" + - name: FORCE_REPROCESS + value: "false" diff --git a/k8s/oca-etl-secret.example.yaml b/k8s/oca-etl-secret.example.yaml new file mode 100644 index 0000000..b2d2da7 --- /dev/null +++ b/k8s/oca-etl-secret.example.yaml @@ -0,0 +1,27 @@ +# Example Secret for oca-etl CronJob. Do not commit real credentials. +# +# kubectl create secret generic oca-etl-secrets \ +# --from-literal=DATABASE_URL='postgresql://...' \ +# --from-literal=AWS_ACCESS_KEY_ID='...' \ +# --from-literal=AWS_SECRET_ACCESS_KEY='...' \ +# --from-literal=AWS_S3_BUCKET_NAME='oca-2-dev' \ +# --from-literal=SFTP_HOST='sftp.nycourts.gov' \ +# --from-literal=SFTP_USER='...' \ +# --from-literal=SFTP_PSWD='...' \ +# --from-literal=SFTP_DIR='139' +# +# Or apply this template after replacing placeholders: +apiVersion: v1 +kind: Secret +metadata: + name: oca-etl-secrets +type: Opaque +stringData: + DATABASE_URL: "postgresql://USER:PASSWORD@HOST:5432/DATABASE" + AWS_ACCESS_KEY_ID: "REPLACE_ME" + AWS_SECRET_ACCESS_KEY: "REPLACE_ME" + AWS_S3_BUCKET_NAME: "oca-2-dev" + SFTP_HOST: "sftp.nycourts.gov" + SFTP_USER: "REPLACE_ME" + SFTP_PSWD: "REPLACE_ME" + SFTP_DIR: "139" diff --git a/lib/etl.py b/lib/etl.py index dad7312..55d348c 100644 --- a/lib/etl.py +++ b/lib/etl.py @@ -119,11 +119,14 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None staging_db, pub_dir, mode, s3_args, s3_prefix, csv_preprocess_chunk_size=csv_row_check_chunk_size, ) - import_and_promote_staging(manifest, db, pub_dir, s3_args, s3_prefix, selection) - publish_core_tables(manifest, db, s3_args, s3_prefix) + staging_tables_with_data = import_and_promote_staging( + manifest, db, pub_dir, s3_args, s3_prefix, selection + ) + published_core_keys = publish_core_tables(manifest, db, s3_args, s3_prefix) geocode_and_publish_addresses( manifest, db, s3, priv_dir, pub_dir, s3_args, s3_prefix, mode, selection, - geocode_workers, census_batch_chunk_size + geocode_workers, census_batch_chunk_size, + staging_tables_with_data, published_core_keys, ) manifest.mark_run_completed( diff --git a/lib/etl_publish.py b/lib/etl_publish.py new file mode 100644 index 0000000..e806c61 --- /dev/null +++ b/lib/etl_publish.py @@ -0,0 +1,64 @@ +"""S3 publish helpers: selective exports and targeted post-publish encryption.""" + +import os + +from .etl_constants import OCA_TABLES, S3_PUBLIC_FOLDER +from .etl_helpers import csv_has_rows, s3_key + + +def staging_tables_with_rows(pub_dir): + """Main table names whose staging CSV had at least one data row this run.""" + tables = [] + for table in OCA_TABLES: + csv_path = os.path.join(pub_dir, f"{table}_staging.csv") + if os.path.isfile(csv_path) and csv_has_rows(csv_path): + tables.append(table) + return set(tables) + + +def should_publish_address_exports(staging_tables_with_data, geocode_candidate_count): + """ + Whether address CSVs and derived views need S3 export this run. + + Skips full-table address exports when incremental geocode has no work and + promotion did not load new address staging rows. Core table exports are not + skipped when oca_index_staging has rows: promotion deletes child rows for the + batch even when a child staging CSV was empty. + """ + if geocode_candidate_count > 0: + return True + return 'oca_addresses' in staging_tables_with_data + + +def export_table_to_s3(db, table, s3_filename, s3_args, s3_prefix): + """Export one main table via aws_s3.query_export_to_s3; return the object key.""" + object_key = s3_key(f"{S3_PUBLIC_FOLDER}/{s3_filename}", s3_prefix) + db.sql(f""" + SELECT * from aws_s3.query_export_to_s3( + 'SELECT * from {table}', + aws_commons.create_s3_uri( + '{s3_args["aws_bucket_name"]}', + '{object_key}', + 'us-east-1' + ), + options :='FORMAT CSV, HEADER'); + """) + return object_key + + +ADDRESS_VIEW_EXPORTS = ( + ('oca_addresses_with_bbl', 'oca_addresses_with_bbl.csv'), + ('oca_addresses_with_ct', 'oca_addresses_with_ct.csv'), + ('oca_addresses_public', 'oca_addresses.csv'), +) + + +def normalize_published_s3_encryption(s3, s3_prefix, object_keys): + """Re-encrypt only objects written during this publish pass (SSE-S3).""" + keys = sorted({k for k in object_keys if k}) + if not keys: + return + print(f'Updating server-side encryption for {len(keys)} published S3 object(s)') + for object_key in keys: + print('-', object_key) + s3.update_encryption(object_key) diff --git a/lib/etl_stages.py b/lib/etl_stages.py index 2d2790a..83e7248 100644 --- a/lib/etl_stages.py +++ b/lib/etl_stages.py @@ -20,6 +20,13 @@ upload_public_file, ) from .etl_promotion import promote_staging_to_main +from .etl_publish import ( + ADDRESS_VIEW_EXPORTS, + export_table_to_s3, + normalize_published_s3_encryption, + should_publish_address_exports, + staging_tables_with_rows, +) from .etl_geocode import ( fetch_addresses_needing_geocode, geocode_candidate_records, @@ -157,6 +164,7 @@ def preprocess_and_upload_staging_csvs( def import_and_promote_staging(manifest, db, pub_dir, s3_args, s3_prefix, selection): + imported_staging_tables = staging_tables_with_rows(pub_dir) staging_tables = [t + '_staging' for t in OCA_TABLES] manifest.upsert_step('promote_staging', 'running') db.execute_sql_file('create_tables_staging.sql') @@ -182,26 +190,33 @@ def import_and_promote_staging(manifest, db, pub_dir, s3_args, s3_prefix, select source = 'sftp' if selected_name in selection.new_file_set else 's3_private' manifest.upsert_file(selected_name, source=source, status='completed', stage='promote') manifest.upsert_step('promote_staging', 'completed') + return imported_staging_tables def publish_core_tables(manifest, db, s3_args, s3_prefix): + """ + Export all core tables after promotion. + + When oca_index_staging has rows, promotion deletes child rows for the batch + even if a child staging CSV was empty, so per-table skip is unsafe. + """ manifest.upsert_step('publish_tables', 'running') + published_keys = [] for t in OCA_TABLES: s3_filename = t + '.csv' if t == "oca_addresses": s3_filename = "oca_addresses_private.csv" - db.sql(f""" - SELECT * from aws_s3.query_export_to_s3( - 'SELECT * from {t}', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/{s3_filename}", s3_prefix)}', 'us-east-1'), - options :='FORMAT CSV, HEADER'); - """) + published_keys.append( + export_table_to_s3(db, t, s3_filename, s3_args, s3_prefix) + ) manifest.upsert_step('publish_tables', 'completed') + return published_keys def geocode_and_publish_addresses( manifest, db, s3, priv_dir, pub_dir, s3_args, s3_prefix, mode, selection, - geocode_workers, census_batch_chunk_size + geocode_workers, census_batch_chunk_size, staging_tables_with_data, + published_core_keys ): manifest.upsert_step('geocode_refresh', 'running') candidates = fetch_addresses_needing_geocode(db) @@ -214,42 +229,40 @@ def geocode_and_publish_addresses( ) upsert_geocoded_addresses(db, geocoded_rows) - csv_filepath = os.path.join(pub_dir, "oca_addresses_private.csv") - db.export_csv('oca_addresses', csv_filepath) + publish_addresses = should_publish_address_exports( + staging_tables_with_data, len(candidates) + ) + published_keys = list(published_core_keys or []) + if publish_addresses: + csv_filepath = os.path.join(pub_dir, "oca_addresses_private.csv") + db.export_csv('oca_addresses', csv_filepath) + db.execute_sql_file('create_addresses_views.sql') + for view_name, s3_filename in ADDRESS_VIEW_EXPORTS: + published_keys.append( + export_table_to_s3(db, view_name, s3_filename, s3_args, s3_prefix) + ) + published_keys.append( + s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_private.csv", s3_prefix) + ) + else: + print( + 'Skipping address CSV/view publish: no geocode candidates and ' + 'no oca_addresses_staging rows this run' + ) + create_date_files(s3, selection.selected_zip_files[-1], pub_dir) - public_files = [i for i in os.listdir(pub_dir) if i in ('last-updated-shield.png', 'last-updated-date.txt', 'oca_addresses_private.csv')] + public_files = ['last-updated-shield.png', 'last-updated-date.txt'] + if publish_addresses: + public_files.append('oca_addresses_private.csv') with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) pool.starmap(upload_public_file, files_zip) + for date_file in public_files: + published_keys.append(s3_key(f"{S3_PUBLIC_FOLDER}/{date_file}", s3_prefix)) + for f in os.listdir(priv_dir): if f != '.DS_Store': s3.upload_file(s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), os.path.join(priv_dir, f)) - db.execute_sql_file('create_addresses_views.sql') - db.sql(f""" - SELECT * from aws_s3.query_export_to_s3( - 'SELECT * from oca_addresses_with_bbl', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_with_bbl.csv", s3_prefix)}', 'us-east-1'), - options :='FORMAT CSV, HEADER'); - """) - db.sql(f""" - SELECT * from aws_s3.query_export_to_s3( - 'SELECT * from oca_addresses_with_ct', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_with_ct.csv", s3_prefix)}', 'us-east-1'), - options :='FORMAT CSV, HEADER'); - """) - db.sql(f""" - SELECT * from aws_s3.query_export_to_s3( - 'SELECT * from oca_addresses_public', - aws_commons.create_s3_uri('{s3_args["aws_bucket_name"]}', '{s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses.csv", s3_prefix)}', 'us-east-1'), - options :='FORMAT CSV, HEADER'); - """) - print('Updating server-side encryption for S3 files') - public_folder = s3_key(S3_PUBLIC_FOLDER, s3_prefix) - public_files_to_encrypt = [ - f for f in s3.list_files('', public_folder) - if not f.endswith('_staging.csv') and 'oca_addresses_private' not in f - ] - for f in public_files_to_encrypt: - print('-', f) - s3.update_encryption(s3_key(f"{S3_PUBLIC_FOLDER}/{f}", s3_prefix)) + + normalize_published_s3_encryption(s3, s3_prefix, published_keys) manifest.upsert_step('geocode_refresh', 'completed') diff --git a/tests/test_etl_publish.py b/tests/test_etl_publish.py new file mode 100644 index 0000000..6bd97ba --- /dev/null +++ b/tests/test_etl_publish.py @@ -0,0 +1,87 @@ +import os +import tempfile +import unittest +from unittest import mock + +from lib.etl_publish import ( + should_publish_address_exports, + staging_tables_with_rows, +) +from lib.etl_stages import geocode_and_publish_addresses + + +class StagingTablesWithRowsTests(unittest.TestCase): + def test_detects_non_empty_staging_csv(self): + with tempfile.TemporaryDirectory() as pub_dir: + path = os.path.join(pub_dir, 'oca_addresses_staging.csv') + with open(path, 'w', encoding='utf-8') as handle: + handle.write('indexnumberid\n') + handle.write('case-1\n') + found = staging_tables_with_rows(pub_dir) + self.assertEqual(found, {'oca_addresses'}) + + def test_empty_staging_csv_excluded(self): + with tempfile.TemporaryDirectory() as pub_dir: + path = os.path.join(pub_dir, 'oca_index_staging.csv') + with open(path, 'w', encoding='utf-8') as handle: + handle.write('indexnumberid\n') + found = staging_tables_with_rows(pub_dir) + self.assertEqual(found, set()) + + +class ShouldPublishAddressExportsTests(unittest.TestCase): + def test_publish_when_geocode_candidates(self): + self.assertTrue(should_publish_address_exports(set(), 3)) + + def test_publish_when_addresses_staging_had_rows(self): + self.assertTrue(should_publish_address_exports({'oca_addresses'}, 0)) + + def test_skip_when_no_address_changes(self): + self.assertFalse(should_publish_address_exports({'oca_index'}, 0)) + self.assertFalse(should_publish_address_exports(set(), 0)) + + +class GeocodePublishSkipTests(unittest.TestCase): + def test_skips_address_exports_when_unchanged(self): + fake_db = mock.Mock() + fake_db.sql_fetch_all_from_file.return_value = [] + fake_manifest = mock.Mock() + fake_s3 = mock.Mock() + selection = mock.Mock(selected_zip_files=['file.zip']) + + with mock.patch('lib.etl_stages.create_date_files'), \ + mock.patch('lib.etl_stages.upload_public_file'), \ + mock.patch('lib.etl_stages.multiprocessing.Pool') as pool_mock, \ + mock.patch('lib.etl_stages.normalize_published_s3_encryption') as encrypt_mock, \ + mock.patch('os.listdir', return_value=[]): + pool_mock.return_value.__enter__.return_value.starmap.return_value = [] + geocode_and_publish_addresses( + fake_manifest, + fake_db, + fake_s3, + '/tmp/priv', + '/tmp/pub', + {'aws_bucket_name': 'bucket', 'aws_id': 'id', 'aws_key': 'key'}, + 'refactor/', + '2', + selection, + geocode_workers=1, + census_batch_chunk_size=2500, + staging_tables_with_data={'oca_index'}, + published_core_keys=['refactor/public/oca_index.csv'], + ) + + fake_db.export_csv.assert_not_called() + view_exports = [ + call for call in fake_db.sql.call_args_list + if call.args and 'query_export_to_s3' in call.args[0] + ] + self.assertEqual(view_exports, []) + encrypt_mock.assert_called_once() + encrypted_keys = encrypt_mock.call_args[0][2] + self.assertIn('refactor/public/oca_index.csv', encrypted_keys) + self.assertNotIn('refactor/public/oca_addresses.csv', encrypted_keys) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_incremental_geocode.py b/tests/test_incremental_geocode.py index 1bcde09..7682304 100644 --- a/tests/test_incremental_geocode.py +++ b/tests/test_incremental_geocode.py @@ -249,7 +249,10 @@ def test_geocode_stage_skips_reset_and_s3_import(self): with mock.patch('lib.etl_stages.create_date_files'), \ mock.patch('lib.etl_stages.upload_public_file'), \ + mock.patch('lib.etl_stages.multiprocessing.Pool') as pool_mock, \ + mock.patch('lib.etl_stages.normalize_published_s3_encryption'), \ mock.patch('os.listdir', return_value=[]): + pool_mock.return_value.__enter__.return_value.starmap.return_value = [] geocode_and_publish_addresses( fake_manifest, fake_db, @@ -262,6 +265,8 @@ def test_geocode_stage_skips_reset_and_s3_import(self): selection, geocode_workers=1, census_batch_chunk_size=2500, + staging_tables_with_data=set(), + published_core_keys=[], ) executed_files = [ @@ -292,7 +297,10 @@ def test_geocode_stage_delta_path_invoked(self): mock.patch('lib.etl_stages.upsert_geocoded_addresses', return_value=1) as upsert_mock, \ mock.patch('lib.etl_stages.create_date_files'), \ mock.patch('lib.etl_stages.upload_public_file'), \ + mock.patch('lib.etl_stages.multiprocessing.Pool') as pool_mock, \ + mock.patch('lib.etl_stages.normalize_published_s3_encryption'), \ mock.patch('os.listdir', return_value=[]): + pool_mock.return_value.__enter__.return_value.starmap.return_value = [] geocode_and_publish_addresses( fake_manifest, fake_db, @@ -305,6 +313,8 @@ def test_geocode_stage_delta_path_invoked(self): selection, geocode_workers=2, census_batch_chunk_size=1000, + staging_tables_with_data={'oca_addresses'}, + published_core_keys=[], ) geocode_mock.assert_called_once() diff --git a/tests/test_k8s_cron_job.py b/tests/test_k8s_cron_job.py new file mode 100644 index 0000000..5adcf95 --- /dev/null +++ b/tests/test_k8s_cron_job.py @@ -0,0 +1,45 @@ +import os +import re +import unittest + +K8S_CRON_JOB = os.path.join( + os.path.dirname(__file__), '..', 'k8s', 'k8s-cron-job.yaml' +) + + +class K8sCronJobManifestTests(unittest.TestCase): + @classmethod + def setUpClass(cls): + with open(K8S_CRON_JOB, encoding='utf-8') as handle: + cls.raw = handle.read() + + def test_memory_limit_two_gib_class(self): + self.assertIn('memory: "2Gi"', self.raw) + self.assertIn('memory: "1536Mi"', self.raw) + + def test_credentials_from_secret_not_plaintext(self): + self.assertIn('secretRef:', self.raw) + self.assertIn('name: oca-etl-secrets', self.raw) + self.assertNotIn('name: DATABASE_URL', self.raw) + self.assertNotIn('name: SFTP_PSWD', self.raw) + + def test_runtime_knobs_documented_in_env(self): + for name in ( + 'MODE', + 'GEOCODE_WORKERS', + 'CENSUS_BATCH_CHUNK_SIZE', + 'CSV_ROW_CHECK_CHUNK_SIZE', + 'S3_PREFIX', + 'DB_SCHEMA', + 'REPROCESS_GLOB', + 'FORCE_REPROCESS', + ): + self.assertIn(f'name: {name}', self.raw) + + def test_no_embedded_aws_keys_or_passwords(self): + self.assertNotRegex(self.raw, r'AKIA[0-9A-Z]{16}') + self.assertNotRegex(self.raw, r'postgresql://[^:]+:[^@]+@') + + +if __name__ == '__main__': + unittest.main() From 570ed86c01ee5d5a94e30d9f71539a47438a4293 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Thu, 28 May 2026 09:00:31 -0400 Subject: [PATCH 12/30] Schema-safe table bootstrap before staging import --- lib/etl.py | 8 ++++- lib/etl_helpers.py | 6 ++-- lib/etl_stages.py | 33 ++++++++++++++++-- lib/sql/create_tables.sql | 63 +++++++++++++--------------------- tests/test_schema_bootstrap.py | 59 +++++++++++++++++++++++++++++++ 5 files changed, 124 insertions(+), 45 deletions(-) create mode 100644 tests/test_schema_bootstrap.py diff --git a/lib/etl.py b/lib/etl.py index 55d348c..ed7f48c 100644 --- a/lib/etl.py +++ b/lib/etl.py @@ -120,7 +120,13 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None csv_preprocess_chunk_size=csv_row_check_chunk_size, ) staging_tables_with_data = import_and_promote_staging( - manifest, db, pub_dir, s3_args, s3_prefix, selection + manifest, + db, + pub_dir, + s3_args, + s3_prefix, + selection, + runtime_args.get('db_schema') or db_args.get('schema') or 'public', ) published_core_keys = publish_core_tables(manifest, db, s3_args, s3_prefix) geocode_and_publish_addresses( diff --git a/lib/etl_helpers.py b/lib/etl_helpers.py index 6decc7b..6570eca 100644 --- a/lib/etl_helpers.py +++ b/lib/etl_helpers.py @@ -64,13 +64,13 @@ def prep_db(s3, db, local_dir): db.execute_sql_file('create_tables.sql') -def create_date_files(s3, data_file, local_dir): +def create_date_files(data_file, local_dir): """ Create a text file and a custom shield image with date the data was - last updated and add them to the public S3 folder. + last updated. - :param s3: S3 object :param data_file: file path for data being processed + :param local_dir: path for local directory to save date files """ date = re.search(r'(\d{4}-\d{2}-\d{2})', data_file).group(1) diff --git a/lib/etl_stages.py b/lib/etl_stages.py index 83e7248..e9fd99a 100644 --- a/lib/etl_stages.py +++ b/lib/etl_stages.py @@ -163,10 +163,39 @@ def preprocess_and_upload_staging_csvs( pool.starmap(upload_public_file, files_zip) -def import_and_promote_staging(manifest, db, pub_dir, s3_args, s3_prefix, selection): +def _assert_schema_bootstrap_context(db, expected_schema): + schema_name = (expected_schema or '').strip() + if not schema_name: + raise RuntimeError('DB schema must be set before running core table bootstrap.') + + schema_row = db.sql_fetch_one( + "SELECT current_schema(), current_setting('search_path')" + ) + current_schema, search_path = schema_row if schema_row else (None, '') + if not current_schema: + raise RuntimeError('Unable to resolve active schema before core table bootstrap.') + + if current_schema != schema_name: + raise RuntimeError( + f"Schema bootstrap guard failed: expected current_schema '{schema_name}', got '{current_schema}'." + ) + + if schema_name not in (search_path or ''): + raise RuntimeError( + f"Schema bootstrap guard failed: search_path '{search_path}' does not include '{schema_name}'." + ) + + +def ensure_core_tables_exist(db, expected_schema): + _assert_schema_bootstrap_context(db, expected_schema) + db.execute_sql_file('create_tables.sql') + + +def import_and_promote_staging(manifest, db, pub_dir, s3_args, s3_prefix, selection, expected_schema): imported_staging_tables = staging_tables_with_rows(pub_dir) staging_tables = [t + '_staging' for t in OCA_TABLES] manifest.upsert_step('promote_staging', 'running') + ensure_core_tables_exist(db, expected_schema) db.execute_sql_file('create_tables_staging.sql') for t in staging_tables: csv_filepath = os.path.join(pub_dir, f"{t}.csv") @@ -250,7 +279,7 @@ def geocode_and_publish_addresses( 'no oca_addresses_staging rows this run' ) - create_date_files(s3, selection.selected_zip_files[-1], pub_dir) + create_date_files(selection.selected_zip_files[-1], pub_dir) public_files = ['last-updated-shield.png', 'last-updated-date.txt'] if publish_addresses: public_files.append('oca_addresses_private.csv') diff --git a/lib/sql/create_tables.sql b/lib/sql/create_tables.sql index 3bd8ebe..e31e294 100644 --- a/lib/sql/create_tables.sql +++ b/lib/sql/create_tables.sql @@ -1,8 +1,4 @@ -DROP VIEW IF EXISTS oca_addresses_with_bbl CASCADE; -DROP VIEW IF EXISTS oca_addresses_with_ct CASCADE; - -DROP TABLE IF EXISTS oca_index CASCADE; -CREATE TABLE oca_index ( +CREATE TABLE IF NOT EXISTS oca_index ( indexnumberid text PRIMARY KEY, court text, fileddate date, @@ -17,16 +13,14 @@ CREATE TABLE oca_index ( dateofjurydemand date ); -DROP TABLE IF EXISTS oca_causes CASCADE; -CREATE TABLE oca_causes ( +CREATE TABLE IF NOT EXISTS oca_causes ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, causeofactiontype text, interestfromdate date, amount numeric ); -DROP TABLE IF EXISTS oca_addresses CASCADE; -CREATE TABLE oca_addresses ( +CREATE TABLE IF NOT EXISTS oca_addresses ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, street1 text, street2 text, @@ -55,8 +49,7 @@ CREATE TABLE oca_addresses ( zip_code text ); -DROP TABLE IF EXISTS oca_parties CASCADE; -CREATE TABLE oca_parties ( +CREATE TABLE IF NOT EXISTS oca_parties ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, role text, partytype text, @@ -64,8 +57,7 @@ CREATE TABLE oca_parties ( undertenant text ); -DROP TABLE IF EXISTS oca_events CASCADE; -CREATE TABLE oca_events ( +CREATE TABLE IF NOT EXISTS oca_events ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, eventname text, fileddate date, @@ -74,8 +66,7 @@ CREATE TABLE oca_events ( answertype text ); -DROP TABLE IF EXISTS oca_appearances CASCADE; -CREATE TABLE oca_appearances ( +CREATE TABLE IF NOT EXISTS oca_appearances ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, appearanceid bigserial, appearancedatetime timestamp, @@ -85,16 +76,14 @@ CREATE TABLE oca_appearances ( motionsequence int ); -DROP TABLE IF EXISTS oca_appearance_outcomes CASCADE; -CREATE TABLE oca_appearance_outcomes ( +CREATE TABLE IF NOT EXISTS oca_appearance_outcomes ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, appearanceid bigint, appearanceoutcometype text, outcomebasedontype text ); -DROP TABLE IF EXISTS oca_motions CASCADE; -CREATE TABLE oca_motions ( +CREATE TABLE IF NOT EXISTS oca_motions ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, sequence int, motiontype text, @@ -106,8 +95,7 @@ CREATE TABLE oca_motions ( ); -DROP TABLE IF EXISTS oca_decisions CASCADE; -CREATE TABLE oca_decisions ( +CREATE TABLE IF NOT EXISTS oca_decisions ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, sequence int, resultof text, @@ -115,8 +103,7 @@ CREATE TABLE oca_decisions ( ); -DROP TABLE IF EXISTS oca_judgments CASCADE; -CREATE TABLE oca_judgments ( +CREATE TABLE IF NOT EXISTS oca_judgments ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, sequence int, amendedfromjudgmentsequence int, @@ -131,8 +118,7 @@ CREATE TABLE oca_judgments ( debtorsroles text[] ); -DROP TABLE IF EXISTS oca_warrants CASCADE; -CREATE TABLE oca_warrants ( +CREATE TABLE IF NOT EXISTS oca_warrants ( indexnumberid text REFERENCES oca_index ON DELETE CASCADE, judgmentsequence int, sequence text, @@ -160,8 +146,7 @@ CREATE TABLE oca_warrants ( executiondate date ); -DROP TABLE IF EXISTS oca_metadata CASCADE; -CREATE TABLE oca_metadata ( +CREATE TABLE IF NOT EXISTS oca_metadata ( -- we don't want to delete records here when deleted from others indexnumberid text PRIMARY KEY, initialdate date, @@ -170,15 +155,15 @@ CREATE TABLE oca_metadata ( ); -CREATE INDEX ON oca_causes (indexnumberid); -CREATE INDEX ON oca_addresses (indexnumberid); -CREATE INDEX ON oca_addresses (bbl); -CREATE INDEX ON oca_parties (indexnumberid); -CREATE INDEX ON oca_events (indexnumberid); -CREATE INDEX ON oca_appearances (indexnumberid); -CREATE INDEX ON oca_appearance_outcomes (indexnumberid); -CREATE INDEX ON oca_motions (indexnumberid); -CREATE INDEX ON oca_decisions (indexnumberid); -CREATE INDEX ON oca_judgments (indexnumberid); -CREATE INDEX ON oca_warrants (indexnumberid); -CREATE INDEX ON oca_metadata (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_causes_indexnumberid_idx ON oca_causes (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_addresses_indexnumberid_idx ON oca_addresses (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_addresses_bbl_idx ON oca_addresses (bbl); +CREATE INDEX IF NOT EXISTS oca_parties_indexnumberid_idx ON oca_parties (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_events_indexnumberid_idx ON oca_events (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_appearances_indexnumberid_idx ON oca_appearances (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_appearance_outcomes_indexnumberid_idx ON oca_appearance_outcomes (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_motions_indexnumberid_idx ON oca_motions (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_decisions_indexnumberid_idx ON oca_decisions (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_judgments_indexnumberid_idx ON oca_judgments (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_warrants_indexnumberid_idx ON oca_warrants (indexnumberid); +CREATE INDEX IF NOT EXISTS oca_metadata_indexnumberid_idx ON oca_metadata (indexnumberid); diff --git a/tests/test_schema_bootstrap.py b/tests/test_schema_bootstrap.py new file mode 100644 index 0000000..7ab2ffe --- /dev/null +++ b/tests/test_schema_bootstrap.py @@ -0,0 +1,59 @@ +import os +import unittest +from unittest import mock + +from lib.etl_stages import ensure_core_tables_exist + + +class CreateTablesSqlContractTests(unittest.TestCase): + @classmethod + def setUpClass(cls): + sql_path = os.path.join( + os.path.dirname(__file__), + '..', + 'lib', + 'sql', + 'create_tables.sql', + ) + with open(sql_path, encoding='utf-8') as f: + cls.sql = f.read() + + def test_bootstrap_sql_is_non_destructive(self): + self.assertNotIn('DROP TABLE', self.sql.upper()) + self.assertNotIn('DROP VIEW', self.sql.upper()) + + def test_bootstrap_sql_uses_idempotent_create_patterns(self): + self.assertIn('CREATE TABLE IF NOT EXISTS oca_index', self.sql) + self.assertIn('CREATE TABLE IF NOT EXISTS oca_metadata', self.sql) + self.assertIn('CREATE INDEX IF NOT EXISTS oca_addresses_bbl_idx', self.sql) + + +class EnsureCoreTablesExistTests(unittest.TestCase): + def test_bootstrap_runs_when_schema_context_matches(self): + db = mock.Mock() + db.sql_fetch_one.return_value = ('oca_refactor', '"oca_refactor", public') + + ensure_core_tables_exist(db, 'oca_refactor') + + db.execute_sql_file.assert_called_once_with('create_tables.sql') + + def test_bootstrap_fails_when_expected_schema_missing(self): + db = mock.Mock() + + with self.assertRaisesRegex(RuntimeError, 'DB schema must be set'): + ensure_core_tables_exist(db, '') + + db.execute_sql_file.assert_not_called() + + def test_bootstrap_fails_when_current_schema_does_not_match_expected(self): + db = mock.Mock() + db.sql_fetch_one.return_value = ('public', 'public') + + with self.assertRaisesRegex(RuntimeError, 'expected current_schema'): + ensure_core_tables_exist(db, 'oca_refactor') + + db.execute_sql_file.assert_not_called() + + +if __name__ == '__main__': + unittest.main() From 285a4c1eb236e6608c448af956a2de244494b654 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Thu, 28 May 2026 10:23:15 -0400 Subject: [PATCH 13/30] remove unused prep_db --- lib/README.md | 3 --- lib/etl.py | 2 -- lib/etl_helpers.py | 19 ------------------- 3 files changed, 24 deletions(-) diff --git a/lib/README.md b/lib/README.md index 9ae6a27..15cedf6 100644 --- a/lib/README.md +++ b/lib/README.md @@ -26,9 +26,6 @@ A few basic helper functions: * `list_new_data_files` * List files that are in the SFTP but not yet in S3 -* `prep_db` - * Prepare the Postgres database (either from scratch with SQL scripts or from a `pg_dump` file) - * `promote_staging_to_main` (`etl_promotion.py`) * Move newly parsed records in the database over from staging tables to the main ones (single transaction) diff --git a/lib/etl.py b/lib/etl.py index ed7f48c..61cb310 100644 --- a/lib/etl.py +++ b/lib/etl.py @@ -20,7 +20,6 @@ csv_has_rows, download_pluto, make_dir, - prep_db, s3_key, upload_public_file, ) @@ -58,7 +57,6 @@ 'EtlRunManifest', 'manifest_step', 'csv_has_rows', - 'prep_db', 'promote_staging_to_main', 'promotion_table_counts', 'promotion_counts_checksum', diff --git a/lib/etl_helpers.py b/lib/etl_helpers.py index 6570eca..9a31266 100644 --- a/lib/etl_helpers.py +++ b/lib/etl_helpers.py @@ -45,25 +45,6 @@ def csv_has_rows(csv_filepath): return False -def prep_db(s3, db, local_dir): - """ - Create a new directory in the same folder as this file, - deleting everything in the folder if it already exists - - :param s3: S3 object - :param db: Database object - :param local_dir: Path for local directory to save database dump file - """ - if s3.list_files('oca.dump', S3_PRIVATE_FOLDER): - print('Rebuilding tables from SQL dump') - s3.download_file(f"{S3_PRIVATE_FOLDER}/oca.dump", os.path.join(local_dir, 'oca.dump')) - db.execute_sql_file('create_tables.sql') - db.restore_from(os.path.join(local_dir, 'oca.dump')) - else: - print('Creating tables from scratch') - db.execute_sql_file('create_tables.sql') - - def create_date_files(data_file, local_dir): """ Create a text file and a custom shield image with date the data was From 884a6329871b9cce2b4ce8b0aaa35ee614348e80 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Thu, 28 May 2026 18:10:56 -0400 Subject: [PATCH 14/30] Batch parse & staging export (#20) The refactor work thus far has only touched the process after the initial parsing into DuckDB and export to csv. This PR adds on some improvements to the parsing and duckdb -> csv export. Previously during parsing each row written to each table in the staging DuckDB was committed separately, and this changes to use a batching approach. The INSERT & DELETE statements are collected in a buffer and then written to the DuckDB in a single transaction for multiple cases (configurable). On a typical weekly update this makes the parsing stage about 80% faster. Previously we exported CSVs from the DuckDB and then did some preprocessing of them to adjust from differences in how DuckDB and Postgres represent some data types so that the COPY (via s3-to-rds) works. This changes to incorporate the preprocessing changes directly into the COPY export from DuckDB so those CSVs are ready for upload to s3/rds without any extra reading of CSVs. During the process of working out these changes I did a bunch of benchmarking and testing to make sure all the behavior wasn't changed at all. I've since stripped that out to keep the code more readable, but the version with benchmarking and other checks is preserved on this branch: https://github.com/housing-data-coalition/oca/tree/batch-parsing-eval --- lib/duckdb_database.py | 98 +++++++--- lib/etl_csv.py | 19 +- lib/etl_stages.py | 39 +++- lib/parse_write_buffer.py | 128 +++++++++++++ lib/parsers.py | 72 ++++---- lib/staging_csv_export.py | 136 ++++++++++++++ oca_update.py | 7 + tests/csv_checksums.py | 21 +++ tests/parse_pipeline_helpers.py | 50 ++++++ tests/parser_xml_fixtures.py | 157 ++++++++++++++++ tests/test_parser_batching.py | 191 ++++++++++++++++++++ tests/test_parser_regression_safety.py | 238 +++++++++++++++++++++++++ tests/test_staging_csv_export.py | 208 +++++++++++++++++++++ 13 files changed, 1299 insertions(+), 65 deletions(-) create mode 100644 lib/parse_write_buffer.py create mode 100644 lib/staging_csv_export.py create mode 100644 tests/csv_checksums.py create mode 100644 tests/parse_pipeline_helpers.py create mode 100644 tests/parser_xml_fixtures.py create mode 100644 tests/test_parser_batching.py create mode 100644 tests/test_parser_regression_safety.py create mode 100644 tests/test_staging_csv_export.py diff --git a/lib/duckdb_database.py b/lib/duckdb_database.py index fdc5cbc..350d218 100644 --- a/lib/duckdb_database.py +++ b/lib/duckdb_database.py @@ -1,24 +1,54 @@ import duckdb import os import threading +from contextlib import contextmanager + +from .staging_csv_export import build_staging_copy_sql + +STAGING_TABLE_FAMILIES = ( + 'oca_index_staging', + 'oca_causes_staging', + 'oca_addresses_staging', + 'oca_parties_staging', + 'oca_events_staging', + 'oca_appearances_staging', + 'oca_motions_staging', + 'oca_decisions_staging', + 'oca_judgments_staging', + 'oca_warrants_staging', + 'oca_metadata_staging', +) + + +def fetch_staging_row_counts(db) -> dict[str, int]: + """Return row counts for known staging tables (missing tables -> 0).""" + counts = {} + with db._lock: + for table_name in STAGING_TABLE_FAMILIES: + try: + row = db.conn.execute(f'SELECT COUNT(*) FROM {table_name}').fetchone() + counts[table_name] = int(row[0]) if row else 0 + except Exception: + counts[table_name] = 0 + return counts + class DuckDB: - """DuckDB database helper with methods for + """DuckDB database helper with methods for exporting to csv, and running sql files and commands with thread safety""" - + def __init__(self, dbname): self.dbname = dbname self.conn = duckdb.connect(dbname) self._lock = threading.Lock() - + def execute_sql_file(self, sql_file_path): """Execute SQL commands from a file""" with open(sql_file_path, 'r') as f: sql_content = f.read() - - # Split by semicolon and execute each statement + statements = [stmt.strip() for stmt in sql_content.split(';') if stmt.strip()] - + with self._lock: for statement in statements: try: @@ -27,38 +57,56 @@ def execute_sql_file(self, sql_file_path): print(f"Error executing statement: {statement[:100]}...") print(f"Error: {e}") raise - + def execute(self, sql, params=None): """Execute a single SQL statement""" with self._lock: - if params: - return self.conn.execute(sql, params) - return self.conn.execute(sql) - + return self._execute_unlocked(sql, params) + + def _execute_unlocked(self, sql, params=None): + if params: + return self.conn.execute(sql, params) + return self.conn.execute(sql) + def executemany(self, sql, params_list): """Execute SQL with multiple parameter sets""" with self._lock: - return self.conn.executemany(sql, params_list) - + return self._executemany_unlocked(sql, params_list) + + def _executemany_unlocked(self, sql, params_list): + return self.conn.executemany(sql, params_list) + + @contextmanager + def transaction(self): + """Run a block in one DuckDB transaction (caller should not nest locks).""" + with self._lock: + self.conn.execute('BEGIN TRANSACTION') + try: + yield self + self.conn.execute('COMMIT') + except Exception: + self.conn.execute('ROLLBACK') + raise + def close(self): - if self.conn: self.conn.close() - + if self.conn: + self.conn.close() + def export_tables_to_csv(self, output_dir): """Export all tables to CSV files""" os.makedirs(output_dir, exist_ok=True) - + with self._lock: - # Get list of all tables tables = self.conn.execute("SHOW TABLES").fetchall() - + for table_row in tables: table_name = table_row[0] csv_path = os.path.join(output_dir, f"{table_name}.csv") - - # Export to CSV - self.conn.execute(f"COPY {table_name} TO '{csv_path}' (HEADER, DELIMITER ',')") - print(f"Exported {table_name} to {csv_path}") - # TODO: before exporting covert arrays to the postgres format, but ignore json objects - # Transform arrays: [1,2,3] -> {1,2,3} - # Ignore JSON objects: {[key: value]} -> [{key: value}] \ No newline at end of file + describe_rows = self.conn.execute( + f'DESCRIBE {table_name}' + ).fetchall() + columns = [(row[0], row[1]) for row in describe_rows] + copy_sql = build_staging_copy_sql(table_name, csv_path, columns) + self.conn.execute(copy_sql) + print(f"Exported {table_name} to {csv_path}") diff --git a/lib/etl_csv.py b/lib/etl_csv.py index 1a4a3dc..2dea8c1 100644 --- a/lib/etl_csv.py +++ b/lib/etl_csv.py @@ -3,6 +3,8 @@ import csv import os +from .staging_csv_export import staging_csv_needs_preprocess + _APPEARANCES_PREFIX = 'oca_appearances_staging' _JUDGMENTS_PREFIX = 'oca_judgments_staging' _WARRANTS_PREFIX = 'oca_warrants_staging' @@ -70,10 +72,11 @@ def preprocess_csv_file(file_path, chunk_size=1000): """ filename = os.path.basename(file_path) if not filename.endswith('.csv'): - return + return 0 drop_columns, int_columns = _file_preprocess_rules(filename) tmp_path = f'{file_path}.tmp' + rows_touched = 0 with open(file_path, newline='', encoding='utf-8') as infile, open( tmp_path, 'w', newline='', encoding='utf-8' @@ -81,7 +84,7 @@ def preprocess_csv_file(file_path, chunk_size=1000): reader = csv.DictReader(infile) if not reader.fieldnames: os.remove(tmp_path) - return + return 0 fieldnames = [name for name in reader.fieldnames if name not in drop_columns] writer = csv.DictWriter(outfile, fieldnames=fieldnames, lineterminator='\n') @@ -90,6 +93,7 @@ def preprocess_csv_file(file_path, chunk_size=1000): batch = [] for row in reader: batch.append(_preprocess_row(filename, fieldnames, int_columns, row)) + rows_touched += 1 if len(batch) >= chunk_size: writer.writerows(batch) batch.clear() @@ -97,9 +101,16 @@ def preprocess_csv_file(file_path, chunk_size=1000): writer.writerows(batch) os.replace(tmp_path, file_path) + return rows_touched def preprocess_staging_csv_dir(target_dir, chunk_size=1000): for filename in sorted(os.listdir(target_dir)): - if filename.endswith('.csv'): - preprocess_csv_file(os.path.join(target_dir, filename), chunk_size=chunk_size) + if not filename.endswith('.csv'): + continue + if not staging_csv_needs_preprocess(filename): + continue + preprocess_csv_file( + os.path.join(target_dir, filename), + chunk_size=chunk_size, + ) diff --git a/lib/etl_stages.py b/lib/etl_stages.py index e9fd99a..d3d44a7 100644 --- a/lib/etl_stages.py +++ b/lib/etl_stages.py @@ -121,7 +121,7 @@ def download_selected_files(manifest, sftp, s3, priv_dir, s3_prefix, selection): manifest.upsert_step('download_files', 'completed') -def parse_xml_to_staging(manifest, staging_db, priv_dir): +def parse_xml_to_staging(manifest, staging_db, priv_dir, parse_num_threads=8): def sort_by_date(file): r = re.search(r'(\d+.+)\.zip', file).group(0).replace('.', ' ') return r @@ -144,7 +144,12 @@ def sort_by_date(file): extract_date = elem.text break with zipfile.ZipFile(zip_file, 'r').open(DATA_FILENAME) as xml_file: - parse_file(xml_file, staging_db, extract_date) + parse_file( + xml_file, + staging_db, + extract_date, + num_threads=parse_num_threads, + ) manifest.upsert_file( file_name, source='local', status='parsed', stage='parse', details={'extract_date': extract_date} @@ -152,17 +157,43 @@ def sort_by_date(file): manifest.upsert_step('parse_xml', 'completed') -def preprocess_and_upload_staging_csvs( - staging_db, pub_dir, mode, s3_args, s3_prefix, csv_preprocess_chunk_size=1000 +def export_staging_to_csv( + staging_db, + pub_dir, + *, + csv_preprocess_chunk_size=1000, + upload=True, + mode=None, + s3_args=None, + s3_prefix=None, ): + """Export DuckDB staging tables to CSV and optionally preprocess + upload.""" staging_db.export_tables_to_csv(output_dir=pub_dir) preprocess_staging_csv_dir(pub_dir, chunk_size=csv_preprocess_chunk_size) + + if not upload: + return + public_files = [i for i in os.listdir(pub_dir) if i.endswith('.csv')] with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) pool.starmap(upload_public_file, files_zip) +def preprocess_and_upload_staging_csvs( + staging_db, pub_dir, mode, s3_args, s3_prefix, csv_preprocess_chunk_size=1000 +): + export_staging_to_csv( + staging_db, + pub_dir, + csv_preprocess_chunk_size=csv_preprocess_chunk_size, + upload=True, + mode=mode, + s3_args=s3_args, + s3_prefix=s3_prefix, + ) + + def _assert_schema_bootstrap_context(db, expected_schema): schema_name = (expected_schema or '').strip() if not schema_name: diff --git a/lib/parse_write_buffer.py b/lib/parse_write_buffer.py new file mode 100644 index 0000000..86a6cf3 --- /dev/null +++ b/lib/parse_write_buffer.py @@ -0,0 +1,128 @@ +"""Buffered DuckDB staging writes with explicit transaction windows for parse hot paths.""" + +from __future__ import annotations + +import os +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any + +if TYPE_CHECKING: + from .duckdb_database import DuckDB + + +def _env_bool(name: str, default: bool) -> bool: + raw = os.environ.get(name) + if raw is None or raw.strip() == '': + return default + return raw.strip().lower() in ('1', 'true', 'yes', 'y', 'on') + + +def _env_int(name: str, default: int) -> int: + raw = os.environ.get(name) + if raw is None or raw.strip() == '': + return default + return max(1, int(raw)) + + +@dataclass(frozen=True) +class ParseWriteConfig: + """Runtime knobs for parser-to-DuckDB batching (safe production defaults).""" + + enabled: bool = True + batch_size: int = 128 + flush_every_n_cases: int = 16 + + @classmethod + def from_env(cls) -> ParseWriteConfig: + return cls( + enabled=_env_bool('PARSE_WRITE_BATCH_ENABLED', True), + batch_size=_env_int('PARSE_WRITE_BATCH_SIZE', 128), + flush_every_n_cases=_env_int('PARSE_WRITE_FLUSH_EVERY_N_CASES', 16), + ) + + @classmethod + def legacy(cls) -> ParseWriteConfig: + """Per-row flush semantics (batching disabled).""" + return cls(enabled=False, batch_size=1, flush_every_n_cases=1) + + +class StagingWriteBuffer: + """ + Buffers DELETE + INSERT statements and flushes in transaction windows. + + Flush order preserves per-case child replacement: all queued DELETEs run + before any queued INSERTs in the same transaction. + """ + + def __init__(self, db: DuckDB, config: ParseWriteConfig): + self.db = db + self.config = config + self._deletes: list[tuple[str, tuple | None]] = [] + self._inserts: dict[str, list[tuple | None]] = {} + self._cases_in_window = 0 + self._flush_count = 0 + + def _pending_insert_count(self) -> int: + return sum(len(rows) for rows in self._inserts.values()) + + def queue_delete(self, sql: str, params: tuple | None) -> None: + self._deletes.append((sql, params)) + + def queue_insert(self, sql: str, params: tuple | None) -> None: + self._inserts.setdefault(sql, []).append(params) + if self._pending_insert_count() >= self.config.batch_size: + self.flush(reason='batch_size') + + def on_case_complete(self) -> None: + self._cases_in_window += 1 + if self._cases_in_window >= self.config.flush_every_n_cases: + self.flush(reason='case_cadence') + + def flush(self, reason: str = 'explicit') -> None: + if not self._deletes and not self._inserts: + self._cases_in_window = 0 + return + + with self.db.transaction(): + for sql, params in self._deletes: + self.db._execute_unlocked(sql, params) + for sql, params_list in self._inserts.items(): + if not params_list: + continue + if len(params_list) == 1: + self.db._execute_unlocked(sql, params_list[0]) + else: + self.db._executemany_unlocked(sql, params_list) + + self._flush_count += 1 + self._deletes.clear() + self._inserts.clear() + self._cases_in_window = 0 + + +def attach_write_buffer(db: DuckDB) -> StagingWriteBuffer | None: + config = ParseWriteConfig.from_env() + if not config.enabled: + return None + buffer = StagingWriteBuffer(db, config) + db.write_buffer = buffer + return buffer + + +def staging_execute(db: DuckDB, sql: str, params: tuple | None = None) -> Any: + """Route a staging write through the optional per-connection write buffer.""" + buffer = getattr(db, 'write_buffer', None) + if buffer is None: + return db.execute(sql, params) + sql_upper = sql.lstrip().upper() + if sql_upper.startswith('DELETE'): + buffer.queue_delete(sql, params) + return None + buffer.queue_insert(sql, params) + return None + + +def flush_write_buffer(db: DuckDB, reason: str = 'shutdown') -> None: + buffer = getattr(db, 'write_buffer', None) + if buffer is not None: + buffer.flush(reason=reason) diff --git a/lib/parsers.py b/lib/parsers.py index 8b2c0f6..0234a9c 100644 --- a/lib/parsers.py +++ b/lib/parsers.py @@ -3,6 +3,8 @@ import threading import queue +from .parse_write_buffer import attach_write_buffer, flush_write_buffer, staging_execute + NAMESPACE = '{http://www.example.org/LandlordTenantExtractSchema}' def oca_tag(tag): @@ -107,7 +109,7 @@ def parse_index(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT OR REPLACE INTO oca_index_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_causes(case, db): @@ -120,7 +122,7 @@ def parse_causes(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # Eelete existing records for this case to handle multiple causes - db.execute("DELETE FROM oca_causes_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_causes_staging WHERE indexnumberid = ?", (IndexNumberId,)) causes = case.find(oca_tag('PrimaryClaimCauseOfActions')) @@ -138,7 +140,7 @@ def parse_causes(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_causes_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_addresses(case, db): @@ -151,7 +153,7 @@ def parse_addresses(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # First, delete existing records for this case to handle multiple addresses - db.execute("DELETE FROM oca_addresses_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_addresses_staging WHERE indexnumberid = ?", (IndexNumberId,)) addresses = case.find(oca_tag('PropertyAddresses')) @@ -171,7 +173,7 @@ def parse_addresses(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_addresses_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_parties(case, db): @@ -184,7 +186,7 @@ def parse_parties(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # First, delete existing records for this case to handle multiple parties - db.execute("DELETE FROM oca_parties_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_parties_staging WHERE indexnumberid = ?", (IndexNumberId,)) parties = case.find(oca_tag('Parties')) @@ -203,7 +205,7 @@ def parse_parties(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_parties_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_events(case, db): @@ -216,7 +218,7 @@ def parse_events(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # First, delete existing records for this case to handle multiple events - db.execute("DELETE FROM oca_events_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_events_staging WHERE indexnumberid = ?", (IndexNumberId,)) events = case.find(oca_tag('Events')) @@ -236,7 +238,7 @@ def parse_events(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_events_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def appearance_outcome_to_dict(elem): @@ -267,7 +269,7 @@ def parse_appearances(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # First, delete existing records for this case to handle multiple appearances - db.execute("DELETE FROM oca_appearances_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_appearances_staging WHERE indexnumberid = ?", (IndexNumberId,)) appearances = case.find(oca_tag('Appearances')) @@ -297,7 +299,7 @@ def parse_appearances(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_appearances_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_motions(case, db): @@ -310,7 +312,7 @@ def parse_motions(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # First, delete existing records for this case to handle multiple motions - db.execute("DELETE FROM oca_motions_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_motions_staging WHERE indexnumberid = ?", (IndexNumberId,)) motions = case.find(oca_tag('Motions')) @@ -332,7 +334,7 @@ def parse_motions(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_motions_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_decisions(case, db): @@ -349,7 +351,7 @@ def parse_decisions(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # First, delete existing records for this case to handle multiple decisions - db.execute("DELETE FROM oca_decisions_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_decisions_staging WHERE indexnumberid = ?", (IndexNumberId,)) decisions = case.find(oca_tag('Decisions')) @@ -367,7 +369,7 @@ def parse_decisions(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_decisions_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_judgments(case, db): @@ -381,7 +383,7 @@ def parse_judgments(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # First, delete existing records for this case to handle multiple judgments - db.execute("DELETE FROM oca_judgments_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_judgments_staging WHERE indexnumberid = ?", (IndexNumberId,)) judgments = case.find(oca_tag('Judgments')) @@ -407,7 +409,7 @@ def parse_judgments(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_judgments_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_warrants(case, db): @@ -421,7 +423,7 @@ def parse_warrants(case, db): IndexNumberId = case.find(INDEX_NUMBER_ID_TAG).text # First, delete existing records for this case to handle multiple warrants - db.execute("DELETE FROM oca_warrants_staging WHERE indexnumberid = ?", (IndexNumberId,)) + staging_execute(db, "DELETE FROM oca_warrants_staging WHERE indexnumberid = ?", (IndexNumberId,)) judgments = case.find(oca_tag('Judgments')) @@ -469,7 +471,7 @@ def parse_warrants(case, db): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT INTO oca_warrants_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def update_metadata(case, db, extract_date): @@ -496,7 +498,7 @@ def update_metadata(case, db, extract_date): values = tuple(row.get(col) for col in columns) placeholders = ', '.join(['?' for _ in columns]) insert_sql = f"INSERT OR REPLACE INTO oca_metadata_staging ({', '.join(columns)}) VALUES ({placeholders})" - db.execute(insert_sql, values) + staging_execute(db, insert_sql, values) def parse_case(case, db, extract_date): @@ -511,6 +513,9 @@ def parse_case(case, db, extract_date): # If this case is flagged for removal, skip the parsing steps if is_case_to_delete(case): + buffer = getattr(db, 'write_buffer', None) + if buffer is not None: + buffer.on_case_complete() return parse_index(case, db) @@ -524,6 +529,10 @@ def parse_case(case, db, extract_date): parse_judgments(case, db) parse_warrants(case, db) + buffer = getattr(db, 'write_buffer', None) + if buffer is not None: + buffer.on_case_complete() + def _worker_thread(case_queue, db_queue, extract_date, thread_id): """Worker thread that processes cases from the queue""" @@ -532,13 +541,14 @@ def _worker_thread(case_queue, db_queue, extract_date, thread_id): case = case_queue.get(timeout=1) if case is None: # Sentinel value to stop thread break - + # Each thread needs its own database connection thread_db = db_queue.get() try: parse_case(case, thread_db, extract_date) except Exception as e: print(f"Thread {thread_id}: Error parsing case: {e}") + flush_write_buffer(thread_db, reason='parse_error') finally: # Clear the case copy from memory case.clear() @@ -553,21 +563,20 @@ def _worker_thread(case_queue, db_queue, extract_date, thread_id): def parse_file(xml_file, staging_db, extract_date, num_threads=8): """ Parse XML file with multiple threads - + :param xml_file: file-like object or path to XML file :param staging_db: DuckDB database object :param extract_date: date of extract :param num_threads: number of worker threads (increasing this doesn't speed up much, bottleneck is the database writes) """ from .duckdb_database import DuckDB - - # Create queues + case_queue = queue.Queue(maxsize=num_threads * 10) db_queue = queue.Queue() - - # Create database connections for each thread + for _ in range(num_threads): thread_db = DuckDB(staging_db.dbname) + attach_write_buffer(thread_db) db_queue.put(thread_db) # Start worker threads @@ -586,9 +595,7 @@ def parse_file(xml_file, staging_db, extract_date, num_threads=8): total_cases = 0 for _, case in frogress.bar(context): - # Make a deep copy since we'll be clearing the original case_copy = etree.fromstring(etree.tostring(case)) - case_queue.put(case_copy) total_cases += 1 @@ -597,17 +604,18 @@ def parse_file(xml_file, staging_db, extract_date, num_threads=8): while case.getprevious() is not None: del case.getparent()[0] - # Signal threads to stop + # Signal threads to stop (workers flush remaining buffer on sentinel) for _ in range(num_threads): case_queue.put(None) - + # Wait for all threads to complete for t in threads: t.join() - - # Close thread database connections + + # Close thread database connections (final flush for any stragglers) while not db_queue.empty(): thread_db = db_queue.get() + flush_write_buffer(thread_db) thread_db.close() print(f"Processed {total_cases} cases with {num_threads} threads") diff --git a/lib/staging_csv_export.py b/lib/staging_csv_export.py new file mode 100644 index 0000000..bc1fe68 --- /dev/null +++ b/lib/staging_csv_export.py @@ -0,0 +1,136 @@ +"""DuckDB COPY export shaping for RDS-compatible staging CSVs.""" + +from __future__ import annotations + +from dataclasses import dataclass, field + +# Match Task 1 raw DuckDB COPY + Python preprocess CSV shape (unquoted empty fields). +DUCKDB_CSV_COPY_OPTIONS = "HEADER, DELIMITER ','" + +_EMPTY_INT_MARKERS_SQL = "('', 'nan', 'NaN', 'None', '')" + + +@dataclass(frozen=True) +class StagingExportSpec: + """Per-table transforms applied during DuckDB export (replaces CSV preprocess).""" + + drop_columns: frozenset[str] = field(default_factory=frozenset) + array_columns: frozenset[str] = field(default_factory=frozenset) + int_columns: frozenset[str] = field(default_factory=frozenset) + + +def postgres_array_brackets_sql(column_expr: str) -> str: + """ + SQL expression matching ``replace_postgres_array_brackets`` in etl_csv.py. + + Converts DuckDB list literals ``[a,b]`` to PostgreSQL ``{a,b}`` while leaving + JSON object arrays (inner ``{...}``) unchanged. + """ + text = f"trim(cast({column_expr} AS VARCHAR))" + inner = f"trim(substr({text}, 2, length({text}) - 2))" + return ( + f"CASE WHEN {column_expr} IS NULL THEN NULL " + f"WHEN NOT (starts_with({text}, '[') AND ends_with({text}, ']')) " + f"THEN cast({column_expr} AS VARCHAR) " + f"WHEN starts_with({inner}, '{{') AND ends_with({inner}, '}}') " + f"THEN cast({column_expr} AS VARCHAR) " + f"ELSE '{{' || substr({text}, 2, length({text}) - 2) || '}}' END" + ) + + +def nullable_int_csv_sql(column_expr: str) -> str: + """ + SQL expression matching nullable integer CSV normalization in etl_csv.py. + + Python writes an empty CSV field (not ``""``); DuckDB COPY does the same when + the cell is NULL rather than an empty string literal. + """ + as_text = f"cast({column_expr} AS VARCHAR)" + return ( + f"CASE WHEN {column_expr} IS NULL THEN NULL " + f"WHEN {as_text} IN {_EMPTY_INT_MARKERS_SQL} THEN NULL " + f"ELSE {as_text} END" + ) + + +def _export_column_sql(column_name: str, column_type: str, spec: StagingExportSpec) -> str | None: + if column_name in spec.drop_columns: + return None + if column_name in spec.array_columns: + return f"{postgres_array_brackets_sql(column_name)} AS {column_name}" + if column_name in spec.int_columns: + return f"{nullable_int_csv_sql(column_name)} AS {column_name}" + if column_type.upper() in ('JSON',): + return f"cast({column_name} AS VARCHAR) AS {column_name}" + return column_name + + +STAGING_TABLE_EXPORT_SPECS: dict[str, StagingExportSpec] = { + 'oca_index_staging': StagingExportSpec( + array_columns=frozenset({'specialtydesignationtypes'}), + ), + 'oca_events_staging': StagingExportSpec( + array_columns=frozenset({'filingpartiesroles'}), + ), + 'oca_motions_staging': StagingExportSpec( + array_columns=frozenset({'filingpartiesroles'}), + ), + 'oca_judgments_staging': StagingExportSpec( + array_columns=frozenset({'creditorsroles', 'debtorsroles'}), + int_columns=frozenset({'amendedfromjudgmentsequence'}), + ), + 'oca_warrants_staging': StagingExportSpec( + array_columns=frozenset({ + 'propertiesonwarrantcities', + 'propertiesonwarrantstates', + 'propertiesonwarrantpostalcodes', + }), + int_columns=frozenset({'executionstayeddays', 'issuancestayeddays'}), + ), + 'oca_appearances_staging': StagingExportSpec( + drop_columns=frozenset({'appearanceid'}), + int_columns=frozenset({'motionsequence'}), + ), +} + +# Staging tables with no export-time transforms (no second-pass CSV rewrite). +STAGING_TABLES_PASSTHROUGH_EXPORT = frozenset({ + 'oca_addresses_staging', + 'oca_causes_staging', + 'oca_decisions_staging', + 'oca_metadata_staging', + 'oca_parties_staging', + 'oca_appearance_outcomes_staging', +}) + + +def staging_csv_needs_preprocess(filename: str) -> bool: + """Return True when a staging CSV still requires the Python preprocess pass.""" + if not filename.endswith('.csv'): + return False + table_name = filename[:-4] + if table_name in STAGING_TABLE_EXPORT_SPECS: + return False + if table_name in STAGING_TABLES_PASSTHROUGH_EXPORT: + return False + return True + + +def build_staging_copy_sql(table_name: str, csv_path: str, columns: list[tuple[str, str]]) -> str: + """ + Build COPY SQL for a staging table. + + ``columns`` is a list of (name, type) from DESCRIBE. + """ + spec = STAGING_TABLE_EXPORT_SPECS.get(table_name) + options = DUCKDB_CSV_COPY_OPTIONS + if spec is None: + return f"COPY {table_name} TO '{csv_path}' ({options})" + + select_cols = [] + for name, col_type in columns: + expr = _export_column_sql(name, col_type, spec) + if expr is not None: + select_cols.append(expr) + select_sql = ', '.join(select_cols) + return f"COPY (SELECT {select_sql} FROM {table_name}) TO '{csv_path}' ({options})" diff --git a/oca_update.py b/oca_update.py index 3628094..16966eb 100644 --- a/oca_update.py +++ b/oca_update.py @@ -28,6 +28,9 @@ def parse_args(): parser.add_argument('--geocode-workers', type=int, default=parse_optional_int(os.environ.get('GEOCODE_WORKERS')), help='Worker process count for geocode pool') parser.add_argument('--census-batch-chunk-size', type=int, default=int(os.environ.get('CENSUS_BATCH_CHUNK_SIZE', '2500')), help='Chunk size for census batch geocoder input') parser.add_argument('--csv-row-check-chunk-size', type=int, default=int(os.environ.get('CSV_ROW_CHECK_CHUNK_SIZE', '1000')), help='Chunk size used for constant-memory CSV non-empty checks') + parser.add_argument('--parse-write-batch-enabled', action='store_true', default=parse_bool(os.environ.get('PARSE_WRITE_BATCH_ENABLED', '1')), help='Buffer parser DuckDB writes and flush in transaction windows') + parser.add_argument('--parse-write-batch-size', type=int, default=int(os.environ.get('PARSE_WRITE_BATCH_SIZE', '128')), help='Max buffered INSERT statements before flush') + parser.add_argument('--parse-write-flush-every-n-cases', type=int, default=int(os.environ.get('PARSE_WRITE_FLUSH_EVERY_N_CASES', '16')), help='Flush buffered writes after this many cases per worker') return parser.parse_args() def main(): @@ -57,6 +60,10 @@ def main(): 'db_url': os.environ.get('CLONED_DATABASE_URL', '') } + os.environ['PARSE_WRITE_BATCH_ENABLED'] = '1' if args.parse_write_batch_enabled else '0' + os.environ['PARSE_WRITE_BATCH_SIZE'] = str(args.parse_write_batch_size) + os.environ['PARSE_WRITE_FLUSH_EVERY_N_CASES'] = str(args.parse_write_flush_every_n_cases) + runtime_args = { 'db_schema': args.db_schema, 's3_prefix': args.s3_prefix, diff --git a/tests/csv_checksums.py b/tests/csv_checksums.py new file mode 100644 index 0000000..debbc80 --- /dev/null +++ b/tests/csv_checksums.py @@ -0,0 +1,21 @@ +"""CSV checksum helpers for parse/export regression tests.""" + +from __future__ import annotations + +import hashlib +import os + + +def md5_dir_csvs(pub_dir: str) -> dict[str, str]: + """MD5 hex digest per CSV in pub_dir (stable parity fingerprint).""" + digests = {} + for name in sorted(os.listdir(pub_dir)): + if not name.endswith('.csv'): + continue + path = os.path.join(pub_dir, name) + h = hashlib.md5() + with open(path, 'rb') as f: + for chunk in iter(lambda: f.read(1 << 20), b''): + h.update(chunk) + digests[name] = h.hexdigest() + return digests diff --git a/tests/parse_pipeline_helpers.py b/tests/parse_pipeline_helpers.py new file mode 100644 index 0000000..1a463ec --- /dev/null +++ b/tests/parse_pipeline_helpers.py @@ -0,0 +1,50 @@ +"""Test helpers for parse -> DuckDB -> export without the evaluation harness.""" + +from __future__ import annotations + +import os + +from lib.duckdb_database import DuckDB, fetch_staging_row_counts +from lib.etl_stages import export_staging_to_csv, parse_xml_to_staging + +from csv_checksums import md5_dir_csvs + + +class _NoopManifest: + def upsert_step(self, *args, **kwargs): + pass + + def upsert_file(self, *args, **kwargs): + pass + + +def run_parse_export_in_dir( + priv_dir: str, + *, + parse_num_threads: int = 1, +) -> tuple[dict[str, int], dict[str, str]]: + """ + Run parse -> export -> preprocess on zips in priv_dir (no upload). + + Returns (staging_row_counts, csv_checksums). + """ + staging_path = os.path.join(priv_dir, 'staging.duckdb') + pub_dir = os.path.join(priv_dir, 'public') + os.makedirs(pub_dir, exist_ok=True) + + if os.path.exists(staging_path): + os.remove(staging_path) + + staging_db = DuckDB(staging_path) + try: + parse_xml_to_staging( + _NoopManifest(), + staging_db, + priv_dir, + parse_num_threads=parse_num_threads, + ) + row_counts = fetch_staging_row_counts(staging_db) + export_staging_to_csv(staging_db, pub_dir, upload=False) + return row_counts, md5_dir_csvs(pub_dir) + finally: + staging_db.close() diff --git a/tests/parser_xml_fixtures.py b/tests/parser_xml_fixtures.py new file mode 100644 index 0000000..686011d --- /dev/null +++ b/tests/parser_xml_fixtures.py @@ -0,0 +1,157 @@ +"""Synthetic OCA XML fixtures for parser and export tests.""" + +from __future__ import annotations + +import zipfile +from typing import Literal + +from lib.etl_constants import DATA_FILENAME + +NS = 'http://www.example.org/LandlordTenantExtractSchema' + + +def _tag(local: str) -> str: + return f'{{{NS}}}{local}' + + +def _el(parent, local: str, text: str | None = None): + elem = parent.makeelement(_tag(local)) + if text is not None: + elem.text = text + parent.append(elem) + return elem + + +def build_case_xml( + index_id: str, + *, + with_delete: bool = False, + num_parties: int = 2, + num_events: int = 2, + num_appearances: int = 1, + num_judgments: int = 1, + num_warrants_per_judgment: int = 1, +) -> str: + """Return one Index element as an XML string.""" + from lxml import etree + + case = etree.Element(_tag('Index')) + _el(case, 'IndexNumberId', index_id) + _el(case, 'Court', 'Housing Part') + _el(case, 'FiledDate', '2024-01-15') + _el(case, 'PropertyType', 'Residential') + _el(case, 'Classification', 'Nonpayment') + _el(case, 'Status', 'Active') + _el(case, 'FirstPaper', 'Petition by Attorney') + + causes = etree.SubElement(case, _tag('PrimaryClaimCauseOfActions')) + cause = etree.SubElement(causes, _tag('PrimaryClaimCauseOfAction')) + _el(cause, 'CauseOfActionType', 'Rent Arrears') + _el(cause, 'Amount', '5000.00') + + addresses = etree.SubElement(case, _tag('PropertyAddresses')) + address = etree.SubElement(addresses, _tag('PropertyAddress')) + _el(address, 'Street1', '123 Main St') + _el(address, 'City', 'New York') + _el(address, 'State', 'NY') + _el(address, 'PostalCode', '10001') + + parties_parent = etree.SubElement(case, _tag('Parties')) + for i in range(num_parties): + party = etree.SubElement(parties_parent, _tag('Party')) + _el(party, 'Role', 'Petitioner' if i == 0 else 'Respondent') + _el(party, 'PartyType', 'Individual') + + events_parent = etree.SubElement(case, _tag('Events')) + for i in range(num_events): + event = etree.SubElement(events_parent, _tag('Event')) + _el(event, 'EventName', f'Event {i}') + _el(event, 'FiledDate', '2024-02-01') + + appearances_parent = etree.SubElement(case, _tag('Appearances')) + for i in range(num_appearances): + appearance = etree.SubElement(appearances_parent, _tag('Appearance')) + _el(appearance, 'AppearanceDateTime', '2024-02-15T10:00:00') + _el(appearance, 'AppearancePurpose', 'Conference') + outcomes = etree.SubElement(appearance, _tag('AppearanceOutcomes')) + outcome = etree.SubElement(outcomes, _tag('AppearanceOutcome')) + _el(outcome, 'AppearanceOutcomeType', 'Adjourned') + + motions_parent = etree.SubElement(case, _tag('Motions')) + motion = etree.SubElement(motions_parent, _tag('Motion')) + _el(motion, 'Sequence', '1') + _el(motion, 'MotionType', 'Default') + + decisions_parent = etree.SubElement(case, _tag('Decisions')) + decision = etree.SubElement(decisions_parent, _tag('Decision')) + _el(decision, 'Sequence', '1') + _el(decision, 'ResultOf', 'Motion') + + judgments_parent = etree.SubElement(case, _tag('Judgments')) + for j in range(num_judgments): + judgment = etree.SubElement(judgments_parent, _tag('Judgment')) + seq = str(j + 1) + _el(judgment, 'Sequence', seq) + _el(judgment, 'JudgmentType', 'Money') + _el(judgment, 'FiledDate', '2024-03-01') + warrants_parent = etree.SubElement(judgment, _tag('Warrants')) + for w in range(num_warrants_per_judgment): + warrant = etree.SubElement(warrants_parent, _tag('Warrant')) + _el(warrant, 'Sequence', str(w + 1)) + _el(warrant, 'CreatedReason', 'Nonpayment') + + if with_delete: + etree.SubElement(case, _tag('Delete')) + + return etree.tostring(case, encoding='unicode') + + +def build_extract_xml( + case_count: int, + *, + extract_date: str = '2024-03-08', + delete_every: int | None = None, + child_profile: Literal['weekly', 'heavy'] = 'weekly', +) -> bytes: + """Build a full LandlordTenantExtract XML document.""" + if child_profile == 'weekly': + parties, events, appearances = 2, 2, 1 + judgments, warrants = 1, 1 + else: + parties, events, appearances = 5, 5, 3 + judgments, warrants = 2, 2 + + parts = [ + '', + f'', + f'{extract_date}', + ] + for i in range(case_count): + index_id = f'LT-BENCH-{i:06d}' + with_delete = delete_every is not None and delete_every > 0 and i % delete_every == 0 + parts.append( + build_case_xml( + index_id, + with_delete=with_delete, + num_parties=parties, + num_events=events, + num_appearances=appearances, + num_judgments=judgments, + num_warrants_per_judgment=warrants, + ) + ) + parts.append('') + return ''.join(parts).encode('utf-8') + + +def write_test_zip( + zip_path: str, + case_count: int, + *, + child_profile: Literal['weekly', 'heavy'] = 'weekly', +) -> str: + """Write a zip containing LandlordTenantExtract.xml; return zip path.""" + xml_bytes = build_extract_xml(case_count, child_profile=child_profile) + with zipfile.ZipFile(zip_path, 'w', compression=zipfile.ZIP_DEFLATED) as zf: + zf.writestr(DATA_FILENAME, xml_bytes) + return zip_path diff --git a/tests/test_parser_batching.py b/tests/test_parser_batching.py new file mode 100644 index 0000000..cd7e7f1 --- /dev/null +++ b/tests/test_parser_batching.py @@ -0,0 +1,191 @@ +import io +import os +import tempfile +import unittest +import zipfile + +from lxml import etree + +from parser_xml_fixtures import build_case_xml, build_extract_xml, write_test_zip +from lib.duckdb_database import DuckDB, fetch_staging_row_counts +from lib.etl_constants import DATA_FILENAME +from lib.parse_write_buffer import ParseWriteConfig, attach_write_buffer, flush_write_buffer +from lib.parsers import parse_file, parse_case + + +def _init_staging_db(path: str) -> DuckDB: + db = DuckDB(path) + db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + attach_write_buffer(db) + return db + + +def _case_from_xml(case_xml: str): + return etree.fromstring(case_xml.encode('utf-8')) + + +class ParserBatchingSemanticsTests(unittest.TestCase): + def test_repeated_case_update_replaces_child_rows(self): + case_id = 'LT-REPEAT-001' + first = build_case_xml(case_id, num_parties=2, num_events=1) + second = build_case_xml(case_id, num_parties=4, num_events=3) + + with tempfile.TemporaryDirectory() as tmp: + db_path = os.path.join(tmp, 'staging.duckdb') + db = _init_staging_db(db_path) + try: + parse_case(_case_from_xml(first), db, '2024-03-01') + flush_write_buffer(db) + parse_case(_case_from_xml(second), db, '2024-03-02') + flush_write_buffer(db) + counts = fetch_staging_row_counts(db) + finally: + db.close() + + self.assertEqual(counts['oca_index_staging'], 1) + self.assertEqual(counts['oca_parties_staging'], 4) + self.assertEqual(counts['oca_events_staging'], 3) + + def test_delete_short_circuit_keeps_metadata_only(self): + case_id = 'LT-DELETE-001' + live = build_case_xml(case_id, num_parties=2) + deleted = build_case_xml(case_id, with_delete=True, num_parties=2) + + with tempfile.TemporaryDirectory() as tmp: + db_path = os.path.join(tmp, 'staging.duckdb') + db = _init_staging_db(db_path) + try: + parse_case(_case_from_xml(live), db, '2024-03-01') + flush_write_buffer(db) + parse_case(_case_from_xml(deleted), db, '2024-03-02') + flush_write_buffer(db) + meta = db.execute( + 'SELECT updatedate, deletedate FROM oca_metadata_staging WHERE indexnumberid = ?', + (case_id,), + ).fetchone() + counts = fetch_staging_row_counts(db) + finally: + db.close() + + self.assertIsNone(meta[0]) + self.assertEqual(str(meta[1]), '2024-03-02') + self.assertEqual(counts['oca_index_staging'], 1) + self.assertEqual(counts['oca_parties_staging'], 2) + + def test_batched_path_matches_legacy_row_counts(self): + xml_bytes = build_extract_xml(25, child_profile='weekly') + with tempfile.TemporaryDirectory() as tmp: + legacy_path = os.path.join(tmp, 'legacy.duckdb') + batched_path = os.path.join(tmp, 'batched.duckdb') + + legacy_db = DuckDB(legacy_path) + legacy_db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + batched_db = DuckDB(batched_path) + batched_db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + attach_write_buffer(batched_db) + + xml_io = io.BytesIO(xml_bytes) + try: + os.environ['PARSE_WRITE_BATCH_ENABLED'] = '0' + parse_file(xml_io, legacy_db, '2024-03-08', num_threads=1) + xml_io.seek(0) + parse_file(xml_io, batched_db, '2024-03-08', num_threads=1) + flush_write_buffer(batched_db) + legacy_counts = fetch_staging_row_counts(legacy_db) + batched_counts = fetch_staging_row_counts(batched_db) + finally: + os.environ.pop('PARSE_WRITE_BATCH_ENABLED', None) + legacy_db.close() + batched_db.close() + + self.assertEqual(legacy_counts, batched_counts) + + def test_parse_file_end_to_end_zip(self): + with tempfile.TemporaryDirectory() as tmp: + zip_path = os.path.join(tmp, 'incr.zip') + with zipfile.ZipFile(zip_path, 'w') as zf: + zf.writestr(DATA_FILENAME, build_extract_xml(10, child_profile='weekly')) + + db_path = os.path.join(tmp, 'staging.duckdb') + db = _init_staging_db(db_path) + try: + with zipfile.ZipFile(zip_path, 'r').open(DATA_FILENAME) as xml_file: + parse_file(xml_file, db, '2024-03-08', num_threads=1) + flush_write_buffer(db) + counts = fetch_staging_row_counts(db) + finally: + db.close() + + self.assertEqual(counts['oca_index_staging'], 10) + self.assertGreater(counts['oca_parties_staging'], 10) + + +class ParseWriteBufferTests(unittest.TestCase): + def test_flush_order_deletes_before_inserts(self): + from lib.parse_write_buffer import StagingWriteBuffer + + with tempfile.TemporaryDirectory() as tmp: + db = DuckDB(os.path.join(tmp, 'buf.duckdb')) + db.execute('CREATE TABLE t (id INTEGER, v VARCHAR)') + db.execute('INSERT INTO t VALUES (1, ?)', ('old',)) + buffer = StagingWriteBuffer( + db, + ParseWriteConfig(enabled=True, batch_size=100, flush_every_n_cases=10), + ) + buffer.queue_delete('DELETE FROM t WHERE id = ?', (1,)) + buffer.queue_insert('INSERT INTO t VALUES (?, ?)', (1, 'new')) + buffer.flush() + row = db.execute('SELECT v FROM t WHERE id = 1').fetchone() + db.close() + self.assertEqual(row[0], 'new') + + def test_config_from_env(self): + os.environ['PARSE_WRITE_BATCH_SIZE'] = '128' + os.environ['PARSE_WRITE_FLUSH_EVERY_N_CASES'] = '8' + try: + cfg = ParseWriteConfig.from_env() + self.assertEqual(cfg.batch_size, 128) + self.assertEqual(cfg.flush_every_n_cases, 8) + finally: + os.environ.pop('PARSE_WRITE_BATCH_SIZE', None) + os.environ.pop('PARSE_WRITE_FLUSH_EVERY_N_CASES', None) + + +class ParserBatchingParityExportTests(unittest.TestCase): + def test_checksum_stable_with_batching_enabled(self): + from parse_pipeline_helpers import run_parse_export_in_dir + + with tempfile.TemporaryDirectory() as tmp: + priv = os.path.join(tmp, 'private') + os.makedirs(priv) + write_test_zip( + os.path.join(priv, 'LandlordTenant.Incr.2024-03-08.zip'), + 20, + child_profile='weekly', + ) + + os.environ['PARSE_WRITE_BATCH_ENABLED'] = '1' + try: + rows_on, checksums_on = run_parse_export_in_dir(priv, parse_num_threads=1) + finally: + os.environ.pop('PARSE_WRITE_BATCH_ENABLED', None) + + priv2 = os.path.join(tmp, 'private2') + os.makedirs(priv2) + write_test_zip( + os.path.join(priv2, 'LandlordTenant.Incr.2024-03-08.zip'), + 20, + child_profile='weekly', + ) + os.environ['PARSE_WRITE_BATCH_ENABLED'] = '0' + try: + rows_off, checksums_off = run_parse_export_in_dir(priv2, parse_num_threads=1) + finally: + os.environ.pop('PARSE_WRITE_BATCH_ENABLED', None) + + self.assertEqual(checksums_on, checksums_off) + self.assertEqual(rows_on, rows_off) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_parser_regression_safety.py b/tests/test_parser_regression_safety.py new file mode 100644 index 0000000..f818290 --- /dev/null +++ b/tests/test_parser_regression_safety.py @@ -0,0 +1,238 @@ +"""Regression and failure-safety tests for Option A parser batching and export parity.""" + +from __future__ import annotations + +import io +import os +import tempfile +import unittest +import zipfile +from unittest.mock import patch + +from lib.duckdb_database import DuckDB, fetch_staging_row_counts +from lib.etl_constants import DATA_FILENAME +from lib.etl_stages import export_staging_to_csv +from lib.parse_write_buffer import ParseWriteConfig, StagingWriteBuffer, attach_write_buffer, flush_write_buffer +from lib.parsers import parse_case, parse_file + +from csv_checksums import md5_dir_csvs +from parse_pipeline_helpers import run_parse_export_in_dir +from parser_xml_fixtures import build_case_xml, build_extract_xml, write_test_zip + + +def _init_staging_db(path: str) -> DuckDB: + db = DuckDB(path) + db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + attach_write_buffer(db) + return db + + +def _parse_zip_bytes( + xml_bytes: bytes, + db: DuckDB, + extract_date: str = '2024-03-08', +) -> None: + buf = io.BytesIO() + with zipfile.ZipFile(buf, 'w') as zf: + zf.writestr(DATA_FILENAME, xml_bytes) + buf.seek(0) + with zipfile.ZipFile(buf, 'r') as zf: + with zf.open(DATA_FILENAME) as xml_file: + parse_file(xml_file, db, extract_date, num_threads=1) + + +class WarmParseIdempotencyTests(unittest.TestCase): + def test_double_parse_same_staging_db_stable_row_counts(self): + xml_bytes = build_extract_xml(40, child_profile='weekly') + with tempfile.TemporaryDirectory() as tmp: + db_path = os.path.join(tmp, 'staging.duckdb') + db = _init_staging_db(db_path) + try: + _parse_zip_bytes(xml_bytes, db) + flush_write_buffer(db) + first_counts = fetch_staging_row_counts(db) + + _parse_zip_bytes(xml_bytes, db) + flush_write_buffer(db) + second_counts = fetch_staging_row_counts(db) + finally: + db.close() + + self.assertEqual(first_counts, second_counts) + self.assertEqual(first_counts['oca_index_staging'], 40) + + def test_double_parse_export_checksums_unchanged(self): + with tempfile.TemporaryDirectory() as tmp: + priv = os.path.join(tmp, 'private') + pub = os.path.join(priv, 'public') + os.makedirs(pub) + write_test_zip( + os.path.join(priv, 'LandlordTenant.Incr.2024-03-08.zip'), + 25, + child_profile='weekly', + ) + + staging_path = os.path.join(priv, 'staging.duckdb') + db = _init_staging_db(staging_path) + try: + with zipfile.ZipFile( + os.path.join(priv, 'LandlordTenant.Incr.2024-03-08.zip'), 'r' + ) as zf: + with zf.open(DATA_FILENAME) as xml_file: + parse_file(xml_file, db, '2024-03-08', num_threads=1) + flush_write_buffer(db) + export_staging_to_csv(db, pub, upload=False) + checksums_first = md5_dir_csvs(pub) + + with zipfile.ZipFile( + os.path.join(priv, 'LandlordTenant.Incr.2024-03-08.zip'), 'r' + ) as zf: + with zf.open(DATA_FILENAME) as xml_file: + parse_file(xml_file, db, '2024-03-08', num_threads=1) + flush_write_buffer(db) + export_staging_to_csv(db, pub, upload=False) + checksums_second = md5_dir_csvs(pub) + finally: + db.close() + + self.assertEqual(checksums_first, checksums_second) + + +class ColdRerunIdempotencyTests(unittest.TestCase): + def test_cold_rerun_parity(self): + """Two cold parse+export runs must match row counts and export checksums.""" + with tempfile.TemporaryDirectory() as tmp: + priv1 = os.path.join(tmp, 'run1') + priv2 = os.path.join(tmp, 'run2') + os.makedirs(priv1) + os.makedirs(priv2) + write_test_zip( + os.path.join(priv1, 'LandlordTenant.Incr.2024-03-08.zip'), + 30, + child_profile='weekly', + ) + write_test_zip( + os.path.join(priv2, 'LandlordTenant.Incr.2024-03-08.zip'), + 30, + child_profile='weekly', + ) + + rows1, checksums1 = run_parse_export_in_dir(priv1, parse_num_threads=1) + rows2, checksums2 = run_parse_export_in_dir(priv2, parse_num_threads=1) + + self.assertEqual(checksums1, checksums2) + self.assertEqual(rows1, rows2) + + +class ParserFailureRerunTests(unittest.TestCase): + def test_mid_file_failure_flush_then_rerun_matches_clean_parse(self): + xml_bytes = build_extract_xml(20, child_profile='weekly') + fail_on_case = 8 + seen = {'n': 0} + + def parse_case_maybe_fail(case, db, extract_date): + seen['n'] += 1 + if seen['n'] == fail_on_case: + raise RuntimeError('injected parse failure') + parse_case(case, db, extract_date) + + with tempfile.TemporaryDirectory() as tmp: + clean_path = os.path.join(tmp, 'clean.duckdb') + dirty_path = os.path.join(tmp, 'dirty.duckdb') + + clean_db = _init_staging_db(clean_path) + dirty_db = _init_staging_db(dirty_path) + try: + _parse_zip_bytes(xml_bytes, clean_db) + flush_write_buffer(clean_db) + clean_counts = fetch_staging_row_counts(clean_db) + + with patch('lib.parsers.parse_case', parse_case_maybe_fail): + _parse_zip_bytes(xml_bytes, dirty_db) + flush_write_buffer(dirty_db) + + _parse_zip_bytes(xml_bytes, dirty_db) + flush_write_buffer(dirty_db) + recovery_counts = fetch_staging_row_counts(dirty_db) + finally: + clean_db.close() + dirty_db.close() + + self.assertEqual(clean_counts, recovery_counts) + + +class BatchBoundaryCorrectnessTests(unittest.TestCase): + def test_aggressive_batching_matches_legacy_counts(self): + xml_bytes = build_extract_xml(48, child_profile='weekly') + env = { + 'PARSE_WRITE_BATCH_ENABLED': '1', + 'PARSE_WRITE_BATCH_SIZE': '4', + 'PARSE_WRITE_FLUSH_EVERY_N_CASES': '3', + } + with tempfile.TemporaryDirectory() as tmp: + legacy_path = os.path.join(tmp, 'legacy.duckdb') + batched_path = os.path.join(tmp, 'batched.duckdb') + legacy_db = DuckDB(legacy_path) + legacy_db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + batched_db = DuckDB(batched_path) + batched_db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + attach_write_buffer(batched_db) + + try: + os.environ['PARSE_WRITE_BATCH_ENABLED'] = '0' + _parse_zip_bytes(xml_bytes, legacy_db) + for key, value in env.items(): + os.environ[key] = value + _parse_zip_bytes(xml_bytes, batched_db) + flush_write_buffer(batched_db) + legacy_counts = fetch_staging_row_counts(legacy_db) + batched_counts = fetch_staging_row_counts(batched_db) + finally: + for key in env: + os.environ.pop(key, None) + os.environ.pop('PARSE_WRITE_BATCH_ENABLED', None) + legacy_db.close() + batched_db.close() + + self.assertEqual(legacy_counts, batched_counts) + + def test_no_duplicate_child_rows_across_flush_windows(self): + from lxml import etree + + case_id = 'LT-BOUNDARY-001' + cases = [ + build_case_xml(case_id, num_parties=2, num_events=1), + build_case_xml(case_id, num_parties=4, num_events=2), + build_case_xml(case_id, num_parties=3, num_events=3), + ] + + with tempfile.TemporaryDirectory() as tmp: + db = DuckDB(os.path.join(tmp, 'b.duckdb')) + db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + db.write_buffer = StagingWriteBuffer( + db, + ParseWriteConfig(enabled=True, batch_size=2, flush_every_n_cases=1), + ) + try: + for case_xml in cases: + case = etree.fromstring(case_xml.encode('utf-8')) + parse_case(case, db, '2024-03-01') + flush_write_buffer(db) + parties = db.execute( + 'SELECT COUNT(*) FROM oca_parties_staging WHERE indexnumberid = ?', + (case_id,), + ).fetchone()[0] + events = db.execute( + 'SELECT COUNT(*) FROM oca_events_staging WHERE indexnumberid = ?', + (case_id,), + ).fetchone()[0] + self.assertGreater(db.write_buffer._flush_count, 2) + finally: + db.close() + + self.assertEqual(parties, 3) + self.assertEqual(events, 3) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_staging_csv_export.py b/tests/test_staging_csv_export.py new file mode 100644 index 0000000..5287353 --- /dev/null +++ b/tests/test_staging_csv_export.py @@ -0,0 +1,208 @@ +import csv +import os +import tempfile +import unittest + +import duckdb + +from lib.duckdb_database import DuckDB +from lib.etl_csv import preprocess_csv_file, replace_postgres_array_brackets +from lib.staging_csv_export import ( + nullable_int_csv_sql, + postgres_array_brackets_sql, + staging_csv_needs_preprocess, +) + + +class PostgresArrayBracketsSqlTests(unittest.TestCase): + def _eval_sql(self, value: str | None) -> str | None: + conn = duckdb.connect(':memory:') + literal = 'NULL' if value is None else f"'{value.replace(chr(39), chr(39)*2)}'" + row = conn.execute( + f"SELECT {postgres_array_brackets_sql(literal)}" + ).fetchone() + conn.close() + return row[0] + + def test_matches_python_simple_array(self): + self.assertEqual(self._eval_sql('[a,b]'), replace_postgres_array_brackets('[a,b]')) + + def test_matches_python_json_object_array(self): + value = '[{"appearanceoutcometype":"Hearing"}]' + self.assertEqual(self._eval_sql(value), replace_postgres_array_brackets(value)) + + def test_matches_python_plain_text(self): + self.assertEqual(self._eval_sql('plain'), replace_postgres_array_brackets('plain')) + + def test_null_unchanged(self): + self.assertIsNone(self._eval_sql(None)) + + +class NullableIntSqlTests(unittest.TestCase): + def _eval_int(self, value) -> str: + conn = duckdb.connect(':memory:') + if value is None: + row = conn.execute(f"SELECT {nullable_int_csv_sql('NULL::INTEGER')}").fetchone() + else: + row = conn.execute( + f"SELECT {nullable_int_csv_sql(str(int(value)))}" + ).fetchone() + conn.close() + return row[0] + + def test_null_becomes_sql_null(self): + conn = duckdb.connect(':memory:') + row = conn.execute( + f"SELECT {nullable_int_csv_sql('motionsequence')} FROM (SELECT NULL::INTEGER AS motionsequence) t" + ).fetchone() + self.assertIsNone(row[0]) + + def test_nan_marker_becomes_sql_null(self): + conn = duckdb.connect(':memory:') + row = conn.execute( + f"SELECT {nullable_int_csv_sql('v')} FROM (SELECT 'NaN' AS v) t" + ).fetchone() + self.assertIsNone(row[0]) + + +class StagingExportIntegrationTests(unittest.TestCase): + def test_appearances_export_drops_appearanceid(self): + with tempfile.TemporaryDirectory() as tmp: + db_path = os.path.join(tmp, 'staging.duckdb') + pub = os.path.join(tmp, 'public') + os.makedirs(pub) + db = DuckDB(db_path) + db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + db.execute( + """ + INSERT INTO oca_appearances_staging ( + indexnumberid, appearanceid, appearancedatetime, + appearancepurpose, motionsequence, appearanceoutcomes + ) VALUES ( + 'LT-1', 99, '2024-02-15 10:00:00', 'Conference', NULL, + '[{"appearanceoutcometype":"Adjourned"}]' + ) + """ + ) + db.export_tables_to_csv(pub) + db.close() + + path = os.path.join(pub, 'oca_appearances_staging.csv') + with open(path, newline='', encoding='utf-8') as f: + row = next(csv.DictReader(f)) + self.assertNotIn('appearanceid', row) + self.assertEqual(row['motionsequence'], '') + self.assertEqual( + row['appearanceoutcomes'], + '[{"appearanceoutcometype":"Adjourned"}]', + ) + + def test_index_array_export_matches_preprocess(self): + with tempfile.TemporaryDirectory() as tmp: + raw_path = os.path.join(tmp, 'raw.csv') + export_path = os.path.join(tmp, 'export.csv') + conn = duckdb.connect(':memory:') + conn.execute('CREATE TABLE t (specialtydesignationtypes VARCHAR[])') + conn.execute("INSERT INTO t VALUES (['HP', 'RTC'])") + conn.execute(f"COPY t TO '{raw_path}' (HEADER, DELIMITER ',')") + conn.close() + + with open(raw_path, newline='', encoding='utf-8') as f: + reader = csv.DictReader(f) + row = next(reader) + with open(export_path, 'w', newline='', encoding='utf-8') as f: + writer = csv.DictWriter(f, fieldnames=['specialtydesignationtypes']) + writer.writeheader() + writer.writerow(row) + + preprocess_csv_file(export_path) + + db_path = os.path.join(tmp, 'staging.duckdb') + pub = os.path.join(tmp, 'public') + os.makedirs(pub) + db = DuckDB(db_path) + db.execute('CREATE TABLE oca_index_staging (specialtydesignationtypes VARCHAR[])') + db.execute("INSERT INTO oca_index_staging VALUES (['HP', 'RTC'])") + db.export_tables_to_csv(pub) + db.close() + + with open(export_path, newline='', encoding='utf-8') as f: + preprocessed = next(csv.DictReader(f))['specialtydesignationtypes'] + with open(os.path.join(pub, 'oca_index_staging.csv'), newline='', encoding='utf-8') as f: + exported = next(csv.DictReader(f))['specialtydesignationtypes'] + self.assertEqual(exported, preprocessed) + + +class ExportMatchesLegacyPreprocessTests(unittest.TestCase): + def test_all_staging_csvs_match_raw_copy_plus_preprocess(self): + import lib.etl_csv as etl_csv_mod + from parser_xml_fixtures import write_test_zip + from lib.etl_stages import parse_xml_to_staging + + with tempfile.TemporaryDirectory() as tmp: + priv = os.path.join(tmp, 'priv') + os.makedirs(priv) + write_test_zip( + os.path.join(priv, 'LandlordTenant.Incr.2024-03-08.zip'), + 10, + child_profile='weekly', + ) + pub_legacy = os.path.join(tmp, 'legacy') + pub_export = os.path.join(tmp, 'export') + os.makedirs(pub_legacy) + os.makedirs(pub_export) + + class _Manifest: + def upsert_step(self, *args, **kwargs): + pass + + def upsert_file(self, *args, **kwargs): + pass + + db_legacy = DuckDB(os.path.join(priv, 'legacy.duckdb')) + db_legacy.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + parse_xml_to_staging(_Manifest(), db_legacy, priv, parse_num_threads=1) + with db_legacy._lock: + for table_row in db_legacy.conn.execute('SHOW TABLES').fetchall(): + table_name = table_row[0] + path = os.path.join(pub_legacy, f'{table_name}.csv') + db_legacy.conn.execute( + f"COPY {table_name} TO '{path}' (HEADER, DELIMITER ',')" + ) + db_legacy.close() + + orig = etl_csv_mod.staging_csv_needs_preprocess + etl_csv_mod.staging_csv_needs_preprocess = lambda _f: True + try: + etl_csv_mod.preprocess_staging_csv_dir(pub_legacy) + finally: + etl_csv_mod.staging_csv_needs_preprocess = orig + + db_export = DuckDB(os.path.join(priv, 'export.duckdb')) + db_export.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + parse_xml_to_staging(_Manifest(), db_export, priv, parse_num_threads=1) + db_export.export_tables_to_csv(pub_export) + db_export.close() + + for name in sorted(os.listdir(pub_legacy)): + if not name.endswith('.csv'): + continue + with open(os.path.join(pub_legacy, name), 'rb') as f: + legacy = f.read() + with open(os.path.join(pub_export, name), 'rb') as f: + exported = f.read() + self.assertEqual(exported, legacy, name) + + +class StagingCsvNeedsPreprocessTests(unittest.TestCase): + def test_staging_tables_skip_second_pass(self): + self.assertFalse(staging_csv_needs_preprocess('oca_index_staging.csv')) + self.assertFalse(staging_csv_needs_preprocess('oca_addresses_staging.csv')) + self.assertFalse(staging_csv_needs_preprocess('oca_appearances_staging.csv')) + + def test_unknown_table_still_preprocessed(self): + self.assertTrue(staging_csv_needs_preprocess('custom_table.csv')) + + +if __name__ == '__main__': + unittest.main() From 5909343124c7cbbae872af94466063cc19262842 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Thu, 28 May 2026 21:41:36 -0400 Subject: [PATCH 15/30] update readmes --- .env.example | 36 +++++++++++----- README.md | 80 ++++++++++++++++++++---------------- lib/README.md | 111 ++++++++++++++++++++++++++++++++++++++------------ 3 files changed, 157 insertions(+), 70 deletions(-) diff --git a/.env.example b/.env.example index 558bff5..701bcc0 100644 --- a/.env.example +++ b/.env.example @@ -1,44 +1,58 @@ # Mode (level 1 or 2) +# 2 = full S3 publish (production default) MODE=2 -# Optional runtime controls (safe defaults preserve current behavior) +# PostgreSQL target schema (optional) +# Sets session search_path for the ETL run. Leave empty for public. DB_SCHEMA= + +# Optional S3 key prefix for private/ and public/ paths (e.g. refactor/ for isolated runs) S3_PREFIX= + +# Reprocess zip files from S3 private/ backups by filename glob +# Example: LandlordTenant.Incr.2024-*.zip REPROCESS_GLOB= + +# When true, replay REPROCESS_GLOB matches even if already completed in etl_files manifest FORCE_REPROCESS=false + +# Geocoding and CSV tuning (optional; safe defaults preserve current behavior) GEOCODE_WORKERS= CENSUS_BATCH_CHUNK_SIZE=2500 CSV_ROW_CHECK_CHUNK_SIZE=1000 +# Parser → DuckDB write batching (Option A; enabled by default) +PARSE_WRITE_BATCH_ENABLED=1 +PARSE_WRITE_BATCH_SIZE=128 +PARSE_WRITE_FLUSH_EVERY_N_CASES=16 + # The database URL # ---------------- # -# This is the postgres instance the parsed cases will -# load data into. -# -# If you use the Dockerfile you don't need to change this. Otherwise make this the remote +# PostgreSQL instance where parsed cases are loaded and promoted. +# With Docker Compose, point this at your RDS instance (or local db service). DATABASE_URL= +# Optional clone/sync target (legacy maintenance path) +# CLONED_DATABASE_URL= # Amazon Web Services (AWS) configuration # --------------------------------------- # -# These are used to move data to/from S3 bucket -# If you are using AWS Lambda you do not need to include AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY -# please configure https://docs.aws.amazon.com/lambda/latest/dg/lambda-intro-execution-role.html +# Used to move data to/from the S3 bucket and for RDS aws_s3 import/export. +# On ECS/Lambda you can omit keys and use an IAM role instead. AWS_ACCESS_KEY_ID= AWS_SECRET_ACCESS_KEY= AWS_S3_BUCKET_NAME= - # OCA SFTP credentials # --------------------------------------- # -# These are used to download raw XML and CSV files +# Used to download new raw XML zip files from OCA SFTP_HOST= SFTP_USER= SFTP_PSWD= -SFTP_DIR= \ No newline at end of file +SFTP_DIR= diff --git a/README.md b/README.md index 583b8e4..8857597 100644 --- a/README.md +++ b/README.md @@ -43,27 +43,44 @@ The data we receive from OCA is an extract of all landlord and tenant cases in N ## About the code -For information about the details of various components, see [`/lib`](/lib) +The ETL pipeline lives under [`lib/`](lib/). See [`lib/README.md`](lib/README.md) for stage-by-stage architecture, module map, and SQL script roles. -### Local Setup +### Local setup -First, you will only be able to run this yourself if you have HDC's credentials to access to the SFTP to get the raw data transfered from OCA and access to the private AWS S3 where those files are stored. +You need credentials for OCA SFTP and the BetaNYC AWS for S3 (file storage) and RDS (PostgreSQL database), plus Docker and Docker Compose. -You will need Docker and Docker Compose. +Copy the example env file and fill in credentials: -First, you'll want to create an `.env` file by copying the example one: - -``` +```bash cp .env.example .env # Or 'copy .env.example .env' on Windows ``` -Take a look at the `.env` file and fill in the AWS S3 credentials. +Required variables: `DATABASE_URL`, `AWS_*`, `SFTP_*`, and `MODE=2` for full publish. Optional runtime controls are documented in [`.env.example`](.env.example). + +**Typical weekly run** (process new SFTP files only): +```bash +docker compose run --rm app python oca_update.py +``` -To run the whole process in the docker container run: +**Refactor / replay run** (isolated schema and S3 prefix, force replay from S3 private backups): +```bash +docker compose run --rm app env \ + DB_SCHEMA=refactor \ + S3_PREFIX=refactor/ \ + REPROCESS_GLOB='LandlordTenant.Incr.2024-*.zip' \ + FORCE_REPROCESS=true \ + GEOCODE_WORKERS=2 \ + python oca_update.py ``` -docker-compose up + +Compose reads `.env` from the repo root for `DATABASE_URL`, AWS, and SFTP. Override any variable inline with `env VAR=value ...` as above. + +Run the test suite in Docker: + +```bash +docker compose run --rm app python -m unittest discover -s tests -p "test_*.py" ``` ### Weekly scheduling and Kubernetes @@ -76,37 +93,32 @@ See [`docs/operations/weekly-etl-scheduling.md`](docs/operations/weekly-etl-sche Create cluster secrets from [`k8s/oca-etl-secret.example.yaml`](k8s/oca-etl-secret.example.yaml); do not commit real credentials. -### Runtime controls (Step 1 refactor) - -These optional variables let operators isolate schema/data paths and tune memory-sensitive parts of the run. If omitted, behavior remains the same as before (new files only, default schema/search path, default worker/chunk values). +### Runtime controls -- `DB_SCHEMA`: set PostgreSQL `search_path` target schema for the ETL session. -- `S3_PREFIX`: optional namespace prefix for S3 object keys (applies to `private/` and `public/` paths). -- `REPROCESS_GLOB`: filename glob against S3 `private/` zip backups (example: `LandlordTenant.Incr.2024-*.zip`). -- `FORCE_REPROCESS`: when `true`, include `REPROCESS_GLOB` matches for replay; otherwise matches are logged and skipped. -- `GEOCODE_WORKERS`: max workers for the Geosupport multiprocessing pool. -- `CENSUS_BATCH_CHUNK_SIZE`: chunk size for Census batch geocoder requests (default `2500`). -- `CSV_ROW_CHECK_CHUNK_SIZE`: chunk size for CSV non-empty checks before S3 import (default `1000`). +Optional env vars (and matching `oca_update.py` CLI flags) tune isolation, replay, memory, and parse throughput. When unset, defaults preserve standard weekly behavior: new SFTP files only, `public` schema, CPU-count geocode workers. -Example Docker run with non-default schema and forced replay: +| Variable | Purpose | Default | +|----------|---------|---------| +| `DB_SCHEMA` | PostgreSQL `search_path` target | `public` | +| `S3_PREFIX` | Prefix for `private/` and `public/` S3 keys | none | +| `REPROCESS_GLOB` | Filename glob for S3 private zip replay | none | +| `FORCE_REPROCESS` | Replay manifest-completed glob matches | `false` | +| `GEOCODE_WORKERS` | Geosupport multiprocessing pool size | CPU count | +| `CENSUS_BATCH_CHUNK_SIZE` | Census batch geocoder chunk | `2500` | +| `CSV_ROW_CHECK_CHUNK_SIZE` | Staging CSV preprocess / row-check chunk | `1000` | +| `PARSE_WRITE_BATCH_ENABLED` | Buffer parser DuckDB writes in txn windows | `1` (on) | +| `PARSE_WRITE_BATCH_SIZE` | Max buffered INSERTs before flush | `128` | +| `PARSE_WRITE_FLUSH_EVERY_N_CASES` | Flush cadence per parse worker | `16` | -```bash -DB_SCHEMA=oca_refactor \ -S3_PREFIX=refactor/dev \ -REPROCESS_GLOB='LandlordTenant.Incr.2024-*.zip' \ -FORCE_REPROCESS=true \ -GEOCODE_WORKERS=4 \ -CENSUS_BATCH_CHUNK_SIZE=2000 \ -docker-compose run --rm app python oca_update.py -``` +Use an isolated `S3_PREFIX` (e.g. `refactor/`) for refactor and end-to-end test runs so reads and writes stay out of production public paths. Memory target per job is **≤ 2 GiB**; lower `GEOCODE_WORKERS` if geocoding approaches the limit. ### Jupyter notebook for maintenance -Comment out `CMD ["python", "oca_update.py"]` in the Dockerfile +Comment out `CMD ["python", "oca_update.py"]` in the Dockerfile, then: -``` -docker-compose up -d -docker-compose exec app /bin/bash +```bash +docker compose up -d +docker compose exec app /bin/bash jupyter notebook --allow-root --ip 0.0.0.0 --no-browser ``` diff --git a/lib/README.md b/lib/README.md index 15cedf6..eba6d56 100644 --- a/lib/README.md +++ b/lib/README.md @@ -1,46 +1,107 @@ -# Code +# OCA ETL pipeline -### `sftp.py` +This directory contains the Extract–Transform–Load pipeline that ingests NY State housing court XML from OCA, parses it into relational tables, loads PostgreSQL on RDS, geocodes addresses, and publishes CSVs to S3. This process works with the protected address-level data ("level 2") but maintains public exports of the deidentified (zip code only, "level 1") version with the full address data kept only in secure S3 and RDS for organization under the legal agreement with OCA. -This class provides a connection to the SFTP maintained by OCA and allows us to list the available files and download selected files. +Entry point: [`oca_update.py`](../oca_update.py) loads `.env` and calls `oca_etl()` in [`etl.py`](etl.py). -### `s3.py` +## Pipeline flow -This class provides a connection to our Amazon S3 account where both the private raw files and public csv files are stored, and allows us to list the available files and upload new files. +```mermaid +flowchart TD + sftp[SFTP new zip files] --> select[Select files to process] + s3backup[S3 private backups] --> select + select --> download[Download selected zips] + download --> parse[Stream parse XML → DuckDB staging] + parse --> export[Export staging CSVs] + export --> preprocess[Normalize CSVs for RDS import] + preprocess --> s3upload[Upload staging CSVs to S3] + s3upload --> import[RDS import staging tables] + import --> normalize[SQL normalize + appearance outcomes] + normalize --> promote[Atomic promote staging → main] + promote --> publish[Export core tables to S3 public/] + publish --> geocode[Incremental geocode delta] + geocode --> addrpub[Publish addresses + views] +``` -### `database.py` +Each run is orchestrated sequentially in `oca_etl()`. See [`etl_stages.py`](etl_stages.py) for stage implementations. -This class is adapted from [NYCDB](https://github.com/nycdb/nycdb/blob/master/src/nycdb/database.py), and provides a connection to the PostgreSQL database where the parsed files are stored. It includes methods to insert new rows, execute SQL files, export tables to CSV, and to create and restore from [pg_dump](https://www.postgresql.org/docs/12/app-pgdump.html) files. +## Stages -### `parsers.py` +| Stage | Module | What it does | +|-------|--------|--------------| +| Select files | `etl_file_selection.py`, `etl_stages.select_input_files` | Picks new SFTP zips and/or S3 private replays (`REPROCESS_GLOB`); skips manifest-completed files unless `FORCE_REPROCESS=true`. | +| Download | `etl_stages.download_selected_files` | New files from SFTP; replay files from S3 private backup. | +| Parse | `parsers.py`, `duckdb_database.py` | Streaming XML parse into local DuckDB (`staging.duckdb`); batched writes via `parse_write_buffer.py`. | +| Export + preprocess | `duckdb_database.py`, `staging_csv_export.py`, `etl_csv.py` | DuckDB `COPY` with Postgres-compatible transforms; minimal second-pass CSV rewrite. | +| Import + promote | `etl_stages.import_and_promote_staging`, `etl_promotion.py` | Bootstrap core tables, import staging CSVs via `aws_s3`, normalize, then single-transaction promotion. | +| Publish core | `etl_publish.py`, `etl_stages.publish_core_tables` | Export `OCA_TABLES` to S3 public prefix; SSE-S3 normalization on objects written this run. | +| Geocode + publish addresses | `etl_geocode.py`, `geocode_record.py` | Delta-select rows missing lat/lon; Geosupport + Census batch; upsert by natural address key; skip address export when unchanged. | -The final function `parse_file` takes an XML file and database connection from `database.py` and iterates over each case, parsing all the data into the various tables. +## Key modules -### `utils.py` +### Connectivity -A few basic helper functions: +- [`sftp.py`](sftp.py) — list and download raw XML zip files from OCA SFTP. +- [`s3.py`](s3.py) — S3 upload/download, encryption normalization (`update_encryption`). +- [`database.py`](database.py) — PostgreSQL connection (NYCDB-derived), schema `search_path`, transactions, `aws_s3` import/export helpers. -* `make_dir` - * Create new local directories +### Parse and local staging -* `list_new_data_files` - * List files that are in the SFTP but not yet in S3 +- [`parsers.py`](parsers.py) — stream `` nodes from XML; per-case child table replace semantics; delete-event handling. +- [`duckdb_database.py`](duckdb_database.py) — local DuckDB staging DB; export to CSV with contract transforms. +- [`parse_write_buffer.py`](parse_write_buffer.py) — buffer INSERTs and flush in transaction windows (`PARSE_WRITE_*` env knobs). +- [`staging_csv_export.py`](staging_csv_export.py) — per-table export specs (Postgres array literals, nullable ints, appearances column rules). -* `promote_staging_to_main` (`etl_promotion.py`) - * Move newly parsed records in the database over from staging tables to the main ones (single transaction) +### ETL orchestration -* `create_date_files` - * Create plain text and image files for the most recent date of the data extracts for display in this repo +- [`etl.py`](etl.py) — run orchestrator, advisory lock, manifest lifecycle, stage sequencing. +- [`etl_constants.py`](etl_constants.py) — table list, zip patterns, S3 folder constants. +- [`etl_run_manifest.py`](etl_run_manifest.py) — `etl_runs` / `etl_files` / `etl_steps` bookkeeping; schema-scoped advisory lock. +- [`etl_helpers.py`](etl_helpers.py) — paths, CSV row checks, PLUTO download, date badge files. +- [`etl_csv.py`](etl_csv.py) — streaming CSV normalization for tables not handled at export time. +- [`etl_promotion.py`](etl_promotion.py) — atomic `promote_staging_to_main()`; count/checksum hooks for validation. +- [`etl_publish.py`](etl_publish.py) — targeted S3 publish and address-export skip logic. +- [`etl_geocode.py`](etl_geocode.py) — incremental geocode candidate fetch, chunked geocoding, natural-key upsert. -### `etl.py` +### Geocoding -This is the main script that does the full process. +- [`geocode_record.py`](geocode_record.py) — address normalization (usaddress), NYC Geosupport, Census batch geocoder. +## SQL scripts (`lib/sql/`) -### `oca_update.py` +Scripts run against the active session schema (`DB_SCHEMA` / `search_path`). -Finally, this file (in the top level of this repo) simply pulls environment variables from the `.env` file and runs `etl.py` to process an update to the data. +| Script | Role | +|--------|------| +| `create_tables.sql` | Non-destructive bootstrap of core tables and indexes | +| `create_tables_staging.sql` | Per-run RDS staging tables | +| `create_tables_staging_duckdb.sql` | Local DuckDB staging DDL | +| `normalize_staging_after_import.sql` | Nullable int coercion after S3 import | +| `update_appearance_outcomes.sql` | Assign `appearanceid`, expand outcomes JSON | +| `promote_staging_to_main.sql` | Single-transaction staging → main promotion | +| `ensure_promotion_indexes.sql` | Indexes for promotion and address natural keys | +| `select_addresses_needing_geocode.sql` | Delta rows for geocoding | +| `create_geocode_staging_table.sql`, `upsert_geocoded_addresses.sql` | Geocode staging merge | +| `create_addresses_views.sql` | PostGIS views after geocode | +| `create_etl_manifest_tables.sql` | Run manifest DDL | -### `geocode_record.py` +Legacy/manual only: `reset_addresses_table.sql`, `update_metadata.sql`. -Uses usaddress to normalize addresses before sending it off to NYC's Geosupport to get bin, bbl, community districts, census tracts, council districts, and status messages. \ No newline at end of file +## Idempotency and run control + +- **Manifest** — each run records status in `etl_runs`, per-file progress in `etl_files`, and stage checkpoints in `etl_steps`. +- **Advisory lock** — one concurrent writer per schema (`pg_try_advisory_lock`). +- **Reprocess** — `REPROCESS_GLOB` selects S3 private backups; manifest skips completed files unless `FORCE_REPROCESS=true`. +- **Schema isolation** — `DB_SCHEMA` + `S3_PREFIX` for refactor/E2E without touching production paths. +- **Promotion** — scoped delete + insert / upsert in one transaction; safe to retry after import failure. +- **Geocode** — only rows with `lat IS NULL` and a house number; upsert matches on address line columns, not `indexnumberid` alone. + +## Output tables + +Core tables (also published as public CSVs): `oca_index`, `oca_causes`, `oca_addresses`, `oca_parties`, `oca_events`, `oca_appearances`, `oca_appearance_outcomes`, `oca_motions`, `oca_decisions`, `oca_judgments`, `oca_warrants`. Defined in [`etl_constants.py`](etl_constants.py). + +## Further reading + +- Root [README](../README.md) — setup, env vars, Docker invocation +- [`docs/operations/weekly-etl-scheduling.md`](../docs/operations/weekly-etl-scheduling.md) — cron, Kubernetes, EventBridge/ECS +- [`docs/`](../docs/) — data dictionary links and raw XML notes From 35c7edabfe1d50cdd6ed5cba806772ee67c71ad1 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Thu, 28 May 2026 21:48:27 -0400 Subject: [PATCH 16/30] prevent dropped remote db connections for long processes --- .env.example | 6 ++ README.md | 3 + lib/README.md | 8 +-- lib/database.py | 107 +++++++++++++++++---------- lib/etl.py | 5 +- lib/etl_run_manifest.py | 17 ----- tests/test_database_connection.py | 115 ++++++++++++++++++++++++++++++ tests/test_run_manifest.py | 14 ---- 8 files changed, 201 insertions(+), 74 deletions(-) create mode 100644 tests/test_database_connection.py diff --git a/.env.example b/.env.example index 701bcc0..a14090a 100644 --- a/.env.example +++ b/.env.example @@ -26,6 +26,12 @@ PARSE_WRITE_BATCH_ENABLED=1 PARSE_WRITE_BATCH_SIZE=128 PARSE_WRITE_FLUSH_EVERY_N_CASES=16 +# PostgreSQL TCP keepalives (optional; reduce idle disconnects on long ETL runs) +# DB_KEEPALIVES=1 +# DB_KEEPALIVES_IDLE=60 +# DB_KEEPALIVES_INTERVAL=10 +# DB_KEEPALIVES_COUNT=5 + # The database URL # ---------------- # diff --git a/README.md b/README.md index 8857597..4314b09 100644 --- a/README.md +++ b/README.md @@ -109,6 +109,9 @@ Optional env vars (and matching `oca_update.py` CLI flags) tune isolation, repla | `PARSE_WRITE_BATCH_ENABLED` | Buffer parser DuckDB writes in txn windows | `1` (on) | | `PARSE_WRITE_BATCH_SIZE` | Max buffered INSERTs before flush | `128` | | `PARSE_WRITE_FLUSH_EVERY_N_CASES` | Flush cadence per parse worker | `16` | +| `DB_KEEPALIVES_*` | PostgreSQL TCP keepalive tuning (see `.env.example`) | RDS-friendly defaults | + +Long runs (multi-hour XML parse, S3 upload, geocoding) may idle the RDS connection; the pipeline uses TCP keepalives and automatic reconnect (`ensure_connection`) before RDS-heavy stages. Optional `DB_KEEPALIVES_IDLE` / `DB_KEEPALIVES_INTERVAL` / `DB_KEEPALIVES_COUNT` override libpq defaults. Use an isolated `S3_PREFIX` (e.g. `refactor/`) for refactor and end-to-end test runs so reads and writes stay out of production public paths. Memory target per job is **≤ 2 GiB**; lower `GEOCODE_WORKERS` if geocoding approaches the limit. diff --git a/lib/README.md b/lib/README.md index eba6d56..d70e9e2 100644 --- a/lib/README.md +++ b/lib/README.md @@ -43,7 +43,7 @@ Each run is orchestrated sequentially in `oca_etl()`. See [`etl_stages.py`](etl_ - [`sftp.py`](sftp.py) — list and download raw XML zip files from OCA SFTP. - [`s3.py`](s3.py) — S3 upload/download, encryption normalization (`update_encryption`). -- [`database.py`](database.py) — PostgreSQL connection (NYCDB-derived), schema `search_path`, transactions, `aws_s3` import/export helpers. +- [`database.py`](database.py) — PostgreSQL connection (NYCDB-derived), schema `search_path`, TCP keepalives, automatic reconnect after idle drops, transactions, `aws_s3` import/export helpers. ### Parse and local staging @@ -54,9 +54,9 @@ Each run is orchestrated sequentially in `oca_etl()`. See [`etl_stages.py`](etl_ ### ETL orchestration -- [`etl.py`](etl.py) — run orchestrator, advisory lock, manifest lifecycle, stage sequencing. +- [`etl.py`](etl.py) — run orchestrator, manifest lifecycle, stage sequencing. - [`etl_constants.py`](etl_constants.py) — table list, zip patterns, S3 folder constants. -- [`etl_run_manifest.py`](etl_run_manifest.py) — `etl_runs` / `etl_files` / `etl_steps` bookkeeping; schema-scoped advisory lock. +- [`etl_run_manifest.py`](etl_run_manifest.py) — `etl_runs` / `etl_files` / `etl_steps` bookkeeping. - [`etl_helpers.py`](etl_helpers.py) — paths, CSV row checks, PLUTO download, date badge files. - [`etl_csv.py`](etl_csv.py) — streaming CSV normalization for tables not handled at export time. - [`etl_promotion.py`](etl_promotion.py) — atomic `promote_staging_to_main()`; count/checksum hooks for validation. @@ -90,7 +90,7 @@ Legacy/manual only: `reset_addresses_table.sql`, `update_metadata.sql`. ## Idempotency and run control - **Manifest** — each run records status in `etl_runs`, per-file progress in `etl_files`, and stage checkpoints in `etl_steps`. -- **Advisory lock** — one concurrent writer per schema (`pg_try_advisory_lock`). +- **Connection resilience** — TCP keepalives and `ensure_connection()` reconnect before RDS-heavy stages after long parse/upload/geocode gaps. - **Reprocess** — `REPROCESS_GLOB` selects S3 private backups; manifest skips completed files unless `FORCE_REPROCESS=true`. - **Schema isolation** — `DB_SCHEMA` + `S3_PREFIX` for refactor/E2E without touching production paths. - **Promotion** — scoped delete + insert / upsert in one transaction; safe to retry after import failure. diff --git a/lib/database.py b/lib/database.py index 582b403..583bb3b 100644 --- a/lib/database.py +++ b/lib/database.py @@ -1,9 +1,28 @@ +import os import urllib.parse +from contextlib import contextmanager + import psycopg2 import psycopg2.extras -from contextlib import contextmanager -from psycopg2 import sql -import os +from psycopg2 import InterfaceError, OperationalError, sql + + +def _env_int(name, default): + raw = os.environ.get(name) + if raw is None or str(raw).strip() == '': + return default + return int(raw) + + +def _connect_params(): + """libpq TCP keepalive settings (override via DB_KEEPALIVES_* env).""" + return { + 'keepalives': _env_int('DB_KEEPALIVES', 1), + 'keepalives_idle': _env_int('DB_KEEPALIVES_IDLE', 60), + 'keepalives_interval': _env_int('DB_KEEPALIVES_INTERVAL', 10), + 'keepalives_count': _env_int('DB_KEEPALIVES_COUNT', 5), + } + # https://github.com/nycdb/nycdb/blob/master/src/nycdb/sql.py def insert_many(table_name, rows): @@ -23,24 +42,50 @@ def insert_many(table_name, rows): fields = ', '.join(field_names) placeholders = ', '.join(["%({})s".format(k) for k in field_names]) template = f"({placeholders})" - sql = f"INSERT INTO {table_name} ({fields}) VALUES %s" + sql_str = f"INSERT INTO {table_name} ({fields}) VALUES %s" - return sql, template + return sql_str, template # https://github.com/nycdb/nycdb/blob/master/src/nycdb/database.py class Database: """Database connection to OCA database""" - def __init__(self, db_url, schema = '', autocommit = False): + def __init__(self, db_url, schema='', autocommit=False): self.db_url = db_url self.schema = schema - self.conn = psycopg2.connect(db_url) + self.conn = None + self._connect() + + def _connect(self): + self.conn = psycopg2.connect(self.db_url, **_connect_params()) if self.schema: self.set_search_path(self.schema) + def _close_connection(self): + if self.conn is not None: + try: + self.conn.close() + except Exception: + pass + self.conn = None + + def ensure_connection(self): + """Ping the connection; reconnect if the server closed an idle session. + + Returns True if a new connection was opened, False if the existing one is healthy. + """ + try: + with self.conn.cursor() as curs: + curs.execute('SELECT 1') + return False + except (OperationalError, InterfaceError, AttributeError): + self._close_connection() + self._connect() + return True + def __exit__(self, exc_type, exc_value, traceback): - self.conn.close() + self._close_connection() def set_search_path(self, schema): with self.conn.cursor() as curs: @@ -49,6 +94,7 @@ def set_search_path(self, schema): def execute(self, SQL, autocommit=False): """Execute SQL without committing (for use inside transaction blocks).""" + self.ensure_connection() if autocommit: self.conn.set_session(autocommit=True) @@ -70,6 +116,7 @@ def sql(self, SQL, autocommit=False): @contextmanager def transaction(self): """Run a block in one DB transaction; rollback on any exception.""" + self.ensure_connection() try: yield self self.conn.commit() @@ -78,11 +125,13 @@ def transaction(self): raise def sql_fetch_one(self, SQL): + self.ensure_connection() with self.conn.cursor() as curs: curs.execute(SQL) return curs.fetchone() def sql_fetch_all(self, SQL): + self.ensure_connection() with self.conn.cursor() as curs: curs.execute(SQL) return curs.fetchall() @@ -96,7 +145,7 @@ def insert_rows(self, rows, table_name): """ Inserts many rows, all in the same transaction. """ - + self.ensure_connection() with self.conn.cursor() as curs: sql_str, template = insert_many(table_name, rows) try: @@ -108,11 +157,10 @@ def insert_rows(self, rows, table_name): page_size=len(rows) ) except psycopg2.DataError: - print(rows) # useful for debugging + print(rows) # useful for debugging raise self.conn.commit() - def execute_sql_file(self, sql_file, commit=True): """ Executes the provided sql file. @@ -127,48 +175,35 @@ def execute_sql_file(self, sql_file, commit=True): else: self.execute(sql_text) - def export_csv(self, table_name, file_path): """ Exports tables to CSV files """ - - f = open(file_path, 'w') - - with self.conn.cursor() as curs: - curs.copy_expert(f"COPY {table_name} TO STDOUT WITH CSV HEADER", f) - - f.close() + self.ensure_connection() + with open(file_path, 'w', encoding='utf-8') as f: + with self.conn.cursor() as curs: + curs.copy_expert(f"COPY {table_name} TO STDOUT WITH CSV HEADER", f) def import_csv(self, table_name, file_path): """ Imports a CSV file to existing table """ - - f = open(file_path, 'r') - - with self.conn.cursor() as curs: - curs.copy_expert(f'COPY {table_name} FROM STDIN WITH CSV HEADER', f) + self.ensure_connection() + with open(file_path, 'r', encoding='utf-8') as f: + with self.conn.cursor() as curs: + curs.copy_expert(f'COPY {table_name} FROM STDIN WITH CSV HEADER', f) self.conn.commit() - f.close() - - def export_view_as_csv(self, table_name, file_path): """ Exports tables to CSV files """ - - f = open(file_path, 'w') - - with self.conn.cursor() as curs: - curs.copy_expert(f"COPY (SELECT * FROM {table_name}) TO STDOUT WITH CSV HEADER", f) - - f.close() + self.ensure_connection() + with open(file_path, 'w', encoding='utf-8') as f: + with self.conn.cursor() as curs: + curs.copy_expert(f"COPY (SELECT * FROM {table_name}) TO STDOUT WITH CSV HEADER", f) def dump_to(self, file_path): """ pg_dump the database to file """ cmd = f"pg_dump {self.db_url} -Fc > {file_path}" os.system(cmd) - def restore_from(self, file_path): """ pg_restore the database from file """ cmd = f"pg_restore -d {self.db_url} -c {file_path}" os.system(cmd) - diff --git a/lib/etl.py b/lib/etl.py index 61cb310..dcd1c12 100644 --- a/lib/etl.py +++ b/lib/etl.py @@ -91,7 +91,6 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None force_reprocess=force_reprocess ) manifest.setup_tables() - manifest.acquire_lock() manifest.create_run() Path('staging.duckdb').unlink(missing_ok=True) @@ -117,6 +116,7 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None staging_db, pub_dir, mode, s3_args, s3_prefix, csv_preprocess_chunk_size=csv_row_check_chunk_size, ) + db.ensure_connection() staging_tables_with_data = import_and_promote_staging( manifest, db, @@ -127,6 +127,7 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None runtime_args.get('db_schema') or db_args.get('schema') or 'public', ) published_core_keys = publish_core_tables(manifest, db, s3_args, s3_prefix) + db.ensure_connection() geocode_and_publish_addresses( manifest, db, s3, priv_dir, pub_dir, s3_args, s3_prefix, mode, selection, geocode_workers, census_batch_chunk_size, @@ -146,5 +147,3 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None manifest.upsert_file(selected_name, source=source, status='failed', stage='run', error=exc) manifest.mark_run_failed(exc) raise - finally: - manifest.release_lock() diff --git a/lib/etl_run_manifest.py b/lib/etl_run_manifest.py index 6c01c07..2cce7e8 100644 --- a/lib/etl_run_manifest.py +++ b/lib/etl_run_manifest.py @@ -23,27 +23,10 @@ def __init__(self, db, schema_name, s3_prefix, mode, reprocess_glob, force_repro self.reprocess_glob = reprocess_glob or '' self.force_reprocess = force_reprocess self.run_id = str(uuid.uuid4()) - self.lock_key = None - self.lock_acquired = False def setup_tables(self): self.db.execute_sql_file('create_etl_manifest_tables.sql') - def acquire_lock(self): - row = self.db.sql_fetch_one( - f"SELECT hashtext('oca_etl:' || {self._literal(self.schema_name)})::bigint" - ) - self.lock_key = row[0] - locked = self.db.sql_fetch_one(f"SELECT pg_try_advisory_lock({self.lock_key})") - self.lock_acquired = bool(locked and locked[0]) - if not self.lock_acquired: - raise RuntimeError(f"Another ETL run is already active for schema '{self.schema_name}'.") - - def release_lock(self): - if self.lock_acquired and self.lock_key is not None: - self.db.sql_fetch_one(f"SELECT pg_advisory_unlock({self.lock_key})") - self.lock_acquired = False - def create_run(self): payload = { "mode": self.mode, diff --git a/tests/test_database_connection.py b/tests/test_database_connection.py new file mode 100644 index 0000000..55ce5c7 --- /dev/null +++ b/tests/test_database_connection.py @@ -0,0 +1,115 @@ +import unittest +from unittest import mock + +from psycopg2 import OperationalError + +from lib.database import Database, _connect_params + + +class ConnectParamsTests(unittest.TestCase): + def test_default_keepalive_params(self): + with mock.patch.dict('os.environ', {}, clear=True): + params = _connect_params() + self.assertEqual(params['keepalives'], 1) + self.assertEqual(params['keepalives_idle'], 60) + self.assertEqual(params['keepalives_interval'], 10) + self.assertEqual(params['keepalives_count'], 5) + + def test_keepalive_env_overrides(self): + with mock.patch.dict('os.environ', { + 'DB_KEEPALIVES_IDLE': '120', + 'DB_KEEPALIVES_INTERVAL': '20', + }, clear=True): + params = _connect_params() + self.assertEqual(params['keepalives_idle'], 120) + self.assertEqual(params['keepalives_interval'], 20) + + +class EnsureConnectionTests(unittest.TestCase): + @staticmethod + def _mock_connection(): + conn = mock.Mock() + cursor = mock.MagicMock() + cursor.__enter__.return_value = cursor + cursor.__exit__.return_value = False + conn.cursor.return_value = cursor + return conn + + @mock.patch('lib.database.psycopg2.connect') + def test_ensure_connection_returns_false_when_healthy(self, connect_mock): + conn = self._mock_connection() + connect_mock.return_value = conn + db = Database(db_url='postgres://example') + connect_mock.reset_mock() + + reconnected = db.ensure_connection() + + self.assertFalse(reconnected) + connect_mock.assert_not_called() + conn.cursor.assert_called() + + @mock.patch('lib.database.psycopg2.connect') + def test_ensure_connection_reconnects_on_operational_error(self, connect_mock): + dead_conn = self._mock_connection() + dead_cursor = dead_conn.cursor.return_value.__enter__.return_value + dead_cursor.execute.side_effect = OperationalError('SSL SYSCALL error: EOF detected') + + live_conn = self._mock_connection() + connect_mock.side_effect = [dead_conn, live_conn] + + db = Database(db_url='postgres://example') + connect_mock.reset_mock() + connect_mock.side_effect = [live_conn] + + reconnected = db.ensure_connection() + + self.assertTrue(reconnected) + self.assertIs(db.conn, live_conn) + connect_mock.assert_called_once() + self.assertEqual( + connect_mock.call_args, + mock.call('postgres://example', **_connect_params()), + ) + + @mock.patch('lib.database.psycopg2.connect') + def test_ensure_connection_restores_search_path_after_reconnect(self, connect_mock): + initial_conn = self._mock_connection() + connect_mock.return_value = initial_conn + + db = Database(db_url='postgres://example', schema='oca_refactor') + + dead_cursor = initial_conn.cursor.return_value.__enter__.return_value + dead_cursor.execute.side_effect = OperationalError('connection closed') + + live_conn = self._mock_connection() + connect_mock.side_effect = [live_conn] + + db.ensure_connection() + + live_cursor = live_conn.cursor.return_value.__enter__.return_value + execute_calls = [str(call.args[0]) for call in live_cursor.execute.call_args_list] + self.assertTrue(any('search_path' in call for call in execute_calls)) + + @mock.patch('lib.database.psycopg2.connect') + def test_sql_uses_reconnected_connection(self, connect_mock): + dead_conn = self._mock_connection() + dead_cursor = dead_conn.cursor.return_value.__enter__.return_value + dead_cursor.execute.side_effect = [ + OperationalError('EOF detected'), + ] + + live_conn = self._mock_connection() + connect_mock.side_effect = [dead_conn, live_conn] + + db = Database(db_url='postgres://example') + connect_mock.reset_mock() + connect_mock.side_effect = [live_conn] + + db.ensure_connection() + db.sql('SELECT 2') + + live_conn.commit.assert_called() + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_run_manifest.py b/tests/test_run_manifest.py index 1f7d05c..fca3ce6 100644 --- a/tests/test_run_manifest.py +++ b/tests/test_run_manifest.py @@ -47,20 +47,6 @@ def test_reprocess_without_force_skips_completed_files(self): ) self.assertEqual(selected, ["LandlordTenant.Incr.2024-03-01.zip"]) - def test_advisory_lock_failure_raises(self): - fake_db = FakeDb() - fake_db.fetch_one_queue = [(12345,), (False,)] - manifest = EtlRunManifest( - db=fake_db, - schema_name="oca_refactor", - s3_prefix="refactor/dev", - mode="2", - reprocess_glob="", - force_reprocess=False, - ) - with self.assertRaises(RuntimeError): - manifest.acquire_lock() - if __name__ == "__main__": unittest.main() From 92a5dc2fff90a30912cfad44d69f21d9fbb752d9 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Fri, 29 May 2026 09:36:32 -0400 Subject: [PATCH 17/30] add DB connection issue protections, suppress geosupport logs --- .env.example | 3 + lib/database.py | 96 +++++++++++++++----- lib/etl_geocode.py | 26 +++++- lib/etl_stages.py | 4 +- lib/geocode_record.py | 11 +++ lib/parsers.py | 2 +- lib/sql/select_addresses_needing_geocode.sql | 4 +- tests/test_database_connection.py | 41 ++++++++- tests/test_incremental_geocode.py | 1 + 9 files changed, 154 insertions(+), 34 deletions(-) diff --git a/.env.example b/.env.example index a14090a..71c758d 100644 --- a/.env.example +++ b/.env.example @@ -32,6 +32,9 @@ PARSE_WRITE_FLUSH_EVERY_N_CASES=16 # DB_KEEPALIVES_INTERVAL=10 # DB_KEEPALIVES_COUNT=5 +# Long-running SQL timeout in milliseconds (default 3600000 = 1 hour) +# DB_STATEMENT_TIMEOUT_MS=3600000 + # The database URL # ---------------- # diff --git a/lib/database.py b/lib/database.py index 583bb3b..90243b9 100644 --- a/lib/database.py +++ b/lib/database.py @@ -1,10 +1,9 @@ import os -import urllib.parse from contextlib import contextmanager import psycopg2 import psycopg2.extras -from psycopg2 import InterfaceError, OperationalError, sql +from psycopg2 import Error, InterfaceError, OperationalError, sql def _env_int(name, default): @@ -14,6 +13,11 @@ def _env_int(name, default): return int(raw) +def default_statement_timeout_ms(): + """RDS statement_timeout for long-running ETL SQL (override via DB_STATEMENT_TIMEOUT_MS).""" + return _env_int('DB_STATEMENT_TIMEOUT_MS', 3_600_000) + + def _connect_params(): """libpq TCP keepalive settings (override via DB_KEEPALIVES_* env).""" return { @@ -70,19 +74,51 @@ def _close_connection(self): pass self.conn = None + def _connection_is_closed(self): + if self.conn is None: + return True + closed = getattr(self.conn, 'closed', None) + if isinstance(closed, bool): + return closed + if isinstance(closed, int): + return closed != 0 + return False + + def _safe_rollback(self): + if self._connection_is_closed(): + self._close_connection() + return + try: + self.conn.rollback() + except (InterfaceError, OperationalError, Error, AttributeError): + self._close_connection() + + def set_statement_timeout(self, timeout_ms=None): + timeout = timeout_ms if timeout_ms is not None else default_statement_timeout_ms() + self.sql(f"SET statement_timeout = '{timeout}'") + def ensure_connection(self): """Ping the connection; reconnect if the server closed an idle session. Returns True if a new connection was opened, False if the existing one is healthy. """ + if self._connection_is_closed(): + self._connect() + return True try: with self.conn.cursor() as curs: curs.execute('SELECT 1') return False - except (OperationalError, InterfaceError, AttributeError): - self._close_connection() - self._connect() - return True + except Error: + try: + self.conn.rollback() + with self.conn.cursor() as curs: + curs.execute('SELECT 1') + return False + except (OperationalError, InterfaceError, Error, AttributeError): + self._close_connection() + self._connect() + return True def __exit__(self, exc_type, exc_value, traceback): self._close_connection() @@ -110,8 +146,13 @@ def sql(self, SQL, autocommit=False): Set autocommit to run queries like VACUUM FULL [1] [1]: https://til.codeinthehole.com/posts/about-a-gotcha-with-psycopg2s-autocommit-handling/ """ - self.execute(SQL, autocommit=autocommit) - self.conn.commit() + self.ensure_connection() + try: + self.execute(SQL, autocommit=autocommit) + self.conn.commit() + except Exception: + self._safe_rollback() + raise @contextmanager def transaction(self): @@ -121,7 +162,7 @@ def transaction(self): yield self self.conn.commit() except Exception: - self.conn.rollback() + self._safe_rollback() raise def sql_fetch_one(self, SQL): @@ -141,25 +182,32 @@ def sql_fetch_all_from_file(self, sql_file): with open(file_path, 'r', encoding='utf-8') as f: return self.sql_fetch_all(f.read()) - def insert_rows(self, rows, table_name): + def insert_rows(self, rows, table_name, page_size=1000): """ Inserts many rows, all in the same transaction. """ + if not rows: + return + self.ensure_connection() - with self.conn.cursor() as curs: - sql_str, template = insert_many(table_name, rows) - try: - psycopg2.extras.execute_values( - curs, - sql_str, - rows, - template=template, - page_size=len(rows) - ) - except psycopg2.DataError: - print(rows) # useful for debugging - raise - self.conn.commit() + try: + with self.conn.cursor() as curs: + sql_str, template = insert_many(table_name, rows) + try: + psycopg2.extras.execute_values( + curs, + sql_str, + rows, + template=template, + page_size=min(page_size, len(rows)), + ) + except psycopg2.DataError: + print(rows) # useful for debugging + raise + self.conn.commit() + except Exception: + self._safe_rollback() + raise def execute_sql_file(self, sql_file, commit=True): """ diff --git a/lib/etl_geocode.py b/lib/etl_geocode.py index b45e863..cec37e5 100644 --- a/lib/etl_geocode.py +++ b/lib/etl_geocode.py @@ -1,12 +1,11 @@ import functools import multiprocessing -import os from itertools import repeat import numpy as np import pandas as pd -from .geocode_record import geocode_record, geocode_using_census_batch +from .geocode_record import geocode_record, geocode_using_census_batch, suppress_geosupport_logging ADDRESS_ROW_KEY_COLUMNS = [ 'indexnumberid', 'street1', 'street2', 'city', 'state', 'postalcode', @@ -57,7 +56,8 @@ def address_row_key(row): def row_needs_geocode(row): """Mirror select_addresses_needing_geocode.sql for unit tests.""" - return not _has_lat(row.get('lat')) and str(row.get('house_number') or '').strip() != '' + # return not _has_lat(row.get('lat')) and str(row.get('house_number') or '').strip() != '' + return not _has_lat(row.get('lat')) def _rows_from_fetchall(rows): @@ -86,6 +86,14 @@ def _prepare_rows_for_db(rows): return prepared +def _init_geosupport_worker(): + suppress_geosupport_logging() + + +def _geosupport_worker(record): + return geocode_record(record, addr_cols=['street1', 'city', 'postalcode']) + + def _run_geosupport(records, geocode_workers, geocode_record_fn): geocode_one = functools.partial( geocode_record_fn, @@ -95,9 +103,13 @@ def _run_geosupport(records, geocode_workers, geocode_record_fn): if not use_pool: return [geocode_one(record) for record in records] + suppress_geosupport_logging() worker_count = min(geocode_workers, multiprocessing.cpu_count()) - with multiprocessing.Pool(processes=worker_count) as pool: - return pool.map(geocode_one, records, 10000) + with multiprocessing.Pool( + processes=worker_count, + initializer=_init_geosupport_worker, + ) as pool: + return pool.map(_geosupport_worker, records, 10000) def _run_census_batch(still_missing, census_batch_chunk_size, pub_dir, geocode_using_census_batch_fn): @@ -129,12 +141,14 @@ def geocode_candidate_records( if not records: return [] + print(f'Geocoding {len(records)} addresses using Geosupport') geosupport_results = _run_geosupport(records, geocode_workers, geocode_record_fn) still_missing = [row for row in geosupport_results if not _has_lat(row.get('lat'))] if not still_missing: return geosupport_results + print(f'Geocoding {len(still_missing)} addresses using Census batch') census_chunks = _run_census_batch( still_missing, census_batch_chunk_size, @@ -156,6 +170,8 @@ def upsert_geocoded_addresses(db, rows): if not rows: return 0 + # Large backfills can exceed default RDS statement_timeout on staging insert + merge. + db.set_statement_timeout() db.execute_sql_file('create_geocode_staging_table.sql') db.insert_rows(_prepare_rows_for_db(rows), 'oca_addresses_geocode_staging') db.execute_sql_file('upsert_geocoded_addresses.sql') diff --git a/lib/etl_stages.py b/lib/etl_stages.py index d3d44a7..83c4bb6 100644 --- a/lib/etl_stages.py +++ b/lib/etl_stages.py @@ -226,6 +226,7 @@ def import_and_promote_staging(manifest, db, pub_dir, s3_args, s3_prefix, select imported_staging_tables = staging_tables_with_rows(pub_dir) staging_tables = [t + '_staging' for t in OCA_TABLES] manifest.upsert_step('promote_staging', 'running') + db.set_statement_timeout() ensure_core_tables_exist(db, expected_schema) db.execute_sql_file('create_tables_staging.sql') for t in staging_tables: @@ -280,13 +281,13 @@ def geocode_and_publish_addresses( ): manifest.upsert_step('geocode_refresh', 'running') candidates = fetch_addresses_needing_geocode(db) - print(f'Geocoding {len(candidates)} addresses missing lat/lon') geocoded_rows = geocode_candidate_records( candidates, geocode_workers, census_batch_chunk_size, pub_dir, ) + print(f'Upserting {len(geocoded_rows)} geocoded addresses') upsert_geocoded_addresses(db, geocoded_rows) publish_addresses = should_publish_address_exports( @@ -294,6 +295,7 @@ def geocode_and_publish_addresses( ) published_keys = list(published_core_keys or []) if publish_addresses: + print('Publishing address CSV/views') csv_filepath = os.path.join(pub_dir, "oca_addresses_private.csv") db.export_csv('oca_addresses', csv_filepath) db.execute_sql_file('create_addresses_views.sql') diff --git a/lib/geocode_record.py b/lib/geocode_record.py index 7896dfb..58d2c4a 100644 --- a/lib/geocode_record.py +++ b/lib/geocode_record.py @@ -1,3 +1,4 @@ +import logging import os import pandas as pd from pandas.util import hash_pandas_object @@ -8,8 +9,18 @@ from .placename_to_borocode import placename_to_borocode + +def suppress_geosupport_logging(): + """Geosupport logs expected geocode failures at ERROR; row status captures outcomes instead.""" + gs_logger = logging.getLogger('geosupport.geosupport') + gs_logger.handlers.clear() + gs_logger.setLevel(logging.CRITICAL) + gs_logger.propagate = False + + # initialize geosupport g = Geosupport() +suppress_geosupport_logging() def parse_address(addr): """parses full address string and returns dict of address components needed for geocoding diff --git a/lib/parsers.py b/lib/parsers.py index 0234a9c..52b0f5c 100644 --- a/lib/parsers.py +++ b/lib/parsers.py @@ -618,5 +618,5 @@ def parse_file(xml_file, staging_db, extract_date, num_threads=8): flush_write_buffer(thread_db) thread_db.close() - print(f"Processed {total_cases} cases with {num_threads} threads") + print(f"\nProcessed {total_cases} cases with {num_threads} threads") diff --git a/lib/sql/select_addresses_needing_geocode.sql b/lib/sql/select_addresses_needing_geocode.sql index b926691..534966a 100644 --- a/lib/sql/select_addresses_needing_geocode.sql +++ b/lib/sql/select_addresses_needing_geocode.sql @@ -27,5 +27,5 @@ SELECT lon, zip_code FROM oca_addresses -WHERE lat IS NULL - AND COALESCE(house_number, '') <> ''; +WHERE lat IS NULL; + -- AND COALESCE(house_number, '') <> ''; diff --git a/tests/test_database_connection.py b/tests/test_database_connection.py index 55ce5c7..eadd4d9 100644 --- a/tests/test_database_connection.py +++ b/tests/test_database_connection.py @@ -1,7 +1,8 @@ import unittest from unittest import mock -from psycopg2 import OperationalError +import psycopg2 +from psycopg2 import InterfaceError, OperationalError from lib.database import Database, _connect_params @@ -29,6 +30,7 @@ class EnsureConnectionTests(unittest.TestCase): @staticmethod def _mock_connection(): conn = mock.Mock() + conn.closed = 0 cursor = mock.MagicMock() cursor.__enter__.return_value = cursor cursor.__exit__.return_value = False @@ -96,6 +98,7 @@ def test_sql_uses_reconnected_connection(self, connect_mock): dead_cursor = dead_conn.cursor.return_value.__enter__.return_value dead_cursor.execute.side_effect = [ OperationalError('EOF detected'), + OperationalError('EOF detected'), ] live_conn = self._mock_connection() @@ -110,6 +113,42 @@ def test_sql_uses_reconnected_connection(self, connect_mock): live_conn.commit.assert_called() + @mock.patch('lib.database.psycopg2.connect') + def test_ensure_connection_recovers_from_aborted_transaction(self, connect_mock): + conn = self._mock_connection() + connect_mock.return_value = conn + db = Database(db_url='postgres://example') + + cursor = conn.cursor.return_value.__enter__.return_value + cursor.execute.side_effect = [ + psycopg2.errors.InFailedSqlTransaction('current transaction is aborted'), + None, + ] + + reconnected = db.ensure_connection() + + self.assertFalse(reconnected) + conn.rollback.assert_called_once() + self.assertEqual(cursor.execute.call_count, 2) + + @mock.patch('lib.database.psycopg2.connect') + def test_sql_reraises_query_error_when_rollback_fails_on_closed_connection(self, connect_mock): + conn = self._mock_connection() + conn.closed = 0 + connect_mock.return_value = conn + db = Database(db_url='postgres://example') + + cursor = conn.cursor.return_value.__enter__.return_value + query_error = OperationalError('server closed the connection unexpectedly') + cursor.execute.side_effect = query_error + conn.rollback.side_effect = InterfaceError('connection already closed') + + with self.assertRaises(OperationalError) as ctx: + db.sql('SELECT 1') + + self.assertIs(ctx.exception, query_error) + self.assertIsNone(db.conn) + if __name__ == '__main__': unittest.main() diff --git a/tests/test_incremental_geocode.py b/tests/test_incremental_geocode.py index 7682304..cfee782 100644 --- a/tests/test_incremental_geocode.py +++ b/tests/test_incremental_geocode.py @@ -227,6 +227,7 @@ def test_upsert_writes_staging_and_merges(self): count = upsert_geocoded_addresses(fake_db, rows) self.assertEqual(count, 1) + fake_db.set_statement_timeout.assert_called_once() fake_db.execute_sql_file.assert_any_call('create_geocode_staging_table.sql') fake_db.insert_rows.assert_called_once() fake_db.execute_sql_file.assert_any_call('upsert_geocoded_addresses.sql') From 0674af21d2875ca67128ae569ef7163812864353 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Fri, 29 May 2026 09:36:50 -0400 Subject: [PATCH 18/30] adjust geocode candidate selection --- lib/etl_geocode.py | 1 - lib/sql/select_addresses_needing_geocode.sql | 1 - 2 files changed, 2 deletions(-) diff --git a/lib/etl_geocode.py b/lib/etl_geocode.py index cec37e5..92ce672 100644 --- a/lib/etl_geocode.py +++ b/lib/etl_geocode.py @@ -56,7 +56,6 @@ def address_row_key(row): def row_needs_geocode(row): """Mirror select_addresses_needing_geocode.sql for unit tests.""" - # return not _has_lat(row.get('lat')) and str(row.get('house_number') or '').strip() != '' return not _has_lat(row.get('lat')) diff --git a/lib/sql/select_addresses_needing_geocode.sql b/lib/sql/select_addresses_needing_geocode.sql index 534966a..67cc97b 100644 --- a/lib/sql/select_addresses_needing_geocode.sql +++ b/lib/sql/select_addresses_needing_geocode.sql @@ -28,4 +28,3 @@ SELECT zip_code FROM oca_addresses WHERE lat IS NULL; - -- AND COALESCE(house_number, '') <> ''; From ae51651c8f12c10857b7c6442b42019fccb3d5aa Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Fri, 29 May 2026 09:52:28 -0400 Subject: [PATCH 19/30] reorder steps for better grouping (publish all files together) and avoid duplicate address export --- README.md | 2 +- lib/README.md | 25 +++-- lib/etl.py | 21 +++-- lib/etl_helpers.py | 24 +++-- lib/etl_publish.py | 35 +++---- lib/etl_stages.py | 86 +++++++++-------- lib/s3.py | 2 +- tests/test_create_date_files.py | 31 ++++++ tests/test_etl_publish.py | 150 ++++++++++++++++++++++-------- tests/test_incremental_geocode.py | 46 ++------- 10 files changed, 266 insertions(+), 156 deletions(-) create mode 100644 tests/test_create_date_files.py diff --git a/README.md b/README.md index 4314b09..2fef197 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ This work is licensed under a [Creative Commons Attribution-NonCommercial-ShareA ## CSV Files -[![Date Last Updated](https://oca-2-dev.s3.amazonaws.com/public/last-updated-shield.png)](https://oca-2-dev.s3.amazonaws.com/public/last-updated-date.txt) +[![Date Last Updated](https://oca-2-dev.s3.amazonaws.com/public/last-updated-shield.svg)](https://oca-2-dev.s3.amazonaws.com/public/last-updated-date.txt) * [`oca_index`](https://oca-2-dev.s3.amazonaws.com/public/oca_index.csv) * [`oca_causes`](https://oca-2-dev.s3.amazonaws.com/public/oca_causes.csv) diff --git a/lib/README.md b/lib/README.md index d70e9e2..3286157 100644 --- a/lib/README.md +++ b/lib/README.md @@ -18,9 +18,11 @@ flowchart TD s3upload --> import[RDS import staging tables] import --> normalize[SQL normalize + appearance outcomes] normalize --> promote[Atomic promote staging → main] - promote --> publish[Export core tables to S3 public/] - publish --> geocode[Incremental geocode delta] - geocode --> addrpub[Publish addresses + views] + promote --> geocode[Incremental geocode + upsert] + geocode --> views[create_addresses_views.sql] + views --> publish[Publish all public CSVs + date files] + publish --> enc[SSE normalize except private address CSV] + enc --> priv[Upload private XML zips] ``` Each run is orchestrated sequentially in `oca_etl()`. See [`etl_stages.py`](etl_stages.py) for stage implementations. @@ -34,8 +36,10 @@ Each run is orchestrated sequentially in `oca_etl()`. See [`etl_stages.py`](etl_ | Parse | `parsers.py`, `duckdb_database.py` | Streaming XML parse into local DuckDB (`staging.duckdb`); batched writes via `parse_write_buffer.py`. | | Export + preprocess | `duckdb_database.py`, `staging_csv_export.py`, `etl_csv.py` | DuckDB `COPY` with Postgres-compatible transforms; minimal second-pass CSV rewrite. | | Import + promote | `etl_stages.import_and_promote_staging`, `etl_promotion.py` | Bootstrap core tables, import staging CSVs via `aws_s3`, normalize, then single-transaction promotion. | -| Publish core | `etl_publish.py`, `etl_stages.publish_core_tables` | Export `OCA_TABLES` to S3 public prefix; SSE-S3 normalization on objects written this run. | -| Geocode + publish addresses | `etl_geocode.py`, `geocode_record.py` | Delta-select rows missing lat/lon; Geosupport + Census batch; upsert by natural address key; skip address export when unchanged. | +| Geocode | `etl_geocode.py`, `etl_stages.geocode_addresses` | Delta-select rows missing lat/lon; Geosupport + Census batch; upsert by natural address key. | +| Publish public | `etl_stages.publish_public_artifacts` | Rebuild address views; export all `OCA_TABLES` and address views; upload date badge files. | +| Normalize encryption | `etl_stages.normalize_public_s3_encryption` | SSE-S3 on published keys except `oca_addresses_private.csv`. | +| Upload private | `etl_stages.upload_private_source_files` | Back up raw XML zips to S3 `private/`. | ## Key modules @@ -57,10 +61,10 @@ Each run is orchestrated sequentially in `oca_etl()`. See [`etl_stages.py`](etl_ - [`etl.py`](etl.py) — run orchestrator, manifest lifecycle, stage sequencing. - [`etl_constants.py`](etl_constants.py) — table list, zip patterns, S3 folder constants. - [`etl_run_manifest.py`](etl_run_manifest.py) — `etl_runs` / `etl_files` / `etl_steps` bookkeeping. -- [`etl_helpers.py`](etl_helpers.py) — paths, CSV row checks, PLUTO download, date badge files. +- [`etl_helpers.py`](etl_helpers.py) — paths, CSV row checks, PLUTO download, local SVG date badge files. - [`etl_csv.py`](etl_csv.py) — streaming CSV normalization for tables not handled at export time. - [`etl_promotion.py`](etl_promotion.py) — atomic `promote_staging_to_main()`; count/checksum hooks for validation. -- [`etl_publish.py`](etl_publish.py) — targeted S3 publish and address-export skip logic. +- [`etl_publish.py`](etl_publish.py) — S3 export helpers and encryption key filtering. - [`etl_geocode.py`](etl_geocode.py) — incremental geocode candidate fetch, chunked geocoding, natural-key upsert. ### Geocoding @@ -82,19 +86,20 @@ Scripts run against the active session schema (`DB_SCHEMA` / `search_path`). | `ensure_promotion_indexes.sql` | Indexes for promotion and address natural keys | | `select_addresses_needing_geocode.sql` | Delta rows for geocoding | | `create_geocode_staging_table.sql`, `upsert_geocoded_addresses.sql` | Geocode staging merge | -| `create_addresses_views.sql` | PostGIS views after geocode | +| `create_addresses_views.sql` | PostGIS views after geocode (before S3 export) | | `create_etl_manifest_tables.sql` | Run manifest DDL | Legacy/manual only: `reset_addresses_table.sql`, `update_metadata.sql`. ## Idempotency and run control -- **Manifest** — each run records status in `etl_runs`, per-file progress in `etl_files`, and stage checkpoints in `etl_steps`. -- **Connection resilience** — TCP keepalives and `ensure_connection()` reconnect before RDS-heavy stages after long parse/upload/geocode gaps. +- **Manifest** — each run records status in `etl_runs`, per-file progress in `etl_files`, and stage checkpoints in `etl_steps` (including `geocode_refresh`, `publish_public`, `normalize_s3_encryption`, `upload_private`). +- **Connection resilience** — TCP keepalives and `ensure_connection()` before geocode and before publish. - **Reprocess** — `REPROCESS_GLOB` selects S3 private backups; manifest skips completed files unless `FORCE_REPROCESS=true`. - **Schema isolation** — `DB_SCHEMA` + `S3_PREFIX` for refactor/E2E without touching production paths. - **Promotion** — scoped delete + insert / upsert in one transaction; safe to retry after import failure. - **Geocode** — only rows with `lat IS NULL` and a house number; upsert matches on address line columns, not `indexnumberid` alone. +- **Publish** — every successful run exports the full public snapshot (all core tables and address views). ## Output tables diff --git a/lib/etl.py b/lib/etl.py index dcd1c12..4b26d93 100644 --- a/lib/etl.py +++ b/lib/etl.py @@ -32,12 +32,14 @@ from .etl_stages import ( FileSelection, download_selected_files, - geocode_and_publish_addresses, + geocode_addresses, import_and_promote_staging, + normalize_public_s3_encryption, parse_xml_to_staging, preprocess_and_upload_staging_csvs, - publish_core_tables, + publish_public_artifacts, select_input_files, + upload_private_source_files, ) from .s3 import S3 from .sftp import Sftp @@ -117,7 +119,7 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None csv_preprocess_chunk_size=csv_row_check_chunk_size, ) db.ensure_connection() - staging_tables_with_data = import_and_promote_staging( + import_and_promote_staging( manifest, db, pub_dir, @@ -126,13 +128,16 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None selection, runtime_args.get('db_schema') or db_args.get('schema') or 'public', ) - published_core_keys = publish_core_tables(manifest, db, s3_args, s3_prefix) db.ensure_connection() - geocode_and_publish_addresses( - manifest, db, s3, priv_dir, pub_dir, s3_args, s3_prefix, mode, selection, - geocode_workers, census_batch_chunk_size, - staging_tables_with_data, published_core_keys, + geocode_addresses( + manifest, db, pub_dir, geocode_workers, census_batch_chunk_size, ) + db.ensure_connection() + published_keys = publish_public_artifacts( + manifest, db, s3_args, s3_prefix, mode, selection, pub_dir, + ) + normalize_public_s3_encryption(manifest, s3, published_keys) + upload_private_source_files(manifest, s3, priv_dir, s3_prefix) manifest.mark_run_completed( len(selection.selected_zip_files), diff --git a/lib/etl_helpers.py b/lib/etl_helpers.py index 9a31266..61fc485 100644 --- a/lib/etl_helpers.py +++ b/lib/etl_helpers.py @@ -45,10 +45,20 @@ def csv_has_rows(csv_filepath): return False +def _last_updated_badge_svg(date): + label = f'Last Updated: {date}' + return f''' + + + {label} + +''' + + def create_date_files(data_file, local_dir): """ - Create a text file and a custom shield image with date the data was - last updated. + Create a text file and a local SVG badge with the data last-updated date. :param data_file: file path for data being processed :param local_dir: path for local directory to save date files @@ -56,12 +66,12 @@ def create_date_files(data_file, local_dir): date = re.search(r'(\d{4}-\d{2}-\d{2})', data_file).group(1) txt_file = os.path.join(local_dir, 'last-updated-date.txt') - open(txt_file, 'w').write(date) + with open(txt_file, 'w', encoding='utf-8') as handle: + handle.write(date) - url = f"https://raster.shields.io/badge/Last%20Updated-{date.replace('-', '--')}-yellow" - r = requests.get(url) - img_file = os.path.join(local_dir, 'last-updated-shield.png') - open(img_file, 'wb').write(r.content) + svg_file = os.path.join(local_dir, 'last-updated-shield.svg') + with open(svg_file, 'w', encoding='utf-8') as handle: + handle.write(_last_updated_badge_svg(date)) def download_pluto(output_dir): diff --git a/lib/etl_publish.py b/lib/etl_publish.py index e806c61..33b251f 100644 --- a/lib/etl_publish.py +++ b/lib/etl_publish.py @@ -1,10 +1,12 @@ -"""S3 publish helpers: selective exports and targeted post-publish encryption.""" +"""S3 publish helpers: exports and targeted post-publish encryption.""" import os from .etl_constants import OCA_TABLES, S3_PUBLIC_FOLDER from .etl_helpers import csv_has_rows, s3_key +PRIVATE_ADDRESS_CSV = 'oca_addresses_private.csv' + def staging_tables_with_rows(pub_dir): """Main table names whose staging CSV had at least one data row this run.""" @@ -16,20 +18,6 @@ def staging_tables_with_rows(pub_dir): return set(tables) -def should_publish_address_exports(staging_tables_with_data, geocode_candidate_count): - """ - Whether address CSVs and derived views need S3 export this run. - - Skips full-table address exports when incremental geocode has no work and - promotion did not load new address staging rows. Core table exports are not - skipped when oca_index_staging has rows: promotion deletes child rows for the - batch even when a child staging CSV was empty. - """ - if geocode_candidate_count > 0: - return True - return 'oca_addresses' in staging_tables_with_data - - def export_table_to_s3(db, table, s3_filename, s3_args, s3_prefix): """Export one main table via aws_s3.query_export_to_s3; return the object key.""" object_key = s3_key(f"{S3_PUBLIC_FOLDER}/{s3_filename}", s3_prefix) @@ -53,9 +41,22 @@ def export_table_to_s3(db, table, s3_filename, s3_args, s3_prefix): ) -def normalize_published_s3_encryption(s3, s3_prefix, object_keys): +def published_keys_for_encryption(object_keys): + """Object keys to re-encrypt with SSE-S3; excludes the private address CSV.""" + keys = [] + for key in object_keys: + if not key: + continue + normalized = key.rstrip('/') + if normalized.endswith(PRIVATE_ADDRESS_CSV): + continue + keys.append(key) + return sorted(set(keys)) + + +def normalize_published_s3_encryption(s3, object_keys): """Re-encrypt only objects written during this publish pass (SSE-S3).""" - keys = sorted({k for k in object_keys if k}) + keys = published_keys_for_encryption(object_keys) if not keys: return print(f'Updating server-side encryption for {len(keys)} published S3 object(s)') diff --git a/lib/etl_stages.py b/lib/etl_stages.py index 83c4bb6..44bd298 100644 --- a/lib/etl_stages.py +++ b/lib/etl_stages.py @@ -24,7 +24,6 @@ ADDRESS_VIEW_EXPORTS, export_table_to_s3, normalize_published_s3_encryption, - should_publish_address_exports, staging_tables_with_rows, ) from .etl_geocode import ( @@ -254,14 +253,13 @@ def import_and_promote_staging(manifest, db, pub_dir, s3_args, s3_prefix, select return imported_staging_tables -def publish_core_tables(manifest, db, s3_args, s3_prefix): +def publish_core_tables(db, s3_args, s3_prefix): """ - Export all core tables after promotion. + Export all core tables via aws_s3.query_export_to_s3. When oca_index_staging has rows, promotion deletes child rows for the batch even if a child staging CSV was empty, so per-table skip is unsafe. """ - manifest.upsert_step('publish_tables', 'running') published_keys = [] for t in OCA_TABLES: s3_filename = t + '.csv' @@ -270,15 +268,11 @@ def publish_core_tables(manifest, db, s3_args, s3_prefix): published_keys.append( export_table_to_s3(db, t, s3_filename, s3_args, s3_prefix) ) - manifest.upsert_step('publish_tables', 'completed') return published_keys -def geocode_and_publish_addresses( - manifest, db, s3, priv_dir, pub_dir, s3_args, s3_prefix, mode, selection, - geocode_workers, census_batch_chunk_size, staging_tables_with_data, - published_core_keys -): +def geocode_addresses(manifest, db, pub_dir, geocode_workers, census_batch_chunk_size): + """Incremental geocode for addresses missing lat/lon; upsert into RDS.""" manifest.upsert_step('geocode_refresh', 'running') candidates = fetch_addresses_needing_geocode(db) geocoded_rows = geocode_candidate_records( @@ -289,42 +283,58 @@ def geocode_and_publish_addresses( ) print(f'Upserting {len(geocoded_rows)} geocoded addresses') upsert_geocoded_addresses(db, geocoded_rows) - - publish_addresses = should_publish_address_exports( - staging_tables_with_data, len(candidates) + manifest.upsert_step( + 'geocode_refresh', + 'completed', + details={ + 'geocode_candidate_count': len(candidates), + 'geocoded_row_count': len(geocoded_rows), + }, ) - published_keys = list(published_core_keys or []) - if publish_addresses: - print('Publishing address CSV/views') - csv_filepath = os.path.join(pub_dir, "oca_addresses_private.csv") - db.export_csv('oca_addresses', csv_filepath) - db.execute_sql_file('create_addresses_views.sql') - for view_name, s3_filename in ADDRESS_VIEW_EXPORTS: - published_keys.append( - export_table_to_s3(db, view_name, s3_filename, s3_args, s3_prefix) - ) + return len(candidates) + + +def publish_public_artifacts(manifest, db, s3_args, s3_prefix, mode, selection, pub_dir): + """Rebuild address views and export all public CSVs and date artifacts to S3.""" + manifest.upsert_step('publish_public', 'running') + print('Publishing address views and all public CSVs') + db.execute_sql_file('create_addresses_views.sql') + published_keys = publish_core_tables(db, s3_args, s3_prefix) + for view_name, s3_filename in ADDRESS_VIEW_EXPORTS: published_keys.append( - s3_key(f"{S3_PUBLIC_FOLDER}/oca_addresses_private.csv", s3_prefix) - ) - else: - print( - 'Skipping address CSV/view publish: no geocode candidates and ' - 'no oca_addresses_staging rows this run' + export_table_to_s3(db, view_name, s3_filename, s3_args, s3_prefix) ) create_date_files(selection.selected_zip_files[-1], pub_dir) - public_files = ['last-updated-shield.png', 'last-updated-date.txt'] - if publish_addresses: - public_files.append('oca_addresses_private.csv') + date_files = ['last-updated-shield.svg', 'last-updated-date.txt'] with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: - files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) + files_zip = zip(date_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) pool.starmap(upload_public_file, files_zip) - for date_file in public_files: + for date_file in date_files: published_keys.append(s3_key(f"{S3_PUBLIC_FOLDER}/{date_file}", s3_prefix)) + manifest.upsert_step( + 'publish_public', + 'completed', + details={'published_object_count': len(published_keys)}, + ) + return published_keys + + +def normalize_public_s3_encryption(manifest, s3, published_keys): + """SSE-S3 normalization for published public objects (except private address CSV).""" + manifest.upsert_step('normalize_s3_encryption', 'running') + normalize_published_s3_encryption(s3, published_keys) + manifest.upsert_step('normalize_s3_encryption', 'completed') + + +def upload_private_source_files(manifest, s3, priv_dir, s3_prefix): + """Upload raw XML zip backups to the S3 private folder.""" + manifest.upsert_step('upload_private', 'running') for f in os.listdir(priv_dir): if f != '.DS_Store': - s3.upload_file(s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), os.path.join(priv_dir, f)) - - normalize_published_s3_encryption(s3, s3_prefix, published_keys) - manifest.upsert_step('geocode_refresh', 'completed') + s3.upload_file( + s3_key(f"{S3_PRIVATE_FOLDER}/{f}", s3_prefix), + os.path.join(priv_dir, f), + ) + manifest.upsert_step('upload_private', 'completed') diff --git a/lib/s3.py b/lib/s3.py index 6d578e0..9a22b01 100644 --- a/lib/s3.py +++ b/lib/s3.py @@ -139,7 +139,7 @@ def upload_file(self, object_name, file_path): }[ext] # date-updated image needs to have no-cache to be used in github readme - cache_control = 'no-cache' if content_type == 'image/png' else '' + cache_control = 'no-cache' if content_type in ('image/png', 'image/svg+xml') else '' # Put the object into the bucket put_object(self.s3, self.bucket_name, object_name, file_path, content_type, cache_control) diff --git a/tests/test_create_date_files.py b/tests/test_create_date_files.py new file mode 100644 index 0000000..ee27b78 --- /dev/null +++ b/tests/test_create_date_files.py @@ -0,0 +1,31 @@ +import os +import tempfile +import unittest +from unittest import mock + +from lib.etl_helpers import create_date_files + + +class CreateDateFilesTests(unittest.TestCase): + def test_writes_txt_and_svg_without_network(self): + with tempfile.TemporaryDirectory() as local_dir: + with mock.patch('lib.etl_helpers.requests.get') as get_mock: + create_date_files('LandlordTenant.Incr.2024-03-08.zip', local_dir) + get_mock.assert_not_called() + + txt_path = os.path.join(local_dir, 'last-updated-date.txt') + svg_path = os.path.join(local_dir, 'last-updated-shield.svg') + self.assertTrue(os.path.isfile(txt_path)) + self.assertTrue(os.path.isfile(svg_path)) + + with open(txt_path, encoding='utf-8') as handle: + self.assertEqual(handle.read(), '2024-03-08') + + with open(svg_path, encoding='utf-8') as handle: + svg = handle.read() + self.assertIn('Last Updated: 2024-03-08', svg) + self.assertIn(' Date: Fri, 29 May 2026 12:08:14 -0400 Subject: [PATCH 20/30] Geom SQL (schema, promotion, views, upsert) --- lib/sql/create_addresses_views.sql | 17 ----------------- lib/sql/create_tables.sql | 14 +++++++++++++- lib/sql/promote_staging_to_main.sql | 18 +++++++++++++++++- lib/sql/upsert_geocoded_addresses.sql | 3 ++- tests/test_etl_publish.py | 1 - tests/test_incremental_geocode.py | 1 + 6 files changed, 33 insertions(+), 21 deletions(-) diff --git a/lib/sql/create_addresses_views.sql b/lib/sql/create_addresses_views.sql index 4fb6070..ee42849 100644 --- a/lib/sql/create_addresses_views.sql +++ b/lib/sql/create_addresses_views.sql @@ -6,12 +6,6 @@ DROP VIEW IF EXISTS public.oca_addresses_with_bbl; DROP VIEW IF EXISTS public.oca_addresses_with_ct; DROP VIEW IF EXISTS public.oca_addresses_public; --- Drop geom column if it exists -ALTER TABLE oca_addresses -DROP COLUMN IF EXISTS geom; -DROP INDEX IF EXISTS oca_addresses_geom_idx; - --- Recreate views CREATE OR REPLACE VIEW public.oca_addresses_with_bbl AS SELECT indexnumberid, @@ -40,17 +34,6 @@ CREATE OR REPLACE VIEW public.oca_addresses_with_bbl AS FROM oca_addresses o LEFT JOIN pluto p ON LEFT(p.bbl, 10) = o.bbl; --- update oca_addresses with geom field -ALTER TABLE oca_addresses - ADD COLUMN geom Geometry(Point, 4326); - -UPDATE oca_addresses - SET geom = ST_SetSRID(ST_Point(lon, lat),4326); - -CREATE INDEX oca_addresses_geom_idx - ON oca_addresses - USING GIST (geom); - CREATE OR REPLACE VIEW public.oca_addresses_with_ct AS SELECT o.indexnumberid, t.geoid, diff --git a/lib/sql/create_tables.sql b/lib/sql/create_tables.sql index e31e294..6e81305 100644 --- a/lib/sql/create_tables.sql +++ b/lib/sql/create_tables.sql @@ -46,7 +46,8 @@ CREATE TABLE IF NOT EXISTS oca_addresses ( msg text, msg2 text, lon real, - zip_code text + zip_code text, + geom Geometry(Point, 4326) ); CREATE TABLE IF NOT EXISTS oca_parties ( @@ -158,6 +159,17 @@ CREATE TABLE IF NOT EXISTS oca_metadata ( CREATE INDEX IF NOT EXISTS oca_causes_indexnumberid_idx ON oca_causes (indexnumberid); CREATE INDEX IF NOT EXISTS oca_addresses_indexnumberid_idx ON oca_addresses (indexnumberid); CREATE INDEX IF NOT EXISTS oca_addresses_bbl_idx ON oca_addresses (bbl); +CREATE INDEX IF NOT EXISTS oca_addresses_geom_idx ON oca_addresses USING GIST (geom); + +-- Existing deployments: add geom column and index if missing. +ALTER TABLE oca_addresses ADD COLUMN IF NOT EXISTS geom Geometry(Point, 4326); +CREATE INDEX IF NOT EXISTS oca_addresses_geom_idx ON oca_addresses USING GIST (geom); + +UPDATE oca_addresses +SET geom = ST_SetSRID(ST_Point(lon, lat), 4326) +WHERE geom IS NULL + AND lat IS NOT NULL + AND lon IS NOT NULL; CREATE INDEX IF NOT EXISTS oca_parties_indexnumberid_idx ON oca_parties (indexnumberid); CREATE INDEX IF NOT EXISTS oca_events_indexnumberid_idx ON oca_events (indexnumberid); CREATE INDEX IF NOT EXISTS oca_appearances_indexnumberid_idx ON oca_appearances (indexnumberid); diff --git a/lib/sql/promote_staging_to_main.sql b/lib/sql/promote_staging_to_main.sql index 5f1c435..11ed867 100644 --- a/lib/sql/promote_staging_to_main.sql +++ b/lib/sql/promote_staging_to_main.sql @@ -79,7 +79,23 @@ ON CONFLICT (indexnumberid) DO UPDATE SET dateofjurydemand = EXCLUDED.dateofjurydemand; INSERT INTO oca_causes SELECT * FROM oca_causes_staging; -INSERT INTO oca_addresses SELECT * FROM oca_addresses_staging; +INSERT INTO oca_addresses ( + indexnumberid, street1, street2, city, state, postalcode, status, + house_number, street_name, borough_code, place_name, sname, hnum, boro, + lat, bin, bbl, cd, ct, council, grc, grc2, msg, msg2, lon, zip_code +) +SELECT + indexnumberid, street1, street2, city, state, postalcode, status, + house_number, street_name, borough_code, place_name, sname, hnum, boro, + lat, bin, bbl, cd, ct, council, grc, grc2, msg, msg2, lon, zip_code +FROM oca_addresses_staging; + +UPDATE oca_addresses AS o +SET geom = ST_SetSRID(ST_Point(o.lon, o.lat), 4326) +WHERE o.indexnumberid IN (SELECT indexnumberid FROM oca_index_staging) + AND o.lat IS NOT NULL + AND o.lon IS NOT NULL; + INSERT INTO oca_parties SELECT * FROM oca_parties_staging; INSERT INTO oca_events SELECT * FROM oca_events_staging; INSERT INTO oca_appearances SELECT * FROM oca_appearances_staging; diff --git a/lib/sql/upsert_geocoded_addresses.sql b/lib/sql/upsert_geocoded_addresses.sql index 3deeca3..63bf010 100644 --- a/lib/sql/upsert_geocoded_addresses.sql +++ b/lib/sql/upsert_geocoded_addresses.sql @@ -24,7 +24,8 @@ SET msg = s.msg, msg2 = s.msg2, lon = s.lon, - zip_code = s.zip_code + zip_code = s.zip_code, + geom = ST_SetSRID(ST_Point(s.lon, s.lat), 4326) FROM oca_addresses_geocode_staging AS s WHERE o.indexnumberid IS NOT DISTINCT FROM s.indexnumberid AND o.street1 IS NOT DISTINCT FROM s.street1 diff --git a/tests/test_etl_publish.py b/tests/test_etl_publish.py index 8dd5ff3..03138c7 100644 --- a/tests/test_etl_publish.py +++ b/tests/test_etl_publish.py @@ -154,7 +154,6 @@ def test_delegates_with_filtered_keys(self): encrypt_mock.assert_called_once_with( fake_s3, - 'refactor/', published, ) diff --git a/tests/test_incremental_geocode.py b/tests/test_incremental_geocode.py index 3eef87c..0efd11f 100644 --- a/tests/test_incremental_geocode.py +++ b/tests/test_incremental_geocode.py @@ -217,6 +217,7 @@ def test_upsert_sql_matches_on_natural_address_key(self): r'WHERE\s+o\.indexnumberid\s*=\s*s\.indexnumberid\s*;', 'upsert must not join on indexnumberid alone', ) + self.assertIn('geom = ST_SetSRID(ST_Point(s.lon, s.lat), 4326)', sql) class UpsertGeocodedAddressesTests(unittest.TestCase): From 1a30fff275b2a8d48f7be4558c454f8d015a30f2 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Fri, 29 May 2026 12:29:49 -0400 Subject: [PATCH 21/30] CSV geocode + weekly pipeline rewire --- lib/etl.py | 17 ++-- lib/etl_geocode.py | 74 ++++++++++++++++ lib/etl_stages.py | 47 ++++++++-- tests/test_etl_publish.py | 87 +++++++++++++++++++ tests/test_incremental_geocode.py | 140 +++++++++++++++++++++++++++++- 5 files changed, 349 insertions(+), 16 deletions(-) diff --git a/lib/etl.py b/lib/etl.py index 4b26d93..fa5634a 100644 --- a/lib/etl.py +++ b/lib/etl.py @@ -32,13 +32,14 @@ from .etl_stages import ( FileSelection, download_selected_files, - geocode_addresses, + export_staging_csvs, + geocode_staging_csvs, import_and_promote_staging, normalize_public_s3_encryption, parse_xml_to_staging, - preprocess_and_upload_staging_csvs, publish_public_artifacts, select_input_files, + upload_staging_csvs, upload_private_source_files, ) from .s3 import S3 @@ -114,10 +115,14 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None download_selected_files(manifest, sftp, s3, priv_dir, s3_prefix, selection) parse_xml_to_staging(manifest, staging_db, priv_dir) - preprocess_and_upload_staging_csvs( - staging_db, pub_dir, mode, s3_args, s3_prefix, + export_staging_csvs( + manifest, staging_db, pub_dir, csv_preprocess_chunk_size=csv_row_check_chunk_size, ) + geocode_staging_csvs( + manifest, pub_dir, geocode_workers, census_batch_chunk_size, + ) + upload_staging_csvs(manifest, pub_dir, mode, s3_args, s3_prefix) db.ensure_connection() import_and_promote_staging( manifest, @@ -129,10 +134,6 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None runtime_args.get('db_schema') or db_args.get('schema') or 'public', ) db.ensure_connection() - geocode_addresses( - manifest, db, pub_dir, geocode_workers, census_batch_chunk_size, - ) - db.ensure_connection() published_keys = publish_public_artifacts( manifest, db, s3_args, s3_prefix, mode, selection, pub_dir, ) diff --git a/lib/etl_geocode.py b/lib/etl_geocode.py index 92ce672..12317b2 100644 --- a/lib/etl_geocode.py +++ b/lib/etl_geocode.py @@ -1,5 +1,8 @@ +import csv import functools import multiprocessing +import os +import shutil from itertools import repeat import numpy as np @@ -20,6 +23,9 @@ GEOCODE_EXPORT_COLUMNS = GEOCODE_ADDRESS_COLUMNS +STAGING_ADDRESSES_CSV = 'oca_addresses_staging.csv' +GEOCODED_STAGING_ADDRESSES_CSV = 'oca_addresses_staging_geocoded.csv' + def _stringify_row_values(row): normalized = {} @@ -175,3 +181,71 @@ def upsert_geocoded_addresses(db, rows): db.insert_rows(_prepare_rows_for_db(rows), 'oca_addresses_geocode_staging') db.execute_sql_file('upsert_geocoded_addresses.sql') return len(rows) + + +def read_staging_addresses_csv(pub_dir): + """Read ``oca_addresses_staging.csv`` rows as string-normalized dicts.""" + path = os.path.join(pub_dir, STAGING_ADDRESSES_CSV) + if not os.path.exists(path): + return [], [] + + with open(path, 'r', encoding='utf-8', newline='') as handle: + reader = csv.DictReader(handle) + fieldnames = list(reader.fieldnames or []) + rows = [_stringify_row_values(row) for row in reader] + return rows, fieldnames + + +def _merge_geocoded_row(original, geocoded): + merged = dict(original) + for col in GEOCODE_ADDRESS_COLUMNS: + if col in geocoded: + merged[col] = geocoded[col] + return merged + + +def write_geocoded_staging_csv(pub_dir, rows, fieldnames, dest_filename=GEOCODED_STAGING_ADDRESSES_CSV): + """Write geocoded address rows to a staging CSV (default: intermediate geocoded file).""" + path = os.path.join(pub_dir, dest_filename) + if not fieldnames: + fieldnames = list(GEOCODE_ADDRESS_COLUMNS) + + with open(path, 'w', encoding='utf-8', newline='') as handle: + writer = csv.DictWriter(handle, fieldnames=fieldnames, extrasaction='ignore') + writer.writeheader() + for row in rows: + writer.writerow({col: row.get(col, '') for col in fieldnames}) + + +def geocode_staging_addresses_csv( + pub_dir, + geocode_workers, + census_batch_chunk_size, + geocode_record_fn=geocode_record, + geocode_using_census_batch_fn=geocode_using_census_batch, +): + """ + Geocode every row in ``oca_addresses_staging.csv``, write + ``oca_addresses_staging_geocoded.csv``, then overwrite the staging file. + """ + rows, fieldnames = read_staging_addresses_csv(pub_dir) + if not rows: + return 0 + + geocoded_rows = geocode_candidate_records( + rows, + geocode_workers, + census_batch_chunk_size, + pub_dir, + geocode_record_fn=geocode_record_fn, + geocode_using_census_batch_fn=geocode_using_census_batch_fn, + ) + merged_rows = [ + _merge_geocoded_row(original, geocoded) + for original, geocoded in zip(rows, geocoded_rows) + ] + write_geocoded_staging_csv(pub_dir, merged_rows, fieldnames) + staging_path = os.path.join(pub_dir, STAGING_ADDRESSES_CSV) + geocoded_path = os.path.join(pub_dir, GEOCODED_STAGING_ADDRESSES_CSV) + shutil.copy2(geocoded_path, staging_path) + return len(merged_rows) diff --git a/lib/etl_stages.py b/lib/etl_stages.py index 44bd298..45dfad1 100644 --- a/lib/etl_stages.py +++ b/lib/etl_stages.py @@ -27,8 +27,10 @@ staging_tables_with_rows, ) from .etl_geocode import ( + GEOCODED_STAGING_ADDRESSES_CSV, fetch_addresses_needing_geocode, geocode_candidate_records, + geocode_staging_addresses_csv, upsert_geocoded_addresses, ) from .parsers import oca_tag, parse_file @@ -179,17 +181,48 @@ def export_staging_to_csv( pool.starmap(upload_public_file, files_zip) -def preprocess_and_upload_staging_csvs( - staging_db, pub_dir, mode, s3_args, s3_prefix, csv_preprocess_chunk_size=1000 -): +def export_staging_csvs(manifest, staging_db, pub_dir, csv_preprocess_chunk_size=1000): + """Export DuckDB staging tables to local CSV (no S3 upload).""" + manifest.upsert_step('export_staging', 'running') export_staging_to_csv( staging_db, pub_dir, csv_preprocess_chunk_size=csv_preprocess_chunk_size, - upload=True, - mode=mode, - s3_args=s3_args, - s3_prefix=s3_prefix, + upload=False, + ) + manifest.upsert_step('export_staging', 'completed') + + +def geocode_staging_csvs(manifest, pub_dir, geocode_workers, census_batch_chunk_size): + """Geocode all rows in ``oca_addresses_staging.csv`` before S3 upload.""" + manifest.upsert_step('geocode_staging', 'running') + geocoded_row_count = geocode_staging_addresses_csv( + pub_dir, + geocode_workers, + census_batch_chunk_size, + ) + manifest.upsert_step( + 'geocode_staging', + 'completed', + details={'geocoded_row_count': geocoded_row_count}, + ) + return geocoded_row_count + + +def upload_staging_csvs(manifest, pub_dir, mode, s3_args, s3_prefix): + """Upload preprocessed staging CSVs to S3 ``public/``.""" + manifest.upsert_step('upload_staging', 'running') + public_files = [ + name for name in os.listdir(pub_dir) + if name.endswith('.csv') and name != GEOCODED_STAGING_ADDRESSES_CSV + ] + with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: + files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) + pool.starmap(upload_public_file, files_zip) + manifest.upsert_step( + 'upload_staging', + 'completed', + details={'uploaded_file_count': len(public_files)}, ) diff --git a/tests/test_etl_publish.py b/tests/test_etl_publish.py index 03138c7..0e5e960 100644 --- a/tests/test_etl_publish.py +++ b/tests/test_etl_publish.py @@ -9,10 +9,14 @@ published_keys_for_encryption, staging_tables_with_rows, ) +from lib.etl import oca_etl from lib.etl_stages import ( + export_staging_csvs, geocode_addresses, + geocode_staging_csvs, normalize_public_s3_encryption, publish_public_artifacts, + upload_staging_csvs, ) @@ -126,6 +130,89 @@ def track_core(*args, **kwargs): self.assertLess(core_idx, export_idx) +class StagingPipelineStageTests(unittest.TestCase): + def test_export_staging_records_manifest(self): + fake_manifest = mock.Mock() + fake_db = mock.Mock() + with mock.patch('lib.etl_stages.export_staging_to_csv') as export_mock: + export_staging_csvs(fake_manifest, fake_db, '/tmp/pub', csv_preprocess_chunk_size=500) + export_mock.assert_called_once_with( + fake_db, + '/tmp/pub', + csv_preprocess_chunk_size=500, + upload=False, + ) + fake_manifest.upsert_step.assert_any_call('export_staging', 'running') + fake_manifest.upsert_step.assert_any_call('export_staging', 'completed') + + def test_upload_staging_skips_geocoded_intermediate_csv(self): + fake_manifest = mock.Mock() + with tempfile.TemporaryDirectory() as pub_dir: + for name in ( + 'oca_index_staging.csv', + 'oca_addresses_staging.csv', + 'oca_addresses_staging_geocoded.csv', + ): + with open(os.path.join(pub_dir, name), 'w', encoding='utf-8') as handle: + handle.write('h\n') + uploaded_names = [] + with mock.patch('lib.etl_stages.upload_public_file', side_effect=lambda name, *a, **k: uploaded_names.append(name)), \ + mock.patch('lib.etl_stages.multiprocessing.Pool') as pool_mock: + pool_mock.return_value.__enter__.return_value.starmap.side_effect = ( + lambda fn, iterable: [fn(*args) for args in iterable] + ) + upload_staging_csvs( + fake_manifest, pub_dir, '2', + {'aws_bucket_name': 'b', 'aws_id': 'i', 'aws_key': 'k'}, + 'refactor/', + ) + uploaded_names = sorted(uploaded_names) + self.assertEqual( + uploaded_names, + ['oca_addresses_staging.csv', 'oca_index_staging.csv'], + ) + fake_manifest.upsert_step.assert_any_call('upload_staging', 'running') + fake_manifest.upsert_step.assert_any_call( + 'upload_staging', + 'completed', + details={'uploaded_file_count': 2}, + ) + + +class OcaEtlPipelineTests(unittest.TestCase): + def test_weekly_etl_uses_csv_geocode_not_post_promotion_rds_geocode(self): + selection = mock.Mock( + selected_zip_files=['file.zip'], + skipped_reprocess_files=[], + new_file_set={'file.zip'}, + reprocess_file_set=set(), + ) + with mock.patch('lib.etl.EtlRunManifest') as manifest_cls, \ + mock.patch('lib.etl.Database'), \ + mock.patch('lib.etl.DuckDB'), \ + mock.patch('lib.etl.Sftp'), \ + mock.patch('lib.etl.S3'), \ + mock.patch('lib.etl.make_dir', side_effect=lambda x: x), \ + mock.patch('lib.etl.select_input_files', return_value=selection), \ + mock.patch('lib.etl.download_selected_files'), \ + mock.patch('lib.etl.parse_xml_to_staging'), \ + mock.patch('lib.etl.export_staging_csvs') as export_mock, \ + mock.patch('lib.etl.geocode_staging_csvs') as geocode_staging_mock, \ + mock.patch('lib.etl.upload_staging_csvs') as upload_mock, \ + mock.patch('lib.etl.import_and_promote_staging'), \ + mock.patch('lib.etl_stages.geocode_addresses') as geocode_rds_mock, \ + mock.patch('lib.etl.publish_public_artifacts', return_value=[]), \ + mock.patch('lib.etl.normalize_public_s3_encryption'), \ + mock.patch('lib.etl.upload_private_source_files'), \ + mock.patch('pathlib.Path.unlink'): + oca_etl({}, {}, {}, '2', {}, runtime_args={'geocode_workers': 2, 'census_batch_chunk_size': 1000}) + + export_mock.assert_called_once() + geocode_staging_mock.assert_called_once() + upload_mock.assert_called_once() + geocode_rds_mock.assert_not_called() + + class GeocodeAddressesTests(unittest.TestCase): def test_geocode_does_not_export_to_s3(self): fake_db = mock.Mock() diff --git a/tests/test_incremental_geocode.py b/tests/test_incremental_geocode.py index 0efd11f..5e079d8 100644 --- a/tests/test_incremental_geocode.py +++ b/tests/test_incremental_geocode.py @@ -1,4 +1,6 @@ +import csv import os +import tempfile import tracemalloc import unittest from pathlib import Path @@ -7,13 +9,18 @@ from lib.etl_geocode import ( ADDRESS_ROW_KEY_COLUMNS, GEOCODE_ADDRESS_COLUMNS, + GEOCODED_STAGING_ADDRESSES_CSV, + STAGING_ADDRESSES_CSV, address_row_key, fetch_addresses_needing_geocode, geocode_candidate_records, + geocode_staging_addresses_csv, + read_staging_addresses_csv, row_needs_geocode, upsert_geocoded_addresses, + write_geocoded_staging_csv, ) -from lib.etl_stages import geocode_addresses +from lib.etl_stages import geocode_addresses, geocode_staging_csvs class RowNeedsGeocodeTests(unittest.TestCase): @@ -240,6 +247,137 @@ def test_empty_rows_skips_db_writes(self): fake_db.execute_sql_file.assert_not_called() +class StagingCsvGeocodeTests(unittest.TestCase): + def _write_staging_csv(self, pub_dir, rows): + path = os.path.join(pub_dir, STAGING_ADDRESSES_CSV) + fieldnames = list(GEOCODE_ADDRESS_COLUMNS) + with open(path, 'w', encoding='utf-8', newline='') as handle: + writer = csv.DictWriter(handle, fieldnames=fieldnames, extrasaction='ignore') + writer.writeheader() + for row in rows: + writer.writerow({col: row.get(col, '') for col in fieldnames}) + + def test_read_and_write_round_trip(self): + with tempfile.TemporaryDirectory() as pub_dir: + rows_in = [ + { + 'indexnumberid': 'case-1', + 'street1': '1 Main', + 'street2': '', + 'city': 'NYC', + 'state': 'NY', + 'postalcode': '10001', + 'lat': '', + 'lon': '', + }, + ] + self._write_staging_csv(pub_dir, rows_in) + read_rows, fieldnames = read_staging_addresses_csv(pub_dir) + self.assertEqual(len(read_rows), 1) + self.assertIn('indexnumberid', fieldnames) + + read_rows[0]['lat'] = '40.1' + read_rows[0]['lon'] = '-73.9' + write_geocoded_staging_csv(pub_dir, read_rows, fieldnames, STAGING_ADDRESSES_CSV) + reread, _ = read_staging_addresses_csv(pub_dir) + self.assertEqual(reread[0]['lat'], '40.1') + + def test_geocode_staging_overwrites_staging_csv(self): + with tempfile.TemporaryDirectory() as pub_dir: + self._write_staging_csv(pub_dir, [ + { + 'indexnumberid': 'case-1', + 'street1': '1 Main', + 'street2': '', + 'city': 'NYC', + 'state': 'NY', + 'postalcode': '10001', + 'lat': '', + 'lon': '', + }, + ]) + + def fake_geocode_record(row, addr_cols): + row = dict(row) + row['lat'] = '40.5' + row['lon'] = '-73.5' + row['house_number'] = '1' + return row + + count = geocode_staging_addresses_csv( + pub_dir, + geocode_workers=1, + census_batch_chunk_size=2500, + geocode_record_fn=fake_geocode_record, + geocode_using_census_batch_fn=mock.Mock(), + ) + self.assertEqual(count, 1) + self.assertTrue(os.path.exists(os.path.join(pub_dir, GEOCODED_STAGING_ADDRESSES_CSV))) + staging_rows, _ = read_staging_addresses_csv(pub_dir) + self.assertEqual(staging_rows[0]['lat'], '40.5') + + def test_geocode_staging_geocodes_all_rows_not_only_missing_lat(self): + with tempfile.TemporaryDirectory() as pub_dir: + self._write_staging_csv(pub_dir, [ + { + 'indexnumberid': 'has-lat', + 'street1': '10 Main', + 'street2': '', + 'city': 'NYC', + 'state': 'NY', + 'postalcode': '10001', + 'lat': '40.0', + 'lon': '-74.0', + }, + { + 'indexnumberid': 'no-lat', + 'street1': '20 Main', + 'street2': '', + 'city': 'NYC', + 'state': 'NY', + 'postalcode': '10002', + 'lat': '', + 'lon': '', + }, + ]) + seen_ids = [] + + def fake_geocode_record(row, addr_cols): + seen_ids.append(row['indexnumberid']) + row = dict(row) + row['lat'] = f"40.{row['indexnumberid']}" + row['lon'] = '-73.0' + return row + + geocode_staging_addresses_csv( + pub_dir, + geocode_workers=1, + census_batch_chunk_size=2500, + geocode_record_fn=fake_geocode_record, + geocode_using_census_batch_fn=mock.Mock(), + ) + self.assertEqual(seen_ids, ['has-lat', 'no-lat']) + + def test_geocode_staging_stage_records_manifest(self): + fake_manifest = mock.Mock() + with tempfile.TemporaryDirectory() as pub_dir: + with mock.patch( + 'lib.etl_stages.geocode_staging_addresses_csv', + return_value=3, + ) as geocode_mock: + count = geocode_staging_csvs( + fake_manifest, pub_dir, geocode_workers=2, census_batch_chunk_size=1000, + ) + self.assertEqual(count, 3) + geocode_mock.assert_called_once_with(pub_dir, 2, 1000) + fake_manifest.upsert_step.assert_any_call('geocode_staging', 'running') + fake_manifest.upsert_step.assert_any_call( + 'geocode_staging', + 'completed', + details={'geocoded_row_count': 3}, + ) + + class GeocodeStageIntegrationTests(unittest.TestCase): def test_geocode_stage_skips_reset_and_s3_import(self): fake_db = mock.Mock() From e45d3c17e1390d09315517072ef033e54b94527d Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Fri, 29 May 2026 12:37:51 -0400 Subject: [PATCH 22/30] RDS backfill CLI --- oca_geocode_backfill.py | 99 +++++++++++++++++++++++++++++ tests/test_geocode_backfill.py | 111 +++++++++++++++++++++++++++++++++ 2 files changed, 210 insertions(+) create mode 100644 oca_geocode_backfill.py create mode 100644 tests/test_geocode_backfill.py diff --git a/oca_geocode_backfill.py b/oca_geocode_backfill.py new file mode 100644 index 0000000..c01ed0b --- /dev/null +++ b/oca_geocode_backfill.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python + +import argparse +import multiprocessing +import os + +import dotenv + +from lib.database import Database +from lib.etl_helpers import make_dir +from lib.etl_run_manifest import EtlRunManifest +from lib.etl_stages import geocode_addresses + +dotenv.load_dotenv() + + +def parse_optional_int(raw_value): + if raw_value in (None, ''): + return None + return int(raw_value) + + +def parse_args(): + parser = argparse.ArgumentParser( + description='Geocode oca_addresses rows in RDS where lat IS NULL', + ) + parser.add_argument( + '--db-schema', + default=os.environ.get('DB_SCHEMA', ''), + help='Database schema search_path target', + ) + parser.add_argument( + '--geocode-workers', + type=int, + default=parse_optional_int(os.environ.get('GEOCODE_WORKERS')), + help='Worker process count for geocode pool', + ) + parser.add_argument( + '--census-batch-chunk-size', + type=int, + default=int(os.environ.get('CENSUS_BATCH_CHUNK_SIZE', '2500')), + help='Chunk size for census batch geocoder input', + ) + return parser.parse_args() + + +def run_geocode_backfill(db_args, runtime_args=None): + """Fetch ungeocoded addresses from RDS, geocode, and upsert (+ geom).""" + runtime_args = runtime_args or {} + geocode_workers = runtime_args.get('geocode_workers') or multiprocessing.cpu_count() + census_batch_chunk_size = runtime_args.get('census_batch_chunk_size') or 2500 + db_schema = runtime_args.get('db_schema') or db_args.get('schema') or 'public' + + db = Database(**db_args) + manifest = EtlRunManifest( + db=db, + schema_name=db_schema, + s3_prefix='', + mode='geocode_backfill', + reprocess_glob='', + force_reprocess=False, + ) + manifest.setup_tables() + manifest.create_run() + + pub_dir = make_dir('data-public') + try: + db.ensure_connection() + candidate_count = geocode_addresses( + manifest, + db, + pub_dir, + geocode_workers, + census_batch_chunk_size, + ) + manifest.mark_run_completed(0, 0, 0) + print(f'Backfill complete; {candidate_count} candidate addresses processed') + return candidate_count + except Exception as exc: + manifest.mark_run_failed(exc) + raise + + +def main(): + args = parse_args() + db_args = { + 'db_url': os.environ.get('DATABASE_URL', ''), + 'schema': args.db_schema, + } + runtime_args = { + 'db_schema': args.db_schema, + 'geocode_workers': args.geocode_workers, + 'census_batch_chunk_size': args.census_batch_chunk_size, + } + run_geocode_backfill(db_args, runtime_args) + + +if __name__ == '__main__': + main() diff --git a/tests/test_geocode_backfill.py b/tests/test_geocode_backfill.py new file mode 100644 index 0000000..509b2d3 --- /dev/null +++ b/tests/test_geocode_backfill.py @@ -0,0 +1,111 @@ +import importlib +import os +import unittest +from unittest.mock import MagicMock, patch + +import oca_geocode_backfill + + +class GeocodeBackfillCliTests(unittest.TestCase): + @patch('oca_geocode_backfill.run_geocode_backfill') + def test_main_passes_db_and_runtime_args(self, run_mock): + with patch.dict(os.environ, { + 'DATABASE_URL': 'postgres://example', + 'DB_SCHEMA': 'oca_refactor', + 'GEOCODE_WORKERS': '3', + 'CENSUS_BATCH_CHUNK_SIZE': '2000', + }, clear=True), patch('sys.argv', ['oca_geocode_backfill.py']): + oca_geocode_backfill.main() + + run_mock.assert_called_once_with( + {'db_url': 'postgres://example', 'schema': 'oca_refactor'}, + { + 'db_schema': 'oca_refactor', + 'geocode_workers': 3, + 'census_batch_chunk_size': 2000, + }, + ) + + @patch('oca_geocode_backfill.geocode_addresses', return_value=5) + @patch('oca_geocode_backfill.make_dir', return_value='/tmp/data-public') + @patch('oca_geocode_backfill.EtlRunManifest') + @patch('oca_geocode_backfill.Database') + def test_run_geocode_backfill_uses_rds_geocode_path( + self, + db_cls, + manifest_cls, + make_dir_mock, + geocode_mock, + ): + fake_db = MagicMock() + db_cls.return_value = fake_db + fake_manifest = MagicMock() + manifest_cls.return_value = fake_manifest + + count = oca_geocode_backfill.run_geocode_backfill( + {'db_url': 'postgres://example', 'schema': 'oca_refactor'}, + { + 'db_schema': 'oca_refactor', + 'geocode_workers': 2, + 'census_batch_chunk_size': 1000, + }, + ) + + self.assertEqual(count, 5) + manifest_cls.assert_called_once_with( + db=fake_db, + schema_name='oca_refactor', + s3_prefix='', + mode='geocode_backfill', + reprocess_glob='', + force_reprocess=False, + ) + fake_manifest.setup_tables.assert_called_once() + fake_manifest.create_run.assert_called_once() + fake_db.ensure_connection.assert_called_once() + geocode_mock.assert_called_once_with( + fake_manifest, + fake_db, + '/tmp/data-public', + 2, + 1000, + ) + fake_manifest.mark_run_completed.assert_called_once_with(0, 0, 0) + + @patch('oca_geocode_backfill.geocode_addresses', side_effect=RuntimeError('boom')) + @patch('oca_geocode_backfill.make_dir', return_value='/tmp/data-public') + @patch('oca_geocode_backfill.EtlRunManifest') + @patch('oca_geocode_backfill.Database') + def test_run_geocode_backfill_marks_manifest_failed_on_error( + self, + db_cls, + manifest_cls, + make_dir_mock, + geocode_mock, + ): + fake_db = MagicMock() + db_cls.return_value = fake_db + fake_manifest = MagicMock() + manifest_cls.return_value = fake_manifest + + with self.assertRaises(RuntimeError): + oca_geocode_backfill.run_geocode_backfill( + {'db_url': 'postgres://example', 'schema': 'public'}, + {'geocode_workers': 1, 'census_batch_chunk_size': 2500}, + ) + + fake_manifest.mark_run_failed.assert_called_once() + fake_manifest.mark_run_completed.assert_not_called() + + def test_not_wired_into_weekly_etl_entrypoints(self): + etl = importlib.import_module('lib.etl') + oca_update = importlib.import_module('oca_update') + + self.assertNotIn('oca_geocode_backfill', etl.__dict__) + self.assertNotIn('geocode_backfill', etl.__dict__) + self.assertNotIn('oca_geocode_backfill', oca_update.__dict__) + self.assertNotIn('run_geocode_backfill', oca_update.__dict__) + + +if __name__ == '__main__': + unittest.main() From 1abb194fb7354ba65df0276696ca3477c49794f5 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Fri, 29 May 2026 12:40:27 -0400 Subject: [PATCH 23/30] update Documentation for geocoding changes --- README.md | 10 +++- docs/operations/weekly-etl-scheduling.md | 25 ++++++++-- lib/README.md | 61 ++++++++++++++---------- 3 files changed, 65 insertions(+), 31 deletions(-) diff --git a/README.md b/README.md index 2fef197..dcc525c 100644 --- a/README.md +++ b/README.md @@ -57,12 +57,20 @@ cp .env.example .env # Or 'copy .env.example .env' on Windows Required variables: `DATABASE_URL`, `AWS_*`, `SFTP_*`, and `MODE=2` for full publish. Optional runtime controls are documented in [`.env.example`](.env.example). -**Typical weekly run** (process new SFTP files only): +**Typical weekly run** (process new SFTP files only; geocodes addresses in the staging CSV before S3 upload, then promotes and publishes): ```bash docker compose run --rm app python oca_update.py ``` +**RDS geocode backfill** (on-demand; rows in `oca_addresses` where `lat IS NULL` only; does not publish public CSVs): + +```bash +docker compose run --rm app python oca_geocode_backfill.py +``` + +Use the same `DATABASE_URL` and `DB_SCHEMA` as weekly ETL. Optional flags: `--geocode-workers`, `--census-batch-chunk-size` (or env `GEOCODE_WORKERS`, `CENSUS_BATCH_CHUNK_SIZE`). After backfill, run view rebuild + publish separately if S3 public files must reflect new coordinates. + **Refactor / replay run** (isolated schema and S3 prefix, force replay from S3 private backups): ```bash diff --git a/docs/operations/weekly-etl-scheduling.md b/docs/operations/weekly-etl-scheduling.md index 1b6e778..278fc45 100644 --- a/docs/operations/weekly-etl-scheduling.md +++ b/docs/operations/weekly-etl-scheduling.md @@ -1,11 +1,13 @@ # Weekly OCA ETL scheduling and deployment -The OCA pipeline ingests new SFTP XML zip files weekly, promotes staging data in PostgreSQL, geocodes addresses incrementally, and publishes CSVs to S3 via `aws_s3`. All three supported schedulers run the same container entrypoint: +The OCA pipeline ingests new SFTP XML zip files weekly, geocodes addresses in the local staging CSV before S3 upload, promotes staging data in PostgreSQL, and publishes CSVs to S3 via `aws_s3`. All three supported schedulers run the same container entrypoint: ```bash python oca_update.py ``` +Historical RDS rows that still lack coordinates are handled separately by `oca_geocode_backfill.py` (not scheduled with weekly ETL). See [RDS geocode backfill](#rds-geocode-backfill-on-demand) below. + Use Docker (or the published image `justfixnyc/oca:latest`) with credentials supplied via environment variables or a secret store. See [Runtime controls](#runtime-controls) and the root [README](../../README.md). ## Runtime controls @@ -29,12 +31,27 @@ Refactor and E2E runs must set `S3_PREFIX=refactor/` (or another isolated prefix Memory target: **≤ 2 GiB** per job. Tune `GEOCODE_WORKERS` down (e.g. `2`) if geocoding approaches the limit. -## Publish behavior (Step 6) +## Publish behavior -- **Core tables:** every table in `OCA_TABLES` is exported after a successful promotion batch. Selective skip per table is unsafe when `oca_index_staging` has rows: promotion deletes child rows for the batch even when a child staging CSV was empty. -- **Addresses:** when incremental geocode has zero candidates and `oca_addresses_staging` had no rows this run, address CSV/view exports and `create_addresses_views.sql` are skipped. +- **Geocode timing:** weekly runs geocode all rows in `oca_addresses_staging.csv` locally (`geocode_staging` manifest step) before uploading staging CSVs to S3. Promotion imports coordinates (and sets `geom` on `oca_addresses`); there is no post-promotion RDS geocode in `oca_etl()`. +- **Core tables:** every table in `OCA_TABLES` is exported after promotion. Selective skip per table is unsafe when `oca_index_staging` has rows: promotion deletes child rows for the batch even when a child staging CSV was empty. +- **Address views:** `create_addresses_views.sql` runs on every successful weekly publish (views only; `geom` already on the base table). - **S3 encryption:** SSE-S3 normalization runs only on objects exported in the current run (not a full public-prefix scan). +## RDS geocode backfill (on-demand) + +Use when `oca_addresses` still has rows with `lat IS NULL` (e.g. pre-CSV-geocode history). **Not** wired into cron, Kubernetes CronJob, or ECS weekly tasks. + +```bash +docker compose run --rm app python oca_geocode_backfill.py +``` + +Same secrets as weekly ETL (`DATABASE_URL`, `DB_SCHEMA`, AWS if needed for manifest only). Tune with `GEOCODE_WORKERS` / `CENSUS_BATCH_CHUNK_SIZE` or CLI flags. + +- Selects only ungeocoded rows (`select_addresses_needing_geocode.sql`). +- Records manifest `mode='geocode_backfill'` with step `geocode_refresh` only. +- **Does not** run `create_addresses_views.sql` or publish public CSVs. Re-run publish (or a full `oca_update.py` publish path) if S3 must reflect backfilled coordinates. + ## 1. Local Docker + cron (weekly) Best for a single host with Docker and an `.env` file. diff --git a/lib/README.md b/lib/README.md index 3286157..bec7448 100644 --- a/lib/README.md +++ b/lib/README.md @@ -2,9 +2,12 @@ This directory contains the Extract–Transform–Load pipeline that ingests NY State housing court XML from OCA, parses it into relational tables, loads PostgreSQL on RDS, geocodes addresses, and publishes CSVs to S3. This process works with the protected address-level data ("level 2") but maintains public exports of the deidentified (zip code only, "level 1") version with the full address data kept only in secure S3 and RDS for organization under the legal agreement with OCA. -Entry point: [`oca_update.py`](../oca_update.py) loads `.env` and calls `oca_etl()` in [`etl.py`](etl.py). +Entry points: -## Pipeline flow +- [`oca_update.py`](../oca_update.py) — weekly ETL → `oca_etl()` in [`etl.py`](etl.py) +- [`oca_geocode_backfill.py`](../oca_geocode_backfill.py) — on-demand RDS backfill (`lat IS NULL` only); not part of weekly ETL + +## Pipeline flow (weekly) ```mermaid flowchart TD @@ -12,20 +15,19 @@ flowchart TD s3backup[S3 private backups] --> select select --> download[Download selected zips] download --> parse[Stream parse XML → DuckDB staging] - parse --> export[Export staging CSVs] - export --> preprocess[Normalize CSVs for RDS import] - preprocess --> s3upload[Upload staging CSVs to S3] + parse --> export[Export staging CSVs + preprocess] + export --> geo[Geocode oca_addresses_staging.csv locally] + geo --> s3upload[Upload staging CSVs to S3] s3upload --> import[RDS import staging tables] import --> normalize[SQL normalize + appearance outcomes] normalize --> promote[Atomic promote staging → main] - promote --> geocode[Incremental geocode + upsert] - geocode --> views[create_addresses_views.sql] + promote --> views[create_addresses_views.sql] views --> publish[Publish all public CSVs + date files] publish --> enc[SSE normalize except private address CSV] enc --> priv[Upload private XML zips] ``` -Each run is orchestrated sequentially in `oca_etl()`. See [`etl_stages.py`](etl_stages.py) for stage implementations. +Each weekly run is orchestrated sequentially in `oca_etl()`. There is **no** post-promotion `geocode_addresses()` on the weekly path. See [`etl_stages.py`](etl_stages.py) for stage implementations. ## Stages @@ -33,14 +35,21 @@ Each run is orchestrated sequentially in `oca_etl()`. See [`etl_stages.py`](etl_ |-------|--------|--------------| | Select files | `etl_file_selection.py`, `etl_stages.select_input_files` | Picks new SFTP zips and/or S3 private replays (`REPROCESS_GLOB`); skips manifest-completed files unless `FORCE_REPROCESS=true`. | | Download | `etl_stages.download_selected_files` | New files from SFTP; replay files from S3 private backup. | -| Parse | `parsers.py`, `duckdb_database.py` | Streaming XML parse into local DuckDB (`staging.duckdb`); batched writes via `parse_write_buffer.py`. | -| Export + preprocess | `duckdb_database.py`, `staging_csv_export.py`, `etl_csv.py` | DuckDB `COPY` with Postgres-compatible transforms; minimal second-pass CSV rewrite. | -| Import + promote | `etl_stages.import_and_promote_staging`, `etl_promotion.py` | Bootstrap core tables, import staging CSVs via `aws_s3`, normalize, then single-transaction promotion. | -| Geocode | `etl_geocode.py`, `etl_stages.geocode_addresses` | Delta-select rows missing lat/lon; Geosupport + Census batch; upsert by natural address key. | -| Publish public | `etl_stages.publish_public_artifacts` | Rebuild address views; export all `OCA_TABLES` and address views; upload date badge files. | +| Parse | `parsers.py`, `duckdb_database.py` | Streaming XML parse into local DuckDB (`staging.duckdb`); batched writes via `parse_write_buffer.py`. Address rows have no lat/lon until CSV geocode. | +| Export staging | `etl_stages.export_staging_csvs` | DuckDB `COPY` with Postgres-compatible transforms; manifest step `export_staging`. No S3 upload yet. | +| Geocode staging | `etl_geocode.geocode_staging_addresses_csv`, `etl_stages.geocode_staging_csvs` | Geocode **every** row in `oca_addresses_staging.csv`; write `oca_addresses_staging_geocoded.csv`, copy over staging CSV; manifest step `geocode_staging`. | +| Upload staging | `etl_stages.upload_staging_csvs` | Upload `*_staging.csv` to S3 `public/`; **excludes** `oca_addresses_staging_geocoded.csv`; manifest step `upload_staging`. | +| Import + promote | `etl_stages.import_and_promote_staging`, `etl_promotion.py` | Bootstrap core tables, import staging CSVs via `aws_s3`, normalize, promote; batch `geom` UPDATE from lat/lon. | +| Publish public | `etl_stages.publish_public_artifacts` | `create_addresses_views.sql` (views only); export all `OCA_TABLES` and address views; upload date badge files. | | Normalize encryption | `etl_stages.normalize_public_s3_encryption` | SSE-S3 on published keys except `oca_addresses_private.csv`. | | Upload private | `etl_stages.upload_private_source_files` | Back up raw XML zips to S3 `private/`. | +### RDS backfill (not weekly) + +| Stage | Module | What it does | +|-------|--------|--------------| +| Geocode refresh | `etl_stages.geocode_addresses`, `oca_geocode_backfill.py` | Fetch `oca_addresses` where `lat IS NULL`; Geosupport + Census; upsert by natural address key + `geom`. Manifest step `geocode_refresh`, run `mode='geocode_backfill'`. Does not publish. | + ## Key modules ### Connectivity @@ -65,7 +74,7 @@ Each run is orchestrated sequentially in `oca_etl()`. See [`etl_stages.py`](etl_ - [`etl_csv.py`](etl_csv.py) — streaming CSV normalization for tables not handled at export time. - [`etl_promotion.py`](etl_promotion.py) — atomic `promote_staging_to_main()`; count/checksum hooks for validation. - [`etl_publish.py`](etl_publish.py) — S3 export helpers and encryption key filtering. -- [`etl_geocode.py`](etl_geocode.py) — incremental geocode candidate fetch, chunked geocoding, natural-key upsert. +- [`etl_geocode.py`](etl_geocode.py) — staging CSV geocode (`read_staging_addresses_csv`, `geocode_staging_addresses_csv`); RDS fetch/upsert for backfill. ### Geocoding @@ -77,29 +86,29 @@ Scripts run against the active session schema (`DB_SCHEMA` / `search_path`). | Script | Role | |--------|------| -| `create_tables.sql` | Non-destructive bootstrap of core tables and indexes | -| `create_tables_staging.sql` | Per-run RDS staging tables | +| `create_tables.sql` | Non-destructive bootstrap; `oca_addresses.geom` + GIST index | +| `create_tables_staging.sql` | Per-run RDS staging tables (no `geom` on address staging) | | `create_tables_staging_duckdb.sql` | Local DuckDB staging DDL | | `normalize_staging_after_import.sql` | Nullable int coercion after S3 import | | `update_appearance_outcomes.sql` | Assign `appearanceid`, expand outcomes JSON | -| `promote_staging_to_main.sql` | Single-transaction staging → main promotion | +| `promote_staging_to_main.sql` | Staging → main promotion; batch `geom` from lat/lon | | `ensure_promotion_indexes.sql` | Indexes for promotion and address natural keys | -| `select_addresses_needing_geocode.sql` | Delta rows for geocoding | -| `create_geocode_staging_table.sql`, `upsert_geocoded_addresses.sql` | Geocode staging merge | -| `create_addresses_views.sql` | PostGIS views after geocode (before S3 export) | +| `select_addresses_needing_geocode.sql` | Backfill delta (`lat IS NULL`) | +| `create_geocode_staging_table.sql`, `upsert_geocoded_addresses.sql` | Backfill staging merge; sets `geom` | +| `create_addresses_views.sql` | Public address views (no geom DDL) | | `create_etl_manifest_tables.sql` | Run manifest DDL | Legacy/manual only: `reset_addresses_table.sql`, `update_metadata.sql`. ## Idempotency and run control -- **Manifest** — each run records status in `etl_runs`, per-file progress in `etl_files`, and stage checkpoints in `etl_steps` (including `geocode_refresh`, `publish_public`, `normalize_s3_encryption`, `upload_private`). -- **Connection resilience** — TCP keepalives and `ensure_connection()` before geocode and before publish. +- **Manifest** — weekly runs record `export_staging`, `geocode_staging`, `upload_staging`, `promote_staging`, `publish_public`, `normalize_s3_encryption`, `upload_private`. Backfill runs record only `geocode_refresh`. +- **Connection resilience** — TCP keepalives and `ensure_connection()` before promote and before publish. - **Reprocess** — `REPROCESS_GLOB` selects S3 private backups; manifest skips completed files unless `FORCE_REPROCESS=true`. - **Schema isolation** — `DB_SCHEMA` + `S3_PREFIX` for refactor/E2E without touching production paths. -- **Promotion** — scoped delete + insert / upsert in one transaction; safe to retry after import failure. -- **Geocode** — only rows with `lat IS NULL` and a house number; upsert matches on address line columns, not `indexnumberid` alone. -- **Publish** — every successful run exports the full public snapshot (all core tables and address views). +- **Weekly geocode** — all staging CSV address rows (re-geocodes rows that already have lat/lon in the file). +- **Backfill geocode** — only `lat IS NULL` with a house number; upsert matches on address line columns, not `indexnumberid` alone. +- **Publish** — every successful weekly run exports the full public snapshot (all core tables and address views). ## Output tables @@ -107,6 +116,6 @@ Core tables (also published as public CSVs): `oca_index`, `oca_causes`, `oca_add ## Further reading -- Root [README](../README.md) — setup, env vars, Docker invocation +- Root [README](../README.md) — setup, env vars, Docker invocation, backfill CLI - [`docs/operations/weekly-etl-scheduling.md`](../docs/operations/weekly-etl-scheduling.md) — cron, Kubernetes, EventBridge/ECS - [`docs/`](../docs/) — data dictionary links and raw XML notes From e80073e49941251b9c5a8add40e7aafe354f030c Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Fri, 29 May 2026 22:34:52 -0400 Subject: [PATCH 24/30] fix bug uploading tempfiles to s3 --- lib/README.md | 2 +- lib/etl_publish.py | 13 +++++++++++++ lib/etl_stages.py | 9 +++------ tests/test_etl_publish.py | 3 ++- 4 files changed, 19 insertions(+), 8 deletions(-) diff --git a/lib/README.md b/lib/README.md index bec7448..a64dd84 100644 --- a/lib/README.md +++ b/lib/README.md @@ -38,7 +38,7 @@ Each weekly run is orchestrated sequentially in `oca_etl()`. There is **no** pos | Parse | `parsers.py`, `duckdb_database.py` | Streaming XML parse into local DuckDB (`staging.duckdb`); batched writes via `parse_write_buffer.py`. Address rows have no lat/lon until CSV geocode. | | Export staging | `etl_stages.export_staging_csvs` | DuckDB `COPY` with Postgres-compatible transforms; manifest step `export_staging`. No S3 upload yet. | | Geocode staging | `etl_geocode.geocode_staging_addresses_csv`, `etl_stages.geocode_staging_csvs` | Geocode **every** row in `oca_addresses_staging.csv`; write `oca_addresses_staging_geocoded.csv`, copy over staging CSV; manifest step `geocode_staging`. | -| Upload staging | `etl_stages.upload_staging_csvs` | Upload `*_staging.csv` to S3 `public/`; **excludes** `oca_addresses_staging_geocoded.csv`; manifest step `upload_staging`. | +| Upload staging | `etl_stages.upload_staging_csvs`, `etl_publish.list_staging_csvs_in_dir` | Upload only whitelisted `{table}_staging.csv` files (from `OCA_TABLES`); ignores geocoder temps and other junk; manifest step `upload_staging`. | | Import + promote | `etl_stages.import_and_promote_staging`, `etl_promotion.py` | Bootstrap core tables, import staging CSVs via `aws_s3`, normalize, promote; batch `geom` UPDATE from lat/lon. | | Publish public | `etl_stages.publish_public_artifacts` | `create_addresses_views.sql` (views only); export all `OCA_TABLES` and address views; upload date badge files. | | Normalize encryption | `etl_stages.normalize_public_s3_encryption` | SSE-S3 on published keys except `oca_addresses_private.csv`. | diff --git a/lib/etl_publish.py b/lib/etl_publish.py index 33b251f..bd5a77a 100644 --- a/lib/etl_publish.py +++ b/lib/etl_publish.py @@ -8,6 +8,19 @@ PRIVATE_ADDRESS_CSV = 'oca_addresses_private.csv' +def staging_csv_filenames(): + """Expected local/S3 filenames for RDS staging import (one per OCA table).""" + return [f"{table}_staging.csv" for table in OCA_TABLES] + + +def list_staging_csvs_in_dir(pub_dir): + """Staging CSVs present in ``pub_dir``; whitelist only (ignores geocoder temps, etc.).""" + return sorted( + name for name in staging_csv_filenames() + if os.path.isfile(os.path.join(pub_dir, name)) + ) + + def staging_tables_with_rows(pub_dir): """Main table names whose staging CSV had at least one data row this run.""" tables = [] diff --git a/lib/etl_stages.py b/lib/etl_stages.py index 45dfad1..06abdfe 100644 --- a/lib/etl_stages.py +++ b/lib/etl_stages.py @@ -23,11 +23,11 @@ from .etl_publish import ( ADDRESS_VIEW_EXPORTS, export_table_to_s3, + list_staging_csvs_in_dir, normalize_published_s3_encryption, staging_tables_with_rows, ) from .etl_geocode import ( - GEOCODED_STAGING_ADDRESSES_CSV, fetch_addresses_needing_geocode, geocode_candidate_records, geocode_staging_addresses_csv, @@ -175,7 +175,7 @@ def export_staging_to_csv( if not upload: return - public_files = [i for i in os.listdir(pub_dir) if i.endswith('.csv')] + public_files = list_staging_csvs_in_dir(pub_dir) with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) pool.starmap(upload_public_file, files_zip) @@ -212,10 +212,7 @@ def geocode_staging_csvs(manifest, pub_dir, geocode_workers, census_batch_chunk_ def upload_staging_csvs(manifest, pub_dir, mode, s3_args, s3_prefix): """Upload preprocessed staging CSVs to S3 ``public/``.""" manifest.upsert_step('upload_staging', 'running') - public_files = [ - name for name in os.listdir(pub_dir) - if name.endswith('.csv') and name != GEOCODED_STAGING_ADDRESSES_CSV - ] + public_files = list_staging_csvs_in_dir(pub_dir) with multiprocessing.Pool(processes=min((2, multiprocessing.cpu_count()))) as pool: files_zip = zip(public_files, repeat(pub_dir), repeat(mode), repeat(s3_args), repeat(s3_prefix)) pool.starmap(upload_public_file, files_zip) diff --git a/tests/test_etl_publish.py b/tests/test_etl_publish.py index 0e5e960..544eb86 100644 --- a/tests/test_etl_publish.py +++ b/tests/test_etl_publish.py @@ -145,13 +145,14 @@ def test_export_staging_records_manifest(self): fake_manifest.upsert_step.assert_any_call('export_staging', 'running') fake_manifest.upsert_step.assert_any_call('export_staging', 'completed') - def test_upload_staging_skips_geocoded_intermediate_csv(self): + def test_upload_staging_whitelist_skips_geocoded_and_temp_csvs(self): fake_manifest = mock.Mock() with tempfile.TemporaryDirectory() as pub_dir: for name in ( 'oca_index_staging.csv', 'oca_addresses_staging.csv', 'oca_addresses_staging_geocoded.csv', + 'temp_11980619120169245792.csv', ): with open(os.path.join(pub_dir, name), 'w', encoding='utf-8') as handle: handle.write('h\n') From e6a2d6f560c8749b3e5dc104040d92e2e45e65c1 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Wed, 3 Jun 2026 10:52:02 -0400 Subject: [PATCH 25/30] update ignore files --- .dockerignore | 51 ++++++++++++++++++++++++++++++++++++++++++++++++--- .gitignore | 1 + 2 files changed, 49 insertions(+), 3 deletions(-) diff --git a/.dockerignore b/.dockerignore index 4cff36c..6f3b123 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,28 +1,73 @@ # configs and git .git/ +.gitattributes .gitignore +.github/ .idea/ +.cursor/ # python and docker .venv/ .env +.env.* +.env.example Dockerfile docker-compose.yml dockerhub-publish.sh __pycache__/ +*.py[cod] +*$py.class +.pytest_cache/ +.mypy_cache/ +.ruff_cache/ +.coverage +htmlcov/ +*.egg-info/ +dist/ +build/ +.ipynb_checkpoints +# credentials and keys (never ship in image layers) +*.pem +*.key +*.p12 +*.pfx +**/id_rsa +**/id_rsa.pub +**/.ssh/ +*kubeconfig* +secrets/ +credentials/ + +# ops / deploy (runtime uses env vars, not these files) +k8s/ notebooks/ docs/ -k8s/ +tests/ +hooks/ +README.md +LICENSE +run.sh +geocoder_test.py +example.output.txt +requirements.txt +.python-version -# data folders +# data folders and local databases staging.db +staging.duckdb* data/ data-raw/ data-clean/ +data-private/ +data-public/ lib/data-private/ lib/data-public/ +# logs and temp +*.log +*.tmp + # macos .DS_Store -.Trash-0/ \ No newline at end of file +.Trash-0/ diff --git a/.gitignore b/.gitignore index 1366847..c296593 100644 --- a/.gitignore +++ b/.gitignore @@ -23,3 +23,4 @@ data-public/ staging.duckdb* k8s/*-kubeconfig.yaml +k8s/oca-etl-secret.yaml From d9e53e60df7e5a5e224ba6a1c3b4b6be19511729 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Wed, 3 Jun 2026 12:22:57 -0400 Subject: [PATCH 26/30] simplify parser progress logging for non-interactive --- lib/parsers.py | 19 +++++++++++++------ oca_update.py | 7 +++++-- pyproject.toml | 1 - requirements.txt | 1 - uv.lock | 11 ----------- 5 files changed, 18 insertions(+), 21 deletions(-) diff --git a/lib/parsers.py b/lib/parsers.py index 52b0f5c..8e5a934 100644 --- a/lib/parsers.py +++ b/lib/parsers.py @@ -1,10 +1,15 @@ -import frogress -from lxml import etree -import threading +import logging import queue +import threading + +from lxml import etree from .parse_write_buffer import attach_write_buffer, flush_write_buffer, staging_execute +logger = logging.getLogger(__name__) + +PARSE_PROGRESS_INTERVAL = 1000 + NAMESPACE = '{http://www.example.org/LandlordTenantExtractSchema}' def oca_tag(tag): @@ -594,11 +599,13 @@ def parse_file(xml_file, staging_db, extract_date, num_threads=8): total_cases = 0 - for _, case in frogress.bar(context): + for _, case in context: case_copy = etree.fromstring(etree.tostring(case)) case_queue.put(case_copy) total_cases += 1 - + if total_cases % PARSE_PROGRESS_INTERVAL == 0: + logger.info("Parsed %s cases", total_cases) + # Clear the case element to free memory case.clear() while case.getprevious() is not None: @@ -618,5 +625,5 @@ def parse_file(xml_file, staging_db, extract_date, num_threads=8): flush_write_buffer(thread_db) thread_db.close() - print(f"\nProcessed {total_cases} cases with {num_threads} threads") + logger.info("Processed %s cases with %s threads", total_cases, num_threads) diff --git a/oca_update.py b/oca_update.py index 16966eb..e17679c 100644 --- a/oca_update.py +++ b/oca_update.py @@ -1,10 +1,12 @@ #!/usr/bin/env python -import dotenv -import os import argparse +import logging +import os from pathlib import Path +import dotenv + from lib.etl import oca_etl dotenv.load_dotenv() @@ -34,6 +36,7 @@ def parse_args(): return parser.parse_args() def main(): + logging.basicConfig(level=logging.INFO, format="%(message)s") args = parse_args() db_args = { diff --git a/pyproject.toml b/pyproject.toml index 95c649a..0053a36 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,6 @@ dependencies = [ "boto3>=1.38.41", "censusgeocode>=0.5.2", "duckdb>=1.3.1", - "frogress>=0.10.1", "lxml>=5.4.0", "notebook>=7.4.4", "pandas>=2.3.0", diff --git a/requirements.txt b/requirements.txt index 0f00eaa..d532571 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ lxml -frogress boto3 python-dotenv urllib3==1.26.15 diff --git a/uv.lock b/uv.lock index 48f1be5..e0a7782 100644 --- a/uv.lock +++ b/uv.lock @@ -24,7 +24,6 @@ dependencies = [ { name = "boto3" }, { name = "censusgeocode" }, { name = "duckdb" }, - { name = "frogress" }, { name = "lxml" }, { name = "notebook" }, { name = "pandas" }, @@ -44,7 +43,6 @@ requires-dist = [ { name = "boto3", specifier = ">=1.38.41" }, { name = "censusgeocode", specifier = ">=0.5.2" }, { name = "duckdb", specifier = ">=1.3.1" }, - { name = "frogress", specifier = ">=0.10.1" }, { name = "lxml", specifier = ">=5.4.0" }, { name = "notebook", specifier = ">=7.4.4" }, { name = "pandas", specifier = ">=2.3.0" }, @@ -488,15 +486,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cf/58/8acf1b3e91c58313ce5cb67df61001fc9dcd21be4fadb76c1a2d540e09ed/fqdn-1.5.1-py3-none-any.whl", hash = "sha256:3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014", size = 9121, upload-time = "2021-03-11T07:16:28.351Z" }, ] -[[package]] -name = "frogress" -version = "0.10.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6d/50/35f9d1758ac852fe75cd5d687709710ea076fad9b5fdb9f2f6d53f5510d9/frogress-0.10.1.tar.gz", hash = "sha256:0150cfb988eeda65e019283a06b69603c26457081c545b1b143d907996bf96d3", size = 19300, upload-time = "2024-04-04T08:16:01.834Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ba/e4/4e7f4f4148854d63550c96acfb5daec82aec5f801f07323822ab84ae5910/frogress-0.10.1-py3-none-any.whl", hash = "sha256:a47be1dbca0b89dcbd0628063a159948ff96343e4f5086a4f25e2780469f96d7", size = 15216, upload-time = "2024-04-04T08:15:59.977Z" }, -] - [[package]] name = "greenlet" version = "3.2.3" From a3e6ef47d723a0233374a1a2188b1d71a4e5ef62 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Thu, 4 Jun 2026 12:28:24 -0400 Subject: [PATCH 27/30] handle cases marked for deletion (#22) In the OCA XML files some cases are marked for permanent deletion and this wasn't handled correctly if files are ever reprocessed out of order since previously deleted cases could be restored. This PR adds extra protection against that issue by explicitly deleting cases marked for deletion (which we already record in oca_metadata.deletedate) before promoting staging data to main, and adds a one-off backfill script to purge all cases. --- docs/operations/weekly-etl-scheduling.md | 24 ++++++ lib/etl_promotion.py | 1 + lib/etl_stages.py | 33 +++++++- lib/sql/promote_staging_to_main.sql | 101 +++++++++++++++++------ lib/sql/purge_tombstoned_cases.sql | 9 ++ oca_deletion_backfill.py | 72 ++++++++++++++++ tests/test_deletion_backfill.py | 35 ++++++++ tests/test_promotion.py | 31 ++++++- 8 files changed, 278 insertions(+), 28 deletions(-) create mode 100644 lib/sql/purge_tombstoned_cases.sql create mode 100644 oca_deletion_backfill.py create mode 100644 tests/test_deletion_backfill.py diff --git a/docs/operations/weekly-etl-scheduling.md b/docs/operations/weekly-etl-scheduling.md index 278fc45..4b36bc2 100644 --- a/docs/operations/weekly-etl-scheduling.md +++ b/docs/operations/weekly-etl-scheduling.md @@ -8,6 +8,8 @@ python oca_update.py Historical RDS rows that still lack coordinates are handled separately by `oca_geocode_backfill.py` (not scheduled with weekly ETL). See [RDS geocode backfill](#rds-geocode-backfill-on-demand) below. +Cases marked deleted in OCA XML are tombstoned in `oca_metadata.deletedate` and purged from `oca_index` (and child tables) during weekly promotion. Historical orphans (tombstone without purge) are cleaned by `oca_deletion_backfill.py` — see [RDS deletion backfill](#rds-deletion-backfill-on-demand). + Use Docker (or the published image `justfixnyc/oca:latest`) with credentials supplied via environment variables or a secret store. See [Runtime controls](#runtime-controls) and the root [README](../../README.md). ## Runtime controls @@ -52,6 +54,28 @@ Same secrets as weekly ETL (`DATABASE_URL`, `DB_SCHEMA`, AWS if needed for manif - Records manifest `mode='geocode_backfill'` with step `geocode_refresh` only. - **Does not** run `create_addresses_views.sql` or publish public CSVs. Re-run publish (or a full `oca_update.py` publish path) if S3 must reflect backfilled coordinates. +## RDS deletion backfill (on-demand) + +Use after deploying tombstone-aware promotion, or when validation shows case rows still present for deleted metadata. **Not** wired into weekly ETL. + +```bash +docker compose run --rm app python oca_deletion_backfill.py +``` + +Same secrets as weekly ETL (`DATABASE_URL`, `DB_SCHEMA`). Records manifest `mode='deletion_backfill'` with step `deletion_backfill`. + +- Deletes from `oca_index` only (child tables CASCADE); **`oca_metadata` rows are kept** (`deletedate` preserved). +- **Does not** publish public CSVs. Re-run weekly publish if S3 must drop deleted cases from snapshots. + +**Validation** (expect `0` after a successful backfill): + +```sql +SELECT COUNT(*)::bigint +FROM oca_index i +INNER JOIN oca_metadata m ON m.indexnumberid = i.indexnumberid +WHERE m.deletedate IS NOT NULL; +``` + ## 1. Local Docker + cron (weekly) Best for a single host with Docker and an `.env` file. diff --git a/lib/etl_promotion.py b/lib/etl_promotion.py index 9ffd19f..257fc4a 100644 --- a/lib/etl_promotion.py +++ b/lib/etl_promotion.py @@ -5,6 +5,7 @@ PROMOTION_SQL_FILE = 'promote_staging_to_main.sql' PROMOTION_INDEX_SQL_FILE = 'ensure_promotion_indexes.sql' +PURGE_TOMBSTONED_CASES_SQL_FILE = 'purge_tombstoned_cases.sql' # Tables promoted via promote_staging_to_main.sql (oca_metadata merged in-SQL). PROMOTED_TABLES = [t for t in OCA_TABLES if t != 'oca_metadata'] diff --git a/lib/etl_stages.py b/lib/etl_stages.py index 06abdfe..f092eb5 100644 --- a/lib/etl_stages.py +++ b/lib/etl_stages.py @@ -19,7 +19,10 @@ s3_key, upload_public_file, ) -from .etl_promotion import promote_staging_to_main +from .etl_promotion import ( + PURGE_TOMBSTONED_CASES_SQL_FILE, + promote_staging_to_main, +) from .etl_publish import ( ADDRESS_VIEW_EXPORTS, export_table_to_s3, @@ -301,6 +304,34 @@ def publish_core_tables(db, s3_args, s3_prefix): return published_keys +def count_tombstone_orphans(db): + """Cases with oca_metadata.deletedate still present in oca_index.""" + row = db.sql_fetch_one(""" + SELECT COUNT(*)::bigint + FROM oca_index i + INNER JOIN oca_metadata m ON m.indexnumberid = i.indexnumberid + WHERE m.deletedate IS NOT NULL + """) + return int(row[0]) if row else 0 + + +def purge_tombstoned_cases(manifest, db): + """Delete oca_index rows (and children via CASCADE) for metadata tombstones.""" + manifest.upsert_step('deletion_backfill', 'running') + orphan_count_before = count_tombstone_orphans(db) + db.execute_sql_file(PURGE_TOMBSTONED_CASES_SQL_FILE) + orphan_count_after = count_tombstone_orphans(db) + manifest.upsert_step( + 'deletion_backfill', + 'completed', + details={ + 'orphan_count_before': orphan_count_before, + 'orphan_count_after': orphan_count_after, + }, + ) + return orphan_count_before, orphan_count_after + + def geocode_addresses(manifest, db, pub_dir, geocode_workers, census_batch_chunk_size): """Incremental geocode for addresses missing lat/lon; upsert into RDS.""" manifest.upsert_step('geocode_refresh', 'running') diff --git a/lib/sql/promote_staging_to_main.sql b/lib/sql/promote_staging_to_main.sql index 11ed867..12aee16 100644 --- a/lib/sql/promote_staging_to_main.sql +++ b/lib/sql/promote_staging_to_main.sql @@ -1,40 +1,56 @@ -- Atomic staging -> main promotion for one import batch. --- Case scope: all indexnumberid values present in oca_index_staging. --- oca_index uses UPSERT; child tables use scoped DELETE + INSERT; metadata merged last. +-- Tombstoned cases (oca_metadata / oca_metadata_staging deletedate) are purged from +-- oca_index (children CASCADE) and excluded from staging upserts. +-- oca_index uses UPSERT for active staging cases; metadata merged before staging drops. SET session_replication_role = replica; --- Child tables keyed by indexnumberid (full per-case row replace for the batch). +CREATE TEMP TABLE tombstoned_ids ON COMMIT DROP AS +SELECT indexnumberid FROM oca_metadata WHERE deletedate IS NOT NULL +UNION +SELECT indexnumberid FROM oca_metadata_staging WHERE deletedate IS NOT NULL; + +DELETE FROM oca_index +WHERE indexnumberid IN (SELECT indexnumberid FROM tombstoned_ids); + +CREATE TEMP TABLE promotion_active_staging_ids ON COMMIT DROP AS +SELECT s.indexnumberid +FROM oca_index_staging s +WHERE NOT EXISTS ( + SELECT 1 FROM tombstoned_ids t WHERE t.indexnumberid = s.indexnumberid +); + +-- Child tables keyed by indexnumberid (full per-case row replace for active staging cases). DELETE FROM oca_appearance_outcomes -WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); DELETE FROM oca_appearances -WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); DELETE FROM oca_warrants -WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); DELETE FROM oca_judgments -WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); DELETE FROM oca_decisions -WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); DELETE FROM oca_motions -WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); DELETE FROM oca_events -WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); DELETE FROM oca_parties -WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); DELETE FROM oca_causes -WHERE indexnumberid IN (SELECT indexnumberid FROM oca_index_staging); +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); -- Addresses: natural-line key aligned with incremental geocode. DELETE FROM oca_addresses m -WHERE m.indexnumberid IN (SELECT indexnumberid FROM oca_index_staging) +WHERE m.indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids) AND NOT EXISTS ( SELECT 1 FROM oca_addresses_staging s @@ -48,7 +64,8 @@ AND NOT EXISTS ( DELETE FROM oca_addresses m USING oca_addresses_staging s -WHERE m.indexnumberid = s.indexnumberid +WHERE m.indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids) + AND m.indexnumberid = s.indexnumberid AND m.street1 IS NOT DISTINCT FROM s.street1 AND m.street2 IS NOT DISTINCT FROM s.street2 AND m.city IS NOT DISTINCT FROM s.city @@ -65,6 +82,7 @@ SELECT specialtydesignationtypes, status, disposeddate, disposedreason, firstpaper, primaryclaimtotal, dateofjurydemand FROM oca_index_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids) ON CONFLICT (indexnumberid) DO UPDATE SET court = EXCLUDED.court, fileddate = EXCLUDED.fileddate, @@ -78,7 +96,10 @@ ON CONFLICT (indexnumberid) DO UPDATE SET primaryclaimtotal = EXCLUDED.primaryclaimtotal, dateofjurydemand = EXCLUDED.dateofjurydemand; -INSERT INTO oca_causes SELECT * FROM oca_causes_staging; +INSERT INTO oca_causes +SELECT * FROM oca_causes_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + INSERT INTO oca_addresses ( indexnumberid, street1, street2, city, state, postalcode, status, house_number, street_name, borough_code, place_name, sname, hnum, boro, @@ -88,22 +109,46 @@ SELECT indexnumberid, street1, street2, city, state, postalcode, status, house_number, street_name, borough_code, place_name, sname, hnum, boro, lat, bin, bbl, cd, ct, council, grc, grc2, msg, msg2, lon, zip_code -FROM oca_addresses_staging; +FROM oca_addresses_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); UPDATE oca_addresses AS o SET geom = ST_SetSRID(ST_Point(o.lon, o.lat), 4326) -WHERE o.indexnumberid IN (SELECT indexnumberid FROM oca_index_staging) +WHERE o.indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids) AND o.lat IS NOT NULL AND o.lon IS NOT NULL; -INSERT INTO oca_parties SELECT * FROM oca_parties_staging; -INSERT INTO oca_events SELECT * FROM oca_events_staging; -INSERT INTO oca_appearances SELECT * FROM oca_appearances_staging; -INSERT INTO oca_appearance_outcomes SELECT * FROM oca_appearance_outcomes_staging; -INSERT INTO oca_motions SELECT * FROM oca_motions_staging; -INSERT INTO oca_decisions SELECT * FROM oca_decisions_staging; -INSERT INTO oca_judgments SELECT * FROM oca_judgments_staging; -INSERT INTO oca_warrants SELECT * FROM oca_warrants_staging; +INSERT INTO oca_parties +SELECT * FROM oca_parties_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +INSERT INTO oca_events +SELECT * FROM oca_events_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +INSERT INTO oca_appearances +SELECT * FROM oca_appearances_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +INSERT INTO oca_appearance_outcomes +SELECT * FROM oca_appearance_outcomes_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +INSERT INTO oca_motions +SELECT * FROM oca_motions_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +INSERT INTO oca_decisions +SELECT * FROM oca_decisions_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +INSERT INTO oca_judgments +SELECT * FROM oca_judgments_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); + +INSERT INTO oca_warrants +SELECT * FROM oca_warrants_staging +WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids); -- Metadata merge (no nested transaction; must run before staging drops). CREATE TABLE oca_metadata_temp AS @@ -118,6 +163,12 @@ FULL OUTER JOIN oca_metadata_staging oms ON om.indexnumberid = oms.indexnumberid DROP TABLE oca_metadata; ALTER TABLE oca_metadata_temp RENAME TO oca_metadata; +-- Apply tombstones merged this batch (delete-only incr may add new deletedate rows). +DELETE FROM oca_index +WHERE indexnumberid IN ( + SELECT indexnumberid FROM oca_metadata WHERE deletedate IS NOT NULL +); + DROP TABLE IF EXISTS oca_index_staging CASCADE; DROP TABLE IF EXISTS oca_causes_staging CASCADE; DROP TABLE IF EXISTS oca_addresses_staging CASCADE; diff --git a/lib/sql/purge_tombstoned_cases.sql b/lib/sql/purge_tombstoned_cases.sql new file mode 100644 index 0000000..1b74515 --- /dev/null +++ b/lib/sql/purge_tombstoned_cases.sql @@ -0,0 +1,9 @@ +-- Remove production case data for tombstoned indexnumberids (oca_metadata.deletedate). +-- Child rows cascade from oca_index; oca_metadata rows are preserved. + +DELETE FROM oca_index +WHERE indexnumberid IN ( + SELECT m.indexnumberid + FROM oca_metadata m + WHERE m.deletedate IS NOT NULL +); diff --git a/oca_deletion_backfill.py b/oca_deletion_backfill.py new file mode 100644 index 0000000..0bd8af4 --- /dev/null +++ b/oca_deletion_backfill.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python + +import argparse +import os + +import dotenv + +from lib.database import Database +from lib.etl_run_manifest import EtlRunManifest +from lib.etl_stages import purge_tombstoned_cases + +dotenv.load_dotenv() + + +def parse_args(): + parser = argparse.ArgumentParser( + description=( + 'Remove oca_index rows (and child tables via CASCADE) for cases ' + 'with oca_metadata.deletedate set' + ), + ) + parser.add_argument( + '--db-schema', + default=os.environ.get('DB_SCHEMA', ''), + help='Database schema search_path target', + ) + return parser.parse_args() + + +def run_deletion_backfill(db_args, runtime_args=None): + """Purge production case data for metadata tombstones.""" + runtime_args = runtime_args or {} + db_schema = runtime_args.get('db_schema') or db_args.get('schema') or 'public' + + db = Database(**db_args) + manifest = EtlRunManifest( + db=db, + schema_name=db_schema, + s3_prefix='', + mode='deletion_backfill', + reprocess_glob='', + force_reprocess=False, + ) + manifest.setup_tables() + manifest.create_run() + + try: + db.ensure_connection() + orphan_before, orphan_after = purge_tombstoned_cases(manifest, db) + manifest.mark_run_completed(0, 0, 0) + print( + f'Deletion backfill complete; ' + f'orphans before={orphan_before}, after={orphan_after}' + ) + return orphan_before, orphan_after + except Exception as exc: + manifest.mark_run_failed(exc) + raise + + +def main(): + args = parse_args() + db_args = { + 'db_url': os.environ.get('DATABASE_URL', ''), + 'schema': args.db_schema, + } + runtime_args = {'db_schema': args.db_schema} + run_deletion_backfill(db_args, runtime_args) + + +if __name__ == '__main__': + main() diff --git a/tests/test_deletion_backfill.py b/tests/test_deletion_backfill.py new file mode 100644 index 0000000..039801e --- /dev/null +++ b/tests/test_deletion_backfill.py @@ -0,0 +1,35 @@ +import unittest +from unittest import mock + +from lib.etl_stages import count_tombstone_orphans, purge_tombstoned_cases + + +class TombstoneOrphanCountTests(unittest.TestCase): + def test_count_tombstone_orphans(self): + db = mock.Mock() + db.sql_fetch_one.return_value = (3,) + self.assertEqual(count_tombstone_orphans(db), 3) + self.assertIn('oca_metadata', db.sql_fetch_one.call_args[0][0]) + self.assertIn('deletedate IS NOT NULL', db.sql_fetch_one.call_args[0][0]) + + +class PurgeTombstonedCasesTests(unittest.TestCase): + def test_purge_runs_sql_and_records_manifest(self): + db = mock.Mock() + db.sql_fetch_one.side_effect = [(5,), (0,)] + manifest = mock.Mock() + + before, after = purge_tombstoned_cases(manifest, db) + + self.assertEqual((before, after), (5, 0)) + db.execute_sql_file.assert_called_once_with('purge_tombstoned_cases.sql') + manifest.upsert_step.assert_any_call('deletion_backfill', 'running') + manifest.upsert_step.assert_any_call( + 'deletion_backfill', + 'completed', + details={'orphan_count_before': 5, 'orphan_count_after': 0}, + ) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_promotion.py b/tests/test_promotion.py index 8b3e547..4b96c3d 100644 --- a/tests/test_promotion.py +++ b/tests/test_promotion.py @@ -7,6 +7,7 @@ from lib.etl_promotion import ( ADDRESS_NATURAL_KEY_COLUMNS, PROMOTION_SQL_FILE, + PURGE_TOMBSTONED_CASES_SQL_FILE, promote_staging_to_main, promotion_counts_checksum, promotion_table_counts, @@ -114,9 +115,27 @@ def test_single_transaction_session_role_reset(self): self.assertIn('SET session_replication_role = replica', self.sql) self.assertIn('SET session_replication_role = default', self.sql) - def test_oca_index_upsert_not_delete(self): + def test_oca_index_upsert_with_tombstone_purge(self): self.assertIn('ON CONFLICT (indexnumberid) DO UPDATE', self.sql) - self.assertNotRegex(self.sql, r'DELETE FROM oca_index\b') + self.assertRegex(self.sql, r'DELETE FROM oca_index\b') + self.assertGreaterEqual(self.sql.count('DELETE FROM oca_index'), 2) + + def test_tombstone_temp_tables_and_filtered_staging(self): + self.assertIn('CREATE TEMP TABLE tombstoned_ids', self.sql) + self.assertIn('CREATE TEMP TABLE promotion_active_staging_ids', self.sql) + self.assertIn('oca_metadata_staging WHERE deletedate IS NOT NULL', self.sql) + self.assertIn( + 'FROM oca_index_staging\nWHERE indexnumberid IN ' + '(SELECT indexnumberid FROM promotion_active_staging_ids)', + self.sql, + ) + + def test_child_inserts_exclude_tombstones(self): + self.assertIn( + 'FROM oca_causes_staging\n' + 'WHERE indexnumberid IN (SELECT indexnumberid FROM promotion_active_staging_ids)', + self.sql, + ) def test_addresses_use_natural_key_delete(self): for col in ADDRESS_NATURAL_KEY_COLUMNS: @@ -134,6 +153,14 @@ def test_all_staging_tables_dropped(self): self.assertIn(f'DROP TABLE IF EXISTS {table}_staging', self.sql) +class PurgeTombstonedCasesSqlContractTests(unittest.TestCase): + def test_purge_deletes_index_not_metadata(self): + sql = (SQL_DIR / PURGE_TOMBSTONED_CASES_SQL_FILE).read_text(encoding='utf-8') + self.assertRegex(sql, r'DELETE FROM oca_index\b') + self.assertIn('oca_metadata', sql) + self.assertNotRegex(sql, r'DELETE FROM oca_metadata\b') + + class DatabaseTransactionTests(unittest.TestCase): @staticmethod def _mock_connection(): From e3cc8a1c6aae84e1e56a09d6c42d1dd7851225df Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Thu, 4 Jun 2026 16:17:08 -0400 Subject: [PATCH 28/30] Handle failed parse (#23) Previously failures during xml case parsing were printed but not handled in any other way so it would be easy to miss the problem. This adds some additional safety measures to make sure it's clear when there is a parsing failure and when it needs to be corrected. By default the rest of the file(s) continue the rest of the pipeline, since I think it's better to update with a few cases missing and then reprocess later to correct it rather than to fail right away when there might not be time to reprocess before the data is needed (context is justfix sends out an email monday morning that uses the data) and if multiple files are being processed at once then later only the single fail that had the parsing failure can be rerun. Details on the failures are recorded in the "manifest" tables that were already added as part of the refactor. --- .env.example | 3 + docker-compose.yml | 7 +- docs/operations/weekly-etl-scheduling.md | 3 + lib/README.md | 4 +- lib/etl.py | 13 +- lib/etl_run_manifest.py | 21 +- lib/etl_stages.py | 48 +++- lib/parse_manifest.py | 109 +++++++++ lib/parse_write_buffer.py | 42 +++- lib/parsers.py | 106 +++++++-- oca_update.py | 2 + tests/test_parse_failure_manifest.py | 287 +++++++++++++++++++++++ tests/test_parser_batching.py | 18 ++ tests/test_parser_regression_safety.py | 47 +++- tests/test_promotion.py | 78 ++++++ tests/test_run_manifest.py | 34 ++- tests/test_runtime_controls.py | 3 + 17 files changed, 785 insertions(+), 40 deletions(-) create mode 100644 lib/parse_manifest.py create mode 100644 tests/test_parse_failure_manifest.py diff --git a/.env.example b/.env.example index 71c758d..0dd2055 100644 --- a/.env.example +++ b/.env.example @@ -16,6 +16,9 @@ REPROCESS_GLOB= # When true, replay REPROCESS_GLOB matches even if already completed in etl_files manifest FORCE_REPROCESS=false +# When true, fail parse_xml and abort before export/promote if any case-level parse errors occur +PARSE_FAIL_FAST=false + # Geocoding and CSV tuning (optional; safe defaults preserve current behavior) GEOCODE_WORKERS= CENSUS_BATCH_CHUNK_SIZE=2500 diff --git a/docker-compose.yml b/docker-compose.yml index 950a6d9..2518a63 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -6,6 +6,8 @@ services: user: 'root' volumes: - .:/app + # Keep Linux .venv out of the host bind mount (macOS .venv breaks imports in-container). + - app_venv:/app/.venv environment: DATABASE_URL: ${DATABASE_URL} AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID} @@ -22,4 +24,7 @@ services: # > jupyter notebook --allow-root --ip 0.0.0.0 --no-browser # ports: # - 8888:8888 - tty: true \ No newline at end of file + tty: true + +volumes: + app_venv: \ No newline at end of file diff --git a/docs/operations/weekly-etl-scheduling.md b/docs/operations/weekly-etl-scheduling.md index 4b36bc2..a0182c8 100644 --- a/docs/operations/weekly-etl-scheduling.md +++ b/docs/operations/weekly-etl-scheduling.md @@ -25,6 +25,7 @@ Use Docker (or the published image `justfixnyc/oca:latest`) with credentials sup | `S3_PREFIX` | Key prefix for `private/` and `public/` | empty → bucket root | | `REPROCESS_GLOB` | Replay zip files from S3 `private/` | empty | | `FORCE_REPROCESS` | Replay manifest-completed files | `false` | +| `PARSE_FAIL_FAST` | Fail `parse_xml` and abort before export/promote when any zip has case-level parse failures | `false` | | `GEOCODE_WORKERS` | Geosupport pool size | CPU count | | `CENSUS_BATCH_CHUNK_SIZE` | Census batch chunk | `2500` | | `CSV_ROW_CHECK_CHUNK_SIZE` | Staging CSV preprocess chunk | `1000` | @@ -33,6 +34,8 @@ Refactor and E2E runs must set `S3_PREFIX=refactor/` (or another isolated prefix Memory target: **≤ 2 GiB** per job. Tune `GEOCODE_WORKERS` down (e.g. `2`) if geocoding approaches the limit. +**Parse failures (default lenient):** With `PARSE_FAIL_FAST=false`, weekly runs still promote and publish; zips with any `cases_failed` in manifest `etl_files.details` do **not** reach `status = 'completed'` (requires **`cases_failed = 0`**) and are omitted from `completed_reprocess_files` on later `REPROCESS_GLOB` runs (no `FORCE_REPROCESS` needed to retry them). Set `PARSE_FAIL_FAST=true` to stop the run before export/promote. + ## Publish behavior - **Geocode timing:** weekly runs geocode all rows in `oca_addresses_staging.csv` locally (`geocode_staging` manifest step) before uploading staging CSVs to S3. Promotion imports coordinates (and sets `geom` on `oca_addresses`); there is no post-promotion RDS geocode in `oca_etl()`. diff --git a/lib/README.md b/lib/README.md index a64dd84..b3cbeed 100644 --- a/lib/README.md +++ b/lib/README.md @@ -35,7 +35,7 @@ Each weekly run is orchestrated sequentially in `oca_etl()`. There is **no** pos |-------|--------|--------------| | Select files | `etl_file_selection.py`, `etl_stages.select_input_files` | Picks new SFTP zips and/or S3 private replays (`REPROCESS_GLOB`); skips manifest-completed files unless `FORCE_REPROCESS=true`. | | Download | `etl_stages.download_selected_files` | New files from SFTP; replay files from S3 private backup. | -| Parse | `parsers.py`, `duckdb_database.py` | Streaming XML parse into local DuckDB (`staging.duckdb`); batched writes via `parse_write_buffer.py`. Address rows have no lat/lon until CSV geocode. | +| Parse | `parsers.py`, `duckdb_database.py`, `parse_manifest.py` | Streaming XML parse into local DuckDB (`staging.duckdb`); batched writes via `parse_write_buffer.py` with per-case windows (`begin_case` / `discard_case` on error—no partial case rows). Per-zip `cases_seen` / `cases_parsed_ok` / `cases_failed` (+ capped `error_samples`) on `etl_files.details`. Default lenient: promote/publish still run; `etl_files.status = 'completed'` only when **`cases_failed = 0`** after promote. `PARSE_FAIL_FAST` aborts before export/promote. Address rows have no lat/lon until CSV geocode. | | Export staging | `etl_stages.export_staging_csvs` | DuckDB `COPY` with Postgres-compatible transforms; manifest step `export_staging`. No S3 upload yet. | | Geocode staging | `etl_geocode.geocode_staging_addresses_csv`, `etl_stages.geocode_staging_csvs` | Geocode **every** row in `oca_addresses_staging.csv`; write `oca_addresses_staging_geocoded.csv`, copy over staging CSV; manifest step `geocode_staging`. | | Upload staging | `etl_stages.upload_staging_csvs`, `etl_publish.list_staging_csvs_in_dir` | Upload only whitelisted `{table}_staging.csv` files (from `OCA_TABLES`); ignores geocoder temps and other junk; manifest step `upload_staging`. | @@ -104,7 +104,7 @@ Legacy/manual only: `reset_addresses_table.sql`, `update_metadata.sql`. - **Manifest** — weekly runs record `export_staging`, `geocode_staging`, `upload_staging`, `promote_staging`, `publish_public`, `normalize_s3_encryption`, `upload_private`. Backfill runs record only `geocode_refresh`. - **Connection resilience** — TCP keepalives and `ensure_connection()` before promote and before publish. -- **Reprocess** — `REPROCESS_GLOB` selects S3 private backups; manifest skips completed files unless `FORCE_REPROCESS=true`. +- **Reprocess** — `REPROCESS_GLOB` selects S3 private backups; manifest skips files in `completed_reprocess_files` (promoted with `cases_failed = 0`) unless `FORCE_REPROCESS=true`. Zips with prior case-level parse failures stay eligible for reprocess without force. - **Schema isolation** — `DB_SCHEMA` + `S3_PREFIX` for refactor/E2E without touching production paths. - **Weekly geocode** — all staging CSV address rows (re-geocodes rows that already have lat/lon in the file). - **Backfill geocode** — only `lat IS NULL` with a house number; upsert matches on address line columns, not `indexnumberid` alone. diff --git a/lib/etl.py b/lib/etl.py index fa5634a..b0a48c8 100644 --- a/lib/etl.py +++ b/lib/etl.py @@ -29,6 +29,7 @@ promotion_table_counts, ) from .etl_run_manifest import EtlRunManifest, completed_reprocess_files, manifest_step +from .parse_manifest import ParseFailFastError, file_names_needing_reprocess from .etl_stages import ( FileSelection, download_selected_files, @@ -83,6 +84,7 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None geocode_workers = runtime_args.get('geocode_workers') or multiprocessing.cpu_count() census_batch_chunk_size = runtime_args.get('census_batch_chunk_size') or 2500 csv_row_check_chunk_size = runtime_args.get('csv_row_check_chunk_size') or 1000 + parse_fail_fast = bool(runtime_args.get('parse_fail_fast')) db = Database(**db_args) manifest = EtlRunManifest( @@ -114,7 +116,7 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None return True download_selected_files(manifest, sftp, s3, priv_dir, s3_prefix, selection) - parse_xml_to_staging(manifest, staging_db, priv_dir) + parse_xml_to_staging(manifest, staging_db, priv_dir, parse_fail_fast=parse_fail_fast) export_staging_csvs( manifest, staging_db, pub_dir, csv_preprocess_chunk_size=csv_row_check_chunk_size, @@ -140,14 +142,17 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None normalize_public_s3_encryption(manifest, s3, published_keys) upload_private_source_files(manifest, s3, priv_dir, s3_prefix) + files_needing_reprocess = file_names_needing_reprocess(manifest.file_details_by_name) + processed_count = len(selection.selected_zip_files) - len(files_needing_reprocess) manifest.mark_run_completed( len(selection.selected_zip_files), - len(selection.selected_zip_files), - len(selection.skipped_reprocess_files) + processed_count, + len(selection.skipped_reprocess_files), + files_needing_reprocess=files_needing_reprocess or None, ) return True except Exception as exc: - if selection and selection.selected_zip_files: + if selection and selection.selected_zip_files and not isinstance(exc, ParseFailFastError): for selected_name in selection.selected_zip_files: source = 'sftp' if selected_name in selection.new_file_set else 's3_private' manifest.upsert_file(selected_name, source=source, status='failed', stage='run', error=exc) diff --git a/lib/etl_run_manifest.py b/lib/etl_run_manifest.py index 2cce7e8..31e2052 100644 --- a/lib/etl_run_manifest.py +++ b/lib/etl_run_manifest.py @@ -23,6 +23,7 @@ def __init__(self, db, schema_name, s3_prefix, mode, reprocess_glob, force_repro self.reprocess_glob = reprocess_glob or '' self.force_reprocess = force_reprocess self.run_id = str(uuid.uuid4()) + self.file_details_by_name = {} def setup_tables(self): self.db.execute_sql_file('create_etl_manifest_tables.sql') @@ -45,7 +46,21 @@ def create_run(self): ) """) - def mark_run_completed(self, selected_count, processed_count, skipped_count): + def mark_run_completed( + self, + selected_count, + processed_count, + skipped_count, + files_needing_reprocess=None, + ): + metadata_patch = {} + if files_needing_reprocess: + metadata_patch['files_needing_reprocess'] = list(files_needing_reprocess) + metadata_sql = ( + f", metadata = metadata || {self._json_literal(metadata_patch)}" + if metadata_patch + else "" + ) self.db.sql(f""" UPDATE etl_runs SET status = 'completed', @@ -53,6 +68,7 @@ def mark_run_completed(self, selected_count, processed_count, skipped_count): selected_file_count = {selected_count}, processed_file_count = {processed_count}, skipped_file_count = {skipped_count} + {metadata_sql} WHERE run_id = {self._literal(self.run_id)} """) @@ -69,6 +85,8 @@ def mark_run_failed(self, exc): """) def upsert_file(self, file_name, source, status, stage=None, details=None, error=None): + if details is not None: + self.file_details_by_name[file_name] = dict(details) stage_value = "NULL" if stage is None else self._literal(stage) details_value = self._json_literal(details or {}) error_message = "NULL" if error is None else self._literal(str(error)) @@ -139,6 +157,7 @@ def completed_reprocess_files(db, reprocess_files): JOIN etl_runs er ON er.run_id = ef.run_id WHERE ef.status = 'completed' AND er.status = 'completed' + AND COALESCE((ef.details->>'cases_failed')::int, 0) = 0 AND ef.file_name IN ({quoted_files}) """) return {row[0] for row in rows} diff --git a/lib/etl_stages.py b/lib/etl_stages.py index f092eb5..1da5a36 100644 --- a/lib/etl_stages.py +++ b/lib/etl_stages.py @@ -22,6 +22,8 @@ from .etl_promotion import ( PURGE_TOMBSTONED_CASES_SQL_FILE, promote_staging_to_main, + promotion_counts_checksum, + promotion_table_counts, ) from .etl_publish import ( ADDRESS_VIEW_EXPORTS, @@ -36,6 +38,11 @@ geocode_staging_addresses_csv, upsert_geocoded_addresses, ) +from .parse_manifest import ( + finalize_parse_xml_step, + upsert_parsed_etl_file, + upsert_promoted_etl_file, +) from .parsers import oca_tag, parse_file @@ -125,7 +132,7 @@ def download_selected_files(manifest, sftp, s3, priv_dir, s3_prefix, selection): manifest.upsert_step('download_files', 'completed') -def parse_xml_to_staging(manifest, staging_db, priv_dir, parse_num_threads=8): +def parse_xml_to_staging(manifest, staging_db, priv_dir, parse_num_threads=8, parse_fail_fast=False): def sort_by_date(file): r = re.search(r'(\d+.+)\.zip', file).group(0).replace('.', ' ') return r @@ -138,6 +145,8 @@ def sort_by_date(file): manifest.upsert_step('parse_xml', 'running') staging_db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') print('Processing files:') + total_cases_failed = 0 + files_with_failures = 0 for zip_file in local_zip_files: file_name = os.path.basename(zip_file) manifest.upsert_file(file_name, source='local', status='processing', stage='parse') @@ -148,17 +157,24 @@ def sort_by_date(file): extract_date = elem.text break with zipfile.ZipFile(zip_file, 'r').open(DATA_FILENAME) as xml_file: - parse_file( + parse_result = parse_file( xml_file, staging_db, extract_date, num_threads=parse_num_threads, + file_name=file_name, ) - manifest.upsert_file( - file_name, source='local', status='parsed', stage='parse', - details={'extract_date': extract_date} - ) - manifest.upsert_step('parse_xml', 'completed') + failed = upsert_parsed_etl_file(manifest, file_name, parse_result, extract_date) + total_cases_failed += failed + if failed > 0: + files_with_failures += 1 + + finalize_parse_xml_step( + manifest, + total_cases_failed, + files_with_failures, + parse_fail_fast=parse_fail_fast, + ) def export_staging_to_csv( @@ -277,12 +293,26 @@ def import_and_promote_staging(manifest, db, pub_dir, s3_args, s3_prefix, select db.execute_sql_file('normalize_staging_after_import.sql') db.execute_sql_file('update_appearance_outcomes.sql') + counts_before = promotion_table_counts(db) + checksum_before = promotion_counts_checksum(counts_before) print('\t...Promoting staging tables to main (single transaction)') promote_staging_to_main(db) + counts_after = promotion_table_counts(db) + checksum_after = promotion_counts_checksum(counts_after) for selected_name in selection.selected_zip_files: source = 'sftp' if selected_name in selection.new_file_set else 's3_private' - manifest.upsert_file(selected_name, source=source, status='completed', stage='promote') - manifest.upsert_step('promote_staging', 'completed') + parse_details = manifest.file_details_by_name.get(selected_name, {}) + upsert_promoted_etl_file(manifest, selected_name, source, parse_details) + manifest.upsert_step( + 'promote_staging', + 'completed', + details={ + 'counts_before': counts_before, + 'counts_after': counts_after, + 'checksum_before': checksum_before, + 'checksum_after': checksum_after, + }, + ) return imported_staging_tables diff --git a/lib/parse_manifest.py b/lib/parse_manifest.py new file mode 100644 index 0000000..85c725a --- /dev/null +++ b/lib/parse_manifest.py @@ -0,0 +1,109 @@ +"""Manifest details for per-zip parse results (no heavy ETL imports).""" + + +class ParseFailFastError(RuntimeError): + """Raised when PARSE_FAIL_FAST is set and any zip has case-level parse failures.""" + + +def cases_failed_from_details(details): + if not details: + return 0 + return int(details.get('cases_failed') or 0) + + +def build_parsed_file_details(extract_date, parse_result): + return { + 'extract_date': extract_date, + 'cases_seen': parse_result.cases_seen, + 'cases_parsed_ok': parse_result.cases_parsed_ok, + 'cases_failed': parse_result.cases_failed, + 'error_samples': parse_result.error_samples, + } + + +def upsert_parsed_etl_file(manifest, file_name, parse_result, extract_date): + """Record per-zip parse counters on etl_files (status parsed).""" + details = build_parsed_file_details(extract_date, parse_result) + upsert_kwargs = { + 'file_name': file_name, + 'source': 'local', + 'status': 'parsed', + 'stage': 'parse', + 'details': details, + } + if parse_result.cases_failed > 0: + upsert_kwargs['error'] = ( + f"{parse_result.cases_failed} of {parse_result.cases_seen} cases failed to parse" + ) + manifest.upsert_file(**upsert_kwargs) + return parse_result.cases_failed + + +def build_parse_xml_step_details(total_cases_failed, files_with_failures): + return { + 'total_cases_failed': total_cases_failed, + 'files_with_failures': files_with_failures, + } + + +def upsert_promoted_etl_file(manifest, file_name, source, parse_details): + """Mark file completed after promote only when parse had zero failures.""" + cases_failed = cases_failed_from_details(parse_details) + if cases_failed > 0: + details = dict(parse_details) + details['parse_complete'] = False + manifest.upsert_file( + file_name, + source=source, + status='parsed', + stage='parse', + details=details, + ) + return False + manifest.upsert_file( + file_name, + source=source, + status='completed', + stage='promote', + details=parse_details, + ) + return True + + +def file_names_needing_reprocess(file_details_by_name): + return sorted( + name + for name, details in file_details_by_name.items() + if cases_failed_from_details(details) > 0 + ) + + +def finalize_parse_xml_step(manifest, total_cases_failed, files_with_failures, parse_fail_fast=False): + """Complete or fail the parse_xml manifest step; optionally abort before export/promote.""" + step_details = build_parse_xml_step_details(total_cases_failed, files_with_failures) + if parse_fail_fast and total_cases_failed > 0: + for file_name, details in manifest.file_details_by_name.items(): + if cases_failed_from_details(details) > 0: + manifest.upsert_file( + file_name, + source='local', + status='failed', + stage='parse', + details=details, + error=ParseFailFastError( + f"{details.get('cases_failed', 0)} case(s) failed in {file_name}" + ), + ) + manifest.upsert_step( + 'parse_xml', + 'failed', + details=step_details, + error=ParseFailFastError( + f"PARSE_FAIL_FAST: {total_cases_failed} case failure(s) across " + f"{files_with_failures} file(s)" + ), + ) + raise ParseFailFastError( + f"PARSE_FAIL_FAST: aborting before export/promote ({total_cases_failed} case failure(s))" + ) + manifest.upsert_step('parse_xml', 'completed', details=step_details) diff --git a/lib/parse_write_buffer.py b/lib/parse_write_buffer.py index 86a6cf3..5ca064b 100644 --- a/lib/parse_write_buffer.py +++ b/lib/parse_write_buffer.py @@ -61,6 +61,8 @@ def __init__(self, db: DuckDB, config: ParseWriteConfig): self._inserts: dict[str, list[tuple | None]] = {} self._cases_in_window = 0 self._flush_count = 0 + self._in_case_window = False + self._case_mark: tuple[int, dict[str, int]] | None = None def _pending_insert_count(self) -> int: return sum(len(rows) for rows in self._inserts.values()) @@ -70,9 +72,47 @@ def queue_delete(self, sql: str, params: tuple | None) -> None: def queue_insert(self, sql: str, params: tuple | None) -> None: self._inserts.setdefault(sql, []).append(params) - if self._pending_insert_count() >= self.config.batch_size: + if ( + not self._in_case_window + and self._pending_insert_count() >= self.config.batch_size + ): self.flush(reason='batch_size') + def begin_case(self) -> None: + """Start a whole-case write window (metadata + children).""" + self._case_mark = ( + len(self._deletes), + {sql: len(rows) for sql, rows in self._inserts.items()}, + ) + self._in_case_window = True + + def discard_case(self) -> None: + """Drop queued writes for the current case without executing them.""" + if not self._in_case_window: + return + mark = self._case_mark + if mark is not None: + del_len, insert_lens = mark + del self._deletes[del_len:] + for sql in list(self._inserts): + prev = insert_lens.get(sql, 0) + rows = self._inserts[sql] + if prev >= len(rows): + del self._inserts[sql] + else: + del rows[prev:] + if not rows: + del self._inserts[sql] + self._case_mark = None + self._in_case_window = False + + def commit_case(self) -> None: + """End a successful case window and apply cross-case cadence flush.""" + if self._in_case_window: + self._case_mark = None + self._in_case_window = False + self.on_case_complete() + def on_case_complete(self) -> None: self._cases_in_window += 1 if self._cases_in_window >= self.config.flush_every_n_cases: diff --git a/lib/parsers.py b/lib/parsers.py index 8e5a934..bcefa35 100644 --- a/lib/parsers.py +++ b/lib/parsers.py @@ -1,6 +1,7 @@ import logging import queue import threading +from dataclasses import dataclass, field from lxml import etree @@ -9,6 +10,41 @@ logger = logging.getLogger(__name__) PARSE_PROGRESS_INTERVAL = 1000 +MAX_PARSE_ERROR_SAMPLES = 10 +MAX_PARSE_ERROR_SAMPLE_LEN = 500 + + +@dataclass +class ParseFileResult: + """Per-zip parse health counters (thread-safe).""" + + cases_seen: int = 0 + cases_parsed_ok: int = 0 + cases_failed: int = 0 + error_samples: list[str] = field(default_factory=list) + _lock: threading.Lock = field(default_factory=threading.Lock, repr=False, compare=False) + + def record_seen(self) -> None: + with self._lock: + self.cases_seen += 1 + + def record_ok(self) -> None: + with self._lock: + self.cases_parsed_ok += 1 + + def record_failed(self, error: str) -> None: + sample = _truncate_parse_error(str(error)) + with self._lock: + self.cases_failed += 1 + if len(self.error_samples) < MAX_PARSE_ERROR_SAMPLES: + self.error_samples.append(sample) + + +def _truncate_parse_error(message: str) -> str: + if len(message) <= MAX_PARSE_ERROR_SAMPLE_LEN: + return message + return message[: MAX_PARSE_ERROR_SAMPLE_LEN - 3] + '...' + NAMESPACE = '{http://www.example.org/LandlordTenantExtractSchema}' @@ -24,6 +60,11 @@ def oca_tag(tag): DELETE_TAG = oca_tag('Delete') +def _index_number_id_from_case(case) -> str | None: + elem = case.find(INDEX_NUMBER_ID_TAG) + return None if elem is None else elem.text + + def is_case_to_delete(case): """ Determine if a case should be deleted from the database @@ -513,14 +554,16 @@ def parse_case(case, db, extract_date): :param db: a DuckDB object :param extract_date: date of extract """ - + buffer = getattr(db, 'write_buffer', None) + if buffer is not None: + buffer.begin_case() + update_metadata(case, db, extract_date) # If this case is flagged for removal, skip the parsing steps if is_case_to_delete(case): - buffer = getattr(db, 'write_buffer', None) if buffer is not None: - buffer.on_case_complete() + buffer.commit_case() return parse_index(case, db) @@ -534,12 +577,11 @@ def parse_case(case, db, extract_date): parse_judgments(case, db) parse_warrants(case, db) - buffer = getattr(db, 'write_buffer', None) if buffer is not None: - buffer.on_case_complete() + buffer.commit_case() -def _worker_thread(case_queue, db_queue, extract_date, thread_id): +def _worker_thread(case_queue, db_queue, extract_date, thread_id, stats: ParseFileResult): """Worker thread that processes cases from the queue""" while True: try: @@ -551,9 +593,28 @@ def _worker_thread(case_queue, db_queue, extract_date, thread_id): thread_db = db_queue.get() try: parse_case(case, thread_db, extract_date) + stats.record_ok() except Exception as e: - print(f"Thread {thread_id}: Error parsing case: {e}") - flush_write_buffer(thread_db, reason='parse_error') + index_id = _index_number_id_from_case(case) + if index_id: + logger.warning( + "Parse case failed thread=%s indexnumberid=%s: %s", + thread_id, + index_id, + e, + exc_info=True, + ) + else: + logger.warning( + "Parse case failed thread=%s: %s", + thread_id, + e, + exc_info=True, + ) + buffer = getattr(thread_db, 'write_buffer', None) + if buffer is not None: + buffer.discard_case() + stats.record_failed(str(e)) finally: # Clear the case copy from memory case.clear() @@ -565,7 +626,7 @@ def _worker_thread(case_queue, db_queue, extract_date, thread_id): case_queue.task_done() -def parse_file(xml_file, staging_db, extract_date, num_threads=8): +def parse_file(xml_file, staging_db, extract_date, num_threads=8, file_name=None): """ Parse XML file with multiple threads @@ -573,9 +634,12 @@ def parse_file(xml_file, staging_db, extract_date, num_threads=8): :param staging_db: DuckDB database object :param extract_date: date of extract :param num_threads: number of worker threads (increasing this doesn't speed up much, bottleneck is the database writes) + :param file_name: basename for summary logging (optional) + :return: ParseFileResult with per-zip counters """ from .duckdb_database import DuckDB + stats = ParseFileResult() case_queue = queue.Queue(maxsize=num_threads * 10) db_queue = queue.Queue() @@ -588,8 +652,8 @@ def parse_file(xml_file, staging_db, extract_date, num_threads=8): threads = [] for i in range(num_threads): t = threading.Thread( - target=_worker_thread, - args=(case_queue, db_queue, extract_date, i) + target=_worker_thread, + args=(case_queue, db_queue, extract_date, i, stats), ) t.start() threads.append(t) @@ -597,14 +661,12 @@ def parse_file(xml_file, staging_db, extract_date, num_threads=8): # Parse XML and feed cases to queue context = etree.iterparse(xml_file, tag=oca_tag('Index')) - - total_cases = 0 for _, case in context: case_copy = etree.fromstring(etree.tostring(case)) case_queue.put(case_copy) - total_cases += 1 - if total_cases % PARSE_PROGRESS_INTERVAL == 0: - logger.info("Parsed %s cases", total_cases) + stats.record_seen() + if stats.cases_seen % PARSE_PROGRESS_INTERVAL == 0: + logger.info("Parsed %s cases", stats.cases_seen) # Clear the case element to free memory case.clear() @@ -624,6 +686,14 @@ def parse_file(xml_file, staging_db, extract_date, num_threads=8): thread_db = db_queue.get() flush_write_buffer(thread_db) thread_db.close() - - logger.info("Processed %s cases with %s threads", total_cases, num_threads) + + label = file_name or getattr(xml_file, 'name', None) or 'unknown' + logger.info( + "Parse zip summary file=%s seen=%d ok=%d failed=%d", + label, + stats.cases_seen, + stats.cases_parsed_ok, + stats.cases_failed, + ) + return stats diff --git a/oca_update.py b/oca_update.py index e17679c..32d1745 100644 --- a/oca_update.py +++ b/oca_update.py @@ -27,6 +27,7 @@ def parse_args(): parser.add_argument('--s3-prefix', default=os.environ.get('S3_PREFIX', ''), help='Optional S3 prefix namespace for private/public files') parser.add_argument('--reprocess-glob', default=os.environ.get('REPROCESS_GLOB', ''), help='Filename glob for S3 private zip reprocessing') parser.add_argument('--force-reprocess', action='store_true', default=parse_bool(os.environ.get('FORCE_REPROCESS')), help='Reprocess matched files even if already in S3 private backup') + parser.add_argument('--parse-fail-fast', action='store_true', default=parse_bool(os.environ.get('PARSE_FAIL_FAST')), help='Abort run before export/promote when any case-level parse failures occur') parser.add_argument('--geocode-workers', type=int, default=parse_optional_int(os.environ.get('GEOCODE_WORKERS')), help='Worker process count for geocode pool') parser.add_argument('--census-batch-chunk-size', type=int, default=int(os.environ.get('CENSUS_BATCH_CHUNK_SIZE', '2500')), help='Chunk size for census batch geocoder input') parser.add_argument('--csv-row-check-chunk-size', type=int, default=int(os.environ.get('CSV_ROW_CHECK_CHUNK_SIZE', '1000')), help='Chunk size used for constant-memory CSV non-empty checks') @@ -75,6 +76,7 @@ def main(): 'geocode_workers': args.geocode_workers, 'census_batch_chunk_size': args.census_batch_chunk_size, 'csv_row_check_chunk_size': args.csv_row_check_chunk_size, + 'parse_fail_fast': args.parse_fail_fast, } oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args) diff --git a/tests/test_parse_failure_manifest.py b/tests/test_parse_failure_manifest.py new file mode 100644 index 0000000..2277b7c --- /dev/null +++ b/tests/test_parse_failure_manifest.py @@ -0,0 +1,287 @@ +"""Parse failure counters and manifest details (Task 1 observability).""" + +from __future__ import annotations + +import io +import os +import tempfile +import unittest +import zipfile +from unittest.mock import patch + +from lib.duckdb_database import DuckDB +from lib.etl_constants import DATA_FILENAME +from lib.parse_manifest import ( + ParseFailFastError, + build_parse_xml_step_details, + cases_failed_from_details, + finalize_parse_xml_step, + upsert_parsed_etl_file, + upsert_promoted_etl_file, +) +from lib.parse_write_buffer import attach_write_buffer, flush_write_buffer +from lib.parsers import ( + MAX_PARSE_ERROR_SAMPLES, + MAX_PARSE_ERROR_SAMPLE_LEN, + ParseFileResult, + parse_case, + parse_file, +) + +from parser_xml_fixtures import build_extract_xml, write_test_zip + + +class FakeManifest: + def __init__(self): + self.file_upserts = [] + self.step_upserts = [] + self.file_details_by_name = {} + + def upsert_file(self, file_name, source, status, stage=None, details=None, error=None): + if details is not None: + self.file_details_by_name[file_name] = dict(details) + self.file_upserts.append({ + 'file_name': file_name, + 'source': source, + 'status': status, + 'stage': stage, + 'details': details or {}, + 'error': error, + }) + + def upsert_step(self, step_name, status, details=None, error=None): + self.step_upserts.append({ + 'step_name': step_name, + 'status': status, + 'details': details or {}, + 'error': error, + }) + + +def _init_staging_db(path: str) -> DuckDB: + db = DuckDB(path) + db.execute_sql_file('lib/sql/create_tables_staging_duckdb.sql') + attach_write_buffer(db) + return db + + +def _parse_zip_bytes(xml_bytes: bytes, db: DuckDB, extract_date: str = '2024-03-08') -> ParseFileResult: + buf = io.BytesIO() + with zipfile.ZipFile(buf, 'w') as zf: + zf.writestr(DATA_FILENAME, xml_bytes) + buf.seek(0) + with zipfile.ZipFile(buf, 'r') as zf: + with zf.open(DATA_FILENAME) as xml_file: + return parse_file(xml_file, db, extract_date, num_threads=1) + + +class ParseFileResultTests(unittest.TestCase): + def test_cases_seen_matches_index_count(self): + xml_bytes = build_extract_xml(12, child_profile='weekly') + with tempfile.TemporaryDirectory() as tmp: + db = _init_staging_db(os.path.join(tmp, 'staging.duckdb')) + try: + result = _parse_zip_bytes(xml_bytes, db) + flush_write_buffer(db) + finally: + db.close() + + self.assertEqual(result.cases_seen, 12) + self.assertEqual(result.cases_parsed_ok, 12) + self.assertEqual(result.cases_failed, 0) + self.assertEqual(result.error_samples, []) + + def test_injected_failures_increment_counters_and_samples(self): + xml_bytes = build_extract_xml(5, child_profile='weekly') + fail_ids = {2, 4} + seen = {'n': 0} + + def parse_case_maybe_fail(case, db, extract_date): + seen['n'] += 1 + if seen['n'] in fail_ids: + raise RuntimeError(f'injected failure case {seen["n"]}') + parse_case(case, db, extract_date) + + with tempfile.TemporaryDirectory() as tmp: + db = _init_staging_db(os.path.join(tmp, 'staging.duckdb')) + try: + with patch('lib.parsers.parse_case', parse_case_maybe_fail): + result = _parse_zip_bytes(xml_bytes, db) + flush_write_buffer(db) + finally: + db.close() + + self.assertEqual(result.cases_seen, 5) + self.assertEqual(result.cases_parsed_ok, 3) + self.assertEqual(result.cases_failed, 2) + self.assertEqual(len(result.error_samples), 2) + self.assertTrue(all('injected failure' in s for s in result.error_samples)) + + def test_error_samples_capped_at_ten(self): + xml_bytes = build_extract_xml(15, child_profile='weekly') + seen = {'n': 0} + + def parse_case_always_fail(case, db, extract_date): + seen['n'] += 1 + raise RuntimeError('always fail') + + with tempfile.TemporaryDirectory() as tmp: + db = _init_staging_db(os.path.join(tmp, 'staging.duckdb')) + try: + with patch('lib.parsers.parse_case', parse_case_always_fail): + result = _parse_zip_bytes(xml_bytes, db) + finally: + db.close() + + self.assertEqual(result.cases_failed, 15) + self.assertEqual(len(result.error_samples), MAX_PARSE_ERROR_SAMPLES) + + def test_error_sample_truncation(self): + long_msg = 'x' * (MAX_PARSE_ERROR_SAMPLE_LEN + 50) + stats = ParseFileResult() + stats.record_failed(long_msg) + self.assertEqual(len(stats.error_samples[0]), MAX_PARSE_ERROR_SAMPLE_LEN) + self.assertTrue(stats.error_samples[0].endswith('...')) + + def test_parse_logs_indexnumberid_on_failure(self): + xml_bytes = build_extract_xml(1, child_profile='weekly') + case_id = 'LT-BENCH-000000' + + def parse_case_fail(case, db, extract_date): + raise RuntimeError('log test failure') + + with tempfile.TemporaryDirectory() as tmp: + db = _init_staging_db(os.path.join(tmp, 'staging.duckdb')) + try: + with patch('lib.parsers.parse_case', parse_case_fail): + with self.assertLogs('lib.parsers', level='WARNING') as logs: + _parse_zip_bytes(xml_bytes, db) + finally: + db.close() + + combined = '\n'.join(logs.output) + self.assertIn('indexnumberid=', combined) + self.assertIn(case_id, combined) + + +class ParseManifestUpsertTests(unittest.TestCase): + def test_upsert_parsed_etl_file_and_step_aggregate(self): + with tempfile.TemporaryDirectory() as tmp: + db = _init_staging_db(os.path.join(tmp, 'staging.duckdb')) + manifest = FakeManifest() + fail_on = {3} + + def parse_case_maybe_fail(case, db, extract_date): + index_elem = case.find( + '{http://www.example.org/LandlordTenantExtractSchema}IndexNumberId' + ) + case_num = int(index_elem.text.rsplit('-', 1)[-1]) if index_elem is not None else 0 + if case_num in fail_on: + raise RuntimeError('manifest test failure') + parse_case(case, db, extract_date) + + xml_bytes = build_extract_xml(6, child_profile='weekly') + try: + with patch('lib.parsers.parse_case', parse_case_maybe_fail): + result = _parse_zip_bytes(xml_bytes, db) + flush_write_buffer(db) + finally: + db.close() + + upsert_parsed_etl_file( + manifest, + 'LandlordTenant.Incr.2024-03-08.zip', + result, + '2024-03-08', + ) + manifest.upsert_step( + 'parse_xml', + 'completed', + details=build_parse_xml_step_details(result.cases_failed, 1), + ) + + self.assertEqual(len(manifest.file_upserts), 1) + upsert = manifest.file_upserts[0] + details = upsert['details'] + self.assertEqual(upsert['status'], 'parsed') + self.assertEqual(details['cases_seen'], 6) + self.assertEqual(details['cases_parsed_ok'], 5) + self.assertEqual(details['cases_failed'], 1) + self.assertEqual(len(details['error_samples']), 1) + self.assertEqual(details['extract_date'], '2024-03-08') + self.assertIn('1 of 6 cases failed', upsert['error']) + + step_details = manifest.step_upserts[0]['details'] + self.assertEqual(step_details['total_cases_failed'], 1) + self.assertEqual(step_details['files_with_failures'], 1) + + +class PromoteCompletedGateTests(unittest.TestCase): + def test_upsert_promoted_marks_completed_only_when_no_failures(self): + manifest = FakeManifest() + clean_details = { + 'extract_date': '2024-03-08', + 'cases_seen': 10, + 'cases_parsed_ok': 10, + 'cases_failed': 0, + 'error_samples': [], + } + dirty_details = dict(clean_details) + dirty_details['cases_failed'] = 3 + dirty_details['cases_parsed_ok'] = 7 + dirty_details['error_samples'] = ['err'] + + self.assertTrue(upsert_promoted_etl_file(manifest, 'clean.zip', 'sftp', clean_details)) + self.assertFalse(upsert_promoted_etl_file(manifest, 'dirty.zip', 's3_private', dirty_details)) + + clean_upsert = manifest.file_upserts[-2] + dirty_upsert = manifest.file_upserts[-1] + self.assertEqual(clean_upsert['status'], 'completed') + self.assertEqual(clean_upsert['stage'], 'promote') + self.assertNotIn('parse_complete', clean_upsert['details']) + + self.assertEqual(dirty_upsert['status'], 'parsed') + self.assertEqual(dirty_upsert['stage'], 'parse') + self.assertFalse(dirty_upsert['details']['parse_complete']) + self.assertEqual(dirty_upsert['details']['cases_failed'], 3) + + def test_cases_failed_from_details_coerces_missing(self): + self.assertEqual(cases_failed_from_details({}), 0) + self.assertEqual(cases_failed_from_details({'cases_failed': '2'}), 2) + + +class ParseFailFastTests(unittest.TestCase): + def test_finalize_parse_fail_fast_marks_step_and_files_failed(self): + manifest = FakeManifest() + manifest.file_details_by_name = { + 'bad.zip': { + 'cases_seen': 5, + 'cases_parsed_ok': 3, + 'cases_failed': 2, + 'error_samples': ['err'], + }, + 'good.zip': { + 'cases_seen': 1, + 'cases_parsed_ok': 1, + 'cases_failed': 0, + 'error_samples': [], + }, + } + + with self.assertRaises(ParseFailFastError): + finalize_parse_xml_step(manifest, 2, 1, parse_fail_fast=True) + + self.assertEqual(manifest.step_upserts[-1]['status'], 'failed') + self.assertEqual(manifest.step_upserts[-1]['step_name'], 'parse_xml') + failed_names = {u['file_name'] for u in manifest.file_upserts if u['status'] == 'failed'} + self.assertEqual(failed_names, {'bad.zip'}) + + def test_lenient_finalize_completes_step_with_failures(self): + manifest = FakeManifest() + finalize_parse_xml_step(manifest, 3, 1, parse_fail_fast=False) + self.assertEqual(manifest.step_upserts[-1]['status'], 'completed') + self.assertEqual(manifest.step_upserts[-1]['details']['total_cases_failed'], 3) + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_parser_batching.py b/tests/test_parser_batching.py index cd7e7f1..daf1a71 100644 --- a/tests/test_parser_batching.py +++ b/tests/test_parser_batching.py @@ -121,6 +121,24 @@ def test_parse_file_end_to_end_zip(self): class ParseWriteBufferTests(unittest.TestCase): + def test_discard_case_drops_in_window_writes(self): + from lib.parse_write_buffer import StagingWriteBuffer + + with tempfile.TemporaryDirectory() as tmp: + db = DuckDB(os.path.join(tmp, 'buf.duckdb')) + db.execute('CREATE TABLE t (id INTEGER, v VARCHAR)') + buffer = StagingWriteBuffer( + db, + ParseWriteConfig(enabled=True, batch_size=100, flush_every_n_cases=10), + ) + buffer.begin_case() + buffer.queue_insert('INSERT INTO t VALUES (?, ?)', (1, 'orphan')) + buffer.discard_case() + buffer.flush() + count = db.execute('SELECT COUNT(*) FROM t').fetchone()[0] + db.close() + self.assertEqual(count, 0) + def test_flush_order_deletes_before_inserts(self): from lib.parse_write_buffer import StagingWriteBuffer diff --git a/tests/test_parser_regression_safety.py b/tests/test_parser_regression_safety.py index f818290..63d47fe 100644 --- a/tests/test_parser_regression_safety.py +++ b/tests/test_parser_regression_safety.py @@ -9,10 +9,11 @@ import zipfile from unittest.mock import patch -from lib.duckdb_database import DuckDB, fetch_staging_row_counts +from lib.duckdb_database import STAGING_TABLE_FAMILIES, DuckDB, fetch_staging_row_counts from lib.etl_constants import DATA_FILENAME from lib.etl_stages import export_staging_to_csv from lib.parse_write_buffer import ParseWriteConfig, StagingWriteBuffer, attach_write_buffer, flush_write_buffer +from lib import parsers from lib.parsers import parse_case, parse_file from csv_checksums import md5_dir_csvs @@ -27,6 +28,20 @@ def _init_staging_db(path: str) -> DuckDB: return db +def _staging_counts_for_index(db: DuckDB, index_id: str) -> dict[str, int]: + counts: dict[str, int] = {} + for table_name in STAGING_TABLE_FAMILIES: + try: + row = db.execute( + f'SELECT COUNT(*) FROM {table_name} WHERE indexnumberid = ?', + (index_id,), + ).fetchone() + counts[table_name] = int(row[0]) if row else 0 + except Exception: + counts[table_name] = 0 + return counts + + def _parse_zip_bytes( xml_bytes: bytes, db: DuckDB, @@ -125,7 +140,7 @@ def test_cold_rerun_parity(self): class ParserFailureRerunTests(unittest.TestCase): - def test_mid_file_failure_flush_then_rerun_matches_clean_parse(self): + def test_mid_file_failure_discard_then_rerun_matches_clean_parse(self): xml_bytes = build_extract_xml(20, child_profile='weekly') fail_on_case = 8 seen = {'n': 0} @@ -160,6 +175,34 @@ def parse_case_maybe_fail(case, db, extract_date): self.assertEqual(clean_counts, recovery_counts) + def test_failed_mid_case_leaves_no_staging_footprint(self): + """Failure after metadata + partial children leaves no rows for that case.""" + xml_bytes = build_extract_xml(12, child_profile='weekly') + fail_id = 'LT-BENCH-000005' + real_parse_index = parsers.parse_index + + def parse_index_maybe_fail(case, db): + real_parse_index(case, db) + index_el = case.find(parsers.INDEX_NUMBER_ID_TAG) + if index_el is not None and index_el.text == fail_id: + raise RuntimeError('injected after metadata and index') + + with tempfile.TemporaryDirectory() as tmp: + db = _init_staging_db(os.path.join(tmp, 'dirty.duckdb')) + try: + baseline = _staging_counts_for_index(db, fail_id) + with patch('lib.parsers.parse_index', parse_index_maybe_fail): + _parse_zip_bytes(xml_bytes, db) + flush_write_buffer(db) + after_failure = _staging_counts_for_index(db, fail_id) + counts = fetch_staging_row_counts(db) + finally: + db.close() + + self.assertEqual(baseline, {table: 0 for table in STAGING_TABLE_FAMILIES}) + self.assertEqual(after_failure, baseline) + self.assertEqual(counts['oca_index_staging'], 11) + class BatchBoundaryCorrectnessTests(unittest.TestCase): def test_aggressive_batching_matches_legacy_counts(self): diff --git a/tests/test_promotion.py b/tests/test_promotion.py index 4b96c3d..fafa632 100644 --- a/tests/test_promotion.py +++ b/tests/test_promotion.py @@ -199,3 +199,81 @@ def test_promotion_table_counts_queries_each_table(self): counts = promotion_table_counts(db, tables=['oca_index', 'oca_causes']) self.assertEqual(counts, {'oca_index': 42, 'oca_causes': 42}) self.assertEqual(db.sql_fetch_one.call_count, 2) + + +class FakeManifest: + def __init__(self): + self.step_upserts = [] + self.file_details_by_name = {} + + def upsert_file(self, file_name, source, status, stage=None, details=None, error=None): + if details is not None: + self.file_details_by_name[file_name] = dict(details) + + def upsert_step(self, step_name, status, details=None, error=None): + self.step_upserts.append({ + 'step_name': step_name, + 'status': status, + 'details': details or {}, + 'error': error, + }) + + +class ImportAndPromoteStagingObservabilityTests(unittest.TestCase): + @mock.patch('lib.etl_stages.upsert_promoted_etl_file') + @mock.patch('lib.etl_stages.promote_staging_to_main') + @mock.patch('lib.etl_stages.ensure_core_tables_exist') + @mock.patch('lib.etl_stages.staging_tables_with_rows', return_value=[]) + @mock.patch('lib.etl_stages.promotion_table_counts') + @mock.patch('geosupport.Geosupport') + def test_promote_step_records_before_after_checksums( + self, + _geosupport_mock, + counts_mock, + _staging_rows_mock, + _ensure_tables_mock, + _promote_mock, + _upsert_file_mock, + ): + from lib.etl_stages import FileSelection, import_and_promote_staging + + counts_before = {'oca_index': 100, 'oca_metadata': 100} + counts_after = {'oca_index': 150, 'oca_metadata': 150} + counts_mock.side_effect = [counts_before, counts_after] + + manifest = FakeManifest() + db = mock.Mock() + selection = FileSelection( + selected_zip_files=['test.zip'], + skipped_reprocess_files=[], + new_file_set={'test.zip'}, + reprocess_file_set=set(), + sftp_download_files=['test.zip'], + s3_download_files=[], + ) + + with mock.patch('lib.etl_stages.csv_has_rows', return_value=False): + import_and_promote_staging( + manifest, + db, + '/tmp/pub', + {'aws_bucket_name': 'b', 'aws_id': 'i', 'aws_key': 'k'}, + '', + selection, + 'public', + ) + + completed = [s for s in manifest.step_upserts if s['status'] == 'completed'][-1] + self.assertEqual(completed['step_name'], 'promote_staging') + details = completed['details'] + self.assertEqual(details['counts_before'], counts_before) + self.assertEqual(details['counts_after'], counts_after) + self.assertEqual( + details['checksum_before'], + promotion_counts_checksum(counts_before), + ) + self.assertEqual( + details['checksum_after'], + promotion_counts_checksum(counts_after), + ) + self.assertEqual(counts_mock.call_count, 2) diff --git a/tests/test_run_manifest.py b/tests/test_run_manifest.py index fca3ce6..46e4555 100644 --- a/tests/test_run_manifest.py +++ b/tests/test_run_manifest.py @@ -1,6 +1,7 @@ import unittest -from lib.etl import EtlRunManifest, completed_reprocess_files, select_data_files_to_process +from lib.etl_run_manifest import EtlRunManifest, completed_reprocess_files +from lib.etl_file_selection import select_data_files_to_process class FakeDb: @@ -32,6 +33,21 @@ def test_completed_reprocess_files_filters_manifest_hits(self): fake_db.fetch_all_result = [("file_a.zip",), ("file_b.zip",)] completed = completed_reprocess_files(fake_db, ["file_a.zip", "file_c.zip"]) self.assertEqual(completed, {"file_a.zip", "file_b.zip"}) + sql = fake_db.sql_calls[-1][1] + self.assertIn("cases_failed", sql) + self.assertIn("= 0", sql) + + def test_completed_reprocess_files_excludes_files_with_case_failures(self): + """SQL must filter out completed rows where details.cases_failed > 0.""" + fake_db = FakeDb() + fake_db.fetch_all_result = [("file_clean.zip",)] + completed = completed_reprocess_files( + fake_db, + ["file_clean.zip", "file_dirty.zip"], + ) + self.assertEqual(completed, {"file_clean.zip"}) + sql = fake_db.sql_calls[-1][1] + self.assertIn("COALESCE((ef.details->>'cases_failed')::int, 0) = 0", sql) def test_reprocess_without_force_skips_completed_files(self): new_files = ["LandlordTenant.Incr.2024-03-01.zip"] @@ -47,6 +63,20 @@ def test_reprocess_without_force_skips_completed_files(self): ) self.assertEqual(selected, ["LandlordTenant.Incr.2024-03-01.zip"]) + def test_mark_run_completed_records_files_needing_reprocess(self): + fake_db = FakeDb() + manifest = EtlRunManifest(fake_db, 'public', '', '2', '', False) + manifest.mark_run_completed( + 2, + 1, + 0, + files_needing_reprocess=['dirty.zip'], + ) + sql = fake_db.sql_calls[-1][1] + self.assertIn('files_needing_reprocess', sql) + self.assertIn('dirty.zip', sql) + self.assertIn('processed_file_count = 1', sql) + -if __name__ == "__main__": +if __name__ == '__main__': unittest.main() diff --git a/tests/test_runtime_controls.py b/tests/test_runtime_controls.py index 5566ccd..e6b759a 100644 --- a/tests/test_runtime_controls.py +++ b/tests/test_runtime_controls.py @@ -30,6 +30,7 @@ def test_main_passes_defaults_when_unset(self, oca_etl_mock): self.assertEqual(runtime_args['s3_prefix'], '') self.assertEqual(runtime_args['reprocess_glob'], '') self.assertFalse(runtime_args['force_reprocess']) + self.assertFalse(runtime_args['parse_fail_fast']) @patch('oca_update.oca_etl') def test_main_non_default_schema_smoke_path(self, oca_etl_mock): @@ -47,6 +48,7 @@ def test_main_non_default_schema_smoke_path(self, oca_etl_mock): 'S3_PREFIX': 'refactor/dev', 'REPROCESS_GLOB': 'LandlordTenant.Incr.2024-*.zip', 'FORCE_REPROCESS': 'true', + 'PARSE_FAIL_FAST': 'true', 'GEOCODE_WORKERS': '3', 'CENSUS_BATCH_CHUNK_SIZE': '2000', 'CSV_ROW_CHECK_CHUNK_SIZE': '500', @@ -60,6 +62,7 @@ def test_main_non_default_schema_smoke_path(self, oca_etl_mock): self.assertEqual(runtime_args['reprocess_glob'], 'LandlordTenant.Incr.2024-*.zip') self.assertTrue(runtime_args['force_reprocess']) self.assertEqual(runtime_args['geocode_workers'], 3) + self.assertTrue(runtime_args['parse_fail_fast']) @patch('lib.database.psycopg2.connect') def test_database_sets_search_path_for_schema(self, connect_mock): From 51c5dd8428ca09d5a876a7f3dac6531d3c2613d7 Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Thu, 4 Jun 2026 21:06:22 -0400 Subject: [PATCH 29/30] add env flag to skip csv s3 publish when reprocessing (#24) --- .env.example | 3 ++ README.md | 7 +++- docs/operations/weekly-etl-scheduling.md | 4 +++ lib/README.md | 8 ++--- lib/etl.py | 17 ++++++--- oca_update.py | 2 ++ tests/test_etl_publish.py | 44 ++++++++++++++++++++++++ tests/test_runtime_controls.py | 20 +++++++++++ 8 files changed, 95 insertions(+), 10 deletions(-) diff --git a/.env.example b/.env.example index 0dd2055..df467ee 100644 --- a/.env.example +++ b/.env.example @@ -19,6 +19,9 @@ FORCE_REPROCESS=false # When true, fail parse_xml and abort before export/promote if any case-level parse errors occur PARSE_FAIL_FAST=false +# When true, skip post-promote RDS public CSV export and S3 encryption normalization (reprocess throughput only) +SKIP_PUBLIC_PUBLISH=false + # Geocoding and CSV tuning (optional; safe defaults preserve current behavior) GEOCODE_WORKERS= CENSUS_BATCH_CHUNK_SIZE=2500 diff --git a/README.md b/README.md index dcc525c..56cb787 100644 --- a/README.md +++ b/README.md @@ -77,12 +77,15 @@ Use the same `DATABASE_URL` and `DB_SCHEMA` as weekly ETL. Optional flags: `--ge docker compose run --rm app env \ DB_SCHEMA=refactor \ S3_PREFIX=refactor/ \ - REPROCESS_GLOB='LandlordTenant.Incr.2024-*.zip' \ + REPROCESS_GLOB='LandlordTenant.Incr.2025-*.zip' \ FORCE_REPROCESS=true \ + SKIP_PUBLIC_PUBLISH=true \ GEOCODE_WORKERS=2 \ python oca_update.py ``` +Bulk reprocess with `SKIP_PUBLIC_PUBLISH=true` updates RDS and private backups only; public S3 CSVs stay stale until you run once with `SKIP_PUBLIC_PUBLISH=false` (or unset). + Compose reads `.env` from the repo root for `DATABASE_URL`, AWS, and SFTP. Override any variable inline with `env VAR=value ...` as above. Run the test suite in Docker: @@ -111,6 +114,8 @@ Optional env vars (and matching `oca_update.py` CLI flags) tune isolation, repla | `S3_PREFIX` | Prefix for `private/` and `public/` S3 keys | none | | `REPROCESS_GLOB` | Filename glob for S3 private zip replay | none | | `FORCE_REPROCESS` | Replay manifest-completed glob matches | `false` | +| `SKIP_PUBLIC_PUBLISH` | Skip post-promote RDS→S3 public CSV export and SSE normalize | `false` | +| `PARSE_FAIL_FAST` | Abort before export/promote on any case-level parse failure | `false` | | `GEOCODE_WORKERS` | Geosupport multiprocessing pool size | CPU count | | `CENSUS_BATCH_CHUNK_SIZE` | Census batch geocoder chunk | `2500` | | `CSV_ROW_CHECK_CHUNK_SIZE` | Staging CSV preprocess / row-check chunk | `1000` | diff --git a/docs/operations/weekly-etl-scheduling.md b/docs/operations/weekly-etl-scheduling.md index a0182c8..5c32853 100644 --- a/docs/operations/weekly-etl-scheduling.md +++ b/docs/operations/weekly-etl-scheduling.md @@ -26,6 +26,7 @@ Use Docker (or the published image `justfixnyc/oca:latest`) with credentials sup | `REPROCESS_GLOB` | Replay zip files from S3 `private/` | empty | | `FORCE_REPROCESS` | Replay manifest-completed files | `false` | | `PARSE_FAIL_FAST` | Fail `parse_xml` and abort before export/promote when any zip has case-level parse failures | `false` | +| `SKIP_PUBLIC_PUBLISH` | Skip post-promote public CSV export and SSE normalization (reprocess throughput) | `false` | | `GEOCODE_WORKERS` | Geosupport pool size | CPU count | | `CENSUS_BATCH_CHUNK_SIZE` | Census batch chunk | `2500` | | `CSV_ROW_CHECK_CHUNK_SIZE` | Staging CSV preprocess chunk | `1000` | @@ -34,6 +35,8 @@ Refactor and E2E runs must set `S3_PREFIX=refactor/` (or another isolated prefix Memory target: **≤ 2 GiB** per job. Tune `GEOCODE_WORKERS` down (e.g. `2`) if geocoding approaches the limit. +**Do not** set `SKIP_PUBLIC_PUBLISH=true` on production weekly cron, Kubernetes CronJob, or ECS tasks. Use it only for operator-driven bulk reprocess; run a normal publish afterward so public S3 matches RDS. + **Parse failures (default lenient):** With `PARSE_FAIL_FAST=false`, weekly runs still promote and publish; zips with any `cases_failed` in manifest `etl_files.details` do **not** reach `status = 'completed'` (requires **`cases_failed = 0`**) and are omitted from `completed_reprocess_files` on later `REPROCESS_GLOB` runs (no `FORCE_REPROCESS` needed to retry them). Set `PARSE_FAIL_FAST=true` to stop the run before export/promote. ## Publish behavior @@ -42,6 +45,7 @@ Memory target: **≤ 2 GiB** per job. Tune `GEOCODE_WORKERS` down (e.g. `2`) if - **Core tables:** every table in `OCA_TABLES` is exported after promotion. Selective skip per table is unsafe when `oca_index_staging` has rows: promotion deletes child rows for the batch even when a child staging CSV was empty. - **Address views:** `create_addresses_views.sql` runs on every successful weekly publish (views only; `geom` already on the base table). - **S3 encryption:** SSE-S3 normalization runs only on objects exported in the current run (not a full public-prefix scan). +- **Skip publish:** `SKIP_PUBLIC_PUBLISH=true` skips view rebuild, RDS exports, date badges, and SSE normalize; manifest records `publish_public` and `normalize_s3_encryption` as completed with `details.skipped=true`. ## RDS geocode backfill (on-demand) diff --git a/lib/README.md b/lib/README.md index b3cbeed..1ef217b 100644 --- a/lib/README.md +++ b/lib/README.md @@ -40,8 +40,8 @@ Each weekly run is orchestrated sequentially in `oca_etl()`. There is **no** pos | Geocode staging | `etl_geocode.geocode_staging_addresses_csv`, `etl_stages.geocode_staging_csvs` | Geocode **every** row in `oca_addresses_staging.csv`; write `oca_addresses_staging_geocoded.csv`, copy over staging CSV; manifest step `geocode_staging`. | | Upload staging | `etl_stages.upload_staging_csvs`, `etl_publish.list_staging_csvs_in_dir` | Upload only whitelisted `{table}_staging.csv` files (from `OCA_TABLES`); ignores geocoder temps and other junk; manifest step `upload_staging`. | | Import + promote | `etl_stages.import_and_promote_staging`, `etl_promotion.py` | Bootstrap core tables, import staging CSVs via `aws_s3`, normalize, promote; batch `geom` UPDATE from lat/lon. | -| Publish public | `etl_stages.publish_public_artifacts` | `create_addresses_views.sql` (views only); export all `OCA_TABLES` and address views; upload date badge files. | -| Normalize encryption | `etl_stages.normalize_public_s3_encryption` | SSE-S3 on published keys except `oca_addresses_private.csv`. | +| Publish public | `etl_stages.publish_public_artifacts` | `create_addresses_views.sql` (views only); export all `OCA_TABLES` and address views; upload date badge files. Skipped when `SKIP_PUBLIC_PUBLISH=true` (manifest steps recorded with `skipped` in details). | +| Normalize encryption | `etl_stages.normalize_public_s3_encryption` | SSE-S3 on published keys except `oca_addresses_private.csv`. Skipped with `SKIP_PUBLIC_PUBLISH=true`. | | Upload private | `etl_stages.upload_private_source_files` | Back up raw XML zips to S3 `private/`. | ### RDS backfill (not weekly) @@ -103,12 +103,12 @@ Legacy/manual only: `reset_addresses_table.sql`, `update_metadata.sql`. ## Idempotency and run control - **Manifest** — weekly runs record `export_staging`, `geocode_staging`, `upload_staging`, `promote_staging`, `publish_public`, `normalize_s3_encryption`, `upload_private`. Backfill runs record only `geocode_refresh`. -- **Connection resilience** — TCP keepalives and `ensure_connection()` before promote and before publish. +- **Connection resilience** — TCP keepalives and `ensure_connection()` before promote and before publish (publish connection refresh skipped when `SKIP_PUBLIC_PUBLISH=true`). - **Reprocess** — `REPROCESS_GLOB` selects S3 private backups; manifest skips files in `completed_reprocess_files` (promoted with `cases_failed = 0`) unless `FORCE_REPROCESS=true`. Zips with prior case-level parse failures stay eligible for reprocess without force. - **Schema isolation** — `DB_SCHEMA` + `S3_PREFIX` for refactor/E2E without touching production paths. - **Weekly geocode** — all staging CSV address rows (re-geocodes rows that already have lat/lon in the file). - **Backfill geocode** — only `lat IS NULL` with a house number; upsert matches on address line columns, not `indexnumberid` alone. -- **Publish** — every successful weekly run exports the full public snapshot (all core tables and address views). +- **Publish** — every successful weekly run exports the full public snapshot (all core tables and address views). `SKIP_PUBLIC_PUBLISH=true` is for bulk reprocess only; run a normal publish afterward so public S3 matches RDS. ## Output tables diff --git a/lib/etl.py b/lib/etl.py index b0a48c8..0766f83 100644 --- a/lib/etl.py +++ b/lib/etl.py @@ -85,6 +85,7 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None census_batch_chunk_size = runtime_args.get('census_batch_chunk_size') or 2500 csv_row_check_chunk_size = runtime_args.get('csv_row_check_chunk_size') or 1000 parse_fail_fast = bool(runtime_args.get('parse_fail_fast')) + skip_public_publish = bool(runtime_args.get('skip_public_publish')) db = Database(**db_args) manifest = EtlRunManifest( @@ -135,11 +136,17 @@ def oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args=None selection, runtime_args.get('db_schema') or db_args.get('schema') or 'public', ) - db.ensure_connection() - published_keys = publish_public_artifacts( - manifest, db, s3_args, s3_prefix, mode, selection, pub_dir, - ) - normalize_public_s3_encryption(manifest, s3, published_keys) + if skip_public_publish: + print('Skipping public publish (SKIP_PUBLIC_PUBLISH)') + skip_details = {'skipped': True, 'reason': 'SKIP_PUBLIC_PUBLISH'} + manifest.upsert_step('publish_public', 'completed', details=skip_details) + manifest.upsert_step('normalize_s3_encryption', 'completed', details=skip_details) + else: + db.ensure_connection() + published_keys = publish_public_artifacts( + manifest, db, s3_args, s3_prefix, mode, selection, pub_dir, + ) + normalize_public_s3_encryption(manifest, s3, published_keys) upload_private_source_files(manifest, s3, priv_dir, s3_prefix) files_needing_reprocess = file_names_needing_reprocess(manifest.file_details_by_name) diff --git a/oca_update.py b/oca_update.py index 32d1745..254107a 100644 --- a/oca_update.py +++ b/oca_update.py @@ -27,6 +27,7 @@ def parse_args(): parser.add_argument('--s3-prefix', default=os.environ.get('S3_PREFIX', ''), help='Optional S3 prefix namespace for private/public files') parser.add_argument('--reprocess-glob', default=os.environ.get('REPROCESS_GLOB', ''), help='Filename glob for S3 private zip reprocessing') parser.add_argument('--force-reprocess', action='store_true', default=parse_bool(os.environ.get('FORCE_REPROCESS')), help='Reprocess matched files even if already in S3 private backup') + parser.add_argument('--skip-public-publish', action='store_true', default=parse_bool(os.environ.get('SKIP_PUBLIC_PUBLISH')), help='Skip post-promote RDS public CSV export and S3 encryption normalization') parser.add_argument('--parse-fail-fast', action='store_true', default=parse_bool(os.environ.get('PARSE_FAIL_FAST')), help='Abort run before export/promote when any case-level parse failures occur') parser.add_argument('--geocode-workers', type=int, default=parse_optional_int(os.environ.get('GEOCODE_WORKERS')), help='Worker process count for geocode pool') parser.add_argument('--census-batch-chunk-size', type=int, default=int(os.environ.get('CENSUS_BATCH_CHUNK_SIZE', '2500')), help='Chunk size for census batch geocoder input') @@ -77,6 +78,7 @@ def main(): 'census_batch_chunk_size': args.census_batch_chunk_size, 'csv_row_check_chunk_size': args.csv_row_check_chunk_size, 'parse_fail_fast': args.parse_fail_fast, + 'skip_public_publish': args.skip_public_publish, } oca_etl(db_args, sftp_args, s3_args, mode, remote_db_args, runtime_args) diff --git a/tests/test_etl_publish.py b/tests/test_etl_publish.py index 544eb86..c88c9c5 100644 --- a/tests/test_etl_publish.py +++ b/tests/test_etl_publish.py @@ -213,6 +213,50 @@ def test_weekly_etl_uses_csv_geocode_not_post_promotion_rds_geocode(self): upload_mock.assert_called_once() geocode_rds_mock.assert_not_called() + def test_skip_public_publish_skips_post_promote_s3(self): + selection = mock.Mock( + selected_zip_files=['file.zip'], + skipped_reprocess_files=[], + new_file_set={'file.zip'}, + reprocess_file_set=set(), + ) + fake_manifest = mock.Mock() + fake_manifest.file_details_by_name = {} + with mock.patch('lib.etl.EtlRunManifest', return_value=fake_manifest), \ + mock.patch('lib.etl.Database') as db_cls, \ + mock.patch('lib.etl.DuckDB'), \ + mock.patch('lib.etl.Sftp'), \ + mock.patch('lib.etl.S3'), \ + mock.patch('lib.etl.make_dir', side_effect=lambda x: x), \ + mock.patch('lib.etl.select_input_files', return_value=selection), \ + mock.patch('lib.etl.download_selected_files'), \ + mock.patch('lib.etl.parse_xml_to_staging'), \ + mock.patch('lib.etl.export_staging_csvs'), \ + mock.patch('lib.etl.geocode_staging_csvs'), \ + mock.patch('lib.etl.upload_staging_csvs'), \ + mock.patch('lib.etl.import_and_promote_staging'), \ + mock.patch('lib.etl.publish_public_artifacts') as publish_mock, \ + mock.patch('lib.etl.normalize_public_s3_encryption') as normalize_mock, \ + mock.patch('lib.etl.upload_private_source_files') as private_upload_mock, \ + mock.patch('pathlib.Path.unlink'): + oca_etl( + {}, {}, {}, '2', {}, + runtime_args={'skip_public_publish': True}, + ) + + fake_db = db_cls.return_value + publish_mock.assert_not_called() + normalize_mock.assert_not_called() + private_upload_mock.assert_called_once() + self.assertEqual(fake_db.ensure_connection.call_count, 1) + skip_details = {'skipped': True, 'reason': 'SKIP_PUBLIC_PUBLISH'} + fake_manifest.upsert_step.assert_any_call( + 'publish_public', 'completed', details=skip_details, + ) + fake_manifest.upsert_step.assert_any_call( + 'normalize_s3_encryption', 'completed', details=skip_details, + ) + class GeocodeAddressesTests(unittest.TestCase): def test_geocode_does_not_export_to_s3(self): diff --git a/tests/test_runtime_controls.py b/tests/test_runtime_controls.py index e6b759a..55e04f0 100644 --- a/tests/test_runtime_controls.py +++ b/tests/test_runtime_controls.py @@ -31,6 +31,7 @@ def test_main_passes_defaults_when_unset(self, oca_etl_mock): self.assertEqual(runtime_args['reprocess_glob'], '') self.assertFalse(runtime_args['force_reprocess']) self.assertFalse(runtime_args['parse_fail_fast']) + self.assertFalse(runtime_args['skip_public_publish']) @patch('oca_update.oca_etl') def test_main_non_default_schema_smoke_path(self, oca_etl_mock): @@ -64,6 +65,25 @@ def test_main_non_default_schema_smoke_path(self, oca_etl_mock): self.assertEqual(runtime_args['geocode_workers'], 3) self.assertTrue(runtime_args['parse_fail_fast']) + @patch('oca_update.oca_etl') + def test_main_passes_skip_public_publish_from_env(self, oca_etl_mock): + with patch.dict(os.environ, { + 'DATABASE_URL': 'postgres://example', + 'AWS_ACCESS_KEY_ID': 'id', + 'AWS_SECRET_ACCESS_KEY': 'key', + 'AWS_S3_BUCKET_NAME': 'bucket', + 'SFTP_HOST': 'host', + 'SFTP_USER': 'user', + 'SFTP_PSWD': 'pswd', + 'SFTP_DIR': '/incoming', + 'MODE': '2', + 'SKIP_PUBLIC_PUBLISH': 'true', + }, clear=True), patch('sys.argv', ['oca_update.py']): + oca_update.main() + + runtime_args = oca_etl_mock.call_args[0][5] + self.assertTrue(runtime_args['skip_public_publish']) + @patch('lib.database.psycopg2.connect') def test_database_sets_search_path_for_schema(self, connect_mock): conn = MagicMock() From 1ab7c41ebd93ad9cd31cc5e4895a4d370652f0fa Mon Sep 17 00:00:00 2001 From: Maxwell Austensen Date: Thu, 4 Jun 2026 21:10:47 -0400 Subject: [PATCH 30/30] fix minor bug in threading for parsers that raised exception but not a problem --- lib/parsers.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lib/parsers.py b/lib/parsers.py index bcefa35..f90ac29 100644 --- a/lib/parsers.py +++ b/lib/parsers.py @@ -586,6 +586,10 @@ def _worker_thread(case_queue, db_queue, extract_date, thread_id, stats: ParseFi while True: try: case = case_queue.get(timeout=1) + except queue.Empty: + continue + + try: if case is None: # Sentinel value to stop thread break @@ -619,9 +623,6 @@ def _worker_thread(case_queue, db_queue, extract_date, thread_id, stats: ParseFi # Clear the case copy from memory case.clear() db_queue.put(thread_db) # Return db connection to pool - - except queue.Empty: - continue finally: case_queue.task_done()