diff --git a/.vale/styles/config/vocabularies/Data/accept.txt b/.vale/styles/config/vocabularies/Data/accept.txt index 9368964299c8..57e0d8e99b19 100644 --- a/.vale/styles/config/vocabularies/Data/accept.txt +++ b/.vale/styles/config/vocabularies/Data/accept.txt @@ -57,5 +57,6 @@ YOLO [Ss]harded [Pp]arameterization(s)? repr +Zarr [Uu]psample(d|s)? [Rr]ebatch(ing|ed|es)? diff --git a/doc/source/data/api/loading_data.rst b/doc/source/data/api/loading_data.rst index ef25cdbedd5e..d613fd8666d6 100644 --- a/doc/source/data/api/loading_data.rst +++ b/doc/source/data/api/loading_data.rst @@ -352,6 +352,15 @@ WebDataset read_webdataset +Zarr +^^^^ + +.. autosummary:: + :nosignatures: + :toctree: doc/ + + read_zarr + Partitioning API ^^^^^^^^^^^^^^^^ diff --git a/doc/source/data/loading-data.rst b/doc/source/data/loading-data.rst index 0f72b4c473ee..772c9ab9cb40 100644 --- a/doc/source/data/loading-data.rst +++ b/doc/source/data/loading-data.rst @@ -150,6 +150,16 @@ To view the full list of supported file formats, see the petal.width float sepal.length float + .. tab-item:: Zarr + + To read a Zarr v2 store, call :func:`~ray.data.read_zarr`. + + .. code-block:: python + + import ray + + ds = ray.data.read_zarr("s3://anonymous@ray-example-data/mnist-tiny.zarr") + Reading files from local disk ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/data/user-guide.rst b/doc/source/data/user-guide.rst index b971e521103f..9e21a80ddb93 100644 --- a/doc/source/data/user-guide.rst +++ b/doc/source/data/user-guide.rst @@ -24,6 +24,7 @@ shows you how to achieve several tasks. working-with-images working-with-text working-with-tensors + working-with-zarr working-with-pytorch working-with-llms how-to-avoid-ooms diff --git a/doc/source/data/working-with-zarr.md b/doc/source/data/working-with-zarr.md new file mode 100644 index 000000000000..e32c3626f64a --- /dev/null +++ b/doc/source/data/working-with-zarr.md @@ -0,0 +1,155 @@ +(working_with_zarr)= + +# Working with Zarr + +Ray Data reads [Zarr v2](https://zarr.readthedocs.io/) stores — chunked, compressed, +N-dimensional arrays on local disk or cloud object storage — with +{func}`ray.data.read_zarr` (`zarr-python` 2.x / Zarr v2 stores). + +This guide covers: + +- [The two output schemas](#output-schemas) — long-form (default) and aligned wide-form +- [Selecting arrays and metadata discovery](#selecting-arrays-and-metadata-discovery) +- [Controlling chunk size](#controlling-chunk-size) +- [Reading row-aligned arrays](#reading-row-aligned-arrays) +- [Custom codecs](#custom-codecs) +- [Zarr's .zattrs](#zarr-zattrs) + +For the full parameter reference, see {func}`ray.data.read_zarr`. + +## Output schemas + +`read_zarr` produces one of two schemas, selected by `align_axis_0`. + +### Long-form (default) + +By default each output row is **one chunk of one array**, with columns: + +- `array` — the array's path in the store (for example `"data/camera0_rgb"`, or `""` for a root-level array). +- `chunk_index` — the N-D index of the chunk in its array's chunk grid. +- `chunk_slices` — per-axis `(start, stop)` of the chunk in the array's coordinate space. +- `chunk` — the chunk's data at its natural shape (trailing-edge chunks may be shorter; no padding). + +Arrays read in the same call need not share any dimension — different ranks, shapes, +dtypes, and native chunk sizes coexist as separate rows. + +```python +import ray + +ds = ray.data.read_zarr("s3://anonymous@ray-example-data/mnist-tiny.zarr") +``` + +```{note} +The `chunk` column is a tensor, and tensors of different rank or dtype can't be +combined into one batch. Consume long-form **per array** (filter on the `array` +column first), or — when arrays are row-aligned (share `shape[0]`) — use +`align_axis_0=True` so each array becomes its own column, which is batch-safe. +``` + +### Aligned wide-form (`align_axis_0=True`) + +With `align_axis_0=True` each row is **one axis-0 chunk shared across the selected +arrays**, with columns: + +- `t_start`, `t_stop` — the global axis-0 range of the row. +- one column per selected array, holding that array's `[t_start:t_stop, ...]` slice. + +All selected arrays must share `shape[0]` and resolve to the same axis-0 chunk size +(after any `chunk_shapes` override). Use `array_paths` to choose which arrays participate — +`align_axis_0` itself doesn't filter. + +```python +ds = ray.data.read_zarr( + "s3://anonymous@ray-example-data/mnist-tiny.zarr", + align_axis_0=True, + chunk_shapes=[50], +) +``` + +## Selecting arrays and metadata discovery + +By default `read_zarr` reads every array it discovers. Pass `array_paths` to read a +subset: + +```python +ds = ray.data.read_zarr(store_uri, array_paths=["images", "labels"]) +``` + +Discovery follows these rules: + +- If the store has consolidated `.zmetadata`, it's the canonical array list (filtered by + `array_paths` if given). This is the fast path. +- Otherwise, if `array_paths` is given, each requested array's metadata is read directly + — no `.zmetadata` required. +- Otherwise, if `allow_full_metadata_scan=True`, the store is recursively scanned for + arrays. This can be slow or costly on large remote stores, so it's off by default; + prefer consolidating metadata with `zarr.consolidate_metadata` ahead of time. + +## Controlling chunk size + +Zarr stores are often chunked finely (for example one image per chunk). +You can use `chunk_shapes` to chunk the leading axes **at read +time** to coarsen (or refine) the granularity at which reading happens. +Note that this doesn't affect downstream batch sizes and is internal to the reading operation. +Finely chunked reading can hurt performance. + +- A **sequence** applies as a shared prefix across all selected arrays, overriding the + leading axes and keeping trailing axes native. `chunk_shapes=[16]` turns native chunks + `(1, 224, 224, 3)` into `(16, 224, 224, 3)` and `(50,)` into `(16,)`. +- A **dict** overrides per array; arrays absent from it keep native chunks. + +```python +# Coarsen every array's axis 0 to 16-element chunks. +ds = ray.data.read_zarr(store_uri, chunk_shapes=[16]) + +# Different overrides per array. +ds = ray.data.read_zarr(store_uri, chunk_shapes={"images": [16], "labels": [64]}) +``` + +## Reading row-aligned arrays + +When arrays share an axis-0 (for example a timestep axis), `align_axis_0=True` +co-iterates them as the [wide-form schema](#output-schemas) — one row per axis-0 +chunk, one column per array. + +For sliding-window pipelines, `overlap` extends each row's per-array data forward by `N` +timesteps from the next row's range (clipped at the end of the store). With +`overlap=K-1`, any window of length `K` that starts in a row's owned `[t_start, t_stop)` +fits entirely within that row's slice. + +```python +ds = ray.data.read_zarr( + store_uri, + align_axis_0=True, + chunk_shapes=[50], + overlap=9, # length-10 windows fit within a row +) +``` + +## Custom codecs + +Stores compressed with non-stdlib codecs (for example `imagecodecs` `JPEG-XL`) need the +codec package imported and registered **in every Ray worker**, not just the driver. +Register it with a `worker_process_setup_hook` — pass an importable callable or its +dotted path (a string of code isn't accepted; a string is interpreted as an import +path): + +```python +import ray + +ray.init(runtime_env={ + "worker_process_setup_hook": "imagecodecs.numcodecs.register_codecs", +}) +``` + +This is a particularity of the underlying Zarr library. + + +(zarr-zattrs)= +## Zarr's .zattrs + +`read_zarr` doesn't surface each array's `.zattrs` (Zarr user attributes) in the row +schema — they're invariant per array, so repeating them on every row would just bloat +the output. Read them separately (for example with the `zarr` package) if your job +needs them. + diff --git a/pyrefly.toml b/pyrefly.toml index 6275ebe1122c..873516f90193 100644 --- a/pyrefly.toml +++ b/pyrefly.toml @@ -65,4 +65,15 @@ ignore-missing-imports = [ "rapidsmpf.*", "rmm.*", "confluent_kafka.*", + "zarr.*", +] + +# numcodecs' ``abc.Codec`` ABC declares its abstract ``encode``/``decode`` with +# empty bodies, so pyrefly infers their return type as ``None`` and flags any +# real codec subclass (like the one in test_zarrv2) as an inconsistent override +# -- a numcodecs typing defect we can't satisfy without lying about the return +# type. So treat numcodecs as Any. (``ignore-missing-imports`` only covers +# modules that can't be found, not installed ones, so it belongs here.) +replace-imports-with-any = [ + "numcodecs.*", ] diff --git a/python/deplocks/ci/data-base-ci_depset_py3.10.lock b/python/deplocks/ci/data-base-ci_depset_py3.10.lock index 0ed0e43a981d..4895fe1eac4d 100644 --- a/python/deplocks/ci/data-base-ci_depset_py3.10.lock +++ b/python/deplocks/ci/data-base-ci_depset_py3.10.lock @@ -313,6 +313,11 @@ arro3-core==0.8.0 \ # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # deltalake +asciitree==0.3.3 ; python_full_version < '3.11' \ + --hash=sha256:4aa4b9b649f85e3fcb343363d97564aa1fb62e249677f2e18a96765145cc0f6e + # via + # -c /tmp/ray-deps/requirements_compiled_py3.13.txt + # zarr asn1crypto==1.5.1 \ --hash=sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c \ --hash=sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67 @@ -1339,6 +1344,12 @@ fastavro==1.12.1 \ # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # -r python/requirements/ml/py313/data-test-requirements.txt +fasteners==0.20 ; python_full_version < '3.11' and sys_platform != 'emscripten' \ + --hash=sha256:55dce8792a41b56f727ba6e123fcaee77fd87e638a6863cec00007bfea84c8d8 \ + --hash=sha256:9422c40d1e350e4259f509fb2e608d6bc43c0136f79a00db1b49046029d0b3b7 + # via + # -c /tmp/ray-deps/requirements_compiled_py3.13.txt + # zarr fastrlock==0.8.3 ; sys_platform != 'darwin' \ --hash=sha256:001fd86bcac78c79658bac496e8a17472d64d558cd2227fdc768aa77f877fe40 \ --hash=sha256:04bb5eef8f460d13b8c0084ea5a9d3aab2c0573991c880c0a34a56bb14951d30 \ @@ -2745,6 +2756,27 @@ networkx==3.2.1 \ --hash=sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6 \ --hash=sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2 # via torch +numcodecs==0.13.1 ; python_full_version < '3.11' \ + --hash=sha256:233bc7f26abce24d57e44ea8ebeb5cd17084690b4e7409dd470fdb75528d615f \ + --hash=sha256:237b7171609e868a20fd313748494444458ccd696062f67e198f7f8f52000c15 \ + --hash=sha256:2a86f5367af9168e30f99727ff03b27d849c31ad4522060dde0bce2923b3a8bc \ + --hash=sha256:2eda97dd2f90add98df6d295f2c6ae846043396e3d51a739ca5db6c03b5eb666 \ + --hash=sha256:3501a848adaddce98a71a262fee15cd3618312692aa419da77acd18af4a6a3f6 \ + --hash=sha256:3f593c7506b0ab248961a3b13cb148cc6e8355662ff124ac591822310bc55ecf \ + --hash=sha256:5195bea384a6428f8afcece793860b1ab0ae28143c853f0b2b20d55a8947c917 \ + --hash=sha256:796b3e6740107e4fa624cc636248a1580138b3f1c579160f260f76ff13a4261b \ + --hash=sha256:7a60d75179fd6692e301ddfb3b266d51eb598606dcae7b9fc57f986e8d65cb43 \ + --hash=sha256:80d3071465f03522e776a31045ddf2cfee7f52df468b977ed3afdd7fe5869701 \ + --hash=sha256:90d3065ae74c9342048ae0046006f99dcb1388b7288da5a19b3bddf9c30c3176 \ + --hash=sha256:96add4f783c5ce57cc7e650b6cac79dd101daf887c479a00a29bc1487ced180b \ + --hash=sha256:96e42f73c31b8c24259c5fac6adba0c3ebf95536e37749dc6c62ade2989dca28 \ + --hash=sha256:a3cf37881df0898f3a9c0d4477df88133fe85185bffe57ba31bcc2fa207709bc \ + --hash=sha256:da2230484e6102e5fa3cc1a5dd37ca1f92dfbd183d91662074d6f7574e3e8f53 \ + --hash=sha256:e5db4824ebd5389ea30e54bc8aeccb82d514d28b6b68da6c536b8fa4596f4bca \ + --hash=sha256:eda7d7823c9282e65234731fd6bd3986b1f9e035755f7fed248d7d366bb291ab + # via + # -r python/requirements/ml/py313/data-test-requirements.txt + # zarr numexpr==2.14.1 ; python_full_version < '3.12' \ --hash=sha256:03130afa04edf83a7b590d207444f05a00363c9b9ea5d81c0f53b1ea13fad55a \ --hash=sha256:05f9366d23a2e991fd5a8b5e61a17558f028ba86158a4552f8f239b005cdf83c \ @@ -2880,6 +2912,7 @@ numpy==2.2.6 \ # keras # ml-dtypes # modin + # numcodecs # numexpr # pandas # pylance @@ -2899,6 +2932,7 @@ numpy==2.2.6 \ # torchtext # torchvision # webdataset + # zarr nvidia-nccl-cu12==2.27.5 ; platform_machine != 'aarch64' and sys_platform == 'linux' \ --hash=sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457 # via @@ -5771,6 +5805,10 @@ yarl==1.23.0 \ # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # aiohttp # delta-sharing +zarr==2.18.3 ; python_full_version < '3.11' \ + --hash=sha256:2580d8cb6dd84621771a10d31c4d777dca8a27706a1a89b29f42d2d37e2df5ce \ + --hash=sha256:b1f7dfd2496f436745cdd4c7bcf8d3b4bc1dceef5fdd0d589c87130d842496dd + # via -r python/requirements/ml/py313/data-test-requirements.txt zict==3.0.0 \ --hash=sha256:5796e36bd0e0cc8cf0fbc1ace6a68912611c1dbd74750a3f3026b9b9d6a327ae \ --hash=sha256:e321e263b6a97aafc0790c3cfb3c04656b7066e6738c37fffcca95d803c9fba5 diff --git a/python/deplocks/ci/data-base-ci_depset_py3.11.lock b/python/deplocks/ci/data-base-ci_depset_py3.11.lock index 6b7589506fbf..f57cb225eeee 100644 --- a/python/deplocks/ci/data-base-ci_depset_py3.11.lock +++ b/python/deplocks/ci/data-base-ci_depset_py3.11.lock @@ -313,6 +313,11 @@ arro3-core==0.8.0 \ # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # deltalake +asciitree==0.3.3 \ + --hash=sha256:4aa4b9b649f85e3fcb343363d97564aa1fb62e249677f2e18a96765145cc0f6e + # via + # -c /tmp/ray-deps/requirements_compiled_py3.13.txt + # zarr asn1crypto==1.5.1 \ --hash=sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c \ --hash=sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67 @@ -1214,6 +1219,7 @@ deprecated==1.3.1 \ # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # deltalake + # numcodecs dill==0.4.1 \ --hash=sha256:1e1ce33e978ae97fcfcff5638477032b801c46c7c65cf717f95fbc2248f79a9d \ --hash=sha256:423092df4182177d4d8ba8290c8a5b640c66ab35ec7da59ccfa00f6fa3eea5fa @@ -1343,6 +1349,12 @@ fastavro==1.12.1 \ # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # -r python/requirements/ml/py313/data-test-requirements.txt +fasteners==0.20 ; sys_platform != 'emscripten' \ + --hash=sha256:55dce8792a41b56f727ba6e123fcaee77fd87e638a6863cec00007bfea84c8d8 \ + --hash=sha256:9422c40d1e350e4259f509fb2e608d6bc43c0136f79a00db1b49046029d0b3b7 + # via + # -c /tmp/ray-deps/requirements_compiled_py3.13.txt + # zarr fastrlock==0.8.3 ; sys_platform != 'darwin' \ --hash=sha256:001fd86bcac78c79658bac496e8a17472d64d558cd2227fdc768aa77f877fe40 \ --hash=sha256:04bb5eef8f460d13b8c0084ea5a9d3aab2c0573991c880c0a34a56bb14951d30 \ @@ -2754,6 +2766,24 @@ networkx==3.6.1 \ # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # torch +numcodecs==0.15.1 \ + --hash=sha256:1d471a1829ce52d3f365053a2bd1379e32e369517557c4027ddf5ac0d99c591e \ + --hash=sha256:1dfdea4a67108205edfce99c1cb6cd621343bc7abb7e16a041c966776920e7de \ + --hash=sha256:698f1d59511488b8fe215fadc1e679a4c70d894de2cca6d8bf2ab770eed34dfd \ + --hash=sha256:a34f0fe5e5f3b837bbedbeb98794a6d4a12eeeef8d4697b523905837900b5e1c \ + --hash=sha256:a4f7bdb26f1b34423cb56d48e75821223be38040907c9b5954eeb7463e7eb03c \ + --hash=sha256:b0a9d9cd29a0088220682dda4a9898321f7813ff7802be2bbb545f6e3d2f10ff \ + --hash=sha256:bef8c8e64fab76677324a07672b10c31861775d03fc63ed5012ca384144e4bb9 \ + --hash=sha256:c3a09e22140f2c691f7df26303ff8fa2dadcf26d7d0828398c0bc09b69e5efa3 \ + --hash=sha256:cdfaef9f5f2ed8f65858db801f1953f1007c9613ee490a1c56233cd78b505ed5 \ + --hash=sha256:daed6066ffcf40082da847d318b5ab6123d69ceb433ba603cb87c323a541a8bc \ + --hash=sha256:e2547fa3a7ffc9399cfd2936aecb620a3db285f2630c86c8a678e477741a4b3c \ + --hash=sha256:e3d82b70500cf61e8d115faa0d0a76be6ecdc24a16477ee3279d711699ad85f3 \ + --hash=sha256:eeed77e4d6636641a2cc605fbc6078c7a8f2cc40f3dfa2b3f61e52e6091b04ff + # via + # -c /tmp/ray-deps/requirements_compiled_py3.13.txt + # -r python/requirements/ml/py313/data-test-requirements.txt + # zarr numexpr==2.14.1 ; python_full_version < '3.12' \ --hash=sha256:03130afa04edf83a7b590d207444f05a00363c9b9ea5d81c0f53b1ea13fad55a \ --hash=sha256:05f9366d23a2e991fd5a8b5e61a17558f028ba86158a4552f8f239b005cdf83c \ @@ -2889,6 +2919,7 @@ numpy==2.2.6 \ # keras # ml-dtypes # modin + # numcodecs # numexpr # pandas # pylance @@ -2908,6 +2939,7 @@ numpy==2.2.6 \ # torchtext # torchvision # webdataset + # zarr nvidia-nccl-cu12==2.27.5 ; platform_machine != 'aarch64' and sys_platform == 'linux' \ --hash=sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457 # via @@ -5799,6 +5831,12 @@ yarl==1.23.0 \ # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # aiohttp # delta-sharing +zarr==2.18.7 \ + --hash=sha256:ac3dc4033e9ae4e9d7b5e27c97ea3eaf1003cc0a07f010bd83d5134bf8c4b223 \ + --hash=sha256:b2b8f66f14dac4af66b180d2338819981b981f70e196c9a66e6bfaa9e59572f5 + # via + # -c /tmp/ray-deps/requirements_compiled_py3.13.txt + # -r python/requirements/ml/py313/data-test-requirements.txt zict==3.0.0 \ --hash=sha256:5796e36bd0e0cc8cf0fbc1ace6a68912611c1dbd74750a3f3026b9b9d6a327ae \ --hash=sha256:e321e263b6a97aafc0790c3cfb3c04656b7066e6738c37fffcca95d803c9fba5 diff --git a/python/deplocks/ci/data-base-ci_depset_py3.12.lock b/python/deplocks/ci/data-base-ci_depset_py3.12.lock index 5352aede185a..5478bc4165dd 100644 --- a/python/deplocks/ci/data-base-ci_depset_py3.12.lock +++ b/python/deplocks/ci/data-base-ci_depset_py3.12.lock @@ -306,6 +306,11 @@ arro3-core==0.8.0 \ # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # deltalake +asciitree==0.3.3 \ + --hash=sha256:4aa4b9b649f85e3fcb343363d97564aa1fb62e249677f2e18a96765145cc0f6e + # via + # -c /tmp/ray-deps/requirements_compiled_py3.13.txt + # zarr asn1crypto==1.5.1 \ --hash=sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c \ --hash=sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67 @@ -1200,6 +1205,7 @@ deprecated==1.3.1 \ # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # deltalake + # numcodecs dill==0.4.1 \ --hash=sha256:1e1ce33e978ae97fcfcff5638477032b801c46c7c65cf717f95fbc2248f79a9d \ --hash=sha256:423092df4182177d4d8ba8290c8a5b640c66ab35ec7da59ccfa00f6fa3eea5fa @@ -1333,6 +1339,12 @@ fastavro==1.12.1 \ # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # -r python/requirements/ml/py313/data-test-requirements.txt +fasteners==0.20 ; sys_platform != 'emscripten' \ + --hash=sha256:55dce8792a41b56f727ba6e123fcaee77fd87e638a6863cec00007bfea84c8d8 \ + --hash=sha256:9422c40d1e350e4259f509fb2e608d6bc43c0136f79a00db1b49046029d0b3b7 + # via + # -c /tmp/ray-deps/requirements_compiled_py3.13.txt + # zarr fastrlock==0.8.3 ; sys_platform != 'darwin' \ --hash=sha256:001fd86bcac78c79658bac496e8a17472d64d558cd2227fdc768aa77f877fe40 \ --hash=sha256:04bb5eef8f460d13b8c0084ea5a9d3aab2c0573991c880c0a34a56bb14951d30 \ @@ -2743,6 +2755,24 @@ networkx==3.6.1 \ # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # torch +numcodecs==0.15.1 \ + --hash=sha256:1d471a1829ce52d3f365053a2bd1379e32e369517557c4027ddf5ac0d99c591e \ + --hash=sha256:1dfdea4a67108205edfce99c1cb6cd621343bc7abb7e16a041c966776920e7de \ + --hash=sha256:698f1d59511488b8fe215fadc1e679a4c70d894de2cca6d8bf2ab770eed34dfd \ + --hash=sha256:a34f0fe5e5f3b837bbedbeb98794a6d4a12eeeef8d4697b523905837900b5e1c \ + --hash=sha256:a4f7bdb26f1b34423cb56d48e75821223be38040907c9b5954eeb7463e7eb03c \ + --hash=sha256:b0a9d9cd29a0088220682dda4a9898321f7813ff7802be2bbb545f6e3d2f10ff \ + --hash=sha256:bef8c8e64fab76677324a07672b10c31861775d03fc63ed5012ca384144e4bb9 \ + --hash=sha256:c3a09e22140f2c691f7df26303ff8fa2dadcf26d7d0828398c0bc09b69e5efa3 \ + --hash=sha256:cdfaef9f5f2ed8f65858db801f1953f1007c9613ee490a1c56233cd78b505ed5 \ + --hash=sha256:daed6066ffcf40082da847d318b5ab6123d69ceb433ba603cb87c323a541a8bc \ + --hash=sha256:e2547fa3a7ffc9399cfd2936aecb620a3db285f2630c86c8a678e477741a4b3c \ + --hash=sha256:e3d82b70500cf61e8d115faa0d0a76be6ecdc24a16477ee3279d711699ad85f3 \ + --hash=sha256:eeed77e4d6636641a2cc605fbc6078c7a8f2cc40f3dfa2b3f61e52e6091b04ff + # via + # -c /tmp/ray-deps/requirements_compiled_py3.13.txt + # -r python/requirements/ml/py313/data-test-requirements.txt + # zarr numpy==2.2.6 \ --hash=sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff \ --hash=sha256:0678000bb9ac1475cd454c6b8c799206af8107e310843532b04d49649c717a47 \ @@ -2817,6 +2847,7 @@ numpy==2.2.6 \ # keras # ml-dtypes # modin + # numcodecs # pandas # pylance # ray @@ -2834,6 +2865,7 @@ numpy==2.2.6 \ # torchtext # torchvision # webdataset + # zarr nvidia-nccl-cu12==2.27.5 ; platform_machine != 'aarch64' and sys_platform == 'linux' \ --hash=sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457 # via @@ -5660,6 +5692,12 @@ yarl==1.23.0 \ # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # aiohttp # delta-sharing +zarr==2.18.7 \ + --hash=sha256:ac3dc4033e9ae4e9d7b5e27c97ea3eaf1003cc0a07f010bd83d5134bf8c4b223 \ + --hash=sha256:b2b8f66f14dac4af66b180d2338819981b981f70e196c9a66e6bfaa9e59572f5 + # via + # -c /tmp/ray-deps/requirements_compiled_py3.13.txt + # -r python/requirements/ml/py313/data-test-requirements.txt zict==3.0.0 \ --hash=sha256:5796e36bd0e0cc8cf0fbc1ace6a68912611c1dbd74750a3f3026b9b9d6a327ae \ --hash=sha256:e321e263b6a97aafc0790c3cfb3c04656b7066e6738c37fffcca95d803c9fba5 diff --git a/python/deplocks/ci/data-mongo-ci_depset_py3.10.lock b/python/deplocks/ci/data-mongo-ci_depset_py3.10.lock index 83daffd71c1f..88651a6a41f0 100644 --- a/python/deplocks/ci/data-mongo-ci_depset_py3.10.lock +++ b/python/deplocks/ci/data-mongo-ci_depset_py3.10.lock @@ -305,6 +305,11 @@ arro3-core==0.8.0 \ # via # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock # deltalake +asciitree==0.3.3 ; python_full_version < '3.11' \ + --hash=sha256:4aa4b9b649f85e3fcb343363d97564aa1fb62e249677f2e18a96765145cc0f6e + # via + # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock + # zarr asn1crypto==1.5.1 \ --hash=sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c \ --hash=sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67 @@ -1285,6 +1290,12 @@ fastavro==1.12.1 \ --hash=sha256:eaa7ab3769beadcebb60f0539054c7755f63bd9cf7666e2c15e615ab605f89a8 \ --hash=sha256:ed924233272719b5d5a6a0b4d80ef3345fc7e84fc7a382b6232192a9112d38a6 # via -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock +fasteners==0.20 ; python_full_version < '3.11' and sys_platform != 'emscripten' \ + --hash=sha256:55dce8792a41b56f727ba6e123fcaee77fd87e638a6863cec00007bfea84c8d8 \ + --hash=sha256:9422c40d1e350e4259f509fb2e608d6bc43c0136f79a00db1b49046029d0b3b7 + # via + # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock + # zarr fastrlock==0.8.3 ; sys_platform != 'darwin' \ --hash=sha256:001fd86bcac78c79658bac496e8a17472d64d558cd2227fdc768aa77f877fe40 \ --hash=sha256:04bb5eef8f460d13b8c0084ea5a9d3aab2c0573991c880c0a34a56bb14951d30 \ @@ -2622,6 +2633,27 @@ networkx==3.2.1 \ # via # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock # torch +numcodecs==0.13.1 ; python_full_version < '3.11' \ + --hash=sha256:233bc7f26abce24d57e44ea8ebeb5cd17084690b4e7409dd470fdb75528d615f \ + --hash=sha256:237b7171609e868a20fd313748494444458ccd696062f67e198f7f8f52000c15 \ + --hash=sha256:2a86f5367af9168e30f99727ff03b27d849c31ad4522060dde0bce2923b3a8bc \ + --hash=sha256:2eda97dd2f90add98df6d295f2c6ae846043396e3d51a739ca5db6c03b5eb666 \ + --hash=sha256:3501a848adaddce98a71a262fee15cd3618312692aa419da77acd18af4a6a3f6 \ + --hash=sha256:3f593c7506b0ab248961a3b13cb148cc6e8355662ff124ac591822310bc55ecf \ + --hash=sha256:5195bea384a6428f8afcece793860b1ab0ae28143c853f0b2b20d55a8947c917 \ + --hash=sha256:796b3e6740107e4fa624cc636248a1580138b3f1c579160f260f76ff13a4261b \ + --hash=sha256:7a60d75179fd6692e301ddfb3b266d51eb598606dcae7b9fc57f986e8d65cb43 \ + --hash=sha256:80d3071465f03522e776a31045ddf2cfee7f52df468b977ed3afdd7fe5869701 \ + --hash=sha256:90d3065ae74c9342048ae0046006f99dcb1388b7288da5a19b3bddf9c30c3176 \ + --hash=sha256:96add4f783c5ce57cc7e650b6cac79dd101daf887c479a00a29bc1487ced180b \ + --hash=sha256:96e42f73c31b8c24259c5fac6adba0c3ebf95536e37749dc6c62ade2989dca28 \ + --hash=sha256:a3cf37881df0898f3a9c0d4477df88133fe85185bffe57ba31bcc2fa207709bc \ + --hash=sha256:da2230484e6102e5fa3cc1a5dd37ca1f92dfbd183d91662074d6f7574e3e8f53 \ + --hash=sha256:e5db4824ebd5389ea30e54bc8aeccb82d514d28b6b68da6c536b8fa4596f4bca \ + --hash=sha256:eda7d7823c9282e65234731fd6bd3986b1f9e035755f7fed248d7d366bb291ab + # via + # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock + # zarr numexpr==2.14.1 ; python_full_version < '3.12' \ --hash=sha256:03130afa04edf83a7b590d207444f05a00363c9b9ea5d81c0f53b1ea13fad55a \ --hash=sha256:05f9366d23a2e991fd5a8b5e61a17558f028ba86158a4552f8f239b005cdf83c \ @@ -2753,6 +2785,7 @@ numpy==2.2.6 \ # jaxlib # keras # ml-dtypes + # numcodecs # numexpr # pandas # pymars @@ -2770,6 +2803,7 @@ numpy==2.2.6 \ # torchtext # torchvision # webdataset + # zarr nvidia-nccl-cu12==2.27.5 ; platform_machine != 'aarch64' and sys_platform == 'linux' \ --hash=sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457 # via -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock @@ -5606,6 +5640,10 @@ yarl==1.23.0 \ # via # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock # aiohttp +zarr==2.18.3 ; python_full_version < '3.11' \ + --hash=sha256:2580d8cb6dd84621771a10d31c4d777dca8a27706a1a89b29f42d2d37e2df5ce \ + --hash=sha256:b1f7dfd2496f436745cdd4c7bcf8d3b4bc1dceef5fdd0d589c87130d842496dd + # via -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock zict==3.0.0 \ --hash=sha256:5796e36bd0e0cc8cf0fbc1ace6a68912611c1dbd74750a3f3026b9b9d6a327ae \ --hash=sha256:e321e263b6a97aafc0790c3cfb3c04656b7066e6738c37fffcca95d803c9fba5 diff --git a/python/deplocks/ci/data-pyarrow-latest-ci_depset_py3.10.lock b/python/deplocks/ci/data-pyarrow-latest-ci_depset_py3.10.lock index c83fae17ef94..cdc31a871578 100644 --- a/python/deplocks/ci/data-pyarrow-latest-ci_depset_py3.10.lock +++ b/python/deplocks/ci/data-pyarrow-latest-ci_depset_py3.10.lock @@ -306,6 +306,11 @@ arro3-core==0.8.0 \ # via # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock # deltalake +asciitree==0.3.3 ; python_full_version < '3.11' \ + --hash=sha256:4aa4b9b649f85e3fcb343363d97564aa1fb62e249677f2e18a96765145cc0f6e + # via + # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock + # zarr asn1crypto==1.5.1 \ --hash=sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c \ --hash=sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67 @@ -1288,6 +1293,12 @@ fastavro==1.12.1 \ --hash=sha256:eaa7ab3769beadcebb60f0539054c7755f63bd9cf7666e2c15e615ab605f89a8 \ --hash=sha256:ed924233272719b5d5a6a0b4d80ef3345fc7e84fc7a382b6232192a9112d38a6 # via -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock +fasteners==0.20 ; python_full_version < '3.11' and sys_platform != 'emscripten' \ + --hash=sha256:55dce8792a41b56f727ba6e123fcaee77fd87e638a6863cec00007bfea84c8d8 \ + --hash=sha256:9422c40d1e350e4259f509fb2e608d6bc43c0136f79a00db1b49046029d0b3b7 + # via + # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock + # zarr fastrlock==0.8.3 ; sys_platform != 'darwin' \ --hash=sha256:001fd86bcac78c79658bac496e8a17472d64d558cd2227fdc768aa77f877fe40 \ --hash=sha256:04bb5eef8f460d13b8c0084ea5a9d3aab2c0573991c880c0a34a56bb14951d30 \ @@ -2643,6 +2654,27 @@ networkx==3.2.1 \ # via # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock # torch +numcodecs==0.13.1 ; python_full_version < '3.11' \ + --hash=sha256:233bc7f26abce24d57e44ea8ebeb5cd17084690b4e7409dd470fdb75528d615f \ + --hash=sha256:237b7171609e868a20fd313748494444458ccd696062f67e198f7f8f52000c15 \ + --hash=sha256:2a86f5367af9168e30f99727ff03b27d849c31ad4522060dde0bce2923b3a8bc \ + --hash=sha256:2eda97dd2f90add98df6d295f2c6ae846043396e3d51a739ca5db6c03b5eb666 \ + --hash=sha256:3501a848adaddce98a71a262fee15cd3618312692aa419da77acd18af4a6a3f6 \ + --hash=sha256:3f593c7506b0ab248961a3b13cb148cc6e8355662ff124ac591822310bc55ecf \ + --hash=sha256:5195bea384a6428f8afcece793860b1ab0ae28143c853f0b2b20d55a8947c917 \ + --hash=sha256:796b3e6740107e4fa624cc636248a1580138b3f1c579160f260f76ff13a4261b \ + --hash=sha256:7a60d75179fd6692e301ddfb3b266d51eb598606dcae7b9fc57f986e8d65cb43 \ + --hash=sha256:80d3071465f03522e776a31045ddf2cfee7f52df468b977ed3afdd7fe5869701 \ + --hash=sha256:90d3065ae74c9342048ae0046006f99dcb1388b7288da5a19b3bddf9c30c3176 \ + --hash=sha256:96add4f783c5ce57cc7e650b6cac79dd101daf887c479a00a29bc1487ced180b \ + --hash=sha256:96e42f73c31b8c24259c5fac6adba0c3ebf95536e37749dc6c62ade2989dca28 \ + --hash=sha256:a3cf37881df0898f3a9c0d4477df88133fe85185bffe57ba31bcc2fa207709bc \ + --hash=sha256:da2230484e6102e5fa3cc1a5dd37ca1f92dfbd183d91662074d6f7574e3e8f53 \ + --hash=sha256:e5db4824ebd5389ea30e54bc8aeccb82d514d28b6b68da6c536b8fa4596f4bca \ + --hash=sha256:eda7d7823c9282e65234731fd6bd3986b1f9e035755f7fed248d7d366bb291ab + # via + # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock + # zarr numexpr==2.14.1 ; python_full_version < '3.12' \ --hash=sha256:03130afa04edf83a7b590d207444f05a00363c9b9ea5d81c0f53b1ea13fad55a \ --hash=sha256:05f9366d23a2e991fd5a8b5e61a17558f028ba86158a4552f8f239b005cdf83c \ @@ -2775,6 +2807,7 @@ numpy==2.2.6 \ # keras # ml-dtypes # modin + # numcodecs # numexpr # pandas # pylance @@ -2793,6 +2826,7 @@ numpy==2.2.6 \ # torchtext # torchvision # webdataset + # zarr nvidia-nccl-cu12==2.27.5 ; platform_machine != 'aarch64' and sys_platform == 'linux' \ --hash=sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457 # via -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock @@ -5553,6 +5587,10 @@ yarl==1.23.0 \ # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock # aiohttp # delta-sharing +zarr==2.18.3 ; python_full_version < '3.11' \ + --hash=sha256:2580d8cb6dd84621771a10d31c4d777dca8a27706a1a89b29f42d2d37e2df5ce \ + --hash=sha256:b1f7dfd2496f436745cdd4c7bcf8d3b4bc1dceef5fdd0d589c87130d842496dd + # via -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock zict==3.0.0 \ --hash=sha256:5796e36bd0e0cc8cf0fbc1ace6a68912611c1dbd74750a3f3026b9b9d6a327ae \ --hash=sha256:e321e263b6a97aafc0790c3cfb3c04656b7066e6738c37fffcca95d803c9fba5 diff --git a/python/deplocks/ci/data-pyarrow-latest-ci_depset_py3.12.lock b/python/deplocks/ci/data-pyarrow-latest-ci_depset_py3.12.lock index e425087cdc33..c27cb7690621 100644 --- a/python/deplocks/ci/data-pyarrow-latest-ci_depset_py3.12.lock +++ b/python/deplocks/ci/data-pyarrow-latest-ci_depset_py3.12.lock @@ -302,6 +302,11 @@ arro3-core==0.8.0 \ # via # -r python/deplocks/ci/relaxed_data-ci_depset_py3.12.lock # deltalake +asciitree==0.3.3 \ + --hash=sha256:4aa4b9b649f85e3fcb343363d97564aa1fb62e249677f2e18a96765145cc0f6e + # via + # -r python/deplocks/ci/relaxed_data-ci_depset_py3.12.lock + # zarr asn1crypto==1.5.1 \ --hash=sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c \ --hash=sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67 @@ -1165,6 +1170,7 @@ deprecated==1.3.1 \ # via # -r python/deplocks/ci/relaxed_data-ci_depset_py3.12.lock # deltalake + # numcodecs distlib==0.4.0 \ --hash=sha256:9659f7d87e46584a30b5780e43ac7a2143098441670ff0a49d5f9034c54a6c16 \ --hash=sha256:feec40075be03a04501a973d81f633735b4b69f98b05450592310c0f401a4e0d @@ -1284,6 +1290,12 @@ fastavro==1.12.1 \ --hash=sha256:eaa7ab3769beadcebb60f0539054c7755f63bd9cf7666e2c15e615ab605f89a8 \ --hash=sha256:ed924233272719b5d5a6a0b4d80ef3345fc7e84fc7a382b6232192a9112d38a6 # via -r python/deplocks/ci/relaxed_data-ci_depset_py3.12.lock +fasteners==0.20 ; sys_platform != 'emscripten' \ + --hash=sha256:55dce8792a41b56f727ba6e123fcaee77fd87e638a6863cec00007bfea84c8d8 \ + --hash=sha256:9422c40d1e350e4259f509fb2e608d6bc43c0136f79a00db1b49046029d0b3b7 + # via + # -r python/deplocks/ci/relaxed_data-ci_depset_py3.12.lock + # zarr fastrlock==0.8.3 ; sys_platform != 'darwin' \ --hash=sha256:001fd86bcac78c79658bac496e8a17472d64d558cd2227fdc768aa77f877fe40 \ --hash=sha256:04bb5eef8f460d13b8c0084ea5a9d3aab2c0573991c880c0a34a56bb14951d30 \ @@ -2639,6 +2651,23 @@ networkx==3.6.1 \ # via # -r python/deplocks/ci/relaxed_data-ci_depset_py3.12.lock # torch +numcodecs==0.15.1 \ + --hash=sha256:1d471a1829ce52d3f365053a2bd1379e32e369517557c4027ddf5ac0d99c591e \ + --hash=sha256:1dfdea4a67108205edfce99c1cb6cd621343bc7abb7e16a041c966776920e7de \ + --hash=sha256:698f1d59511488b8fe215fadc1e679a4c70d894de2cca6d8bf2ab770eed34dfd \ + --hash=sha256:a34f0fe5e5f3b837bbedbeb98794a6d4a12eeeef8d4697b523905837900b5e1c \ + --hash=sha256:a4f7bdb26f1b34423cb56d48e75821223be38040907c9b5954eeb7463e7eb03c \ + --hash=sha256:b0a9d9cd29a0088220682dda4a9898321f7813ff7802be2bbb545f6e3d2f10ff \ + --hash=sha256:bef8c8e64fab76677324a07672b10c31861775d03fc63ed5012ca384144e4bb9 \ + --hash=sha256:c3a09e22140f2c691f7df26303ff8fa2dadcf26d7d0828398c0bc09b69e5efa3 \ + --hash=sha256:cdfaef9f5f2ed8f65858db801f1953f1007c9613ee490a1c56233cd78b505ed5 \ + --hash=sha256:daed6066ffcf40082da847d318b5ab6123d69ceb433ba603cb87c323a541a8bc \ + --hash=sha256:e2547fa3a7ffc9399cfd2936aecb620a3db285f2630c86c8a678e477741a4b3c \ + --hash=sha256:e3d82b70500cf61e8d115faa0d0a76be6ecdc24a16477ee3279d711699ad85f3 \ + --hash=sha256:eeed77e4d6636641a2cc605fbc6078c7a8f2cc40f3dfa2b3f61e52e6091b04ff + # via + # -r python/deplocks/ci/relaxed_data-ci_depset_py3.12.lock + # zarr numpy==2.2.6 \ --hash=sha256:038613e9fb8c72b0a41f025a7e4c3f0b7a1b5d768ece4796b674c8f3fe13efff \ --hash=sha256:0678000bb9ac1475cd454c6b8c799206af8107e310843532b04d49649c717a47 \ @@ -2710,6 +2739,7 @@ numpy==2.2.6 \ # keras # ml-dtypes # modin + # numcodecs # pandas # pylance # raydp @@ -2726,6 +2756,7 @@ numpy==2.2.6 \ # torchtext # torchvision # webdataset + # zarr nvidia-nccl-cu12==2.27.5 ; platform_machine != 'aarch64' and sys_platform == 'linux' \ --hash=sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457 # via -r python/deplocks/ci/relaxed_data-ci_depset_py3.12.lock @@ -5445,6 +5476,10 @@ yarl==1.23.0 \ # -r python/deplocks/ci/relaxed_data-ci_depset_py3.12.lock # aiohttp # delta-sharing +zarr==2.18.7 \ + --hash=sha256:ac3dc4033e9ae4e9d7b5e27c97ea3eaf1003cc0a07f010bd83d5134bf8c4b223 \ + --hash=sha256:b2b8f66f14dac4af66b180d2338819981b981f70e196c9a66e6bfaa9e59572f5 + # via -r python/deplocks/ci/relaxed_data-ci_depset_py3.12.lock zict==3.0.0 \ --hash=sha256:5796e36bd0e0cc8cf0fbc1ace6a68912611c1dbd74750a3f3026b9b9d6a327ae \ --hash=sha256:e321e263b6a97aafc0790c3cfb3c04656b7066e6738c37fffcca95d803c9fba5 diff --git a/python/deplocks/ci/data-pyarrow-nightly-ci_depset_py3.10.lock b/python/deplocks/ci/data-pyarrow-nightly-ci_depset_py3.10.lock index fdb4ff1c7ced..bb7e9fd7adb3 100644 --- a/python/deplocks/ci/data-pyarrow-nightly-ci_depset_py3.10.lock +++ b/python/deplocks/ci/data-pyarrow-nightly-ci_depset_py3.10.lock @@ -305,6 +305,11 @@ arro3-core==0.8.0 \ # via # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock # deltalake +asciitree==0.3.3 ; python_full_version < '3.11' \ + --hash=sha256:4aa4b9b649f85e3fcb343363d97564aa1fb62e249677f2e18a96765145cc0f6e + # via + # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock + # zarr asn1crypto==1.5.1 \ --hash=sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c \ --hash=sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67 @@ -1281,6 +1286,12 @@ fastavro==1.12.1 \ --hash=sha256:eaa7ab3769beadcebb60f0539054c7755f63bd9cf7666e2c15e615ab605f89a8 \ --hash=sha256:ed924233272719b5d5a6a0b4d80ef3345fc7e84fc7a382b6232192a9112d38a6 # via -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock +fasteners==0.20 ; python_full_version < '3.11' and sys_platform != 'emscripten' \ + --hash=sha256:55dce8792a41b56f727ba6e123fcaee77fd87e638a6863cec00007bfea84c8d8 \ + --hash=sha256:9422c40d1e350e4259f509fb2e608d6bc43c0136f79a00db1b49046029d0b3b7 + # via + # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock + # zarr fastrlock==0.8.3 ; sys_platform != 'darwin' \ --hash=sha256:001fd86bcac78c79658bac496e8a17472d64d558cd2227fdc768aa77f877fe40 \ --hash=sha256:04bb5eef8f460d13b8c0084ea5a9d3aab2c0573991c880c0a34a56bb14951d30 \ @@ -2633,6 +2644,27 @@ networkx==3.2.1 \ # via # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock # torch +numcodecs==0.13.1 ; python_full_version < '3.11' \ + --hash=sha256:233bc7f26abce24d57e44ea8ebeb5cd17084690b4e7409dd470fdb75528d615f \ + --hash=sha256:237b7171609e868a20fd313748494444458ccd696062f67e198f7f8f52000c15 \ + --hash=sha256:2a86f5367af9168e30f99727ff03b27d849c31ad4522060dde0bce2923b3a8bc \ + --hash=sha256:2eda97dd2f90add98df6d295f2c6ae846043396e3d51a739ca5db6c03b5eb666 \ + --hash=sha256:3501a848adaddce98a71a262fee15cd3618312692aa419da77acd18af4a6a3f6 \ + --hash=sha256:3f593c7506b0ab248961a3b13cb148cc6e8355662ff124ac591822310bc55ecf \ + --hash=sha256:5195bea384a6428f8afcece793860b1ab0ae28143c853f0b2b20d55a8947c917 \ + --hash=sha256:796b3e6740107e4fa624cc636248a1580138b3f1c579160f260f76ff13a4261b \ + --hash=sha256:7a60d75179fd6692e301ddfb3b266d51eb598606dcae7b9fc57f986e8d65cb43 \ + --hash=sha256:80d3071465f03522e776a31045ddf2cfee7f52df468b977ed3afdd7fe5869701 \ + --hash=sha256:90d3065ae74c9342048ae0046006f99dcb1388b7288da5a19b3bddf9c30c3176 \ + --hash=sha256:96add4f783c5ce57cc7e650b6cac79dd101daf887c479a00a29bc1487ced180b \ + --hash=sha256:96e42f73c31b8c24259c5fac6adba0c3ebf95536e37749dc6c62ade2989dca28 \ + --hash=sha256:a3cf37881df0898f3a9c0d4477df88133fe85185bffe57ba31bcc2fa207709bc \ + --hash=sha256:da2230484e6102e5fa3cc1a5dd37ca1f92dfbd183d91662074d6f7574e3e8f53 \ + --hash=sha256:e5db4824ebd5389ea30e54bc8aeccb82d514d28b6b68da6c536b8fa4596f4bca \ + --hash=sha256:eda7d7823c9282e65234731fd6bd3986b1f9e035755f7fed248d7d366bb291ab + # via + # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock + # zarr numexpr==2.14.1 ; python_full_version < '3.12' \ --hash=sha256:03130afa04edf83a7b590d207444f05a00363c9b9ea5d81c0f53b1ea13fad55a \ --hash=sha256:05f9366d23a2e991fd5a8b5e61a17558f028ba86158a4552f8f239b005cdf83c \ @@ -2765,6 +2797,7 @@ numpy==2.2.6 \ # keras # ml-dtypes # modin + # numcodecs # numexpr # pandas # pylance @@ -2783,6 +2816,7 @@ numpy==2.2.6 \ # torchtext # torchvision # webdataset + # zarr nvidia-nccl-cu12==2.27.5 ; platform_machine != 'aarch64' and sys_platform == 'linux' \ --hash=sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457 # via -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock @@ -5538,6 +5572,10 @@ yarl==1.23.0 \ # via # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock # aiohttp +zarr==2.18.3 ; python_full_version < '3.11' \ + --hash=sha256:2580d8cb6dd84621771a10d31c4d777dca8a27706a1a89b29f42d2d37e2df5ce \ + --hash=sha256:b1f7dfd2496f436745cdd4c7bcf8d3b4bc1dceef5fdd0d589c87130d842496dd + # via -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock zict==3.0.0 \ --hash=sha256:5796e36bd0e0cc8cf0fbc1ace6a68912611c1dbd74750a3f3026b9b9d6a327ae \ --hash=sha256:e321e263b6a97aafc0790c3cfb3c04656b7066e6738c37fffcca95d803c9fba5 diff --git a/python/deplocks/ci/data-pyarrow-v17-ci_depset_py3.10.lock b/python/deplocks/ci/data-pyarrow-v17-ci_depset_py3.10.lock index 26e4c9b04ebb..ef0c2b9c3671 100644 --- a/python/deplocks/ci/data-pyarrow-v17-ci_depset_py3.10.lock +++ b/python/deplocks/ci/data-pyarrow-v17-ci_depset_py3.10.lock @@ -307,6 +307,11 @@ arro3-core==0.8.0 \ # via # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock # deltalake +asciitree==0.3.3 ; python_full_version < '3.11' \ + --hash=sha256:4aa4b9b649f85e3fcb343363d97564aa1fb62e249677f2e18a96765145cc0f6e + # via + # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock + # zarr asn1crypto==1.5.1 \ --hash=sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c \ --hash=sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67 @@ -1299,6 +1304,12 @@ fastavro==1.12.1 \ --hash=sha256:eaa7ab3769beadcebb60f0539054c7755f63bd9cf7666e2c15e615ab605f89a8 \ --hash=sha256:ed924233272719b5d5a6a0b4d80ef3345fc7e84fc7a382b6232192a9112d38a6 # via -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock +fasteners==0.20 ; python_full_version < '3.11' and sys_platform != 'emscripten' \ + --hash=sha256:55dce8792a41b56f727ba6e123fcaee77fd87e638a6863cec00007bfea84c8d8 \ + --hash=sha256:9422c40d1e350e4259f509fb2e608d6bc43c0136f79a00db1b49046029d0b3b7 + # via + # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock + # zarr fastrlock==0.8.3 ; sys_platform != 'darwin' \ --hash=sha256:001fd86bcac78c79658bac496e8a17472d64d558cd2227fdc768aa77f877fe40 \ --hash=sha256:04bb5eef8f460d13b8c0084ea5a9d3aab2c0573991c880c0a34a56bb14951d30 \ @@ -2676,6 +2687,27 @@ networkx==3.2.1 \ # via # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock # torch +numcodecs==0.13.1 ; python_full_version < '3.11' \ + --hash=sha256:233bc7f26abce24d57e44ea8ebeb5cd17084690b4e7409dd470fdb75528d615f \ + --hash=sha256:237b7171609e868a20fd313748494444458ccd696062f67e198f7f8f52000c15 \ + --hash=sha256:2a86f5367af9168e30f99727ff03b27d849c31ad4522060dde0bce2923b3a8bc \ + --hash=sha256:2eda97dd2f90add98df6d295f2c6ae846043396e3d51a739ca5db6c03b5eb666 \ + --hash=sha256:3501a848adaddce98a71a262fee15cd3618312692aa419da77acd18af4a6a3f6 \ + --hash=sha256:3f593c7506b0ab248961a3b13cb148cc6e8355662ff124ac591822310bc55ecf \ + --hash=sha256:5195bea384a6428f8afcece793860b1ab0ae28143c853f0b2b20d55a8947c917 \ + --hash=sha256:796b3e6740107e4fa624cc636248a1580138b3f1c579160f260f76ff13a4261b \ + --hash=sha256:7a60d75179fd6692e301ddfb3b266d51eb598606dcae7b9fc57f986e8d65cb43 \ + --hash=sha256:80d3071465f03522e776a31045ddf2cfee7f52df468b977ed3afdd7fe5869701 \ + --hash=sha256:90d3065ae74c9342048ae0046006f99dcb1388b7288da5a19b3bddf9c30c3176 \ + --hash=sha256:96add4f783c5ce57cc7e650b6cac79dd101daf887c479a00a29bc1487ced180b \ + --hash=sha256:96e42f73c31b8c24259c5fac6adba0c3ebf95536e37749dc6c62ade2989dca28 \ + --hash=sha256:a3cf37881df0898f3a9c0d4477df88133fe85185bffe57ba31bcc2fa207709bc \ + --hash=sha256:da2230484e6102e5fa3cc1a5dd37ca1f92dfbd183d91662074d6f7574e3e8f53 \ + --hash=sha256:e5db4824ebd5389ea30e54bc8aeccb82d514d28b6b68da6c536b8fa4596f4bca \ + --hash=sha256:eda7d7823c9282e65234731fd6bd3986b1f9e035755f7fed248d7d366bb291ab + # via + # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock + # zarr numexpr==2.14.1 ; python_full_version < '3.12' \ --hash=sha256:03130afa04edf83a7b590d207444f05a00363c9b9ea5d81c0f53b1ea13fad55a \ --hash=sha256:05f9366d23a2e991fd5a8b5e61a17558f028ba86158a4552f8f239b005cdf83c \ @@ -2810,6 +2842,7 @@ numpy==2.2.6 \ # keras # ml-dtypes # modin + # numcodecs # numexpr # pandas # pyarrow @@ -2829,6 +2862,7 @@ numpy==2.2.6 \ # torchtext # torchvision # webdataset + # zarr nvidia-nccl-cu12==2.27.5 ; platform_machine != 'aarch64' and sys_platform == 'linux' \ --hash=sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457 # via -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock @@ -5570,6 +5604,10 @@ yarl==1.23.0 \ # -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock # aiohttp # delta-sharing +zarr==2.18.3 ; python_full_version < '3.11' \ + --hash=sha256:2580d8cb6dd84621771a10d31c4d777dca8a27706a1a89b29f42d2d37e2df5ce \ + --hash=sha256:b1f7dfd2496f436745cdd4c7bcf8d3b4bc1dceef5fdd0d589c87130d842496dd + # via -r python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock zict==3.0.0 \ --hash=sha256:5796e36bd0e0cc8cf0fbc1ace6a68912611c1dbd74750a3f3026b9b9d6a327ae \ --hash=sha256:e321e263b6a97aafc0790c3cfb3c04656b7066e6738c37fffcca95d803c9fba5 diff --git a/python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock b/python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock index adfa425d7948..0f3089f1f965 100644 --- a/python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock +++ b/python/deplocks/ci/relaxed_data-ci_depset_py3.10.lock @@ -312,6 +312,11 @@ arro3-core==0.8.0 \ # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # deltalake +asciitree==0.3.3; python_full_version < "3.11" \ + --hash=sha256:4aa4b9b649f85e3fcb343363d97564aa1fb62e249677f2e18a96765145cc0f6e +# via +# -c /tmp/ray-deps/requirements_compiled_py3.13.txt +# zarr asn1crypto==1.5.1 \ --hash=sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c \ --hash=sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67 @@ -1329,6 +1334,12 @@ fastavro==1.12.1 \ # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # -r python/requirements/ml/py313/data-test-requirements.txt +fasteners==0.20; python_full_version < "3.11" and sys_platform != "emscripten" \ + --hash=sha256:55dce8792a41b56f727ba6e123fcaee77fd87e638a6863cec00007bfea84c8d8 \ + --hash=sha256:9422c40d1e350e4259f509fb2e608d6bc43c0136f79a00db1b49046029d0b3b7 +# via +# -c /tmp/ray-deps/requirements_compiled_py3.13.txt +# zarr fastrlock==0.8.3; sys_platform != "darwin" \ --hash=sha256:001fd86bcac78c79658bac496e8a17472d64d558cd2227fdc768aa77f877fe40 \ --hash=sha256:04bb5eef8f460d13b8c0084ea5a9d3aab2c0573991c880c0a34a56bb14951d30 \ @@ -2708,6 +2719,27 @@ networkx==3.2.1 \ --hash=sha256:9f1bb5cf3409bf324e0a722c20bdb4c20ee39bf1c30ce8ae499c8502b0b5e0c6 \ --hash=sha256:f18c69adc97877c42332c170849c96cefa91881c99a7cb3e95b7c659ebdc1ec2 # via torch +numcodecs==0.13.1; python_full_version < "3.11" \ + --hash=sha256:233bc7f26abce24d57e44ea8ebeb5cd17084690b4e7409dd470fdb75528d615f \ + --hash=sha256:237b7171609e868a20fd313748494444458ccd696062f67e198f7f8f52000c15 \ + --hash=sha256:2a86f5367af9168e30f99727ff03b27d849c31ad4522060dde0bce2923b3a8bc \ + --hash=sha256:2eda97dd2f90add98df6d295f2c6ae846043396e3d51a739ca5db6c03b5eb666 \ + --hash=sha256:3501a848adaddce98a71a262fee15cd3618312692aa419da77acd18af4a6a3f6 \ + --hash=sha256:3f593c7506b0ab248961a3b13cb148cc6e8355662ff124ac591822310bc55ecf \ + --hash=sha256:5195bea384a6428f8afcece793860b1ab0ae28143c853f0b2b20d55a8947c917 \ + --hash=sha256:796b3e6740107e4fa624cc636248a1580138b3f1c579160f260f76ff13a4261b \ + --hash=sha256:7a60d75179fd6692e301ddfb3b266d51eb598606dcae7b9fc57f986e8d65cb43 \ + --hash=sha256:80d3071465f03522e776a31045ddf2cfee7f52df468b977ed3afdd7fe5869701 \ + --hash=sha256:90d3065ae74c9342048ae0046006f99dcb1388b7288da5a19b3bddf9c30c3176 \ + --hash=sha256:96add4f783c5ce57cc7e650b6cac79dd101daf887c479a00a29bc1487ced180b \ + --hash=sha256:96e42f73c31b8c24259c5fac6adba0c3ebf95536e37749dc6c62ade2989dca28 \ + --hash=sha256:a3cf37881df0898f3a9c0d4477df88133fe85185bffe57ba31bcc2fa207709bc \ + --hash=sha256:da2230484e6102e5fa3cc1a5dd37ca1f92dfbd183d91662074d6f7574e3e8f53 \ + --hash=sha256:e5db4824ebd5389ea30e54bc8aeccb82d514d28b6b68da6c536b8fa4596f4bca \ + --hash=sha256:eda7d7823c9282e65234731fd6bd3986b1f9e035755f7fed248d7d366bb291ab +# via +# -r python/requirements/ml/py313/data-test-requirements.txt +# zarr numexpr==2.14.1; python_full_version < "3.12" \ --hash=sha256:03130afa04edf83a7b590d207444f05a00363c9b9ea5d81c0f53b1ea13fad55a \ --hash=sha256:05f9366d23a2e991fd5a8b5e61a17558f028ba86158a4552f8f239b005cdf83c \ @@ -2787,6 +2819,7 @@ numexpr==2.14.1; python_full_version < "3.12" \ # keras # ml-dtypes # modin +# numcodecs # numexpr # pandas # pylance @@ -2806,6 +2839,7 @@ numexpr==2.14.1; python_full_version < "3.12" \ # torchtext # torchvision # webdataset +# zarr nvidia-nccl-cu12==2.27.5; platform_machine != "aarch64" and sys_platform == "linux" \ --hash=sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457 # via @@ -5564,6 +5598,10 @@ yarl==1.23.0 \ # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # aiohttp # delta-sharing +zarr==2.18.3; python_full_version < "3.11" \ + --hash=sha256:2580d8cb6dd84621771a10d31c4d777dca8a27706a1a89b29f42d2d37e2df5ce \ + --hash=sha256:b1f7dfd2496f436745cdd4c7bcf8d3b4bc1dceef5fdd0d589c87130d842496dd +# via -r python/requirements/ml/py313/data-test-requirements.txt zict==3.0.0 \ --hash=sha256:5796e36bd0e0cc8cf0fbc1ace6a68912611c1dbd74750a3f3026b9b9d6a327ae \ --hash=sha256:e321e263b6a97aafc0790c3cfb3c04656b7066e6738c37fffcca95d803c9fba5 diff --git a/python/deplocks/ci/relaxed_data-ci_depset_py3.12.lock b/python/deplocks/ci/relaxed_data-ci_depset_py3.12.lock index 09dd08c4e512..ced038f99658 100644 --- a/python/deplocks/ci/relaxed_data-ci_depset_py3.12.lock +++ b/python/deplocks/ci/relaxed_data-ci_depset_py3.12.lock @@ -305,6 +305,11 @@ arro3-core==0.8.0 \ # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # deltalake +asciitree==0.3.3 \ + --hash=sha256:4aa4b9b649f85e3fcb343363d97564aa1fb62e249677f2e18a96765145cc0f6e +# via +# -c /tmp/ray-deps/requirements_compiled_py3.13.txt +# zarr asn1crypto==1.5.1 \ --hash=sha256:13ae38502be632115abf8a24cbe5f4da52e3b5231990aff31123c805306ccb9c \ --hash=sha256:db4e40728b728508912cbb3d44f19ce188f218e9eba635821bb4b68564f8fd67 @@ -1193,6 +1198,7 @@ deprecated==1.3.1 \ # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # deltalake +# numcodecs # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # datasets @@ -1323,6 +1329,12 @@ fastavro==1.12.1 \ # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # -r python/requirements/ml/py313/data-test-requirements.txt +fasteners==0.20; sys_platform != "emscripten" \ + --hash=sha256:55dce8792a41b56f727ba6e123fcaee77fd87e638a6863cec00007bfea84c8d8 \ + --hash=sha256:9422c40d1e350e4259f509fb2e608d6bc43c0136f79a00db1b49046029d0b3b7 +# via +# -c /tmp/ray-deps/requirements_compiled_py3.13.txt +# zarr fastrlock==0.8.3; sys_platform != "darwin" \ --hash=sha256:001fd86bcac78c79658bac496e8a17472d64d558cd2227fdc768aa77f877fe40 \ --hash=sha256:04bb5eef8f460d13b8c0084ea5a9d3aab2c0573991c880c0a34a56bb14951d30 \ @@ -2706,6 +2718,24 @@ networkx==3.6.1 \ # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # torch +numcodecs==0.15.1 \ + --hash=sha256:1d471a1829ce52d3f365053a2bd1379e32e369517557c4027ddf5ac0d99c591e \ + --hash=sha256:1dfdea4a67108205edfce99c1cb6cd621343bc7abb7e16a041c966776920e7de \ + --hash=sha256:698f1d59511488b8fe215fadc1e679a4c70d894de2cca6d8bf2ab770eed34dfd \ + --hash=sha256:a34f0fe5e5f3b837bbedbeb98794a6d4a12eeeef8d4697b523905837900b5e1c \ + --hash=sha256:a4f7bdb26f1b34423cb56d48e75821223be38040907c9b5954eeb7463e7eb03c \ + --hash=sha256:b0a9d9cd29a0088220682dda4a9898321f7813ff7802be2bbb545f6e3d2f10ff \ + --hash=sha256:bef8c8e64fab76677324a07672b10c31861775d03fc63ed5012ca384144e4bb9 \ + --hash=sha256:c3a09e22140f2c691f7df26303ff8fa2dadcf26d7d0828398c0bc09b69e5efa3 \ + --hash=sha256:cdfaef9f5f2ed8f65858db801f1953f1007c9613ee490a1c56233cd78b505ed5 \ + --hash=sha256:daed6066ffcf40082da847d318b5ab6123d69ceb433ba603cb87c323a541a8bc \ + --hash=sha256:e2547fa3a7ffc9399cfd2936aecb620a3db285f2630c86c8a678e477741a4b3c \ + --hash=sha256:e3d82b70500cf61e8d115faa0d0a76be6ecdc24a16477ee3279d711699ad85f3 \ + --hash=sha256:eeed77e4d6636641a2cc605fbc6078c7a8f2cc40f3dfa2b3f61e52e6091b04ff +# via +# -c /tmp/ray-deps/requirements_compiled_py3.13.txt +# -r python/requirements/ml/py313/data-test-requirements.txt +# zarr # via # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # bokeh @@ -2724,6 +2754,7 @@ networkx==3.6.1 \ # keras # ml-dtypes # modin +# numcodecs # pandas # pylance # ray @@ -2741,6 +2772,7 @@ networkx==3.6.1 \ # torchtext # torchvision # webdataset +# zarr nvidia-nccl-cu12==2.27.5; platform_machine != "aarch64" and sys_platform == "linux" \ --hash=sha256:ad730cf15cb5d25fe849c6e6ca9eb5b76db16a80f13f425ac68d8e2e55624457 # via @@ -5453,6 +5485,12 @@ yarl==1.23.0 \ # -c /tmp/ray-deps/requirements_compiled_py3.13.txt # aiohttp # delta-sharing +zarr==2.18.7 \ + --hash=sha256:ac3dc4033e9ae4e9d7b5e27c97ea3eaf1003cc0a07f010bd83d5134bf8c4b223 \ + --hash=sha256:b2b8f66f14dac4af66b180d2338819981b981f70e196c9a66e6bfaa9e59572f5 +# via +# -c /tmp/ray-deps/requirements_compiled_py3.13.txt +# -r python/requirements/ml/py313/data-test-requirements.txt zict==3.0.0 \ --hash=sha256:5796e36bd0e0cc8cf0fbc1ace6a68912611c1dbd74750a3f3026b9b9d6a327ae \ --hash=sha256:e321e263b6a97aafc0790c3cfb3c04656b7066e6738c37fffcca95d803c9fba5 diff --git a/python/ray/data/BUILD.bazel b/python/ray/data/BUILD.bazel index 66db80388e06..ce8016c9e38b 100644 --- a/python/ray/data/BUILD.bazel +++ b/python/ray/data/BUILD.bazel @@ -1836,6 +1836,20 @@ py_test( ], ) +py_test( + name = "test_zarrv2", + size = "medium", + srcs = ["tests/datasource/test_zarrv2.py"], + tags = [ + "exclusive", + "team:data", + ], + deps = [ + ":conftest", + "//:ray_lib", + ], +) + py_test( name = "test_zip", size = "small", diff --git a/python/ray/data/__init__.py b/python/ray/data/__init__.py index cc95da357aa0..cb81c8becfee 100644 --- a/python/ray/data/__init__.py +++ b/python/ray/data/__init__.py @@ -82,6 +82,7 @@ read_unity_catalog, read_videos, read_webdataset, + read_zarr, ) # Module-level cached global functions for callable classes. It needs to be defined here @@ -191,6 +192,7 @@ "read_tfrecords", "read_unity_catalog", "read_videos", + "read_zarr", "read_webdataset", "KafkaAuthConfig", "Preprocessor", diff --git a/python/ray/data/_internal/datasource/zarrv2_datasource.py b/python/ray/data/_internal/datasource/zarrv2_datasource.py new file mode 100644 index 000000000000..e83b3d4b4030 --- /dev/null +++ b/python/ray/data/_internal/datasource/zarrv2_datasource.py @@ -0,0 +1,685 @@ +from __future__ import annotations + +import logging +import math +import numbers +from collections.abc import Callable, Iterable +from dataclasses import dataclass +from typing import TYPE_CHECKING, Any, List, Optional + +import numpy as np + +from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder +from ray.data._internal.util import ( + _check_import, + _is_local_scheme, + iterate_with_retry, +) +from ray.data.block import Block, BlockMetadata +from ray.data.datasource.datasource import Datasource, ReadTask + +logger = logging.getLogger(__name__) + +if TYPE_CHECKING: + from fsspec.spec import AbstractFileSystem + from pyarrow import fs as pyarrow_fs + from zarr import Array as ZarrArray + from zarr.hierarchy import Group as ZarrGroup + + from ray.data.context import DataContext + + ZarrRoot = ZarrGroup | ZarrArray + + +@dataclass(frozen=True) +class ZarrArrayMeta: + """``shape``/``chunks``/``dtype`` for a single Zarr v2 array.""" + + shape: tuple[int, ...] + chunks: tuple[int, ...] + dtype: str + + @classmethod + def from_zarr_array(cls, arr: "ZarrArray") -> ZarrArrayMeta: + return cls( + shape=tuple(int(s) for s in arr.shape), + chunks=tuple(int(c) for c in arr.chunks), + dtype=str(arr.dtype), + ) + + @property + def rank(self) -> int: + return len(self.shape) + + @property + def itemsize(self) -> int: + return np.dtype(self.dtype).itemsize + + def effective_chunks( + self, + array_name: str, + user_chunk_shape: tuple[int, ...] | dict[str, tuple[int, ...]] | None, + ) -> tuple[int, ...]: + """Resolve the user's ``chunk_shapes`` override(s) against this array's chunks. + + A single sequence overrides the leading axes (trailing axes keep the + native chunks), so one ``chunk_shapes=[16]`` applies across arrays of + different ranks. A dict maps array path → that array's override prefix; + arrays absent from it keep native chunks. ``None`` keeps native chunks; + an override longer than the array's rank raises ``ValueError``. + """ + if user_chunk_shape is None: + return self.chunks + + if isinstance(user_chunk_shape, dict): + user_chunk_shape = user_chunk_shape.get(array_name) + if user_chunk_shape is None: + return self.chunks + if len(user_chunk_shape) > self.rank: + raise ValueError( + f"chunk_shapes override for array {array_name!r} has " + f"{len(user_chunk_shape)} axes but array of shape " + f"{self.shape!r} has rank {self.rank}. Each chunk_shapes " + f"override may not be longer than its target array's rank." + ) + return user_chunk_shape + self.chunks[len(user_chunk_shape) :] + + def grid_shape(self, chunks: tuple[int, ...]) -> tuple[int, ...]: + """Number of chunks along each axis under the given chunk shape.""" + return tuple(math.ceil(s / c) for s, c in zip(self.shape, chunks)) + + def chunk_slices( + self, chunk_index: tuple[int, ...], chunks: tuple[int, ...] + ) -> tuple[tuple[int, int], ...]: + """Per-axis ``(start, stop)`` for ``array[chunk_index]`` under ``chunks``. + + Trailing-edge chunks are clamped to ``shape[i]``, so they may be + shorter than ``chunks[i]``. No padding is applied. + """ + return tuple( + (i * c, min((i + 1) * c, s)) + for i, c, s in zip(chunk_index, chunks, self.shape) + ) + + +# --------------------------------------------------------------------------- +# Chunk reading +# --------------------------------------------------------------------------- + + +def _read_chunk( + root: ZarrRoot, + array_name: str, + chunk_slices: tuple[tuple[int, int], ...], + retry_match: Optional[List[str]] = None, +) -> np.ndarray: + """Read ``array[chunk_slices]`` as an ndarray. + + The underlying filesystem's own retry policy still applies underneath. + """ + + def _read() -> np.ndarray: + indexer = tuple(slice(s, e) for s, e in chunk_slices) + arr = root if array_name == "" else root[array_name] + return np.asarray(arr[indexer]) + + if not retry_match: + return _read() + # TODO(Artur): This would be more elegant with a general retry helper for non-iterables. + return next( + iterate_with_retry( + lambda: [_read()], description="read a Zarr chunk", match=retry_match + ) + ) + + +@dataclass(frozen=True) +class _ChunkRange: + """A contiguous slice ``[flat_start, flat_stop)`` of an array's chunk grid. + + The flat indices address the row-major flattening of the chunk grid; the + read fn unravels each to an N-D ``chunk_index`` lazily on the worker. Keeping + a range (not a materialized per-chunk list) makes read-task planning + O(parallelism) rather than O(total chunks) -- important for stores with very + many chunks. + """ + + array_name: str + meta: ZarrArrayMeta + chunks: tuple[int, ...] + grid: tuple[int, ...] + flat_start: int + flat_stop: int + + +@dataclass(frozen=True) +class _AlignedChunkDescriptor: + """One wide row: a global axis-0 range ``[t_start, t_stop)`` across the + aligned arrays. With ``overlap > 0`` the row's data extends to + ``t_stop_data = min(t_stop + overlap, shape[0])`` (lookahead so windows + starting in this row reach their tail without crossing a row boundary). + """ + + chunk_index: int + t_start: int + t_stop: int + t_stop_data: int + + +def _create_read_fn( + chunk_range: _ChunkRange, + root: ZarrRoot, + per_task_row_limit: Optional[int], + retry_match: Optional[List[str]], +) -> Callable[[], Iterable[Block]]: + """Build a callable that materializes one block for a chunk-grid range. + + This is the case where arrays are not aligned. Chunks are enumerated lazily + (on the worker) from ``chunk_range``. ``per_task_row_limit`` caps how many + chunks this task reads so a downstream ``limit`` reads only what it needs + (``None`` reads the whole range). + """ + cr = chunk_range + stop = cr.flat_stop + if per_task_row_limit is not None: + stop = min(stop, cr.flat_start + per_task_row_limit) + + def read_fn() -> Iterable[Block]: + builder = DelegatingBlockBuilder() + for flat_index in range(cr.flat_start, stop): + chunk_index = tuple(int(i) for i in np.unravel_index(flat_index, cr.grid)) + chunk_slices = cr.meta.chunk_slices(chunk_index, cr.chunks) + builder.add( + { + "array": cr.array_name, + "chunk_index": chunk_index, + "chunk_slices": chunk_slices, + "chunk": _read_chunk( + root, cr.array_name, chunk_slices, retry_match + ), + } + ) + yield builder.build() + + return read_fn + + +def _create_aligned_read_fn( + batch: list[_AlignedChunkDescriptor], + aligned_array_names: list[str], + root: ZarrRoot, + per_task_row_limit: Optional[int], + retry_match: Optional[List[str]], +) -> Callable[[], Iterable[Block]]: + """Build a callable for aligned (wide-row) reads. + + Each output row carries ``t_start``, ``t_stop``, and one column per + aligned array holding that array's ``[t_start:t_stop, ...]`` slice at + its natural shape (edge rows may be shorter). All arrays in one row + share the same axis-0 range. + + This is the case where arrays are aligned on axis 0. ``per_task_row_limit`` + caps how many rows this task reads (``None`` reads the whole batch). + """ + batch = batch[:per_task_row_limit] + + def read_fn() -> Iterable[Block]: + builder = DelegatingBlockBuilder() + for d in batch: + row: dict[str, Any] = {"t_start": d.t_start, "t_stop": d.t_stop} + for name in aligned_array_names: + row[name] = _read_chunk( + root, name, ((d.t_start, d.t_stop_data),), retry_match + ) + builder.add(row) + yield builder.build() + + return read_fn + + +def _is_positive_int(x) -> bool: + """True for a positive integer, including NumPy integers; False for bool.""" + return not isinstance(x, bool) and isinstance(x, numbers.Integral) and int(x) > 0 + + +def _validate_chunk_shapes_dict(chunk_shapes: dict) -> dict[str, tuple[int, ...]]: + """Normalize chunk_shapes keys to store paths and validate their values.""" + from zarr.util import normalize_storage_path + + normalized: dict[str, tuple[int, ...]] = {} + for k, v in chunk_shapes.items(): + if ( + not isinstance(v, (tuple, list)) + or not v + or not all(_is_positive_int(x) for x in v) + ): + raise ValueError( + f"chunk_shapes[{k!r}] must be a non-empty sequence of positive " + f"integers (list or tuple), got {v!r}" + ) + normalized[normalize_storage_path(k)] = tuple(int(x) for x in v) + return normalized + + +# --------------------------------------------------------------------------- +# Datasource +# --------------------------------------------------------------------------- + + +class ZarrV2Datasource(Datasource): + """Reads one or more Zarr v2 arrays into a Ray Data ``Dataset``. + + Emits long-form rows (one per chunk per array) or, with + ``align_axis_0=True``, wide rows (one per axis-0 chunk, one column per + array). See :func:`ray.data.read_zarr` for the row schemas and full API. + """ + + def __init__( + self, + path: str, + filesystem: pyarrow_fs.FileSystem | AbstractFileSystem | None = None, + chunk_shapes: dict[str, list] | list | None = None, + array_paths: list[str] | None = None, + allow_full_metadata_scan: bool = False, + align_axis_0: bool = False, + overlap: int = 0, + ) -> None: + super().__init__() + _check_import(self, module="zarr", package="zarr") + import zarr + + _check_import(self, module="fsspec", package="fsspec") + from fsspec.spec import AbstractFileSystem + + if int(zarr.__version__.split(".")[0]) >= 3: + raise ImportError( + f"read_zarr supports zarr-python 2.x (Zarr v2 stores), but found " + f"zarr=={zarr.__version__}. Install a compatible version with " + f"`pip install 'zarr<3'`." + ) + + self.allow_full_metadata_scan = allow_full_metadata_scan + self.paths = [str(path)] + # ``local://`` stores live on the driver's local disk, so pin reads to + # the driver node (workers on other nodes can't see those files). + self._supports_distributed_reads = not _is_local_scheme(self.paths) + + # Resolve filesystem + store path. The order of precedence: + # 1. Explicit ``filesystem=`` always wins. + # 2. ``.zip`` URL/path: auto-wrap with fsspec's ZipFileSystem. + # 3. Otherwise delegate to Ray Data's standard URL to filesystem + # helper (the same one every other ``read_*`` API uses). + # "store path" is the path to the Zarr store, relative to the filesystem root. + # It is used to construct the Zarr root object. + if filesystem is None and self.paths[0].endswith(".zip"): + import fsspec + + self._fs = fsspec.filesystem("zip", fo=self.paths[0]) + self._store_path = "" + elif filesystem is None: + from fsspec.implementations.arrow import ArrowFSWrapper + + from ray.data.datasource.path_util import ( + _resolve_paths_and_filesystem, + ) + + resolved_paths, pa_fs = _resolve_paths_and_filesystem([self.paths[0]]) + self._fs = ArrowFSWrapper(pa_fs) + self._store_path = resolved_paths[0].rstrip("/") + else: + from pyarrow.fs import FileSystem + + if isinstance(filesystem, AbstractFileSystem): + self._fs = filesystem + elif isinstance(filesystem, FileSystem): + from fsspec.implementations.arrow import ArrowFSWrapper + + self._fs = ArrowFSWrapper(filesystem) + else: + raise TypeError( + f"filesystem must be pyarrow.fs.FileSystem or " + f"fsspec.spec.AbstractFileSystem, got " + f"{type(filesystem).__name__}" + ) + from fsspec.implementations.zip import ZipFileSystem + + if isinstance(self._fs, ZipFileSystem) and self.paths[0].endswith(".zip"): + # An explicit archive filesystem: the store is the archive root, + # not a ``.zip``-named entry inside it. + self._store_path = "" + else: + from fsspec.core import split_protocol + + _, store_path = split_protocol(self.paths[0]) + self._store_path = store_path.rstrip("/") + + if chunk_shapes is not None and not isinstance( + chunk_shapes, (tuple, list, dict) + ): + raise ValueError( + f"chunk_shapes must be a non-empty sequence of positive " + f"integers (list or tuple), or a dict, got {chunk_shapes!r}" + ) + + self.chunk_shapes: tuple[int, ...] | dict[str, tuple[int, ...]] | None = None + if chunk_shapes is not None: + if isinstance(chunk_shapes, dict): + self.chunk_shapes = _validate_chunk_shapes_dict(chunk_shapes) + else: + if not chunk_shapes or not all( + _is_positive_int(x) for x in chunk_shapes + ): + raise ValueError( + "chunk_shapes must be a non-empty sequence of positive integers " + f"(list or tuple), got {chunk_shapes!r}" + ) + + self.chunk_shapes = tuple(int(x) for x in chunk_shapes) + + # Open the store with zarr (consolidated metadata when available). + # Detect consolidation by *trying* ``open_consolidated``. + store = self._fs.get_mapper(self._store_path) + try: + self.root = zarr.open_consolidated(store, mode="r") + self._consolidated = True + except KeyError: + self.root = zarr.open(store, mode="r") + self._consolidated = False + + self._metadata_by_path = self._load_metadata(array_paths) + if not self._metadata_by_path: + raise ValueError( + f"No arrays discovered in Zarr store at {self.paths[0]!r}." + ) + + # Reject per-array overrides that do not correspond to any selected + # array in this read. + if isinstance(self.chunk_shapes, dict): + unknown_chunk_shape_keys = sorted( + set(self.chunk_shapes) - set(self._metadata_by_path) + ) + if unknown_chunk_shape_keys: + raise ValueError( + f"Unknown array path(s) in chunk_shapes: {unknown_chunk_shape_keys}" + ) + + if not align_axis_0: + self._aligned_array_names = None + else: + scalar_arrays = sorted( + name for name, meta in self._metadata_by_path.items() if not meta.shape + ) + if scalar_arrays: + raise ValueError( + f"align_axis_0=True requires every selected array to have " + f"at least one axis, but these are 0-D (scalar): " + f"{scalar_arrays}. Drop them with array_paths=[...]." + ) + shape0_by_array = { + name: meta.shape[0] for name, meta in self._metadata_by_path.items() + } + if len(set(shape0_by_array.values())) > 1: + raise ValueError( + f"All selected arrays must share shape[0] when " + f"align_axis_0=True. Got: {shape0_by_array}. Pass a " + f"shape-compatible subset via array_paths=[...]." + ) + self._aligned_array_names = list(self._metadata_by_path.keys()) + + # Validate overlap. Only meaningful when arrays are co-iterated as + # wide rows, since the trailing lookahead is exposed via the + # per-array column being longer than ``t_stop - t_start``. + if not isinstance(overlap, int) or overlap < 0: + raise ValueError(f"overlap must be a non-negative integer, got {overlap!r}") + if overlap and self._aligned_array_names is None: + raise ValueError( + "overlap requires align_axis_0=True. In the default long-form " + "(chunk-per-row) mode, there's no wide row to extend forward — " + "the ``chunk_slices`` column on each chunk row already exposes " + "the global axis-0 range." + ) + self.overlap = overlap + + # Resolve per-array chunk geometry. ``effective_chunks`` raises a + # ``ValueError`` if a shared ``chunk_shapes`` prefix or any per-array + # ``chunk_shapes`` override is longer than the target array's rank — + # so this loop is also where rank validation happens. + self._array_chunks: dict[str, tuple[int, ...]] = {} + self._array_grids: dict[str, tuple[int, ...]] = {} + for name, meta in self._metadata_by_path.items(): + chunks = meta.effective_chunks(name, self.chunk_shapes) + self._array_chunks[name] = chunks + self._array_grids[name] = meta.grid_shape(chunks) + + # If aligned, all listed arrays must share the same axis-0 chunk size + # so each wide row corresponds to one axis-0 step across every array. + if self._aligned_array_names is not None: + axis_0_chunks = { + name: self._array_chunks[name][0] for name in self._aligned_array_names + } + unique = set(axis_0_chunks.values()) + if len(unique) > 1: + raise ValueError( + f"Aligned arrays must share the same axis-0 chunk size. " + f"Got: {axis_0_chunks}. Pass chunk_shapes=[N] (or a " + f"per-array chunk_shapes dict that resolves all aligned " + f"arrays to the same axis-0 prefix) to re-tile them." + ) + + @property + def supports_distributed_reads(self) -> bool: + return self._supports_distributed_reads + + def estimate_inmemory_data_size(self) -> Optional[int]: + """Total bytes = sum over selected arrays of ``prod(shape) * itemsize``.""" + return sum( + math.prod(meta.shape) * meta.itemsize + for meta in self._metadata_by_path.values() + ) + + def get_read_tasks( + self, + parallelism: int, + per_task_row_limit: Optional[int] = None, + data_context: Optional["DataContext"] = None, + ) -> List[ReadTask]: + """Enumerate every chunk and wrap it (or batches of chunks) in ReadTasks.""" + from ray.data.context import DataContext + + retry_match = (data_context or DataContext.get_current()).retried_io_errors + if self._aligned_array_names is not None: + return self._get_aligned_read_tasks( + parallelism, per_task_row_limit, retry_match + ) + return self._get_long_form_read_tasks( + parallelism, per_task_row_limit, retry_match + ) + + def _get_long_form_read_tasks( + self, + parallelism: int, + per_task_row_limit: Optional[int], + retry_match: Optional[List[str]], + ) -> List[ReadTask]: + read_tasks: List[ReadTask] = [] + for name, meta in self._metadata_by_path.items(): + chunks = self._array_chunks[name] + grid = self._array_grids[name] + n_chunks = math.prod(grid) + if n_chunks == 0: + continue + # Split the chunk grid into contiguous flat-index ranges. This is + # O(n_tasks), not O(n_chunks): we never materialize a per-chunk list + # on the driver -- the read fn unravels chunks lazily on the worker. + n_tasks = max(1, min(parallelism, n_chunks)) + batch_size = math.ceil(n_chunks / n_tasks) + for flat_start in range(0, n_chunks, batch_size): + flat_stop = min(flat_start + batch_size, n_chunks) + chunk_range = _ChunkRange( + name, meta, chunks, grid, flat_start, flat_stop + ) + read_tasks.append( + ReadTask( + _create_read_fn( + chunk_range, self.root, per_task_row_limit, retry_match + ), + BlockMetadata( + num_rows=flat_stop - flat_start, + size_bytes=self._estimate_range_mem_size(chunk_range), + input_files=(self.paths[0],), + exec_stats=None, + ), + per_task_row_limit=per_task_row_limit, + ) + ) + return read_tasks + + def _estimate_range_mem_size(self, chunk_range: _ChunkRange) -> int: + """Upper-bound in-memory bytes for a chunk-grid range. + + Assumes a full-size chunk per index; trailing-edge chunks are smaller, + so this slightly over-estimates. O(1) -- it does not enumerate the range. + """ + n = chunk_range.flat_stop - chunk_range.flat_start + return n * math.prod(chunk_range.chunks) * chunk_range.meta.itemsize + + def _get_aligned_read_tasks( + self, + parallelism: int, + per_task_row_limit: Optional[int], + retry_match: Optional[List[str]], + ) -> List[ReadTask]: + """Aligned read tasks. See :meth:`get_read_tasks` for semantics.""" + assert self._aligned_array_names is not None + # All aligned arrays share the same axis-0 chunk size (validated in + # ``__init__``) and the same shape[0]. Read the geometry off the first. + first_name = self._aligned_array_names[0] + axis_0_chunk = self._array_chunks[first_name][0] + shape0 = self._metadata_by_path[first_name].shape[0] + + descriptors = [ + _AlignedChunkDescriptor( + chunk_index=i, + t_start=i * axis_0_chunk, + t_stop=min((i + 1) * axis_0_chunk, shape0), + t_stop_data=min((i + 1) * axis_0_chunk + self.overlap, shape0), + ) + for i in range(math.ceil(shape0 / axis_0_chunk)) + ] + if not descriptors: + return [] + + n_tasks = max(1, min(parallelism, len(descriptors))) + batch_size = math.ceil(len(descriptors) / n_tasks) + + read_tasks: List[ReadTask] = [] + for start in range(0, len(descriptors), batch_size): + batch = descriptors[start : start + batch_size] + read_tasks.append( + ReadTask( + _create_aligned_read_fn( + batch, + self._aligned_array_names, + self.root, + per_task_row_limit, + retry_match, + ), + BlockMetadata( + num_rows=len(batch), + size_bytes=self._estimate_aligned_batch_mem_size(batch), + input_files=(self.paths[0],), + exec_stats=None, + ), + per_task_row_limit=per_task_row_limit, + ) + ) + return read_tasks + + def _estimate_aligned_batch_mem_size( + self, batch: list[_AlignedChunkDescriptor] + ) -> int: + """Sum bytes across all (row, aligned-array) pairs in a wide-row batch. + + Accounts for the trailing overlap data each row carries: the row's + per-array slice covers ``[t_start, t_stop_data)``, not just + ``[t_start, t_stop)``. + """ + assert self._aligned_array_names is not None + return sum( + (desc.t_stop_data - desc.t_start) + * (math.prod(meta.shape[1:]) if len(meta.shape) > 1 else 1) + * meta.itemsize + for desc in batch + for meta in ( + self._metadata_by_path[name] for name in self._aligned_array_names + ) + ) + + def _load_metadata(self, array_paths) -> dict[str, ZarrArrayMeta]: + """Read ``shape``/``chunks``/``dtype`` for the selected arrays off ``self.root``. + + zarr validated the store's metadata when it was opened, so this only + adapts the resulting ``zarr.Array`` objects. Discovery uses consolidated + metadata when present, then explicit ``array_paths``, then an optional + full scan (``allow_full_metadata_scan``). If ``array_paths`` is given, + the discovered set is filtered down to it. + """ + import zarr + from zarr.util import normalize_storage_path + + root = self.root + requested = ( + {normalize_storage_path(p) for p in array_paths} if array_paths else None + ) + + if isinstance(root, zarr.Array): + # A store that is itself an array exposes exactly one path: "" (root). + # Reject any requested path that isn't the root so a bad ``array_paths`` + # fails loudly here instead of silently returning the root array. + if requested is not None and requested != {""}: + raise ValueError( + f"This Zarr store is a single root-level array (path ''), " + f"but array_paths={array_paths!r} requested other path(s). " + f"Pass array_paths=[''] or omit it." + ) + return {"": ZarrArrayMeta.from_zarr_array(root)} + + if not self._consolidated and not self.allow_full_metadata_scan: + if requested is None: + raise ValueError( + "No array_paths were provided and this Zarr store does not " + "contain .zmetadata. Pass array_paths=[...] or set " + "allow_full_metadata_scan=True." + ) + out: dict[str, ZarrArrayMeta] = {} + for raw in array_paths: + name = normalize_storage_path(raw) + try: + arr = root[name] + except KeyError as e: + raise ValueError( + f"Array path {raw!r} not found in Zarr store." + ) from e + if not isinstance(arr, zarr.Array): + raise ValueError(f"Array path {raw!r} is a group, not an array.") + out[name] = ZarrArrayMeta.from_zarr_array(arr) + return out + + all_arrays: dict[str, ZarrArrayMeta] = {} + + def _collect(name: str, obj) -> None: + if isinstance(obj, zarr.Array): + all_arrays[name] = ZarrArrayMeta.from_zarr_array(obj) + + root.visititems(_collect) + + if requested is not None: + missing = sorted(requested - all_arrays.keys()) + if missing: + raise ValueError( + f"Array(s) not found: {', '.join(repr(m) for m in missing)}. " + f"Available: {', '.join(repr(a) for a in sorted(all_arrays))}" + ) + all_arrays = {k: v for k, v in all_arrays.items() if k in requested} + + return all_arrays diff --git a/python/ray/data/read_api.py b/python/ray/data/read_api.py index e885578da0c4..7e248d0e9ea8 100644 --- a/python/ray/data/read_api.py +++ b/python/ray/data/read_api.py @@ -66,6 +66,7 @@ from ray.data._internal.datasource.uc_datasource import UnityCatalogConnector from ray.data._internal.datasource.video_datasource import VideoDatasource from ray.data._internal.datasource.webdataset_datasource import WebDatasetDatasource +from ray.data._internal.datasource.zarrv2_datasource import ZarrV2Datasource from ray.data._internal.delegating_block_builder import DelegatingBlockBuilder from ray.data._internal.logical.interfaces import LogicalPlan from ray.data._internal.logical.operators import ( @@ -121,6 +122,7 @@ import daft import dask import datasets + import fsspec.spec import mars import modin import pandas @@ -925,6 +927,143 @@ def read_videos( ) +@PublicAPI(stability="alpha") +def read_zarr( + path: str, + *, + filesystem: "pyarrow.fs.FileSystem | fsspec.spec.AbstractFileSystem | None" = None, + chunk_shapes: dict[str, list] | list | None = None, + array_paths: list[str] | None = None, + allow_full_metadata_scan: bool = False, + align_axis_0: bool = False, + overlap: int = 0, + concurrency: Optional[int] = None, + override_num_blocks: Optional[int] = None, + num_cpus: Optional[float] = None, + num_gpus: Optional[float] = None, + memory: Optional[float] = None, + ray_remote_args: Optional[Dict[str, Any]] = None, +): + """Creates a :class:`~ray.data.Dataset` from a Zarr v2 store. + + By default each row is one chunk of one array (long-form), with columns + ``array``, ``chunk_index``, ``chunk_slices``, and ``chunk``. With + ``align_axis_0=True``, each row is one axis-0 chunk with ``t_start``, + ``t_stop``, and one column per selected array (wide-form), for arrays that + share ``shape[0]``. + + For the output schemas, chunk re-tiling, aligned and sliding-window reads, + metadata discovery, custom codecs, and cloud-storage setup, see + :ref:`Working with Zarr `. + + .. note:: + + In long-form the ``chunk`` column is a tensor, and tensors of different + rank or dtype can't be combined into one batch. Consume long-form per + array (filter on the ``array`` column first), or, when arrays are + row-aligned (share ``shape[0]``), use ``align_axis_0=True`` so each + array is its own column -- which is batch-safe. + + Examples: + Read every array at its native chunking (long-form, one row per chunk): + + >>> import ray + >>> ds = ray.data.read_zarr( # doctest: +SKIP + ... "s3://anonymous@ray-example-data/mnist-tiny.zarr", + ... ) + + Aligned read -- paired ``(images, labels)`` per row; ``align_axis_0`` + requires all selected arrays to share ``shape[0]``: + + >>> ds = ray.data.read_zarr( # doctest: +SKIP + ... "s3://anonymous@ray-example-data/mnist-tiny.zarr", + ... align_axis_0=True, + ... chunk_shapes=[50], + ... ) + + Per-array chunk overrides -- re-tile only the selected arrays: + + >>> ds = ray.data.read_zarr( # doctest: +SKIP + ... "s3://anonymous@ray-example-data/mnist-tiny.zarr", + ... chunk_shapes={"images": [50], "labels": [50]}, + ... ) + + Args: + path: Path to the Zarr v2 store. + filesystem: The filesystem + implementation to read from. PyArrow filesystems are specified in the + `pyarrow docs `_. Specify this parameter if + you need to provide specific configurations to the filesystem. By default, + the filesystem is automatically selected based on the scheme of the paths. + For example, if the path begins with ``s3://``, the `S3FileSystem` is used. + Also acceptsan :class:`fsspec.spec.AbstractFileSystem`. + pyarrow filesystems are wrapped internally with + :class:`fsspec.implementations.arrow.ArrowFSWrapper` + chunk_shapes: Optional re-tiling of the leading chunk axes at read + time (see :ref:`Working with Zarr `). Either a + sequence applied as a shared prefix across all selected arrays + (trailing axes keep native chunks), or a dict of per-array + prefixes (arrays absent from it keep native chunks). An override + may not exceed its target array's rank. Defaults to native chunks. + array_paths: Optional list of array paths within the Zarr store to + read. If unspecified, all arrays discovered in the store are + included. + allow_full_metadata_scan: If ``True``, recursively scan the store for + ``.zarray`` files when ``array_paths`` is unspecified and + ``.zmetadata`` is missing. This may be slow or expensive for large + remote stores, so it is disabled by default. + align_axis_0: If ``True``, emit the wide-form schema: one row per + axis-0 chunk with one column per selected array, plus ``t_start`` + and ``t_stop`` columns naming the global axis-0 range. All selected + arrays must share ``shape[0]`` and resolve to the same effective + axis-0 chunk size after ``chunk_shapes`` resolution. Defaults to + ``False`` (long-form, one chunk per row). + overlap: The number of additional axis-0 timesteps to extend each + row's per-array data forward by, clipped at the store end, for + sliding-window pipelines. Only valid with ``align_axis_0=True``. + Defaults to ``0``. See :ref:`Working with Zarr `. + concurrency: The maximum number of Ray tasks to run concurrently. Set this + to control number of tasks to run concurrently. This doesn't change the + total number of tasks run or the total number of output blocks. By default, + concurrency is dynamically decided based on the available resources. + override_num_blocks: Override the number of output blocks from all read tasks. + By default, the number of output blocks is dynamically decided based on + input data size and available resources. You shouldn't manually set this + value in most cases. + num_cpus: The number of CPUs to reserve for each parallel read worker. + num_gpus: The number of GPUs to reserve for each parallel read worker. For + example, specify `num_gpus=1` to request 1 GPU for each parallel read + worker. + memory: The heap memory in bytes to reserve for each parallel read worker. + ray_remote_args: kwargs passed to :meth:`~ray.remote` in the read tasks. + + Returns: + A :class:`~ray.data.Dataset` of long-form chunk rows by default + (``array``, ``chunk_index``, ``chunk_slices``, ``chunk``), or + wide-form aligned rows (``t_start``, ``t_stop``, plus one column + per aligned array) when ``align_axis_0`` is set. + """ + datasource = ZarrV2Datasource( + path=path, + filesystem=filesystem, + chunk_shapes=chunk_shapes, + array_paths=array_paths, + allow_full_metadata_scan=allow_full_metadata_scan, + align_axis_0=align_axis_0, + overlap=overlap, + ) + return read_datasource( + datasource, + ray_remote_args=ray_remote_args, + num_cpus=num_cpus, + num_gpus=num_gpus, + memory=memory, + concurrency=concurrency, + override_num_blocks=override_num_blocks, + ) + + @PublicAPI(stability="alpha") def read_mongo( uri: str, diff --git a/python/ray/data/tests/datasource/test_zarrv2.py b/python/ray/data/tests/datasource/test_zarrv2.py new file mode 100644 index 000000000000..96da15d2ed8b --- /dev/null +++ b/python/ray/data/tests/datasource/test_zarrv2.py @@ -0,0 +1,937 @@ +import logging +import os +from pathlib import Path +from typing import Any + +import fsspec +import numpy as np +import pandas as pd +import pyarrow.fs +import pytest +import zarr +from pytest_lazy_fixtures import lf as lazy_fixture + +import ray +from ray.data._internal.datasource import zarrv2_datasource +from ray.data.block import BlockAccessor +from ray.data.tests.conftest import * # noqa: F401, F403 + + +def _execute_read_tasks(tasks) -> pd.DataFrame: + frames = [ + BlockAccessor.for_block(block).to_pandas() for task in tasks for block in task() + ] + return pd.concat(frames, ignore_index=True) + + +def _reconstruct_array(df: pd.DataFrame, array_name: str) -> np.ndarray: + """Concatenate all chunks of one array from a long-form result frame.""" + sub = df[df["array"] == array_name].sort_values( + "chunk_index", key=lambda col: col.map(tuple) + ) + return np.concatenate(list(sub["chunk"]), axis=0) + + +def _write_real_zarr_store( + store_path: Path, + arrays: dict, # {name: (data, chunks)} +) -> Path: + """Write a real Zarr v2 store from numpy arrays and consolidate metadata.""" + root = zarr.open_group(str(store_path), mode="w") + for name, (data, chunks) in arrays.items(): + root.create_dataset(name, data=data, chunks=chunks, dtype=data.dtype) + zarr.consolidate_metadata(zarr.DirectoryStore(str(store_path))) + return store_path + + +@pytest.fixture +def zarrv2_group_store(tmp_path) -> Path: + """Two arrays at the store root, both 2-D and 1-D, axis-0-aligned (shape[0]==5).""" + return _write_real_zarr_store( + tmp_path / "group.zarr", + { + "images": (np.arange(20, dtype=" Path: + """Single-array store with the array sitting directly at the store root.""" + store_path = tmp_path / "root.zarr" + arr = zarr.open( + str(store_path), + mode="w", + shape=(5, 4), + chunks=(2, 4), + dtype=" Path: + """A store mixing different ranks, shape[0]s, dtypes, and native chunk sizes. + + Mirrors the UMI-style real-world layout where ``data/*`` arrays share an + axis-0 timestep count but differ in everything else, and ``meta/*`` + arrays live in a separate axis-0 universe entirely. The chunk-per-row + datasource handles all of these in one read; nothing has to align. + """ + store_path = tmp_path / "heterogeneous.zarr" + root = zarr.open_group(str(store_path), mode="w") + # 4-D image tensor with tiny axis-0 chunks (1 image per chunk). + root.create_dataset( + "data/camera0_rgb", + data=np.arange(20 * 2 * 2 * 3, dtype="|u1").reshape(20, 2, 2, 3), + chunks=(1, 2, 2, 3), + ) + # 2-D pose array, same shape[0]=20, much larger axis-0 chunks (10). + root.create_dataset( + "data/robot0_eef_pos", + data=np.arange(20 * 3, dtype=" Path: + """Two arrays at the store root, no ``.zmetadata``. + + Exercises the no-``.zmetadata`` code paths (per-array ``.zarray`` + discovery and full-store walk) — the common shape of real-world stores + behind plain HTTPS or other listing-less filesystems. + """ + store_path = tmp_path / "unconsolidated.zarr" + root = zarr.open_group(str(store_path), mode="w") + root.create_dataset( + "images", data=np.arange(20, dtype=" Path: + """Three arrays sharing ``shape[0]=8``, different ranks and native chunks. + + Models the UMI-style case where data arrays co-stride on the timestep + axis but differ in everything else. + """ + store_path = tmp_path / "aligned.zarr" + root = zarr.open_group(str(store_path), mode="w") + root.create_dataset( + "img", + data=np.arange(8 * 4 * 4 * 3, dtype="|u1").reshape(8, 4, 4, 3), + chunks=(2, 4, 4, 3), + ) + root.create_dataset( + "state", + data=np.arange(8 * 3, dtype=" Path: + """A small Zarr store packed into a ``.zip`` for URL-detection tests.""" + src = tmp_path / "src.zarr" + _write_real_zarr_store( + src, + { + "data": (np.arange(12, dtype=" the per-array ``.zarray`` lookup path. + with pytest.raises(ValueError, match="is a group, not an array"): + zarrv2_datasource.ZarrV2Datasource(str(store_path), array_paths=["grp"]) + + +def test_root_array_rejects_non_root_array_paths(zarrv2_root_store): + """A single root-level array rejects array_paths that aren't the root ''.""" + with pytest.raises(ValueError, match="single root-level array"): + zarrv2_datasource.ZarrV2Datasource( + str(zarrv2_root_store), array_paths=["missing"] + ) + + +# --------------------------------------------------------------------------- +# chunk_shapes validation +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "chunk_shapes, match", + [ + ("invalid", "positive integers"), + ({"images": 1}, "positive integers"), + ({"does_not_exist": [2]}, "Unknown array path"), + ], +) +def test_rejects_invalid_chunk_shapes(zarrv2_group_store, chunk_shapes, match): + with pytest.raises(ValueError, match=match): + zarrv2_datasource.ZarrV2Datasource( + str(zarrv2_group_store), chunk_shapes=chunk_shapes + ) + + +@pytest.mark.parametrize( + "chunk_shapes,array_paths,expected", + [ + # No chunk_shapes: every array reads at its native chunk size. + # 4-D image with tiny chunks coexists with 2-D pose with big chunks — + # nothing is forced into a shared min/max. + ( + None, + None, + { + "data/camera0_rgb": (1, 2, 2, 3), + "data/robot0_eef_pos": (10, 3), + "meta/episode_ends": (3,), + }, + ), + # ``[5]`` prefix overrides axis 0 across arrays of all ranks at once. + ( + [5], + None, + { + "data/camera0_rgb": (5, 2, 2, 3), + "data/robot0_eef_pos": (5, 3), + "meta/episode_ends": (5,), + }, + ), + # Length-2 prefix overrides axes 0+1; needs every selected array to + # have rank >= 2, so we filter out ``meta/episode_ends`` (rank 1). + ( + [5, 1], + ["data/camera0_rgb", "data/robot0_eef_pos"], + { + "data/camera0_rgb": (5, 1, 2, 3), + "data/robot0_eef_pos": (5, 1), + }, + ), + # Per-array overrides may retile only some arrays while others keep + # their native chunks. + ( + { + "data/camera0_rgb": [5], + "data/robot0_eef_pos": [7], + }, + None, + { + "data/camera0_rgb": (5, 2, 2, 3), + "data/robot0_eef_pos": (7, 3), + "meta/episode_ends": (3,), + }, + ), + ], +) +def test_chunk_shapes_resolution_across_mixed_rank( + heterogeneous_zarrv2_store, chunk_shapes, array_paths, expected +): + datasource = zarrv2_datasource.ZarrV2Datasource( + str(heterogeneous_zarrv2_store), + chunk_shapes=chunk_shapes, + array_paths=array_paths, + ) + assert datasource._array_chunks == expected + + +# --------------------------------------------------------------------------- +# align_axis_0 (wide-form mode) +# --------------------------------------------------------------------------- + + +def test_align_axis_0_emits_wide_rows(ray_start_regular_shared, aligned_zarrv2_store): + """Wide-row schema: ``t_start``, ``t_stop``, one column per selected array.""" + datasource = zarrv2_datasource.ZarrV2Datasource( + str(aligned_zarrv2_store), + align_axis_0=True, + chunk_shapes=[4], + ) + df = _execute_read_tasks(datasource.get_read_tasks(parallelism=4)) + assert set(df.columns) == {"t_start", "t_stop", "img", "state", "label"} + # shape[0]=8, chunk_shapes=[4] -> 2 rows. + assert len(df) == 2 + # Reconstruct each array by concatenating slices in order. + img_recon = np.concatenate(list(df["img"]), axis=0) + assert img_recon.shape == (8, 4, 4, 3) + state_recon = np.concatenate(list(df["state"]), axis=0) + assert state_recon.shape == (8, 3) + label_recon = np.concatenate(list(df["label"]), axis=0) + assert label_recon.shape == (8,) + # t_start/t_stop are correct. + starts = sorted(df["t_start"].tolist()) + stops = sorted(df["t_stop"].tolist()) + assert starts == [0, 4] + assert stops == [4, 8] + + +def test_align_axis_0_column_set(ray_start_regular_shared, aligned_zarrv2_store): + """array_paths selects which arrays are read; aligned mode emits one column + per selected array (plus t_start/t_stop).""" + datasource = zarrv2_datasource.ZarrV2Datasource( + str(aligned_zarrv2_store), + array_paths=["img", "state"], + align_axis_0=True, + chunk_shapes=[4], + ) + df = _execute_read_tasks(datasource.get_read_tasks(parallelism=4)) + assert set(df.columns) == {"t_start", "t_stop", "img", "state"} + + +def test_align_axis_0_rejects_misaligned_shape0(heterogeneous_zarrv2_store): + """Misalignment raises with the per-array shape[0] breakdown.""" + with pytest.raises( + ValueError, + match=r"All selected arrays must share shape\[0\]", + ): + zarrv2_datasource.ZarrV2Datasource( + str(heterogeneous_zarrv2_store), + align_axis_0=True, + chunk_shapes=[5], + ) + + +def test_align_axis_0_rejects_divergent_axis_0_chunks(aligned_zarrv2_store): + """If aligned arrays end up with different axis-0 chunks, error clearly. + + The native chunks differ (img=2, state=4, label=8) — without a + ``chunk_shapes`` re-tile they all stay at native, and the validator + catches the mismatch. + """ + with pytest.raises( + ValueError, match="Aligned arrays must share the same axis-0 chunk size" + ): + zarrv2_datasource.ZarrV2Datasource( + str(aligned_zarrv2_store), + align_axis_0=True, + ) + + +# --------------------------------------------------------------------------- +# overlap (aligned-mode lookahead) +# --------------------------------------------------------------------------- + + +def test_overlap_extends_chunk_data(ray_start_regular_shared, aligned_zarrv2_store): + """``overlap=N`` makes each row's per-array slice cover ``N`` extra timesteps. + + Aligned store has shape[0]=8, ``chunk_shapes=[4]`` -> rows own [0,4) and [4,8). + With ``overlap=2``, row 0's data covers [0,6) and row 1's data covers [4,8) + (clipped at the store end since 4+4+2 > 8). + """ + datasource = zarrv2_datasource.ZarrV2Datasource( + str(aligned_zarrv2_store), + align_axis_0=True, + chunk_shapes=[4], + overlap=2, + ) + df = _execute_read_tasks(datasource.get_read_tasks(parallelism=4)) + # Ownership unchanged: 2 rows of width 4 each. + assert sorted(zip(df["t_start"], df["t_stop"])) == [(0, 4), (4, 8)] + # Data extents: row 0 has 6 timesteps, row 1 has 4 (clipped at shape[0]=8). + rows = sorted(df.to_dict("records"), key=lambda r: r["t_start"]) + assert rows[0]["img"].shape[0] == 6 # 4 owned + 2 overlap + assert rows[0]["state"].shape[0] == 6 + assert rows[1]["img"].shape[0] == 4 # 4 owned + 0 overlap (clipped) + assert rows[1]["state"].shape[0] == 4 + + +def test_overlap_requires_align_axis_0(aligned_zarrv2_store): + """``overlap`` in long-form (no ``align_axis_0``) is a clear error.""" + with pytest.raises(ValueError, match="overlap requires align_axis_0=True"): + zarrv2_datasource.ZarrV2Datasource( + str(aligned_zarrv2_store), + overlap=2, + ) + + +def test_overlap_rejects_negative_and_non_int(aligned_zarrv2_store): + bad_values: list[Any] = [-1, 1.5, "two"] + + for bad in bad_values: + with pytest.raises(ValueError, match="overlap must be a non-negative integer"): + zarrv2_datasource.ZarrV2Datasource( + str(aligned_zarrv2_store), + align_axis_0=True, + chunk_shapes=[4], + overlap=bad, + ) + + +def test_chunk_shapes_rejected_when_longer_than_smallest_array( + heterogeneous_zarrv2_store, +): + """A shared ``chunk_shapes`` override longer than a target rank is an error.""" + with pytest.raises( + ValueError, + match=r"chunk_shapes override for array .* has 2 axes but array of shape .* has rank 1", + ): + zarrv2_datasource.ZarrV2Datasource( + str(heterogeneous_zarrv2_store), + chunk_shapes=[2, 2], # OK for 2-D and 4-D, fails for 1-D episode_ends + ) + + +# --------------------------------------------------------------------------- +# Filesystem handling +# --------------------------------------------------------------------------- + + +def test_accepts_pyarrow_fs_filesystem(zarrv2_group_store): + """A pyarrow.fs.FileSystem passed in is wrapped into fsspec internally.""" + datasource = zarrv2_datasource.ZarrV2Datasource( + str(zarrv2_group_store), + filesystem=pyarrow.fs.LocalFileSystem(), + ) + from fsspec.spec import AbstractFileSystem + + assert isinstance(datasource._fs, AbstractFileSystem) + assert set(datasource._metadata_by_path) == {"images", "nested"} + + +def test_rejects_unsupported_filesystem_type(): + """Filesystem that's neither pyarrow.fs nor fsspec raises ``TypeError``.""" + with pytest.raises( + TypeError, + match=r"filesystem must be pyarrow\.fs\.FileSystem or", + ): + zarrv2_datasource.ZarrV2Datasource( + "/tmp/some.zarr", + filesystem="not-a-filesystem", + ) + + +# --------------------------------------------------------------------------- +# .zarr.zip URL support +# --------------------------------------------------------------------------- + + +def test_reads_zarr_zip_local_path(ray_start_regular_shared, zarr_zip_store): + """A local ``.zarr.zip`` path auto-wires fsspec's ZipFileSystem.""" + datasource = zarrv2_datasource.ZarrV2Datasource(str(zarr_zip_store)) + # The store has one array "data" of shape (6, 2) chunks (3, 2) -> 2 chunks. + df = _execute_read_tasks(datasource.get_read_tasks(parallelism=2)) + assert len(df) == 2 + assert set(df["array"]) == {"data"} + recon = _reconstruct_array(df, "data") + np.testing.assert_array_equal(recon, np.arange(12, dtype=" grid (3, 2) = 6 chunks. + _write_real_zarr_store( + store_path, {"a": (np.arange(6 * 4, dtype=" two flat-index ranges; concatenated they must be in order. + df = _execute_read_tasks(datasource.get_read_tasks(parallelism=2)) + got = [tuple(int(x) for x in ci) for ci in df["chunk_index"]] + assert got == list(product(range(3), range(2))) + + +def test_per_task_row_limit_caps_chunks_read( + ray_start_regular_shared, tmp_path, monkeypatch +): + """per_task_row_limit bounds how many chunks a task actually reads, so a + downstream ``limit`` doesn't pull the whole batch's I/O.""" + store_path = tmp_path / "limit.zarr" + _write_real_zarr_store(store_path, {"data": (np.arange(10, dtype=" one task batching all 10 chunks; cap it at 3. + tasks = datasource.get_read_tasks(parallelism=1, per_task_row_limit=3) + blocks = [block for task in tasks for block in task()] + + total_rows = sum(BlockAccessor.for_block(b).num_rows() for b in blocks) + assert total_rows == 3 + # The fix: only 3 chunks were actually read (not all 10, then truncated). + assert len(reads) == 3 + + +def test_read_chunk_retries_transient_io(monkeypatch): + """_read_chunk retries reads whose error matches retry_match (Ray Data's + DataContext.retried_io_errors), then succeeds.""" + monkeypatch.setattr("time.sleep", lambda *_: None) # no backoff in the test + + class _FlakyArray: + attempts = 0 + + def __getitem__(self, _idx): + type(self).attempts += 1 + if self.attempts < 3: + raise OSError("Connection reset by peer") + return np.arange(4, dtype="= 1 + + +def test_align_axis_0_rejects_scalar_array(tmp_path): + """align_axis_0=True with a 0-D (scalar) array must raise a clear error + rather than an IndexError when reading the (empty) axis-0 chunk size.""" + store_path = tmp_path / "scalar.zarr" + root = zarr.open_group(str(store_path), mode="w") + root.create_dataset("vec", data=np.arange(8, dtype=" 1 # actually exercise cross-block unification + schemas = [BlockAccessor.for_block(b).to_arrow().schema for b in blocks] + unified = unify_schemas(schemas) # must not raise + assert {"t_start", "t_stop", "img", "state", "label"}.issubset(set(unified.names)) + + +if __name__ == "__main__": + import sys + + sys.exit(pytest.main(["-v", __file__])) + + +# --------------------------------------------------------------------------- +# Custom codec registration in Ray workers +# --------------------------------------------------------------------------- + + +@pytest.fixture +def fresh_ray(): + """A clean Ray for a test that needs its own ``ray.init`` (e.g. a custom + ``runtime_env``). Unlike ``shutdown_only`` (teardown only), it also shuts + down any pre-existing cluster, so isolation doesn't depend on test order. + """ + if ray.is_initialized(): + ray.shutdown() + yield + if ray.is_initialized(): + ray.shutdown() + + +def test_custom_codec_succeeds_with_worker_setup_hook(fresh_ray, tmp_path): + """Test that we successfully register a custom codec. + + numcodecs' registry is process-local. + """ + import numcodecs + + def _register_codec(): + import numcodecs + import numpy as np + + class _RayZarrTestCodec(numcodecs.abc.Codec): + codec_id = "ray_zarr_test_codec" + + def encode(self, buf): + return bytes(buf) + + def decode(self, buf, out=None): + arr = np.frombuffer(buf, dtype=np.uint8) + if out is not None: + out[:] = arr.view(out.dtype) + return out + return arr.copy() + + numcodecs.register_codec(_RayZarrTestCodec) + + # Register driver-side so we can write the store. + _register_codec() + + store_path = tmp_path / "codec_test.zarr" + arr = zarr.open( + str(store_path), + mode="w", + shape=(8,), + chunks=(4,), + dtype="u1", + compressor=numcodecs.get_codec({"id": "ray_zarr_test_codec"}), + ) + arr[:] = np.arange(8, dtype="u1") + zarr.consolidate_metadata(zarr.DirectoryStore(str(store_path))) + + ray.init( + num_cpus=1, + logging_level=logging.ERROR, + log_to_driver=False, + runtime_env={"worker_process_setup_hook": _register_codec}, + ) + ds = ray.data.read_zarr(str(store_path)) + rows = sorted(ds.take_all(), key=lambda r: tuple(r["chunk_index"])) + recon = np.concatenate([r["chunk"] for r in rows]) + np.testing.assert_array_equal(recon, np.arange(8, dtype="u1")) diff --git a/python/requirements/ml/py313/data-test-requirements.txt b/python/requirements/ml/py313/data-test-requirements.txt index 57d29c6b7ef5..99864cde9979 100644 --- a/python/requirements/ml/py313/data-test-requirements.txt +++ b/python/requirements/ml/py313/data-test-requirements.txt @@ -37,3 +37,10 @@ tensorflow-metadata>=1.17.0 tf-keras torchvision==0.24.0 confluent-kafka +zarr<3 ; python_version >= '3.11' # zarr 2.18.4+ requires py3.11+ (v2 API) +zarr>=2.18,<2.18.4 ; python_version < '3.11' # 2.18.3: last v2 line supporting py3.10 +# numcodecs is zarr's codec dep; 0.14+ dropped py3.10. Pin per-Python with exact +# versions so the markers survive pip-compile -- the compiled-constraint pin must +# stay gated to py3.11+, otherwise the py3.10 data locks can't resolve zarr. +numcodecs==0.15.1 ; python_version >= '3.11' +numcodecs==0.13.1 ; python_version < '3.11' diff --git a/python/requirements_compiled_py3.13.txt b/python/requirements_compiled_py3.13.txt index 9158a652dc4c..306920343621 100644 --- a/python/requirements_compiled_py3.13.txt +++ b/python/requirements_compiled_py3.13.txt @@ -124,6 +124,8 @@ arro3-core==0.8.0 # via deltalake arrow==1.4.0 # via isoduration +asciitree==0.3.3 + # via zarr asgiref==3.9.2 # via # -r python/requirements/py313/test-requirements.txt @@ -462,6 +464,7 @@ deltalake==1.5.0 deprecated==1.3.1 # via # deltalake + # numcodecs # pymoo dill==0.4.1 # via @@ -541,6 +544,7 @@ fasteners==0.20 # via # google-apitools # gsutil + # zarr fastjsonschema==2.21.2 # via nbformat fastrlock==0.8.3 ; sys_platform != "darwin" @@ -1236,6 +1240,10 @@ numba==0.61.2 # via # -r python/requirements/py313/test-requirements.txt # statsforecast +numcodecs==0.15.1 ; python_version >= "3.11" + # via + # -r python/requirements/ml/py313/data-test-requirements.txt + # zarr numexpr==2.14.1 # via # -r python/requirements/ml/py313/rllib-test-requirements.txt @@ -1291,6 +1299,7 @@ numpy==2.2.6 # mujoco # nevergrad # numba + # numcodecs # numexpr # onnx # onnx-ir @@ -1333,6 +1342,7 @@ numpy==2.2.6 # utilsforecast # webdataset # xgboost + # zarr # zoopt nvidia-nccl-cu12==2.27.5 ; platform_system == "Linux" and platform_machine != "aarch64" # via @@ -2683,6 +2693,8 @@ yq==3.2.2 # via # -r python/requirements/lint-requirements.txt # -r python/requirements/py313/test-requirements.txt +zarr==2.18.7 ; python_version >= "3.11" + # via -r python/requirements/ml/py313/data-test-requirements.txt zict==3.0.0 # via distributed zipp==3.23.1