diff --git a/.python-version b/.python-version index 7c7a975f..fdcfcfdf 100644 --- a/.python-version +++ b/.python-version @@ -1 +1 @@ -3.10 \ No newline at end of file +3.12 \ No newline at end of file diff --git a/demos/satija-2020/Snakefile b/demos/satija-2020/Snakefile index c1bd0e25..418159c4 100644 --- a/demos/satija-2020/Snakefile +++ b/demos/satija-2020/Snakefile @@ -27,7 +27,9 @@ rule all: rule convert_to_zarr: input: cells_h5ad=(RAW_DIR / "uf-processed" / GLOBUS_ID / "cluster_marker_genes.h5ad"), - annotations_csv=(RAW_DIR / "annotations_spleen" / f"{GLOBUS_ID}.csv"), + # For some reasons f-strings in python 3.12 don't work as expected + # Reference: https://github.com/NeuralEnsemble/cobrawap/pull/105 + annotations_csv=(RAW_DIR / "annotations_spleen" / str(GLOBUS_ID + ".csv")), cl_obo=(RAW_DIR / "cl.obo") output: cells=directory(PROCESSED_DIR / "satija_2020.h5ad.zarr"), @@ -100,4 +102,4 @@ rule download_cl_obo: shell: ''' curl -L -o {output} {params.file_url} - ''' \ No newline at end of file + ''' diff --git a/docs/data_options.rst b/docs/data_options.rst index a6c15663..b4fd1ede 100644 --- a/docs/data_options.rst +++ b/docs/data_options.rst @@ -129,15 +129,18 @@ This is currently supported for the ``AnnDataWrapper`` class using the ``adata_s vc.widget() -Or, with a Zarr store instance (instead of a local string path to a DirectoryStore): +Or, with a Zarr store instance (instead of a local string path to a LocalStore/DirectoryStore): .. code-block:: python import zarr + from obstore.store import HTTPStore from vitessce import VitessceConfig, AnnDataWrapper - # ... - store = zarr.storage.FSStore("s3://my_bucket/path/to/my_store.adata.zarr") + obs_store = HTTPStore.from_url("https://my_bucket.example.com/path/to/my_store.adata.zarr") + store = zarr.storage.ObjectStore(obs_store, read_only=True) + # or + store = zarr.storage.LocalStore("./path/to/my_store.adata.zarr") vc = VitessceConfig(name="My Vitessce Configuration") vc.add_dataset(name="My Dataset").add_object(AnnDataWrapper( diff --git a/docs/notebooks/spatial_data_blobs.ipynb b/docs/notebooks/spatial_data_blobs.ipynb index 5d410ec1..5586b73e 100644 --- a/docs/notebooks/spatial_data_blobs.ipynb +++ b/docs/notebooks/spatial_data_blobs.ipynb @@ -115,7 +115,7 @@ ")\n", "# Add data to the configuration:\n", "wrapper = SpatialDataWrapper(\n", - " sdata_path=spatialdata_filepath,\n", + " sdata_store=spatialdata_filepath,\n", " # The following paths are relative to the root of the SpatialData zarr store on-disk.\n", " image_path=\"images/blobs_image\",\n", " obs_segmentations_path=\"labels/blobs_labels\",\n", @@ -129,7 +129,7 @@ " }\n", ")\n", "points_wrapper = SpatialDataWrapper(\n", - " sdata_path=spatialdata_filepath,\n", + " sdata_store=spatialdata_filepath,\n", " # The following paths are relative to the root of the SpatialData zarr store on-disk.\n", " obs_points_path=\"points/blobs_points\",\n", " obs_feature_matrix_path=\"tables/table_points/X\", # TODO\n", @@ -221,6 +221,27 @@ "outputs": [], "source": [] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null, @@ -245,7 +266,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.12.5" } }, "nbformat": 4, diff --git a/docs/notebooks/spatial_data_xenium_morton.ipynb b/docs/notebooks/spatial_data_xenium_morton.ipynb index e5efa995..9984d13a 100644 --- a/docs/notebooks/spatial_data_xenium_morton.ipynb +++ b/docs/notebooks/spatial_data_xenium_morton.ipynb @@ -47,7 +47,6 @@ "from vitessce.data_utils import (\n", " sdata_morton_sort_points,\n", " sdata_points_process_columns,\n", - " sdata_points_write_bounding_box_attrs,\n", " sdata_points_modify_row_group_size,\n", " sdata_morton_query_rect,\n", ")" @@ -59,7 +58,8 @@ "metadata": {}, "outputs": [], "source": [ - "from spatialdata import read_zarr" + "from spatialdata import read_zarr\n", + "from spatialdata.models import PointsModel" ] }, { @@ -82,7 +82,7 @@ "if not isdir(spatialdata_filepath):\n", " if not isfile(zip_filepath):\n", " os.makedirs(data_dir, exist_ok=True)\n", - " urlretrieve('https://s3.embl.de/spatialdata/spatialdata-sandbox/xenium_rep1_io.zip', zip_filepath)\n", + " urlretrieve('https://data-2.vitessce.io/sdata-datasets/xenium_rep1_io.spatialdata.zarr.zip', zip_filepath)\n", " with zipfile.ZipFile(zip_filepath,\"r\") as zip_ref:\n", " zip_ref.extractall(data_dir)\n", " os.rename(join(data_dir, \"data.zarr\"), spatialdata_filepath)\n", @@ -161,6 +161,15 @@ "sdata.points['transcripts'].head()" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "transformations = sdata['transcripts'].attrs['transform']" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -223,15 +232,7 @@ "metadata": {}, "outputs": [], "source": [ - "sdata[\"transcripts_with_morton_codes\"] = ddf\n", - "sdata.write_element(\"transcripts_with_morton_codes\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Step 4. Write bounding box metadata with `sdata_points_write_bounding_box_attrs`" + "del ddf.attrs['transform']" ] }, { @@ -240,14 +241,15 @@ "metadata": {}, "outputs": [], "source": [ - "sdata_points_write_bounding_box_attrs(sdata, \"transcripts_with_morton_codes\")" + "sdata[\"transcripts_with_morton_codes\"] = PointsModel.parse(ddf, feature_key=\"feature_name\", instance_key=\"cell_id\", transformations=transformations)\n", + "sdata.write_element(\"transcripts_with_morton_codes\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Step 5. Modify the row group sizes of the Parquet files with `sdata_points_modify_row_group_size`" + "### Step 4. Modify the row group sizes of the Parquet files with `sdata_points_modify_row_group_size`" ] }, { @@ -278,7 +280,7 @@ "import pyarrow.parquet as pq\n", "from os.path import join\n", "\n", - "parquet_file = pq.ParquetFile(join(sdata.path, \"points\", \"transcripts_with_morton_codes\", \"points.parquet\", \"part.0.parquet\"))\n", + "parquet_file = pq.ParquetFile(join(sdata.path, \"points\", \"transcripts_with_morton_codes\", \"points.parquet\", \"part.1.parquet\"))\n", "\n", "# Get the number of row groups in this part-0 file.\n", "num_groups = parquet_file.num_row_groups\n", @@ -308,17 +310,29 @@ "wrapper = SpatialDataWrapper(\n", " sdata_path=spatialdata_filepath,\n", " # The following paths are relative to the root of the SpatialData zarr store on-disk.\n", - " image_path=\"images/rasterized\",\n", + " image_path=\"images/morphology_focus\",\n", " table_path=\"tables/table\",\n", " obs_feature_matrix_path=\"tables/table/X\",\n", - " obs_spots_path=\"shapes/cells\",\n", + " #obs_spots_path=\"shapes/cell_circles\",\n", + " obs_segmentations_path=\"shapes/cell_boundaries\",\n", " coordinate_system=\"global\",\n", " coordination_values={\n", " # The following tells Vitessce to consider each observation as a \"spot\"\n", " \"obsType\": \"cell\",\n", " }\n", ")\n", - "dataset = vc.add_dataset(name='MERFISH').add_object(wrapper)\n", + "points_wrapper = SpatialDataWrapper(\n", + " sdata_path=spatialdata_filepath,\n", + " # The following paths are relative to the root of the SpatialData zarr store on-disk.\n", + " obs_points_path=\"points/transcripts_with_morton_codes\",\n", + " obs_feature_matrix_path=\"tables/dense_table/X\",\n", + " coordinate_system=\"global\",\n", + " coordination_values={\n", + " \"obsType\": \"point\",\n", + " \"featureType\": \"gene\",\n", + " }\n", + ")\n", + "dataset = vc.add_dataset(name='MERFISH').add_object(wrapper).add_object(points_wrapper)\n", "\n", "# Add views (visualizations) to the configuration:\n", "spatial = vc.add_view(\"spatialBeta\", dataset=dataset)\n", @@ -326,11 +340,27 @@ "layer_controller = vc.add_view(\"layerControllerBeta\", dataset=dataset)\n", "obs_sets = vc.add_view(\"obsSets\", dataset=dataset)\n", "\n", + "\"\"\"\n", "vc.link_views_by_dict([spatial, layer_controller], {\n", " 'spotLayer': CL([{\n", " 'obsType': 'cell',\n", " }]),\n", "}, scope_prefix=get_initial_coordination_scope_prefix(\"A\", \"obsSpots\"))\n", + "\"\"\"\n", + "\n", + "vc.link_views_by_dict([spatial, layer_controller], {\n", + " 'segmentationLayer': CL([{\n", + " 'segmentationChannel': CL([{\n", + " 'obsType': 'cell',\n", + " }]),\n", + " }]),\n", + "}, scope_prefix=get_initial_coordination_scope_prefix(\"A\", \"obsSegmentations\"))\n", + "\n", + "vc.link_views_by_dict([spatial, layer_controller], {\n", + " 'pointLayer': CL([{\n", + " 'obsType': 'point',\n", + " }]),\n", + "}, scope_prefix=get_initial_coordination_scope_prefix(\"A\", \"obsPoints\"))\n", "\n", "vc.link_views([spatial, layer_controller, feature_list, obs_sets], ['obsType'], [wrapper.obs_type_label])\n", "\n", @@ -379,7 +409,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.14" + "version": "3.12.5" } }, "nbformat": 4, diff --git a/pyproject.toml b/pyproject.toml index 3278718e..5304a161 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,14 +4,14 @@ build-backend = "hatchling.build" [project] name = "vitessce" -version = "3.8.4" +version = "3.9.0" authors = [ { name="Mark Keller", email="mark_keller@hms.harvard.edu" }, ] description = "Jupyter widget facilitating interactive visualization of spatial single-cell data with Vitessce" readme = "README.md" license = {file = "LICENSE"} -requires-python = ">=3.10" +requires-python = ">=3.12" keywords = ["ipython", "jupyter", "widgets"] classifiers = [ 'Development Status :: 4 - Beta', @@ -29,8 +29,7 @@ dependencies = [ 'pandas>=1.1.2', 'black>=21.11b1', 'numpy>=1.21.2', - 'zarr>=2.5.0,<3', - 'numcodecs>=0.5.7,<0.16.0', + 'zarr>=3.0.0', ] [project.optional-dependencies] @@ -61,22 +60,22 @@ docs = [ all = [ # For data_utils 'negspy>=0.2.24', - 'anndata>=0.7.8', + 'anndata>=0.12.10', # scanpy < 1.10.3 does not support numpy >= 2.0.0 and does not # Reference: https://github.com/scverse/scanpy/pull/3115/files 'scanpy>=1.10.2', - 'ome-zarr<0.10.3', + 'ome-zarr>=0.12.2', 'tifffile>=2020.10.1', 'jupyter-server-proxy>=1.5.2', 'oxc-py>=0.1.1', - 'anywidget>=0.9.10', + 'anywidget>=0.11.0', 'uvicorn>=0.17.0', 'ujson>=4.0.1', 'starlette>=0.14.0', 'generate-tiff-offsets>=0.1.9', 'kerchunk>=0.2.6', - 'fsspec', + 'obstore', # aiofiles is not explicitly referenced in our code, # but it is an implicit dependency of starlette==0.14.0. @@ -88,8 +87,7 @@ building = [] testing = [] linting = [] notebook = [ - 'spatialdata>=0.3.0', - 'dask[dataframe]==2024.11.1', + 'spatialdata>=0.7.3', 'marimo', 'starlette>=0.42.0', 'tqdm>=4.1.0', @@ -121,8 +119,7 @@ dev = [ 'boto3>=1.16.30', 'scikit-misc>=0.1.3', 'autopep8>=2.0.2', - 'spatialdata>=0.3.0', - 'dask[dataframe]==2024.11.1', + 'spatialdata>=0.7.3', ] [tool.uv] diff --git a/src/vitessce/data_utils/__init__.py b/src/vitessce/data_utils/__init__.py index 77490c90..7bf7360a 100644 --- a/src/vitessce/data_utils/__init__.py +++ b/src/vitessce/data_utils/__init__.py @@ -22,9 +22,9 @@ sdata_morton_sort_points, # Other helper functions sdata_points_process_columns, - sdata_points_write_bounding_box_attrs, sdata_points_modify_row_group_size, # Functions for querying sdata_morton_query_rect, row_ranges_to_row_indices, + MORTON_CODE_EXTREME_VALUE_INDICATOR, ) diff --git a/src/vitessce/data_utils/entities.py b/src/vitessce/data_utils/entities.py index aa32ddfb..971295b7 100644 --- a/src/vitessce/data_utils/entities.py +++ b/src/vitessce/data_utils/entities.py @@ -196,8 +196,6 @@ def __init__(self, f, profile_paths, assembly='hg38', starting_resolution=5000, num_profiles = len(profile_paths) - compressor = 'default' - chromosomes = [str(chr_name) for chr_name in nc.get_chromorder( assembly)[:25]] # TODO: should more than chr1-chrM be used? chroms_length_arr = np.array( @@ -217,8 +215,8 @@ def __init__(self, f, profile_paths, assembly='hg38', starting_resolution=5000, # Create each resolution group. for resolution in resolutions: chr_shape = (num_profiles, math.ceil(chr_len / resolution)) - chr_group.create_dataset(str( - resolution), shape=chr_shape, dtype="f4", fill_value=np.nan, compressor=compressor) + chr_group.create_array(str( + resolution), shape=chr_shape, dtype="f4", fill_value=np.nan) # f.attrs should contain the properties required for HiGlass's "tileset_info" requests. f.attrs['row_infos'] = [ diff --git a/src/vitessce/data_utils/ome.py b/src/vitessce/data_utils/ome.py index c8f5a5ae..31df691a 100644 --- a/src/vitessce/data_utils/ome.py +++ b/src/vitessce/data_utils/ome.py @@ -67,7 +67,7 @@ def multiplex_img_to_ome_tiff(img_arr, channel_names, output_path, axes="CYX"): tiff_writer.close() -def rgb_img_to_ome_zarr(img_arr, output_path, img_name="Image", chunks=(1, 256, 256), axes="cyx", **kwargs): +def rgb_img_to_ome_zarr(img_arr, output_path, img_name="Image", chunks=(1, 256, 256), axes="cyx", **write_image_kwargs): """ Convert an RGB image to OME-Zarr v0.3. @@ -94,35 +94,36 @@ def rgb_img_to_ome_zarr(img_arr, output_path, img_name="Image", chunks=(1, 256, group=z_root, axes=axes, storage_options=dict(chunks=chunks), - **kwargs, - ) - z_root.attrs["omero"] = { - "name": img_name, - "version": "0.3", - "rdefs": { - "model": "color", - }, - "channels": [ - { - "label": "R", - "color": "FF0000", - "window": default_window - }, - { - "label": "G", - "color": "00FF00", - "window": default_window - }, - { - "label": "B", - "color": "0000FF", - "window": default_window + **write_image_kwargs, + metadata={ + "omero": { + "name": img_name, + "rdefs": { + "model": "color", + }, + "channels": [ + { + "label": "R", + "color": "FF0000", + "window": default_window + }, + { + "label": "G", + "color": "00FF00", + "window": default_window + }, + { + "label": "B", + "color": "0000FF", + "window": default_window + } + ] } - ] - } + } + ) -def multiplex_img_to_ome_zarr(img_arr, channel_names, output_path, img_name="Image", chunks=(1, 256, 256), axes="cyx", channel_colors=None): +def multiplex_img_to_ome_zarr(img_arr, channel_names, output_path, img_name="Image", chunks=(1, 256, 256), axes="cyx", channel_colors=None, **write_image_kwargs): """ Convert a multiplexed image to OME-Zarr v0.3. @@ -153,21 +154,23 @@ def multiplex_img_to_ome_zarr(img_arr, channel_names, output_path, img_name="Ima image=img_arr, group=z_root, axes=axes, - storage_options=dict(chunks=chunks) - ) - z_root.attrs["omero"] = { - "name": img_name, - "version": "0.3", - "rdefs": { - "model": "greyscale", - }, - "channels": [ - { - "label": channel_name, - "color": channel_colors[channel_name] if channel_colors is not None else "FFFFFF", - "window": default_window + storage_options=dict(chunks=chunks), + **write_image_kwargs, + metadata={ + "omero": { + "name": img_name, + "rdefs": { + "model": "greyscale", + }, + "channels": [ + { + "label": channel_name, + "color": channel_colors[channel_name] if channel_colors is not None else "FFFFFF", + "window": default_window + } + for channel_name + in channel_names + ] } - for channel_name - in channel_names - ] - } + } + ) diff --git a/src/vitessce/data_utils/spatialdata_points_zorder.py b/src/vitessce/data_utils/spatialdata_points_zorder.py index b5aa9e4b..c94665c1 100644 --- a/src/vitessce/data_utils/spatialdata_points_zorder.py +++ b/src/vitessce/data_utils/spatialdata_points_zorder.py @@ -6,16 +6,22 @@ import pandas as pd import numpy as np - from spatialdata import get_element_annotators +from spatialdata.models import PointsModel import dask.dataframe as dd -import zarr MORTON_CODE_NUM_BITS = 32 # Resulting morton codes will be stored as uint32. MORTON_CODE_VALUE_MIN = 0 MORTON_CODE_VALUE_MAX = 2**(MORTON_CODE_NUM_BITS / 2) - 1 +# In the first few rows (up to four), we set the morton code +# value to a sentinel value for the rows that contain extrema, +# x_min, x_max, y_min, and/or y_max data extent. +# It is possible for the same row to contain extrema along more than one axis, +# in which case we will only use fewer than four rows for the extrema. +MORTON_CODE_EXTREME_VALUE_INDICATOR = 0 + # -------------------------- # Functions for computing Morton codes for SpatialData points (2D). # -------------------------- @@ -42,16 +48,6 @@ def norm_ddf_to_uint(ddf): [x_min, x_max, y_min, y_max] = [ddf["x"].min().compute(), ddf["x"].max().compute(), ddf["y"].min().compute(), ddf["y"].max().compute()] ddf["x_uint"] = norm_series_to_uint(ddf["x"], x_min, x_max) ddf["y_uint"] = norm_series_to_uint(ddf["y"], y_min, y_max) - - # Insert the bounding box as metadata for the sdata.points[element] Points element dataframe. - # TODO: does anything special need to be done to ensure this is saved to disk? - ddf.attrs["bounding_box"] = { - "x_min": float(x_min), - "x_max": float(x_max), - "y_min": float(y_min), - "y_max": float(y_max), - } - return ddf @@ -140,24 +136,57 @@ def morton_interleave(ddf): def sdata_morton_sort_points(sdata, element): ddf = sdata.points[element] - # Compute morton codes + # Add normalized integer columns and morton codes using the full dataset bounding box. ddf = norm_ddf_to_uint(ddf) ddf["morton_code_2d"] = morton_interleave(ddf) + # Identify the extreme coordinate values. + x_min_val = ddf["x"].min().compute() + x_max_val = ddf["x"].max().compute() + y_min_val = ddf["y"].min().compute() + y_max_val = ddf["y"].max().compute() + + # Build a boolean mask for the extreme rows using coordinate value equality. + is_extreme = ( + (ddf["x"] == x_min_val) | (ddf["x"] == x_max_val) + | (ddf["y"] == y_min_val) | (ddf["y"] == y_max_val) + ) + + extreme_pdf = ddf.loc[is_extreme].compute().reset_index(drop=True) + + # Checking based on the row indices would be cleaner, but dask does not + # guarantee unique indices across partitions. + # Reference: https://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.reset_index.html + # Instead, our `is_extreme` uses the min/max values themselves to identify the matching rows, + # which can potentially return more than four rows if multiple rows match these extreme values. + # TODO: in that case, reorder the extreme rows so the first four form the bounding box, + # and any additional rows should be moved into `rest_df` before the morton-based sorting occurs. + assert extreme_pdf.shape[0] <= 4 + + # Mark sentinel rows with morton_code_2d = MORTON_CODE_EXTREME_VALUE_INDICATOR so the reader can identify them without + # relying on a hard-coded count or fixed row positions. + extreme_pdf["morton_code_2d"] = np.uint32(MORTON_CODE_EXTREME_VALUE_INDICATOR) + + # Exclude the extreme rows from the rest. + rest_ddf = ddf.loc[~is_extreme] + if "z" in ddf.columns: num_unique_z = ddf["z"].unique().shape[0].compute() if num_unique_z < 100: # Heuristic for interpreting the 3D data as 2.5D # Reference: https://github.com/scverse/spatialdata/issues/961 - sorted_ddf = ddf.sort_values(by=["z", "morton_code_2d"], ascending=True) + sorted_rest = rest_ddf.sort_values(by=["z", "morton_code_2d"], ascending=True) else: # TODO: include z as a dimension in the morton code in the 3D case? - - # For now, just return the data sorted by 2D code. - sorted_ddf = ddf.sort_values(by="morton_code_2d", ascending=True) + sorted_rest = rest_ddf.sort_values(by="morton_code_2d", ascending=True) else: - sorted_ddf = ddf.sort_values(by="morton_code_2d", ascending=True) - sdata.points[element] = sorted_ddf + sorted_rest = rest_ddf.sort_values(by="morton_code_2d", ascending=True) + + # Prepend the sentinel rows (morton_code_2d == MORTON_CODE_EXTREME_VALUE_INDICATOR) before the morton-sorted data. + sentinel_ddf = dd.from_pandas(extreme_pdf, npartitions=1) + result_ddf = dd.concat([sentinel_ddf, sorted_rest], ignore_index=True, interleave_partitions=True) + + sdata.points[element] = PointsModel.parse(result_ddf) # annotating_tables = get_element_annotators(sdata, element) @@ -175,15 +204,14 @@ def sdata_morton_query_rect_aux(sdata, element, orig_rect): sorted_ddf = sdata.points[element] - # TODO: fail if no morton_code_2d column - # TODO: fail if not sorted as expected - # TODO: fail if no bounding box metadata - - bounding_box = sorted_ddf.attrs["bounding_box"] - x_min = bounding_box["x_min"] - x_max = bounding_box["x_max"] - y_min = bounding_box["y_min"] - y_max = bounding_box["y_max"] + # Sentinel rows are identified by morton_code_2d == 0 and always precede the z-order data. + # There are at most 4 (one per axis extreme), fewer when a single point covers multiple extremes. + head = sorted_ddf.head(4) + sentinel_rows = head[head["morton_code_2d"] == MORTON_CODE_EXTREME_VALUE_INDICATOR] + x_min = float(sentinel_rows["x"].min()) + x_max = float(sentinel_rows["x"].max()) + y_min = float(sentinel_rows["y"].min()) + y_max = float(sentinel_rows["y"].max()) norm_rect = [ orig_coord_to_norm_coord(orig_rect[0], orig_x_min=x_min, orig_x_max=x_max, orig_y_min=y_min, orig_y_max=y_max), @@ -210,13 +238,24 @@ def sdata_morton_query_rect(sdata, element, orig_rect): morton_intervals = sdata_morton_query_rect_aux(sdata, element, orig_rect) - # Get morton code column as a list of integers + # Get morton code column as a list of integers. morton_sorted = sorted_ddf["morton_code_2d"].compute().values.tolist() + # Count leading sentinel rows (morton_code_2d == 0); stop at the first non-zero code. + sentinel_count = 0 + for code in morton_sorted[:4]: + if code == MORTON_CODE_EXTREME_VALUE_INDICATOR: + sentinel_count += 1 + else: + break + # Get a list of row ranges that match the morton intervals. # (This uses binary searches internally to find the matching row indices). # [ (row_start, row_end), ... ] - matching_row_ranges = zquery_rows(morton_sorted, morton_intervals, merge=True) + matching_row_ranges = zquery_rows(morton_sorted[sentinel_count:], morton_intervals, merge=True) + + # Offset ranges to account for the sentinel rows at the start of the table. + matching_row_ranges = [(i + sentinel_count, j + sentinel_count) for i, j in matching_row_ranges] return matching_row_ranges @@ -228,7 +267,14 @@ def sdata_morton_query_rect_debug(sdata, element, orig_rect): sorted_ddf = sdata.points[element] morton_intervals = sdata_morton_query_rect_aux(sdata, element, orig_rect) morton_sorted = sorted_ddf["morton_code_2d"].compute().values.tolist() - matching_row_ranges, rows_checked = zquery_rows_aux(morton_sorted, morton_intervals, merge=True) + sentinel_count = 0 + for code in morton_sorted[:4]: + if code == MORTON_CODE_EXTREME_VALUE_INDICATOR: + sentinel_count += 1 + else: + break + matching_row_ranges, rows_checked = zquery_rows_aux(morton_sorted[sentinel_count:], morton_intervals, merge=True) + matching_row_ranges = [(i + sentinel_count, j + sentinel_count) for i, j in matching_row_ranges] return matching_row_ranges, rows_checked # -------------------------- @@ -470,28 +516,6 @@ def try_index(gene_name): return ddf -def sdata_points_write_bounding_box_attrs(sdata, element) -> dd.DataFrame: - ddf = sdata.points[element] - - [x_min, x_max, y_min, y_max] = [ddf["x"].min().compute(), ddf["x"].max().compute(), ddf["y"].min().compute(), ddf["y"].max().compute()] - bounding_box = { - "x_min": float(x_min), - "x_max": float(x_max), - "y_min": float(y_min), - "y_max": float(y_max), - } - - sdata_path = sdata.path - # TODO: error if no path - - # Insert the bounding box as metadata for the sdata.points[element] Points element dataframe. - z = zarr.open(sdata_path, mode='a') - group = z[f'points/{element}'] - group.attrs['bounding_box'] = bounding_box - - # TODO: does anything special need to be done to ensure this is saved to disk? - - def sdata_points_modify_row_group_size(sdata, element, row_group_size: int = 50_000): import pyarrow.parquet as pq diff --git a/src/vitessce/widget.py b/src/vitessce/widget.py index af7a8401..e0757c6d 100644 --- a/src/vitessce/widget.py +++ b/src/vitessce/widget.py @@ -7,6 +7,10 @@ import anywidget from traitlets import Unicode, Dict, List, Int, Bool +import asyncio +from zarr.abc.store import RangeByteRequest, SuffixByteRequest +from zarr.core.buffer.core import default_buffer_prototype + MAX_PORT_TRIES = 1000 DEFAULT_PORT = 8000 @@ -318,6 +322,45 @@ def get_uid_str(uid): }; } +// Fallback UUID for non-secure (http://) contexts where crypto.randomUUID is unavailable. +// Reference: https://stackoverflow.com/a/8809472 +function generateId() { + let d = new Date().getTime(), + d2 = ((typeof performance !== 'undefined') && performance.now && (performance.now() * 1000)) || 0; + return 'xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx'.replace(/[xy]/g, c => { + let r = Math.random() * 16; + if (d > 0) { + r = (d + r) % 16 | 0; + d = Math.floor(d / 16); + } else { + r = (d2 + r) % 16 | 0; + d2 = Math.floor(d2 / 16); + } + return (c == 'x' ? r : (r & 0x3 | 0x8)).toString(16); + }); +} + +// Custom invoke matching the anywidget-command protocol implemented on the Python side. +function invoke(model, name, msg, buffers, signal) { + const id = generateId(); + const abortSignal = signal ?? AbortSignal.timeout(30000); + return new Promise((resolve, reject) => { + if (abortSignal.aborted) { reject(abortSignal.reason); return; } + abortSignal.addEventListener("abort", () => { + model.off("msg:custom", handler); + reject(abortSignal.reason); + }); + function handler(responseMsg, responseBuffers) { + if (!responseMsg || responseMsg.id !== id) return; + model.off("msg:custom", handler); + resolve([responseMsg.response, responseBuffers]); + } + model.on("msg:custom", handler); + model.send({ id, kind: "anywidget-command", name, msg }, undefined, buffers ?? []); + }); +} + + async function render(view) { const cssUid = view.model.get('uid'); const jsDevMode = view.model.get('js_dev_mode'); @@ -382,9 +425,13 @@ def get_uid_str(uid): let batchId = 0; async function processBatch(prevPendingArr) { - const [dataArr, buffersArr] = await view.experimental.invoke("_zarr_get_multi", prevPendingArr.map(d => d.params), { - signal: AbortSignal.timeout(invokeTimeout), - }); + const [dataArr, buffersArr] = await invoke( + view.model, + "_zarr_get_multi", + prevPendingArr.map(d => d.params), + null, + AbortSignal.timeout(invokeTimeout), + ); prevPendingArr.forEach((prevPendingItem, i) => { const data = dataArr[i]; const bufferData = buffersArr[i]; @@ -428,9 +475,13 @@ def get_uid_str(uid): return enqueue([storeUrl, key]); } else { // Do not submit zarr gets in batches. Instead, submit individually. - const [data, buffers] = await view.experimental.invoke("_zarr_get", [storeUrl, key], { - signal: AbortSignal.timeout(invokeTimeout), - }); + const [data, buffers] = await invoke( + view.model, + "_zarr_get", + [storeUrl, key], + null, + AbortSignal.timeout(invokeTimeout), + ); if (!data.success) return undefined; if (ArrayBuffer.isView(buffers[0])) { @@ -444,9 +495,13 @@ def get_uid_str(uid): return enqueue([storeUrl, key, rangeQuery]); } else { // Do not submit zarr gets in batches. Instead, submit individually. - const [data, buffers] = await view.experimental.invoke("_zarr_get_range", [storeUrl, key, rangeQuery], { - signal: AbortSignal.timeout(invokeTimeout), - }); + const [data, buffers] = await invoke( + view.model, + "_zarr_get_range", + [storeUrl, key, rangeQuery], + null, + AbortSignal.timeout(invokeTimeout), + ); if (!data.success) return undefined; if (ArrayBuffer.isView(buffers[0])) { @@ -460,10 +515,13 @@ def get_uid_str(uid): ); function invokePluginCommand(commandName, commandParams, commandBuffers) { - return view.experimental.invoke("_plugin_command", [commandName, commandParams], { - signal: AbortSignal.timeout(invokeTimeout), - ...(commandBuffers ? { buffers: commandBuffers } : {}), - }); + return invoke( + view.model, + "_plugin_command", + [commandName, commandParams], + commandBuffers ?? null, + AbortSignal.timeout(invokeTimeout), + ); } for (const pluginEsm of pluginEsmArr) { @@ -738,7 +796,7 @@ class VitessceWidget(anywidget.AnyWidget): next_port = DEFAULT_PORT - js_package_version = Unicode('3.9.9').tag(sync=True) + js_package_version = Unicode('3.9.10').tag(sync=True) js_dev_mode = Bool(False).tag(sync=True) custom_js_url = Unicode('').tag(sync=True) plugin_esm = List(trait=Unicode(''), default_value=[]).tag(sync=True) @@ -751,7 +809,7 @@ class VitessceWidget(anywidget.AnyWidget): store_urls = List(trait=Unicode(''), default_value=[]).tag(sync=True) - def __init__(self, config, height=600, theme='auto', uid=None, port=None, proxy=False, js_package_version='3.9.9', js_dev_mode=False, custom_js_url='', plugins=None, remount_on_uid_change=True, prefer_local=True, invoke_timeout=300000, invoke_batched=True, page_mode=False, page_esm=None, prevent_scroll=True, server_host=None): + def __init__(self, config, height=600, theme='auto', uid=None, port=None, proxy=False, js_package_version='3.9.10', js_dev_mode=False, custom_js_url='', plugins=None, remount_on_uid_change=True, prefer_local=True, invoke_timeout=300000, invoke_batched=True, page_mode=False, page_esm=None, prevent_scroll=True, server_host=None): """ Construct a new Vitessce widget. Not intended to be instantiated directly; instead, use ``VitessceConfig.widget``. @@ -823,6 +881,9 @@ def handle_config_change(change): serve_routes(config, routes, use_port, server_host) + # Set up custom ipywidgets message handlers for invoking commands RPC-style. + self.on_msg(self._handle_msg) + def _get_coordination_value(self, coordination_type, coordination_scope): obj = self._config['coordinationSpace'][coordination_type] obj_scopes = list(obj.keys()) @@ -850,45 +911,61 @@ def close(self): self.config.stop_server(self.port) super().close() - @anywidget.experimental.command - def _zarr_get(self, params, buffers): + # @anywidget.experimental.command + async def _zarr_get(self, params, buffers): [store_url, key] = params store = self._stores[store_url] try: - buffers = [store[key.lstrip("/")]] + result = await store.get(key.lstrip("/"), prototype=default_buffer_prototype()) + if result is None: + buffers = [] + else: + buffers = [result.to_bytes()] except KeyError: buffers = [] + # TODO: catch other types of errors here? return {"success": len(buffers) == 1}, buffers - @anywidget.experimental.command - def _zarr_get_range(self, params, buffers): + # @anywidget.experimental.command + async def _zarr_get_range(self, params, buffers): [store_url, key, range_query] = params store = self._stores[store_url] try: - full_value = store[key.lstrip("/")] + range_param = None # Reference: https://github.com/manzt/zarrita.js/blob/f63a2521e2b46b22aa26af4146822e4d827dff83/packages/%40zarrita-storage/src/types.ts#L3 if "suffixLength" in range_query: suffix_length = range_query["suffixLength"] - buffers = [full_value[-suffix_length:]] + range_param = SuffixByteRequest(suffix=suffix_length) elif "offset" in range_query and "length" in range_query: offset = range_query["offset"] length = range_query["length"] - buffers = [full_value[offset:offset + length]] + range_param = RangeByteRequest(start=offset, end=(offset + length)) + else: + raise ValueError(f"Invalid range query: {range_query}. Must contain either 'suffixLength' or both 'offset' and 'length'.") + + result = await store.get(key, byte_range=range_param, prototype=default_buffer_prototype()) + if result is None: + buffers = [] + else: + buffers = [result.to_bytes()] except KeyError: buffers = [] + # TODO: catch other types of errors here? return {"success": len(buffers) == 1}, buffers - @anywidget.experimental.command - def _zarr_get_multi(self, params_arr, buffers): + # @anywidget.experimental.command + async def _zarr_get_multi(self, params_arr, buffers): # This variant of _zarr_get and _zarr_get_range supports batching. result_dicts = [] result_buffers = [] for params in params_arr: + result_dict, result_buffer_arr = {}, [] if len(params) == 2: - result_dict, result_buffer_arr = self._zarr_get(params, buffers) + result_dict, result_buffer_arr = await self._zarr_get(params, buffers) elif len(params) == 3: - result_dict, result_buffer_arr = self._zarr_get_range(params, buffers) - + result_dict, result_buffer_arr = await self._zarr_get_range(params, buffers) + else: + raise ValueError("Expected params to have len 2 or 3 in _zarr_get_multi") if result_dict["success"] and len(result_buffer_arr) == 1: result_dicts.append(result_dict) result_buffers.append(result_buffer_arr[0]) @@ -897,16 +974,57 @@ def _zarr_get_multi(self, params_arr, buffers): result_buffers.append(b'') return result_dicts, result_buffers - @anywidget.experimental.command + # @anywidget.experimental.command def _plugin_command(self, params, buffers): [command_name, command_params] = params command_func = self._plugin_commands[command_name] return command_func(command_params, buffers) + def _handle_msg(self, msg: dict) -> None: + content = msg.get("content", {}).get("data", {}).get("content", {}) + buffers = msg.get("buffers", []) + if content.get("kind") != "anywidget-command": + super()._handle_msg(msg) + return + try: + loop = asyncio.get_event_loop() + except RuntimeError: + return + if loop.is_running(): + loop.create_task(self._dispatch_command(content, buffers)) + else: + loop.run_until_complete(self._dispatch_command(content, buffers)) + + async def _dispatch_command(self, msg: dict, buffers: list[bytes]) -> None: + name = msg.get("name") + params = msg.get("msg") + msg_id = msg.get("id") + try: + if name == "_zarr_get": + response, result_buffers = await self._zarr_get(params, buffers) + elif name == "_zarr_get_range": + response, result_buffers = await self._zarr_get_range(params, buffers) + elif name == "_zarr_get_multi": + response, result_buffers = await self._zarr_get_multi(params, buffers) + elif name == "_plugin_command": + response, result_buffers = await self._plugin_command(params, buffers) + else: + return + except Exception as exc: # noqa: BLE001 + self.send( + {"id": msg_id, "kind": "anywidget-command-response", "response": {"error": repr(exc)}}, + [], + ) + return + self.send( + {"id": msg_id, "kind": "anywidget-command-response", "response": response}, + result_buffers, + ) + # Launch Vitessce using plain HTML representation (no ipywidgets) -def ipython_display(config, height=600, theme='auto', base_url=None, host_name=None, uid=None, port=None, proxy=False, js_package_version='3.9.9', js_dev_mode=False, custom_js_url='', plugins=None, remount_on_uid_change=True, page_mode=False, page_esm=None, server_host=None): +def ipython_display(config, height=600, theme='auto', base_url=None, host_name=None, uid=None, port=None, proxy=False, js_package_version='3.9.10', js_dev_mode=False, custom_js_url='', plugins=None, remount_on_uid_change=True, page_mode=False, page_esm=None, server_host=None): from IPython.display import display, HTML uid_str = "vitessce" + get_uid_str(uid) diff --git a/src/vitessce/widget_plugins/spatial_query.py b/src/vitessce/widget_plugins/spatial_query.py index 7a941baa..6bf88ed1 100644 --- a/src/vitessce/widget_plugins/spatial_query.py +++ b/src/vitessce/widget_plugins/spatial_query.py @@ -286,7 +286,7 @@ def _render_heatmap_base64(self, fp_tree, query_type, cell_type_of_interest): enrich = fp_tree.copy() enrich["frequency"] = enrich["n_center_motif"] / enrich["n_center"] enrich = enrich.sort_values(by="frequency", ascending=False) - enrich["motif_group"] = [f"motif_{i+1}" for i in range(len(enrich))] + enrich["motif_group"] = [f"motif_{i + 1}" for i in range(len(enrich))] expanded = enrich.explode("motifs") heatmap_data = expanded.pivot_table( index="motifs", columns="motif_group", values="frequency", aggfunc="first" @@ -303,7 +303,7 @@ def _render_heatmap_base64(self, fp_tree, query_type, cell_type_of_interest): else: fp = fp_tree.copy() fp = fp.sort_values(by="support", ascending=False).reset_index(drop=True) - fp["motif_group"] = [f"motif_{i+1}" for i in range(len(fp))] + fp["motif_group"] = [f"motif_{i + 1}" for i in range(len(fp))] expanded = fp.explode("itemsets") heatmap_data = expanded.pivot_table( index="itemsets", columns="motif_group", values="support", aggfunc="first" diff --git a/src/vitessce/wrappers.py b/src/vitessce/wrappers.py index 77547e95..b72d35b1 100644 --- a/src/vitessce/wrappers.py +++ b/src/vitessce/wrappers.py @@ -136,7 +136,7 @@ def try_getting_artifact_stores(self): # Reference: https://github.com/laminlabs/lamindb/blob/58ee954c9f363524e2c47b358727f0b921467078/lamindb/models/save.py#L167 local_path = artifact._local_filepath if local_path is not None and os.path.isdir(local_path): - store = zarr.DirectoryStore(local_path) + store = zarr.storage.LocalStore(local_path) artifact_stores[artifact_url] = store return artifact_stores @@ -205,7 +205,7 @@ def register_zarr_store(self, dataset_uid, obj_i, store_or_local_dir_path, local # Set up `store` and `local_dir_path` variables. if isinstance(store_or_local_dir_path, str): # TODO: use zarr.FSStore if fsspec is installed? - store = zarr.DirectoryStore(store_or_local_dir_path) + store = zarr.storage.LocalStore(store_or_local_dir_path) local_dir_path = store_or_local_dir_path else: # TODO: check that store_or_local_dir_path is a zarr.Store or StoreLike? @@ -1199,7 +1199,7 @@ def __init__(self, adata_path=None, adata_url=None, adata_store=None, adata_arti :param str adata_path: A path to an AnnData object written to a Zarr store containing single-cell experiment data. :param str adata_url: A remote url pointing to a zarr-backed AnnData store. - :param adata_store: A path to pass to zarr.DirectoryStore, or an existing store instance. + :param adata_store: A path to pass to zarr.storage.LocalStore, or an existing store instance. :type adata_store: str or zarr.Storage :param adata_artifact: A lamindb Artifact corresponding to the AnnData object. :type adata_artifact: lamindb.Artifact diff --git a/tests/test_ome_utils.py b/tests/test_ome_utils.py index 85020be9..ef1c008d 100644 --- a/tests/test_ome_utils.py +++ b/tests/test_ome_utils.py @@ -50,49 +50,50 @@ def setUp(self): def test_rgb_img_to_ome_zarr(self): img_arr = self.img_arr out_path = data_path / "rgb_out.ome.zarr" - rgb_img_to_ome_zarr(img_arr, out_path, img_name="Test", axes="cyx", chunks=(1, 3, 3), scaler=None) + rgb_img_to_ome_zarr(img_arr, out_path, img_name="Test", axes="cyx", chunks=(1, 3, 3), scale_factors=[]) z_root = zarr.open(out_path, mode="r") assert dict(z_root.attrs) == { - 'multiscales': [ - { - 'axes': [ - {'name': 'c', 'type': 'channel'}, - {'name': 'y', 'type': 'space'}, - {'name': 'x', 'type': 'space'} - ], - 'datasets': [ - { - 'coordinateTransformations': [ - {'scale': [1.0, 1.0, 1.0], 'type': 'scale'} - ], - 'path': '0' - } + 'ome': { + 'version': '0.5', + 'multiscales': [ + { + 'axes': [ + {'name': 'c', 'type': 'channel'}, + {'name': 'y', 'type': 'space'}, + {'name': 'x', 'type': 'space'} + ], + 'datasets': [ + { + 'coordinateTransformations': [ + {'scale': [1.0, 1.0, 1.0], 'type': 'scale'} + ], + 'path': 's0' + } + ], + 'name': '/', + } + ], + 'omero': { + 'channels': [ + {'color': 'FF0000', + 'label': 'R', + 'window': {'end': 255, 'max': 255, 'min': 0, 'start': 0} + }, + {'color': '00FF00', + 'label': 'G', + 'window': {'end': 255, 'max': 255, 'min': 0, 'start': 0} + }, + {'color': '0000FF', + 'label': 'B', + 'window': {'end': 255, 'max': 255, 'min': 0, 'start': 0} + } ], - 'name': '/', - 'version': '0.4' + 'name': 'Test', + 'rdefs': { + "model": "color", + } } - ], - 'omero': { - 'channels': [ - {'color': 'FF0000', - 'label': 'R', - 'window': {'end': 255, 'max': 255, 'min': 0, 'start': 0} - }, - {'color': '00FF00', - 'label': 'G', - 'window': {'end': 255, 'max': 255, 'min': 0, 'start': 0} - }, - {'color': '0000FF', - 'label': 'B', - 'window': {'end': 255, 'max': 255, 'min': 0, 'start': 0} - } - ], - 'name': 'Test', - 'rdefs': { - "model": "color", - }, - 'version': '0.3' } } diff --git a/tests/test_sdata_points_zorder.py b/tests/test_sdata_points_zorder.py index 783569de..59b00406 100644 --- a/tests/test_sdata_points_zorder.py +++ b/tests/test_sdata_points_zorder.py @@ -9,6 +9,7 @@ sdata_morton_query_rect_debug, row_ranges_to_row_indices, orig_coord_to_norm_coord, + MORTON_CODE_EXTREME_VALUE_INDICATOR ) @@ -30,11 +31,22 @@ def test_zorder_sorting(sdata_with_points): sdata_morton_sort_points(sdata, "transcripts") - # Check that the morton codes are sorted sorted_ddf = sdata.points["transcripts"] morton_sorted = sorted_ddf["morton_code_2d"].compute().values.tolist() - assert _is_sorted(morton_sorted) + # Count leading sentinel rows, identified by morton_code_2d == MORTON_CODE_EXTREME_VALUE_INDICATOR + sentinel_count = 0 + for code in morton_sorted[:4]: + if code == MORTON_CODE_EXTREME_VALUE_INDICATOR: + sentinel_count += 1 + else: + break + + assert sentinel_count >= 2 # at least the x_min and x_max extremes are distinct + assert all(code == MORTON_CODE_EXTREME_VALUE_INDICATOR for code in morton_sorted[:sentinel_count]) + + # Z-order sorted data begins after the sentinel rows + assert _is_sorted(morton_sorted[sentinel_count:]) def test_zorder_query(sdata_with_points): @@ -55,6 +67,13 @@ def test_zorder_query(sdata_with_points): assert df.shape[0] == 213191 + # Read bounding box from sentinel rows (morton_code_2d == 0) at the start of the table + sentinel_rows = df.iloc[:4][df.iloc[:4]["morton_code_2d"] == MORTON_CODE_EXTREME_VALUE_INDICATOR] + x_min = float(sentinel_rows["x"].min()) + x_max = float(sentinel_rows["x"].max()) + y_min = float(sentinel_rows["y"].min()) + y_max = float(sentinel_rows["y"].max()) + # Do the same query the "dumb" way, by checking all points # We need an epsilon for the "dumb" query since the normalization @@ -86,12 +105,6 @@ def test_zorder_query(sdata_with_points): # Do a second check, this time against x_uint/y_uint (the normalized coordinates) # TODO: does this ensure that estimated == exact? - - bounding_box = ddf.attrs["bounding_box"] - x_min = bounding_box["x_min"] - x_max = bounding_box["x_max"] - y_min = bounding_box["y_min"] - y_max = bounding_box["y_max"] norm_rect = [ orig_coord_to_norm_coord(orig_rect[0], orig_x_min=x_min, orig_x_max=x_max, orig_y_min=y_min, orig_y_max=y_max), orig_coord_to_norm_coord(orig_rect[1], orig_x_min=x_min, orig_x_max=x_max, orig_y_min=y_min, orig_y_max=y_max)