From 4c0be0004568dedfc828c09f5aa30a9db591b5ba Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Tue, 10 Jun 2025 11:55:42 -0700 Subject: [PATCH 1/2] Minor touchups to formatting and text, fix links. --- docs/site/API-key.md | 1 + docs/site/tutorial.md | 34 ++++++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/docs/site/API-key.md b/docs/site/API-key.md index 01d1940..c3e4421 100644 --- a/docs/site/API-key.md +++ b/docs/site/API-key.md @@ -21,6 +21,7 @@ This line can be added to your shell configuration (e.g. ``.bashrc``, ``.zshrc`` ``.bash_profile``, etc.) to automatically grant access to DeepCell models/data upon login. +(download_models)= Models ------ diff --git a/docs/site/tutorial.md b/docs/site/tutorial.md index c21a202..514d652 100644 --- a/docs/site/tutorial.md +++ b/docs/site/tutorial.md @@ -24,7 +24,7 @@ Each of these components will be covered in further detail in this tutorial. ## Example datasets -This tutorial will make use of the spatial proteomic data provided by the +This tutorial will make use of the spatial proteomic data available on the [HuBMAP data portal][hubmap-data-portal]. Users are encouraged to explore the portal for data of interest. For convenience, a subset of the publicly-available spatial proteomic data @@ -100,6 +100,8 @@ As noted above, the cell-type prediction pipeline requires the multiplexed image the channel-name mapping, and a segmentation mask for the image. The multiplexed image is stored in the `image` array for each dataset, and the channel mapping is stored under the key `"channels"` in the image metadata. +Note that these two inputs are derived directly from the corresponding datasets +on the HuBMAP data portal. ```{code-cell} ds = z[k] @@ -135,9 +137,11 @@ The final input is a segmentation mask. to better integrate into existing spatial-omics workflows. However, for convenience, several pre-computed segmentation masks are stored in the data archive: one computed by [Mesmer](https://www.nature.com/articles/s41587-021-01094-0) -and a second by [CellSAM](https://www.biorxiv.org/content/10.1101/2023.11.17.567630v4). +(available at `ds["segmentations/torch_mesmer"]`) +and a second by [CellSAM](https://www.biorxiv.org/content/10.1101/2023.11.17.567630v4) +(available at `ds["segmentations/cellsam"]`). -For illustration purposes however, we will demonstrate how to use one of these +In this tutorial, we will demonstrate how to use one of these models to construct a full cell-type inference pipeline. ### Cell segmentation with `cellSAM` @@ -157,9 +161,12 @@ For convenience, channels corresponding to nuclear markers and a whole-cell mark are stored in the dataset metadata. ```{note} -While the nuclear channel is unambiguous, the whole-cell channel selection is -arbitrary. Users are encourage to try different channels or combinations of +Nuclear markers are typically unambiguous. The whole-cell channel selection +on the other hand is less well-defined. +Users are encouraged to try different channels or combinations of channels for improved whole-cell segmentation results. +The `membrane_channel` selection in the metadata is arbitrary and provided +for convenience. ``` ```{code-cell} @@ -183,6 +190,8 @@ seg_img[..., 1:] = im Finally, run the segmentation pipeline: ```{code-cell} +:tags: [hide-output] + mask = cellsam_pipeline( seg_img, block_size=512, @@ -190,7 +199,9 @@ mask = cellsam_pipeline( use_wsi=True, gauge_cell_size=False, ) +``` +```{code-cell} # Sanity check: the segmentation mask should have the same W, H dimensions as # the input image mask.shape == img.shape[1:] @@ -260,7 +271,7 @@ import deepcell_types ``` To run the inference pipeline, you will need to download a trained model. -See {ref}`models` for details. +See {ref}`download_models` for details. ```{code-cell} # Model & system-specific configuration @@ -298,11 +309,14 @@ mapping explicit: ```{code-cell} idx_to_pred = dict(enumerate(cell_types, start=1)) -df = pd.DataFrame.from_dict( # For nice table rendering +pd.DataFrame.from_dict( # For nice table rendering idx_to_pred, orient="index", columns=["Cell type"] ) ``` +Depending on the subsequent analysis you wish to perform, it may be convenient +to group the cells by their predicted cell-type: + ```{code-cell} from collections import defaultdict @@ -316,8 +330,8 @@ Here's the distribution of predicted cell types for this tissue: ```{code-cell} from pprint import pprint -num_cells = np.max(mask) -print(f"Total number of cells: {num_cells}") + +print(f"Total number of cells: {(num_cells := np.max(mask))}") pprint( { @@ -333,7 +347,7 @@ pprint( There are many ways to visualize the cell-type prediction data, each with their own advantages and disadvantages. One way is to add an independent layer for each predicted cell type. -The advantage of this approach is that individual layers can be toggled off to focus +The advantage of this approach is that individual layers can be toggled to focus on a particular cell type during interactive visualization. ```{code-cell} From 4956947fad05b0f242b86731fcabb65739faf961 Mon Sep 17 00:00:00 2001 From: Ross Barnowski Date: Tue, 10 Jun 2025 15:19:51 -0700 Subject: [PATCH 2/2] API: rename tissue_exclude -> tissue in predict. --- deepcell_types/predict.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/deepcell_types/predict.py b/deepcell_types/predict.py index b01bd51..dfa48e5 100644 --- a/deepcell_types/predict.py +++ b/deepcell_types/predict.py @@ -33,7 +33,7 @@ def get_result(self): return cell_type_str_pred, top_probs, cell_index -def predict(raw, mask, channel_names, mpp, model_name, device_num, batch_size=256, num_workers=24, tissue_exclude=None): +def predict(raw, mask, channel_names, mpp, model_name, device_num, batch_size=256, num_workers=24, tissue=None): device = torch.device(device_num) embedding_model_name = "deepseek-r1-70b-llama-distill-q4_K_M" @@ -95,8 +95,8 @@ def predict(raw, mask, channel_names, mpp, model_name, device_num, batch_size=25 with torch.no_grad(): for sample, ch_idx, attn_mask, cell_index in tqdm(data_loader, desc=f"(inference)"): ct_exclude = None - if tissue_exclude: - ct_exclude = [[i for i in range(len(ct_embeddings)) if i not in [dct_config.ct2idx[i] for i in tct[tissue_exclude]]] for _ in range(len(sample))] + if tissue: + ct_exclude = [[i for i in range(len(ct_embeddings)) if i not in [dct_config.ct2idx[i] for i in tct[tissue]]] for _ in range(len(sample))] _, _, _, _, probs, _ = model( sample.to(device), ch_idx.to(device),