From 773766002bee23d7e248246c4a40e4480583ee39 Mon Sep 17 00:00:00 2001 From: JoneSu1 Date: Wed, 6 May 2026 16:15:49 +0200 Subject: [PATCH] Add AlphaGenome adapter and multi-track support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add AlphaGenomeAdapter (src/deepISA/modeling/alpha_genome_adapter.py): single API call for multi-track/multi-biosample configs, sequence cache, probe-based n_tracks, backward-compatible single-track YAML format - Add configs/ag_config_template.yaml with single- and multi-track examples - Add notebooks/ag_biosample_reference.csv (714 biosamples × 10 assay types) - Update tutorial notebook: local paths, AlphaGenome cells, multi-track docs - Add tests: test_adapter.py, test_attr_filter_pipeline.py - Update pyproject.toml and modeling __init__.py --- configs/ag_config_template.yaml | 42 ++ notebooks/ag_biosample_reference.csv | 715 +++++++++++++++++++ notebooks/deepISA_tutorial.ipynb | 174 ++++- pyproject.toml | 5 +- src/deepISA/modeling/__init__.py | 5 + src/deepISA/modeling/alpha_genome_adapter.py | 217 ++++++ tests/test_adapter.py | 284 ++++++++ tests/test_attr_filter_pipeline.py | 100 +++ 8 files changed, 1509 insertions(+), 33 deletions(-) create mode 100644 configs/ag_config_template.yaml create mode 100644 notebooks/ag_biosample_reference.csv create mode 100644 src/deepISA/modeling/alpha_genome_adapter.py create mode 100644 tests/test_adapter.py create mode 100644 tests/test_attr_filter_pipeline.py diff --git a/configs/ag_config_template.yaml b/configs/ag_config_template.yaml new file mode 100644 index 0000000..144f664 --- /dev/null +++ b/configs/ag_config_template.yaml @@ -0,0 +1,42 @@ +# AlphaGenome backend config for deepISA +# ───────────────────────────────────────────────────────────────────────────── +# Usage: +# from deepISA.modeling.alpha_genome_adapter import AlphaGenomeAdapter +# adapter = AlphaGenomeAdapter("configs/ag_config_template.yaml") +# +# Browse available biosamples and assay types: +# notebooks/ag_biosample_reference.csv (714 biosamples × 10 assay types) +# +# ───────────────────────────────────────────────────────────────────────────── +# Option A — Single track (most common, backward-compatible format) +# ───────────────────────────────────────────────────────────────────────────── +api_key: YOUR_API_KEY_HERE # AlphaGenome API key +output_type: DNASE # DNASE | ATAC | CAGE | RNA_SEQ | CHIP_TF | CHIP_HISTONE | PROCAP +biosample_name: GM12878 # exact string from ag_biosample_reference.csv + +# Optional — defaults shown: +context_len: 16384 # AlphaGenome input length (do not change) +seq_len: 600 # deepISA region length (match your training) +aggregation: sum # how to aggregate positions in the 600bp window + + +# ───────────────────────────────────────────────────────────────────────────── +# Option B — Multi-track (combine assays / cell lines; still ONE API call) +# ───────────────────────────────────────────────────────────────────────────── +# Uncomment below and remove / comment out the single-track keys above. +# +# api_key: YOUR_API_KEY_HERE +# context_len: 16384 +# seq_len: 600 +# aggregation: sum +# tracks: +# - output_type: DNASE +# biosample_name: GM12878 # col 0 in output → isa_t0 +# - output_type: CAGE +# biosample_name: GM12878 # cols 1-2 → isa_t1, isa_t2 +# - output_type: ATAC +# biosample_name: K562 # col 3 → isa_t3 +# +# n_tracks = sum of tracks for each (output_type, biosample) pair. +# Use adapter.n_tracks to get the value at runtime. +# Use calc_coop_score(..., track_idx=i) to analyse each track separately. diff --git a/notebooks/ag_biosample_reference.csv b/notebooks/ag_biosample_reference.csv new file mode 100644 index 0000000..ae8b89d --- /dev/null +++ b/notebooks/ag_biosample_reference.csv @@ -0,0 +1,715 @@ +biosample_name,CHIP_TF,CHIP_HISTONE,SPLICE_SITE_USAGE,RNA_SEQ,CAGE,SPLICE_JUNCTIONS,DNASE,ATAC,CONTACT_MAPS,PROCAP +22Rv1,1,1,0,0,0,0,0,0,0,0 +A172,0,0,2,2,2,1,1,0,0,0 +A375,0,0,2,2,0,1,0,0,0,0 +A549,33,4,2,2,2,1,1,1,0,0 +A673,1,3,2,2,0,1,1,0,0,2 +ACC112,0,7,0,0,0,0,0,0,0,0 +ACHN,0,0,0,0,0,0,1,0,0,0 +AG04449,1,1,0,0,0,0,1,0,0,0 +AG04450,1,4,2,2,0,1,1,0,0,0 +AG08395,0,0,0,0,0,0,1,0,0,0 +AG08396,0,0,0,0,0,0,1,0,0,0 +AG09309,1,1,0,0,0,0,1,0,0,0 +AG09319,1,1,0,0,0,0,1,0,0,0 +AG10803,1,1,0,0,0,0,1,0,0,0 +AG20443,0,0,0,0,0,0,1,0,0,0 +Ammon's horn,0,0,2,1,2,1,1,0,0,0 +B cell,1,10,4,4,0,2,1,1,0,0 +BE2C,1,4,2,1,0,1,1,0,0,0 +BJ,1,3,2,2,0,1,1,0,0,0 +BLaER1,1,9,2,2,0,1,0,0,0,0 +C1 segment of cervical spinal cord,0,0,2,1,0,1,0,0,0,0 +C4-2B,2,1,0,0,0,0,0,0,0,0 +CD14-positive monocyte,1,11,2,3,0,1,1,0,0,0 +"CD14-positive, CD16-negative classical monocyte",0,0,0,0,2,0,0,0,0,0 +"CD14-positive, CD16-positive monocyte",0,0,0,0,4,0,0,0,0,0 +"CD4-positive, CD25-positive, alpha-beta regulatory T cell",0,6,2,3,0,1,1,1,0,0 +"CD4-positive, alpha-beta T cell",1,6,4,3,0,2,1,1,0,0 +"CD4-positive, alpha-beta memory T cell",0,4,4,3,0,2,1,0,0,0 +"CD8-positive, alpha-beta T cell",1,7,4,3,0,2,1,1,0,0 +"CD8-positive, alpha-beta memory T cell",0,4,4,3,0,2,1,0,0,0 +CMK,0,0,0,0,0,0,1,0,0,0 +COLO829,0,0,0,0,0,0,1,0,0,0 +Caco-2,1,2,2,2,2,1,1,0,0,2 +Caki2,0,0,2,2,0,1,1,0,0,0 +Calu3,1,0,2,2,0,1,1,0,0,2 +CyT49,0,0,0,0,0,0,0,0,4,0 +D721Med,0,0,0,0,0,0,1,0,0,0 +DND-41,1,9,0,0,0,0,1,1,0,0 +DOHH2,1,9,0,0,0,0,0,0,0,0 +DU 145,0,0,0,0,2,0,0,0,0,0 +Daoy,0,0,2,2,0,1,1,0,0,0 +EH,0,0,0,0,0,0,1,0,0,0 +EL,0,0,0,0,0,0,1,0,0,0 +ELF-1,0,0,0,0,0,0,1,0,0,0 +ELR,0,0,0,0,0,0,1,0,0,0 +ES-I3,0,3,0,0,0,0,0,0,0,0 +G401,0,0,2,2,2,1,1,0,0,0 +GM03348,0,0,0,0,0,0,1,0,0,0 +GM04503,0,0,0,0,0,0,1,0,0,0 +GM04504,0,0,0,0,0,0,1,0,0,0 +GM06990,1,2,0,0,0,0,1,0,0,0 +GM08714,0,1,0,0,0,0,0,0,0,0 +GM10847,1,0,0,0,0,0,0,0,0,0 +GM12864,1,1,0,0,0,0,1,0,0,0 +GM12865,0,1,0,0,0,0,1,0,0,0 +GM12872,1,0,0,0,0,0,0,0,0,0 +GM12873,1,0,0,0,0,0,0,0,0,0 +GM12874,1,0,0,0,0,0,0,0,0,0 +GM12875,1,1,0,0,0,0,0,0,0,0 +GM12878,101,8,4,5,2,2,1,1,1,0 +GM12891,6,0,2,1,0,1,1,0,0,0 +GM12892,4,0,2,1,0,1,0,0,0,0 +GM13976,0,0,0,0,0,0,1,0,0,0 +GM15510,1,0,0,0,0,0,0,0,0,0 +GM18498,0,0,0,0,0,0,0,1,0,0 +GM18499,0,0,0,0,0,0,0,1,0,0 +GM18505,0,0,0,0,0,0,0,1,0,0 +GM18508,0,0,0,0,0,0,0,1,0,0 +GM18511,0,0,0,0,0,0,0,1,0,0 +GM18517,0,0,0,0,0,0,0,1,0,0 +GM18519,0,0,0,0,0,0,0,1,0,0 +GM18520,0,0,0,0,0,0,0,1,0,0 +GM18526,1,0,0,0,0,0,0,0,0,0 +GM18858,0,0,0,0,0,0,0,1,0,0 +GM18861,0,0,0,0,0,0,0,1,0,0 +GM18867,0,0,0,0,0,0,0,1,0,0 +GM18868,0,0,0,0,0,0,0,1,0,0 +GM18870,0,0,0,0,0,0,0,1,0,0 +GM18873,0,0,0,0,0,0,0,1,0,0 +GM18907,0,0,0,0,0,0,0,1,0,0 +GM18951,1,0,0,0,0,0,0,0,0,0 +GM19023,0,0,0,0,0,0,0,1,0,0 +GM19025,0,0,0,0,0,0,0,1,0,0 +GM19035,0,0,0,0,0,0,0,1,0,0 +GM19043,0,0,0,0,0,0,0,1,0,0 +GM19099,1,0,0,0,0,0,0,0,0,0 +GM19193,1,0,0,0,0,0,0,0,0,0 +GM19238,0,0,0,0,0,0,1,0,0,0 +GM19239,0,0,0,0,0,0,1,0,0,0 +GM19240,0,0,0,0,0,0,1,0,0,0 +GM19324,0,0,0,0,0,0,0,1,0,0 +GM19328,0,0,0,0,0,0,0,1,0,0 +GM19351,0,0,0,0,0,0,0,1,0,0 +GM19372,0,0,0,0,0,0,0,1,0,0 +GM19395,0,0,0,0,0,0,0,1,0,0 +GM19397,0,0,0,0,0,0,0,1,0,0 +GM19438,0,0,0,0,0,0,0,1,0,0 +GM19452,0,0,0,0,0,0,0,1,0,0 +GM19455,0,0,0,0,0,0,0,1,0,0 +GM19463,0,0,0,0,0,0,0,1,0,0 +GM19467,0,0,0,0,0,0,0,1,0,0 +GM20000,0,0,0,0,0,0,1,0,0,0 +GM21360,0,0,0,0,0,0,0,1,0,0 +GM21367,0,0,0,0,0,0,0,1,0,0 +GM21381,0,0,0,0,0,0,0,1,0,0 +GM21390,0,0,0,0,0,0,0,1,0,0 +GM21423,0,0,0,0,0,0,0,1,0,0 +GM21447,0,0,0,0,0,0,0,1,0,0 +GM21515,0,0,0,0,0,0,0,1,0,0 +GM21526,0,0,0,0,0,0,0,1,0,0 +GM21528,0,0,0,0,0,0,0,1,0,0 +GM21529,0,0,0,0,0,0,0,1,0,0 +GM21576,0,0,0,0,0,0,0,1,0,0 +GM21619,0,0,0,0,0,0,0,1,0,0 +GM21717,0,0,0,0,0,0,0,1,0,0 +GM21723,0,0,0,0,0,0,0,1,0,0 +GM21737,0,0,0,0,0,0,0,1,0,0 +GM21786,0,0,0,0,0,0,0,1,0,0 +GM21825,0,0,0,0,0,0,0,1,0,0 +GM23248,1,10,4,3,0,2,1,0,0,0 +GM23338,10,9,4,3,0,2,1,1,0,0 +GM25256,0,0,0,0,0,0,0,0,2,0 +H1,39,14,4,5,0,2,1,0,0,0 +H1-hESC,0,0,0,0,0,0,0,0,6,0 +H4,0,0,2,2,0,1,1,0,0,0 +H54,2,0,0,0,0,0,0,0,0,0 +H7,0,3,2,2,0,1,1,0,0,0 +H9,1,6,2,2,0,1,1,0,6,0 +HAP-1,0,4,0,0,0,0,1,0,0,0 +HCEC 1CT,0,0,0,0,0,0,1,0,0,0 +HCT116,25,8,4,3,0,2,1,1,2,0 +HEK293,106,3,0,0,2,0,0,0,0,0 +HEK293T,5,0,0,0,0,0,0,0,0,0 +HFF-Myc,0,1,0,0,0,0,1,0,0,0 +HFFc6,1,0,2,2,0,1,1,0,2,0 +HG02571,0,0,0,0,0,0,0,1,0,0 +HG02588,0,0,0,0,0,0,0,1,0,0 +HG02610,0,0,0,0,0,0,0,1,0,0 +HG02623,0,0,0,0,0,0,0,1,0,0 +HG02678,0,0,0,0,0,0,0,1,0,0 +HG02759,0,0,0,0,0,0,0,1,0,0 +HG02763,0,0,0,0,0,0,0,1,0,0 +HG02840,0,0,0,0,0,0,0,1,0,0 +HG02852,0,0,0,0,0,0,0,1,0,0 +HG02870,0,0,0,0,0,0,0,1,0,0 +HG02884,0,0,0,0,0,0,0,1,0,0 +HG02938,0,0,0,0,0,0,0,1,0,0 +HG02943,0,0,0,0,0,0,0,1,0,0 +HG02970,0,0,0,0,0,0,0,1,0,0 +HG02973,0,0,0,0,0,0,0,1,0,0 +HG02981,0,0,0,0,0,0,0,1,0,0 +HG03025,0,0,0,0,0,0,0,1,0,0 +HG03039,0,0,0,0,0,0,0,1,0,0 +HG03045,0,0,0,0,0,0,0,1,0,0 +HG03060,0,0,0,0,0,0,0,1,0,0 +HG03064,0,0,0,0,0,0,0,1,0,0 +HG03066,0,0,0,0,0,0,0,1,0,0 +HG03095,0,0,0,0,0,0,0,1,0,0 +HG03097,0,0,0,0,0,0,0,1,0,0 +HG03103,0,0,0,0,0,0,0,1,0,0 +HG03108,0,0,0,0,0,0,0,1,0,0 +HG03135,0,0,0,0,0,0,0,1,0,0 +HG03139,0,0,0,0,0,0,0,1,0,0 +HG03159,0,0,0,0,0,0,0,1,0,0 +HG03175,0,0,0,0,0,0,0,1,0,0 +HG03196,0,0,0,0,0,0,0,1,0,0 +HG03280,0,0,0,0,0,0,0,1,0,0 +HG03342,0,0,0,0,0,0,0,1,0,0 +HG03378,0,0,0,0,0,0,0,1,0,0 +HG03432,0,0,0,0,0,0,0,1,0,0 +HG03439,0,0,0,0,0,0,0,1,0,0 +HG03442,0,0,0,0,0,0,0,1,0,0 +HG03457,0,0,0,0,0,0,0,1,0,0 +HG03460,0,0,0,0,0,0,0,1,0,0 +HG03469,0,0,0,0,0,0,0,1,0,0 +HG03520,0,0,0,0,0,0,0,1,0,0 +HG03521,0,0,0,0,0,0,0,1,0,0 +HG03565,0,0,0,0,0,0,0,1,0,0 +HG03571,0,0,0,0,0,0,0,1,0,0 +HK-2,0,0,0,0,0,0,1,0,0,0 +HL-60,4,2,0,0,2,0,1,0,0,0 +HS-27A,0,0,0,0,0,0,1,0,0,0 +HS-5,0,0,0,0,0,0,1,0,0,0 +HT-29,0,0,2,2,0,1,1,0,0,0 +HT1080,0,0,4,4,2,2,1,0,0,0 +HTR-8/SVneo,0,0,0,0,0,0,1,0,0,0 +HUES48,0,7,0,0,0,0,0,0,0,0 +HUES6,0,7,0,0,0,0,0,0,0,0 +HUES64,0,7,2,2,0,1,0,0,0,0 +HeLa-S3,40,9,2,3,2,1,1,0,1,0 +HepG2,539,10,4,5,2,2,1,1,1,0 +HuH-7,0,0,0,0,0,0,1,0,0,0 +HuH-7.5,0,0,0,0,0,0,1,0,0,0 +IMR-90,12,26,4,4,0,2,1,1,1,0 +IgD-negative memory B cell,0,0,2,1,0,1,0,0,0,0 +Ishikawa,16,0,0,0,0,0,0,0,0,0 +Jurkat,0,0,0,0,2,0,0,0,0,0 +"Jurkat, Clone E6-1",0,1,2,1,0,1,1,0,0,0 +K562,306,10,4,5,2,2,1,1,0,2 +KBM-7,0,0,0,0,0,0,1,0,1,0 +KMS-11,1,6,0,0,0,0,0,0,0,0 +KOPT-K1,0,3,0,0,0,0,0,0,0,0 +Karpas-422,0,9,2,2,0,1,1,0,0,0 +L1-S8,0,0,0,0,0,0,1,0,0,0 +L1-S8R,0,0,0,0,0,0,1,0,0,0 +LHCN-M2,0,0,4,3,0,2,1,0,0,0 +LNCAP,1,0,0,0,0,0,0,0,0,0 +LNCaP clone FGC,1,1,0,0,0,0,1,0,0,0 +Langerhans cell,0,0,0,0,2,0,0,0,0,0 +LoVo,0,0,0,0,0,0,1,0,0,0 +Loucy,1,7,0,0,0,0,0,0,0,0 +M059J,0,0,2,2,0,1,1,0,0,0 +MCF 10A,1,0,2,2,0,1,1,0,0,2 +MCF-7,71,8,4,5,0,2,1,1,0,0 +MG63,0,2,2,2,0,1,1,0,0,0 +MM.1S,1,9,0,0,0,0,1,0,0,0 +NAMALWA,0,0,0,0,0,0,1,0,0,0 +NB4,4,1,0,0,0,0,1,0,0,0 +NCI-H226,0,0,0,0,2,0,1,0,0,0 +NCI-H460,0,0,4,4,2,2,1,0,0,0 +NCI-H929,0,8,0,0,0,0,1,1,0,0 +NT2/D1,1,6,0,0,0,0,1,0,0,0 +OCI-LY1,2,7,0,0,0,0,0,0,0,0 +OCI-LY3,1,10,0,0,0,0,0,0,0,0 +OCI-LY7,1,7,2,2,0,1,1,0,0,0 +PC-3,1,9,2,2,2,1,1,1,0,0 +PC-9,1,12,2,2,0,1,1,0,0,0 +PFSK-1,4,0,2,1,0,1,0,0,0,0 +Panc1,5,4,4,3,0,2,1,1,0,0 +Peyer's patch,2,5,4,3,0,2,1,0,0,0 +Purkinje cell,0,0,2,1,0,1,0,0,0,0 +RCC,0,0,0,0,0,0,1,0,0,0 +RCC 7860,0,0,0,0,0,0,1,0,0,0 +RKO,0,0,0,0,0,0,1,0,0,0 +RPMI7951,0,0,2,2,0,1,1,0,0,0 +RPMI8226,0,0,0,0,0,0,1,0,0,0 +RWPE1,1,1,0,0,0,0,1,0,0,0 +RWPE2,1,3,0,0,0,0,1,1,0,0 +Raji,1,0,0,0,2,0,0,0,0,0 +Right ventricle myocardium inferior,0,0,2,2,0,1,1,1,0,0 +Right ventricle myocardium superior,0,0,2,2,0,1,1,1,0,0 +SH-SY5Y,2,0,0,0,0,0,0,0,0,0 +SJCRH30,0,4,2,2,0,1,1,0,0,0 +SJSA1,0,4,2,2,0,1,1,0,0,0 +SK-MEL-5,0,0,4,4,0,2,1,0,0,0 +SK-N-DZ,0,0,2,2,0,1,1,0,0,0 +SK-N-MC,1,5,0,0,2,0,1,0,0,0 +SK-N-SH,31,10,2,3,0,1,1,0,0,0 +SU-DHL-6,0,6,0,0,0,0,0,0,0,0 +SW480,0,0,0,0,0,0,1,0,0,0 +Schwann cell,0,0,0,0,2,0,0,0,0,0 +Sertoli cell,0,0,0,0,2,0,0,0,0,0 +T follicular helper cell,0,0,0,0,0,0,1,0,0,0 +T-cell,0,6,4,4,0,2,1,1,0,0 +T-helper 1 cell,0,0,0,0,0,0,1,0,0,0 +T-helper 17 cell,0,0,0,2,0,0,1,1,0,0 +T-helper 2 cell,0,0,0,0,0,0,1,0,0,0 +T-helper 22 cell,0,0,0,0,0,0,1,0,0,0 +T-helper 9 cell,0,0,0,0,0,0,1,0,0,0 +T47D,1,0,0,0,0,0,1,0,0,0 +THP-1,0,0,0,0,2,0,0,0,0,0 +U-87 MG,0,0,2,1,0,1,0,0,0,0 +UCSF-4,0,5,0,0,0,0,0,0,0,0 +VCaP,1,1,0,0,0,0,0,0,0,0 +WERI-Rb-1,1,4,0,0,0,0,1,0,0,0 +WI38,1,1,0,0,0,0,0,0,0,0 +WTC11,34,0,2,3,0,1,1,1,0,0 +acinar cell of salivary gland,0,0,0,0,2,0,0,0,0,0 +acinar cell of sebaceous gland,0,0,0,0,2,0,0,0,0,0 +adipocyte,0,6,0,0,0,0,1,0,0,0 +adipocyte of breast,0,0,0,0,2,0,0,0,0,0 +adipocyte of omentum tissue,0,0,0,0,2,0,0,0,0,0 +adipose tissue,0,0,2,2,2,1,0,0,0,0 +adrenal gland,2,6,6,6,2,3,1,1,0,0 +adult organism,0,0,0,0,2,0,0,0,0,0 +airway epithelial cell,0,0,2,2,0,1,0,0,0,0 +amnion,0,3,2,2,2,1,0,0,0,0 +amnion mesenchymal stem cell,0,0,0,0,2,0,0,0,0,0 +amniotic epithelial cell,0,0,0,0,2,0,1,0,0,0 +amygdala,0,0,2,1,2,1,0,0,0,0 +angular gyrus,0,5,0,0,0,0,0,0,0,0 +annulus pulposus cell,0,0,0,0,2,0,0,0,0,0 +anterior cingulate cortex,0,0,2,1,0,1,0,0,0,0 +anterior lingual gland,0,0,2,1,0,1,0,0,0,0 +aorta,0,3,4,4,2,2,1,0,0,0 +aortic endothelial cell,0,0,0,0,2,0,0,0,0,0 +aortic smooth muscle cell,0,0,2,2,2,1,0,0,0,0 +artery,0,0,0,0,2,0,0,0,0,0 +articular chondrocyte of knee joint,0,0,2,2,0,1,0,0,0,0 +ascending aorta,1,5,4,3,0,2,1,0,0,0 +astrocyte,2,10,2,2,0,1,1,0,0,0 +astrocyte of the cerebellum,0,1,0,0,2,0,1,0,0,0 +astrocyte of the cerebral cortex,0,0,0,0,2,0,0,0,0,0 +astrocyte of the hippocampus,0,0,0,0,0,0,1,0,0,0 +astrocyte of the spinal cord,1,1,0,0,0,0,1,0,0,0 +basophil,0,0,0,0,4,0,0,0,0,0 +bile duct,0,0,0,0,0,0,0,1,0,0 +bladder microvascular endothelial cell,0,0,2,2,0,1,0,0,0,0 +blood,0,0,0,0,2,0,0,0,0,0 +body of pancreas,2,3,4,3,0,2,1,1,0,0 +bone marrow,0,0,0,0,2,0,0,0,0,0 +bone marrow cell,0,0,0,0,2,0,0,0,0,0 +brain,1,5,2,2,2,1,1,0,0,0 +brain microvascular endothelial cell,1,3,0,0,0,0,1,0,0,0 +brain pericyte,0,0,0,0,0,0,1,0,0,0 +breast,0,0,0,0,2,0,0,0,0,0 +breast epithelium,3,5,4,3,0,2,1,1,0,0 +bronchial epithelial cell,1,2,2,2,2,1,1,0,0,0 +bronchial smooth muscle cell,0,0,2,2,2,1,0,0,0,0 +bronchus fibroblast of lung,0,0,2,2,0,1,0,0,0,0 +calcaneal tendon,0,0,0,0,2,0,0,0,0,0 +camera-type eye,0,0,2,2,2,1,0,0,0,0 +capillary endothelial cell,0,0,0,0,2,0,0,0,0,0 +cardiac atrium fibroblast,0,0,2,2,0,1,0,0,0,0 +cardiac fibroblast,1,1,0,0,2,0,1,0,0,0 +cardiac mesenchymal cell,0,0,0,0,2,0,0,0,0,0 +cardiac muscle cell,1,5,4,3,2,2,1,0,0,0 +cardiac septum,0,0,2,2,0,1,1,1,0,0 +cardiac ventricle fibroblast,0,0,2,2,0,1,0,0,0,0 +caudate nucleus,0,5,2,1,2,1,0,0,0,0 +cell of skeletal muscle,0,0,0,0,2,0,0,0,0,0 +"central memory CD4-positive, alpha-beta T cell",0,0,0,0,0,0,1,0,0,0 +"central memory CD8-positive, alpha-beta T cell",0,0,0,0,0,0,1,1,0,0 +central nervous system pericyte,0,0,0,0,2,0,0,0,0,0 +cerebellar cortex,0,0,0,0,0,0,1,0,0,0 +cerebellar hemisphere,0,0,2,1,0,1,0,0,0,0 +cerebellum,0,1,6,5,2,3,1,0,0,0 +cerebrospinal fluid,0,0,0,0,2,0,0,0,0,0 +chondrocyte,1,0,2,2,2,1,1,0,0,0 +chorion,0,4,2,2,0,1,1,0,0,0 +chorionic villus,0,1,2,2,0,1,0,0,0,0 +choroid plexus epithelial cell,1,1,0,0,0,0,1,0,0,0 +cingulate gyrus,0,5,0,0,0,0,0,0,0,0 +colon,0,0,0,0,2,0,0,0,0,0 +colonic mucosa,1,6,2,2,0,1,1,1,0,0 +"common myeloid progenitor, CD34-positive",0,6,2,1,0,1,1,0,0,0 +corneal epithelial cell,0,0,0,0,2,0,0,0,0,0 +coronary artery,1,2,2,1,0,1,1,0,0,0 +corpus callosum,0,0,0,0,2,0,0,0,0,0 +cortex of kidney,0,0,2,1,0,1,0,0,0,0 +cranial nerve II,0,0,0,0,2,0,0,0,0,0 +cruciate ligament of knee,0,0,0,0,2,0,0,0,0,0 +dark melanocyte,0,0,0,0,2,0,0,0,0,0 +dermis blood vessel endothelial cell,0,0,2,2,0,1,1,0,0,0 +dermis lymphatic vessel endothelial cell,0,0,2,2,0,1,0,0,0,0 +dermis microvascular lymphatic vessel endothelial cell,0,0,2,2,0,1,1,0,0,0 +diaphragm,0,0,0,0,2,0,0,0,0,0 +diencephalon,0,0,2,2,2,1,0,0,0,0 +dorsal plus ventral thalamus,0,0,0,0,2,0,0,0,0,0 +dorsolateral prefrontal cortex,1,6,4,3,0,2,1,0,0,0 +duodenal mucosa,0,7,0,0,0,0,0,0,0,0 +duodenum,0,0,0,0,2,0,0,0,0,0 +dura mater,0,0,0,0,2,0,0,0,0,0 +ecto neural progenitor cell,0,0,0,0,0,0,1,0,0,0 +ectocervix,0,0,2,1,0,1,0,0,0,0 +ectodermal cell,0,4,2,2,0,1,0,0,0,0 +"effector CD4-positive, alpha-beta T cell",0,0,0,0,0,0,1,0,0,0 +"effector memory CD4-positive, alpha-beta T cell",0,3,0,0,0,0,1,0,0,0 +"effector memory CD8-positive, alpha-beta T cell",0,0,0,0,0,0,1,1,0,0 +egg chorion,0,0,0,0,2,0,0,0,0,0 +embryonic uterus,0,0,0,0,2,0,0,0,0,0 +endocervix,0,0,2,1,0,1,0,0,0,0 +endocrine pancreas,0,5,2,2,0,1,0,0,0,0 +endodermal cell,1,6,4,4,0,2,1,0,0,0 +endothelial cell,1,0,2,2,0,1,1,0,0,0 +endothelial cell of artery,0,0,0,0,2,0,0,0,0,0 +endothelial cell of coronary artery,0,0,2,2,0,1,0,0,0,0 +endothelial cell of lymphatic vessel,0,0,0,0,2,0,0,0,0,0 +endothelial cell of umbilical vein,8,9,4,5,2,2,1,0,0,2 +enteric smooth muscle cell,0,0,0,0,2,0,0,0,0,0 +eosinophil,0,0,0,0,2,0,0,0,0,0 +epidermal melanocyte,0,0,0,0,0,0,1,0,0,0 +epididymis,0,0,0,0,2,0,0,0,0,0 +epithelial cell of Malassez,0,0,0,0,4,0,0,0,0,0 +epithelial cell of alveolus of lung,0,0,2,2,0,1,0,0,0,0 +epithelial cell of esophagus,0,1,0,0,2,0,1,0,0,0 +epithelial cell of prostate,1,1,0,0,2,0,1,0,0,0 +epithelial cell of proximal tubule,1,1,2,2,2,1,1,0,0,0 +epithelial cell of umbilical artery,0,0,2,2,0,1,0,0,0,0 +erythroblast,1,0,0,0,0,0,0,0,0,0 +esophagus,0,5,2,2,2,1,0,0,0,0 +esophagus mucosa,0,0,2,2,0,1,0,1,0,0 +esophagus muscularis mucosa,3,6,4,3,0,2,0,0,0,0 +esophagus squamous epithelium,2,6,4,3,0,2,0,0,0,0 +eye,0,0,0,0,0,0,1,0,0,0 +fallopian tube,0,0,2,1,0,1,0,1,0,0 +fat cell,0,0,0,0,2,0,0,0,0,0 +female gonad,0,0,0,0,2,0,0,0,0,0 +femur,0,0,0,0,0,0,1,0,0,0 +fibroblast derived cell line,0,0,2,1,0,1,0,0,0,0 +fibroblast of breast,0,4,0,0,0,0,0,0,0,0 +fibroblast of choroid plexus,0,0,0,0,2,0,0,0,0,0 +fibroblast of dermis,1,7,2,2,2,1,1,0,0,0 +fibroblast of gingiva,0,0,0,0,2,0,1,0,0,0 +fibroblast of lung,1,8,4,5,0,2,1,0,0,0 +fibroblast of lymphatic vessel,0,0,0,0,2,0,0,0,0,0 +fibroblast of mammary gland,1,1,0,0,2,0,1,0,0,0 +fibroblast of peridontal ligament,0,0,0,0,0,0,1,0,0,0 +fibroblast of periodontium,0,0,0,0,2,0,0,0,0,0 +fibroblast of pulmonary artery,1,1,0,0,2,0,1,0,0,0 +fibroblast of skin of abdomen,0,0,2,1,0,1,1,0,0,0 +fibroblast of skin of back,0,0,2,1,0,1,1,0,0,0 +fibroblast of skin of left biceps,0,0,0,0,0,0,1,0,0,0 +fibroblast of skin of left quadriceps,0,0,0,0,0,0,1,0,0,0 +fibroblast of skin of right biceps,0,0,0,0,0,0,1,0,0,0 +fibroblast of skin of right quadriceps,0,0,0,0,0,0,1,0,0,0 +fibroblast of skin of scalp,0,0,2,1,0,1,1,0,0,0 +fibroblast of the aortic adventitia,1,1,2,2,2,1,1,0,0,0 +fibroblast of the conjuctiva,0,0,0,0,2,0,0,0,0,0 +fibroblast of the conjunctiva,0,0,0,0,0,0,1,0,0,0 +fibroblast of upper back skin,0,0,0,0,0,0,1,0,0,0 +fibroblast of villous mesenchyme,1,1,2,2,2,1,1,0,0,0 +forelimb muscle,0,0,2,1,0,1,1,0,0,0 +foreskin fibroblast,1,6,2,2,0,1,1,0,0,0 +foreskin keratinocyte,1,7,4,4,0,2,1,1,0,0 +foreskin melanocyte,0,7,2,2,0,1,1,0,0,0 +frontal cortex,0,0,4,3,2,2,1,0,0,0 +fungiform papilla,0,0,0,0,2,0,0,0,0,0 +gallbladder,0,0,0,0,2,0,0,0,0,0 +gamma-delta T cell,0,0,0,0,2,0,0,0,0,0 +gastrocnemius medialis,2,5,4,3,0,2,1,1,0,0 +gastroesophageal sphincter,4,6,4,3,0,2,0,1,0,0 +germinal matrix,0,5,0,0,0,0,0,0,0,0 +gingival epithelial cell,0,0,0,0,2,0,0,0,0,0 +globus pallidus,0,0,0,0,2,0,1,0,0,0 +glomerular endothelial cell,0,0,2,2,2,1,0,0,0,0 +glomerular visceral epithelial cell,0,0,0,0,0,0,1,0,0,0 +glutamatergic neuron,5,0,2,2,0,1,1,0,0,0 +hair follicle dermal papilla cell,0,0,2,2,0,1,0,0,0,0 +hair follicular keratinocyte,0,0,2,2,0,1,0,0,0,0 +head of caudate nucleus,0,0,0,0,0,0,1,0,0,0 +heart,0,4,4,5,2,2,1,0,0,0 +heart left ventricle,2,6,4,4,2,2,1,1,0,0 +heart right ventricle,1,6,4,4,0,2,1,1,0,0 +hematopoietic multipotent progenitor cell,0,0,2,2,0,1,1,0,0,0 +hepatic mesenchymal stem cell,0,0,0,0,2,0,0,0,0,0 +hepatic stellate cell,0,0,0,0,0,0,1,0,0,0 +hepatocyte,1,7,4,3,2,2,1,0,0,0 +hindlimb muscle,0,0,2,1,0,1,1,0,0,0 +hypothalamus,0,0,2,1,0,1,0,0,0,0 +iPS DF 19.11,0,4,0,0,0,0,0,0,0,0 +iPS DF 6.9,0,2,0,0,0,0,1,0,0,0 +iPS-11a,0,1,0,0,0,0,0,0,0,0 +iPS-15b,0,5,0,0,0,0,0,0,0,0 +iPS-18a,0,6,0,0,0,0,0,0,0,0 +iPS-18c,0,2,0,0,0,0,0,0,0,0 +iPS-20b,0,7,0,0,0,0,0,0,0,0 +iPS-NIHi11,0,0,0,0,0,0,1,0,0,0 +immature CD1a-positive Langerhans cell,0,0,0,0,2,0,0,0,0,0 +immature conventional dendritic cell,0,0,0,0,4,0,0,0,0,0 +immature natural killer cell,0,0,2,1,0,1,1,0,0,0 +inferior rectus extraocular muscle,0,0,0,0,2,0,0,0,0,0 +inflammatory macrophage,0,0,0,0,0,0,1,0,0,0 +insula,0,0,0,0,2,0,0,0,0,0 +intestinal epithelial cell,0,0,0,0,2,0,0,0,0,0 +iris pigment epithelial cell,0,0,0,0,2,0,1,0,0,0 +islet precursor cell,0,0,0,0,0,0,1,0,0,0 +keratinocyte,2,8,2,3,2,1,1,0,0,0 +keratocyte,0,0,0,0,2,0,0,0,0,0 +kidney,0,5,4,5,2,2,1,1,0,0 +kidney capillary endothelial cell,0,0,0,0,0,0,1,0,0,0 +kidney epithelial cell,1,2,2,2,0,1,1,0,0,0 +kidney tubule cell,0,0,0,0,0,0,1,0,0,0 +large intestine,0,6,0,1,0,0,1,0,0,0 +lateral rectus extra-ocular muscle,0,0,0,0,2,0,0,0,0,0 +layer of hippocampus,0,5,0,0,0,0,0,0,0,0 +left cardiac atrium,0,0,2,2,2,1,1,1,0,0 +left colon,0,0,2,2,0,1,1,1,0,0 +left forelimb,0,0,0,0,0,0,1,0,0,0 +left hindlimb,0,0,0,0,0,0,1,0,0,0 +left kidney,0,0,2,1,0,1,1,0,0,0 +left lobe of liver,0,0,2,2,0,1,1,1,0,0 +left lung,1,6,4,3,0,2,1,1,0,0 +left ovary,0,0,0,0,2,0,0,0,0,0 +left renal cortex interstitium,0,0,2,1,0,1,1,0,0,0 +left renal pelvis,0,0,2,1,0,1,1,0,0,0 +left ventricle myocardium,0,0,2,1,0,1,0,0,0,0 +left ventricle myocardium inferior,1,6,2,2,0,1,1,0,0,0 +left ventricle myocardium superior,0,0,2,2,0,1,1,1,0,0 +lens epithelial cell,0,0,0,0,2,0,0,0,0,0 +leptomeningeal cell,0,0,0,0,2,0,0,0,0,0 +light melanocyte,0,0,0,0,2,0,0,0,0,0 +liver,18,6,4,5,2,2,1,1,0,0 +locus ceruleus,0,0,0,0,2,0,0,0,0,0 +lower leg skin,2,0,4,3,0,2,1,0,0,0 +lower lobe of left lung,1,6,2,2,0,1,1,1,0,0 +lower lobe of right lung,1,5,2,2,2,1,1,1,0,0 +luminal epithelial cell of mammary gland,0,4,2,2,0,1,0,0,0,0 +lung,0,7,4,4,2,2,1,1,0,0 +lung fibroblast,0,0,0,0,2,0,0,0,0,0 +lung microvascular endothelial cell,0,0,2,2,0,1,1,0,0,0 +lymph node,0,0,0,0,2,0,0,0,0,0 +lymphoblast,0,0,2,1,0,1,0,0,0,0 +macrophage,0,0,0,0,2,0,0,0,0,0 +mammary epithelial cell,2,10,4,4,0,2,1,0,0,0 +mammary gland epithelial cell,0,0,0,0,2,0,0,0,0,0 +mammary microvascular endothelial cell,0,0,2,2,0,1,0,0,0,0 +mammary stem cell,0,0,2,2,0,1,0,0,0,0 +mast cell,0,0,0,0,2,0,0,0,0,0 +medial rectus extraocular muscle,0,0,0,0,2,0,0,0,0,0 +medulla oblongata,0,0,0,0,2,0,0,0,0,0 +melanocyte of skin,0,0,2,2,0,1,0,0,0,0 +memory B cell,0,0,0,0,0,0,1,1,0,0 +meninx,0,0,0,0,2,0,0,0,0,0 +mesangial cell,0,0,2,2,2,1,0,0,0,0 +mesenchymal cell,0,0,0,0,2,0,0,0,0,0 +mesenchymal stem cell,0,14,2,2,0,1,1,0,0,0 +mesenchymal stem cell of Wharton's jelly,0,0,2,2,2,1,0,0,0,0 +mesenchymal stem cell of adipose,0,0,2,2,2,1,0,0,0,0 +mesenchymal stem cell of the bone marrow,0,0,2,2,2,1,0,0,0,0 +mesenchymal stem cell of umbilical cord,0,0,0,0,2,0,0,0,0,0 +mesendoderm,0,7,2,2,0,1,1,0,0,0 +mesenteric fat pad,0,0,2,2,0,1,1,1,0,0 +mesodermal cell,0,5,2,2,0,1,0,0,0,0 +mesothelial cell,0,0,0,0,4,0,0,0,0,0 +mesothelial cell of epicardium,1,0,2,2,0,1,1,0,0,0 +metanephros,0,0,2,2,0,1,0,0,0,0 +middle frontal gyrus,0,0,0,0,2,0,0,0,0,0 +middle temporal gyrus,0,0,0,0,2,0,0,0,0,0 +mitral valve,0,0,0,0,2,0,0,0,0,0 +mole,0,0,0,2,0,0,0,0,0,0 +monocyte,0,0,0,0,2,0,0,0,0,0 +mononuclear cell,0,3,2,2,0,1,0,0,0,0 +motor neuron,0,3,2,1,0,1,0,1,0,0 +mouth mucosa,0,0,0,0,2,0,0,0,0,0 +mucosa of descending colon,1,3,2,2,0,1,1,1,0,0 +mucosa of gallbladder,0,0,2,2,0,1,1,1,0,0 +mucosa of rectum,0,5,0,0,0,0,0,0,0,0 +mucosa of stomach,0,4,0,0,0,0,0,0,0,0 +mucosa of urinary bladder,0,0,0,0,0,0,0,1,0,0 +muscle layer of colon,0,5,0,0,0,0,0,0,0,0 +muscle layer of duodenum,0,5,0,0,0,0,0,0,0,0 +muscle of arm,0,0,2,1,0,1,1,0,0,0 +muscle of back,0,0,2,1,0,1,1,0,0,0 +muscle of leg,0,6,2,3,0,1,1,0,0,0 +muscle of trunk,0,6,2,1,0,1,1,0,0,0 +myelocyte,0,0,0,0,2,0,0,0,0,0 +myocyte,0,0,4,3,0,2,1,0,0,0 +myoepithelial cell of mammary gland,0,6,2,2,0,1,0,0,0,0 +myometrial cell,0,0,2,2,0,1,0,0,0,0 +myotube,0,9,2,2,2,1,1,0,0,0 +nail plate,0,0,0,0,2,0,0,0,0,0 +naive B cell,0,0,2,1,0,1,1,1,0,0 +naive regulatory T cell,0,0,0,0,4,0,0,0,0,0 +"naive thymus-derived CD4-positive, alpha-beta T cell",0,4,4,3,2,2,1,1,0,0 +"naive thymus-derived CD8-positive, alpha-beta T cell",0,0,4,3,0,2,1,1,0,0 +nasal cavity respiratory epithelium epithelial cell of viscerocranial mucosa,0,0,2,2,0,1,0,0,0,0 +natural killer cell,1,5,4,4,2,2,1,1,0,0 +nephron,1,0,0,0,0,0,0,0,0,0 +nephron progenitor cell,1,0,0,0,0,0,1,0,0,0 +nephron tubule epithelial cell,0,0,0,0,2,0,0,0,0,0 +neural cell,5,4,2,2,0,1,0,0,0,0 +neural crest cell,1,0,2,2,0,1,1,0,0,0 +neural progenitor cell,3,9,2,2,0,1,1,0,0,0 +neuron,0,5,0,0,2,0,0,0,0,0 +neuronal stem cell,0,11,2,2,2,1,1,0,0,0 +neurosphere,0,6,2,2,0,1,0,0,0,0 +neutrophil,1,6,0,0,4,0,0,0,0,0 +non-pigmented ciliary epithelial cell,0,0,0,0,2,0,1,0,0,0 +nucleus accumbens,0,0,2,1,2,1,0,0,0,0 +nucleus pulposus cell of intervertebral disc,0,0,0,0,2,0,0,0,0,0 +occipital lobe,0,0,2,2,2,1,0,0,0,0 +occipital pole,0,0,0,0,2,0,0,0,0,0 +olfactory epithelial cell,0,0,0,0,4,0,0,0,0,0 +olfactory region,0,0,0,0,2,0,0,0,0,0 +oligodendrocyte precursor cell,0,0,0,0,2,0,0,0,0,0 +omental fat pad,1,0,4,3,0,2,1,1,0,0 +omentum preadipocyte,0,0,0,0,2,0,0,0,0,0 +osteoblast,1,9,2,2,2,1,0,0,0,0 +osteocyte,1,0,2,2,0,1,0,0,0,0 +outer medulla of kidney,0,0,2,1,0,1,0,0,0,0 +outer root sheath cell,0,0,0,0,2,0,0,0,0,0 +ovary,2,6,6,6,0,3,1,1,0,0 +pancreas,1,5,4,4,2,2,1,1,0,0 +paracentral gyrus,0,0,0,0,2,0,0,0,0,0 +parathyroid adenoma,1,6,0,0,0,0,0,0,0,0 +parietal lobe,0,0,2,2,2,1,0,0,0,0 +parotid gland,0,0,0,0,2,0,0,0,0,0 +penis,0,0,0,0,2,0,0,0,0,0 +pericardium fibroblast,0,0,2,2,0,1,0,0,0,0 +perineural cell,0,0,0,0,2,0,0,0,0,0 +peripheral blood mononuclear cell,0,7,2,2,0,1,0,0,0,0 +perirenal adipocyte cell,0,0,0,0,2,0,0,0,0,0 +perirenal preadipocyte,0,0,0,0,2,0,0,0,0,0 +pineal body,0,0,0,0,2,0,0,0,0,0 +pituitary gland,0,0,2,1,2,1,0,0,0,0 +placenta,1,6,4,4,2,2,1,0,0,0 +placental basal plate,0,1,2,2,0,1,0,0,0,0 +placental epithelial cell,0,0,2,2,2,1,0,0,0,0 +placental pericyte,0,0,2,2,0,1,0,0,0,0 +plasmacytoid dendritic cell,0,0,0,0,4,0,0,0,0,0 +pneumocyte,0,0,0,0,2,0,0,0,0,0 +pons,0,0,0,0,2,0,1,0,0,0 +postcentral gyrus,0,0,0,0,2,0,0,0,0,0 +posterior cingulate gyrus,0,0,0,0,0,0,1,0,0,0 +posterior vena cava,0,0,2,2,0,1,1,1,0,0 +preadipocyte of the breast,0,0,0,0,2,0,0,0,0,0 +progenitor cell of endocrine pancreas,1,0,2,2,0,1,1,0,0,0 +prostate gland,3,3,4,3,2,2,1,0,0,0 +prostate stromal cell,0,0,0,0,2,0,0,0,0,0 +psoas muscle,1,5,4,4,0,2,1,1,0,0 +pulmonary artery endothelial cell,0,0,2,2,0,1,1,0,0,0 +pulmonary valve,0,0,0,0,2,0,0,0,0,0 +putamen,0,0,2,1,2,1,1,0,0,0 +rectal smooth muscle tissue,0,4,0,0,0,0,0,0,0,0 +rectum,0,0,0,0,2,0,0,0,0,0 +regular cardiac myocyte,0,0,2,2,0,1,0,0,0,0 +renal cortex interstitium,0,0,2,1,0,1,1,0,0,0 +renal cortical epithelial cell,0,0,2,2,2,1,1,0,0,0 +renal pelvis,0,0,2,1,0,1,1,0,0,0 +respiratory epithelial cell,0,0,0,0,2,0,0,0,0,0 +respiratory system smooth muscle,0,0,0,0,2,0,0,0,0,0 +reticulocyte,0,0,0,0,2,0,0,0,0,0 +retina,0,0,0,0,2,0,1,0,0,0 +retinal pigment epithelial cell,1,1,0,0,2,0,1,0,0,0 +right atrium auricular region,1,5,4,3,0,2,1,0,0,0 +right cardiac atrium,0,3,4,4,0,2,1,1,0,0 +right forelimb,0,0,0,0,0,0,1,0,0,0 +right hindlimb,0,0,0,0,0,0,1,0,0,0 +right kidney,0,0,0,0,0,0,1,0,0,0 +right lobe of liver,2,4,4,3,0,2,1,1,0,0 +right lung,0,0,2,1,0,1,1,0,0,0 +right ovary,0,0,0,0,2,0,0,0,0,0 +right renal cortex interstitium,0,0,2,1,0,1,1,0,0,0 +right renal pelvis,0,0,2,1,0,1,1,0,0,0 +saliva-secreting gland,0,0,0,0,2,0,0,0,0,0 +sciatic nerve,0,0,2,2,0,1,1,1,0,0 +seminal vesicle,0,0,0,0,2,0,0,0,0,0 +sigmoid colon,4,5,6,5,0,3,1,1,0,0 +skeletal muscle cell,0,1,0,0,0,0,1,0,0,0 +skeletal muscle myoblast,1,7,4,5,2,2,1,0,0,0 +skeletal muscle satellite cell,0,6,2,2,2,1,0,0,0,0 +skeletal muscle tissue,0,6,2,2,2,1,0,0,0,0 +skin epidermis,0,2,0,0,0,0,0,0,0,0 +skin fibroblast,0,0,0,0,2,0,0,0,1,0 +skin of body,0,0,2,2,0,1,0,0,0,0 +skin of palm of manus,0,0,0,0,2,0,0,0,0,0 +small intestine,0,6,2,3,2,1,1,0,0,0 +smooth muscle cell,0,9,4,3,0,2,0,0,0,0 +smooth muscle cell of bladder,0,0,2,2,2,1,0,0,0,0 +smooth muscle cell of colon,0,0,0,0,2,0,0,0,0,0 +smooth muscle cell of prostate,0,0,0,0,2,0,0,0,0,0 +smooth muscle cell of the brachiocephalic vasculature,0,0,0,0,2,0,0,0,0,0 +smooth muscle cell of the brain vasculature,0,0,0,0,2,0,1,0,0,0 +smooth muscle cell of the carotid artery,0,0,0,0,2,0,0,0,0,0 +smooth muscle cell of the coronary artery,0,0,2,2,2,1,0,0,0,0 +smooth muscle cell of the esophagus,0,0,0,0,2,0,0,0,0,0 +smooth muscle cell of the internal thoracic artery,0,0,0,0,2,0,0,0,0,0 +smooth muscle cell of the pulmonary artery,0,0,2,2,2,1,0,0,0,0 +smooth muscle cell of the subclavian artery,0,0,0,0,2,0,0,0,0,0 +smooth muscle cell of the umbilical artery,0,0,2,2,2,1,0,0,0,0 +smooth muscle cell of the umbilical vein,0,0,0,0,2,0,0,0,0,0 +smooth muscle cell of trachea,0,0,2,2,2,1,0,0,0,0 +smooth muscle tissue,0,0,0,0,2,0,0,0,0,0 +soleus muscle,0,0,0,0,2,0,0,0,0,0 +spinal cord,0,5,4,5,2,2,1,0,0,0 +spleen,3,6,6,6,2,3,1,1,0,0 +stem cell of epidermis,0,0,0,0,2,0,0,0,0,0 +stomach,3,6,6,6,2,3,1,1,0,0 +stomach smooth muscle,0,6,0,0,0,0,0,0,0,0 +stromal cell of bone marrow,0,0,0,0,0,0,1,0,0,0 +stromal cell of pancreas,0,0,0,0,2,0,0,0,0,0 +subcutaneous abdominal adipose tissue,0,7,0,0,0,0,0,0,0,0 +subcutaneous adipose tissue,0,0,4,3,0,2,0,1,0,0 +subcutaneous fat cell,0,0,0,0,2,0,0,0,0,0 +subcutaneous preadipocyte,0,0,2,2,2,1,0,0,0,0 +submandibular gland,0,0,0,0,2,0,0,0,0,0 +substantia nigra,0,5,2,1,2,1,0,0,0,0 +superior rectus extraocular muscle,0,0,0,0,2,0,0,0,0,0 +suppressor macrophage,0,0,0,0,0,0,1,0,0,0 +suprapubic skin,3,5,4,3,0,2,1,0,0,0 +synovial cell,0,0,0,0,2,0,0,0,0,0 +temporal lobe,0,5,2,2,2,1,0,0,0,0 +tendon cell,0,0,0,0,2,0,0,0,0,0 +testis,1,3,6,6,2,3,1,0,0,0 +thoracic aorta,1,4,2,2,0,1,1,0,0,0 +thoracic aorta endothelial cell,0,0,2,2,2,1,0,0,0,0 +throat,0,0,0,0,2,0,0,0,0,0 +thymus,0,6,2,3,2,1,1,0,0,0 +thyroid gland,2,4,4,3,2,2,1,1,0,0 +tibial artery,1,6,2,1,0,1,1,1,0,0 +tibial nerve,3,6,4,3,0,2,1,0,0,0 +tongue,0,0,2,2,2,1,1,0,0,0 +tonsil,0,0,0,0,2,0,0,0,0,0 +trabecular meshwork cell,0,0,0,0,2,0,0,0,0,0 +trachea,0,0,0,0,2,0,0,0,0,0 +tracheal epithelial cell,0,0,2,2,2,1,0,0,0,0 +transverse colon,4,5,4,3,0,2,1,1,0,0 +tricuspid valve,0,0,0,0,2,0,0,0,0,0 +trophoblast,0,3,2,2,0,1,0,0,0,0 +trophoblast cell,0,14,2,2,0,1,1,0,0,0 +type B pancreatic cell,1,0,2,2,0,1,0,0,0,0 +umbilical cord,0,0,2,2,2,1,1,0,0,0 +umbilical cord blood,0,0,0,0,2,0,0,0,0,0 +upper lobe of left lung,4,4,4,3,0,2,1,1,0,0 +upper lobe of right lung,1,4,2,2,0,1,1,1,0,0 +ureter,0,0,2,2,0,1,1,1,0,0 +urethra,0,0,0,0,2,0,0,0,0,0 +urinary bladder,0,2,6,5,2,3,0,0,0,0 +urothelial cell,0,0,0,0,2,0,0,0,0,0 +urothelium cell line,0,0,0,0,0,0,1,0,0,0 +uterine cervix,0,0,0,0,2,0,0,0,0,0 +uterine smooth muscle cell,0,0,2,2,2,1,0,0,0,0 +uterus,2,2,4,3,2,2,1,1,0,0 +vagina,3,4,4,3,2,2,0,0,0,0 +vas deferens,0,0,0,0,2,0,0,0,0,0 +vein,0,0,0,0,2,0,0,0,0,0 +vein endothelial cell,0,0,2,2,2,1,0,0,0,0 +venous blood,0,0,2,1,0,1,0,0,0,0 +vermiform appendix,0,0,0,0,2,0,0,0,0,0 +vertebral mesenchymal stem cell,0,0,0,0,2,0,0,0,0,0 +visceral preadipocyte,0,0,0,0,2,0,0,0,0,0 +vitreous humor,0,0,0,0,2,0,0,0,0,0 +zone of skin,0,0,0,0,2,0,0,0,0,0 diff --git a/notebooks/deepISA_tutorial.ipynb b/notebooks/deepISA_tutorial.ipynb index 83304df..c08c943 100644 --- a/notebooks/deepISA_tutorial.ipynb +++ b/notebooks/deepISA_tutorial.ipynb @@ -482,35 +482,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "from deepISA.scoring.mapper import map_motifs\n", - "import torch\n", - "#TODO: write preprocess_remap(cell_type). \n", - "\n", - "# load the best model\n", - "model_path = os.path.join(RESULTS_DIR, \"Model\", \"model_best.pt\")\n", - "model.load_state_dict(torch.load(model_path, weights_only=True))\n", - "\n", - "map_motifs(\n", - " regions_df=df_pos, \n", - " fasta_path=FASTA_PATH,\n", - " jaspar_path=JASPAR_BB,\n", - " outpath=os.path.join(RESULTS_DIR,\"motif_locs.csv\"),\n", - " model=model,\n", - " device=DEVICE,\n", - " tracks=[0],\n", - " expressed_tfs=None, # set to None for tutorial, but providing expressed_tf_list is highly recommended for real analysis to reduce false positives.\n", - " motif_score_thresh=500,\n", - " remap_path=None,\n", - " attr_percentile=70,\n", - " attr_batch_size=1024\n", - ")\n", - "\n", - "\n", - "# read the mapped motif locations\n", - "df_motif_locs = pd.read_csv(os.path.join(RESULTS_DIR,\"motif_locs.csv\"))\n", - "df_motif_locs.head()" - ] + "source": "from deepISA.scoring.mapper import map_motifs\nimport torch\n\n# load the best model (or the pretrained model if you skipped training)\nmodel_path = os.path.join(RESULTS_DIR, \"Model\", \"model_best.pt\")\nmodel.load_state_dict(torch.load(model_path, weights_only=True))\n\n# Note: for local testing with mini_jaspar.bb, use motif_score_thresh=200\n# (mini_jaspar.bb max score is 494). For production with full JASPAR use 500.\nmap_motifs(\n regions_df=df_pos, \n fasta_path=FASTA_PATH,\n jaspar_path=JASPAR_BB,\n outpath=os.path.join(RESULTS_DIR,\"motif_locs.csv\"),\n model=model,\n device=DEVICE,\n tracks=[0],\n expressed_tfs=None, # set to None for tutorial; provide expressed_tf_list for real analysis\n motif_score_thresh=200, # use 500 for full JASPAR; 200 for mini_jaspar.bb test\n remap_path=None,\n attr_percentile=70,\n attr_batch_size=1024\n)\n\n# read the mapped motif locations\ndf_motif_locs = pd.read_csv(os.path.join(RESULTS_DIR,\"motif_locs.csv\"))\ndf_motif_locs.head()" }, { "cell_type": "markdown", @@ -1180,6 +1152,148 @@ "source": [ "plot_cell_specificity(df_coop_tf)" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 8: AlphaGenome Backend\n", + "\n", + "Use the AlphaGenome API as a drop-in model backend for ISA scoring —\n", + "the same pipeline (motif mapping → attr_filter → ISA) runs unchanged;\n", + "only the prediction model is swapped.\n", + "\n", + "**Prerequisites:**\n", + "- AlphaGenome API key (apply at https://github.com/google-deepmind/alphagenome)\n", + "- `pip install alphagenome pyyaml` (already in `deepisa_ag_env`)\n", + "- Parts 1–3 of this tutorial must have been run first: `FASTA_PATH`, `RESULTS_DIR`,\n", + " and `motif_locs.csv` produced by `map_motifs()` are all required.\n", + "\n", + "---\n", + "\n", + "### Step 0: Choose your biosample and assay type\n", + "\n", + "AlphaGenome supports **714 human cell lines / tissues** and **10 assay types**.\n", + "Before writing the config, open the reference table to find the right combination:\n", + "\n", + "```\n", + "notebooks/ag_biosample_reference.csv\n", + "```\n", + "\n", + "Each row is a biosample, each column is an assay type, and the value is the\n", + "number of tracks available (0 = not available for that combination).\n", + "\n", + "| Assay | What it measures | Typical use |\n", + "|---|---|---|\n", + "| **DNASE** | DNase-seq chromatin accessibility | default for regulatory elements |\n", + "| **ATAC** | ATAC-seq open chromatin | alternative to DNASE |\n", + "| **CAGE** | CAGE-seq promoter activity | promoter / TSS analysis |\n", + "| **RNA_SEQ** | RNA-seq gene expression | gene-level signal |\n", + "| **CHIP_TF** | TF ChIP-seq occupancy | TF-centric analysis (many tracks) |\n", + "| **CHIP_HISTONE** | Histone mark ChIP-seq | chromatin state |\n", + "| PROCAP | PRO-cap nascent transcription | high-res TRE activity |\n", + "| SPLICE_SITE_USAGE | Splicing | splicing QTL analysis |\n", + "\n", + "The code cell below loads the table and filters it — run it to find your cell line.\n", + "Then copy the exact `biosample_name` string into the config cell." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": "import os\n\n# ── Paste your AlphaGenome API key here ──────────────────────────────────────\nALPHA_GENOME_API_KEY = \"YOUR_API_KEY_HERE\" # ← replace with your key\n\n# Alternative: load from a .env file (one line: just the key)\n# with open(\"/path/to/.env\") as f:\n# ALPHA_GENOME_API_KEY = f.read().strip()\n\nos.environ[\"ALPHA_GENOME_API_KEY\"] = ALPHA_GENOME_API_KEY\nprint(\"API key loaded:\", ALPHA_GENOME_API_KEY[:8] + \"...\" if len(ALPHA_GENOME_API_KEY) > 8 else \"(not set)\")", + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": null, + "metadata": {}, + "source": [ + "import pandas as pd\n", + "\n", + "# Load the reference table (714 biosamples × 10 assay types)\n", + "ref = pd.read_csv(\"ag_biosample_reference.csv\", index_col=0)\n", + "\n", + "# ── Search by cell-line name (case-insensitive substring match) ──────────────\n", + "SEARCH = \"GM12878\" # ← change to your cell line\n", + "hits = ref[ref.index.str.contains(SEARCH, case=False)]\n", + "print(f\"Matches for '{SEARCH}':\")\n", + "print(hits.to_string())\n", + "\n", + "# ── Or: list all biosamples that have DNASE data ─────────────────────────────\n", + "# print(ref[ref['DNASE'] > 0].index.tolist())\n", + "\n", + "# ── Or: show the full table in a scrollable view ─────────────────────────────\n", + "# from IPython.display import display\n", + "# display(ref.style.highlight_max(axis=0))" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "metadata": {}, + "source": "from alphagenome.models import dna_client\nfrom alphagenome.models.dna_output import OutputType\n\ndna_model_meta = dna_client.create(ALPHA_GENOME_API_KEY)\nmeta = dna_model_meta.output_metadata(dna_client.Organism.HOMO_SAPIENS).concatenate()\n\noutput_type = \"DNASE\" # change to ATAC, CAGE, RNA_SEQ, CHIP_TF, etc.\noutput_type_enum = OutputType[output_type]\nbiosamples = sorted(\n meta[meta[\"output_type\"] == output_type_enum][\"biosample_name\"]\n .dropna().unique()\n)\nprint(f\"Available biosamples for {output_type}: {len(biosamples)} total\")\nprint(\"First 20:\", biosamples[:20])", + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "import yaml\n\n# Pick a biosample matching your cell type; browse the list above.\nBIOSAMPLE = biosamples[0] # replace with e.g. \"GM12878\" for B-lymphocytes\nprint(f\"Using biosample: '{BIOSAMPLE}'\")\n\nAG_CONFIG_PATH = os.path.join(RESULTS_DIR, \"ag_config.yaml\")\n# ── Single-track config (default) ───────────────────────────────────────────\n# config = {\n# 'api_key': ALPHA_GENOME_API_KEY, 'output_type': output_type,\n# 'biosample_name': BIOSAMPLE, 'context_len': 16384, 'seq_len': 600, 'aggregation': 'sum',\n# }\n\n# ── Multi-track config: combine assays / cell lines in ONE API call ──────────\n# config = {\n# 'api_key': ALPHA_GENOME_API_KEY, 'context_len': 16384, 'seq_len': 600, 'aggregation': 'sum',\n# 'tracks': [\n# {'output_type': 'DNASE', 'biosample_name': 'GM12878'},\n# {'output_type': 'CAGE', 'biosample_name': 'GM12878'},\n# {'output_type': 'ATAC', 'biosample_name': 'K562'},\n# ]\n# }\n# n_tracks = DNASE(1) + CAGE(2) + ATAC(1) = 4 → isa_t0 .. isa_t3\n# Use calc_coop_score(..., track_idx=i) to analyse each track separately.\n\nconfig = {\n \"api_key\": ALPHA_GENOME_API_KEY,\n \"output_type\": output_type,\n \"biosample_name\": BIOSAMPLE,\n \"context_len\": 16384,\n \"seq_len\": 600,\n \"aggregation\": \"sum\",\n}\nwith open(AG_CONFIG_PATH, \"w\") as f:\n yaml.dump(config, f, default_flow_style=False)\nprint(f\"Config written → {AG_CONFIG_PATH}\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": null, + "metadata": {}, + "source": [ + "### Choosing tracks and handling multi-track assays\n", + "\n", + "The number of tracks (`n_tracks`) depends on your biosample + assay combination:\n", + "\n", + "| Assay | Typical n_tracks | Notes |\n", + "|---|---|---|\n", + "| DNASE | 1–3 | one experiment per biosample; easy to interpret |\n", + "| ATAC | 1–2 | same as DNASE |\n", + "| CAGE | 2–4 | plus/minus strand or replicate tracks |\n", + "| CHIP_TF | 1–100+ | **one track per TF** — see note below |\n", + "| CHIP_HISTONE | 1–10 | one per histone mark |\n", + "\n", + "**What happens with multiple tracks:**\n", + "- `adapter._ontology_terms` lists every track for your biosample.\n", + "- `run_single_isa` outputs columns `isa_t0, isa_t1, ..., isa_t{n-1}` — one per track.\n", + "- `calc_coop_score(..., track_idx=i)` lets you analyse one track at a time.\n", + " Run it once per track of interest, or loop over `range(n_tracks)`.\n", + "\n", + "**CHIP_TF note:** each track is a different TF's ChIP-seq experiment.\n", + "With 30+ tracks the ISA output is wide but each column is interpretable independently.\n", + "Use `track_idx` to pick the column matching the TF you care about.\n", + "\n", + "**Recommended starting point:** use `DNASE` or `ATAC` (n_tracks ≤ 3).\n", + "The ISA scores will reflect overall chromatin accessibility — equivalent to\n", + "the Conv model's regression head." + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ + "import torch\nfrom deepISA.modeling.alpha_genome_adapter import AlphaGenomeAdapter\nfrom deepISA.scoring.single_isa import run_single_isa\n\n# motif_locs.csv was produced (and already filtered) by map_motifs() in Part 3\nMOTIF_LOCS = os.path.join(RESULTS_DIR, \"motif_locs.csv\")\nAG_ISA_PATH = os.path.join(RESULTS_DIR, \"motif_single_isa_ag.csv\")\n\nadapter = AlphaGenomeAdapter(AG_CONFIG_PATH)\nn_tracks = adapter.n_tracks # true output width from probe call (may differ from len(_ontology_terms))\ndevice = torch.device(\"cpu\")\nprint(f\"Adapter ready — biosample '{BIOSAMPLE}', n_tracks={n_tracks}\")\n\nrun_single_isa(\n model = adapter,\n fasta_path = FASTA_PATH,\n motif_locs_path = MOTIF_LOCS,\n outpath = AG_ISA_PATH,\n device = device,\n tracks = list(range(n_tracks)),\n num_regions_per_batch = 10,\n pred_batch_size = 1, # one API call per sequence\n)\nprint(f\"AlphaGenome single ISA complete → {AG_ISA_PATH}\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "metadata": {}, + "source": "from deepISA.scoring.combi_isa import run_combi_isa\n\nAG_COMBI_PATH = os.path.join(RESULTS_DIR, \"motif_combi_isa_ag.csv\")\n\nrun_combi_isa(\n model = adapter,\n fasta_path = FASTA_PATH,\n motif_locs_path = MOTIF_LOCS, # same filtered motif locs as single ISA\n outpath = AG_COMBI_PATH,\n device = device,\n inde_dist_max = 255,\n tracks = list(range(n_tracks)),\n num_regions_per_batch = 5,\n pred_batch_size = 1,\n)\nprint(f\"AlphaGenome combinatorial ISA complete → {AG_COMBI_PATH}\")", + "outputs": [], + "execution_count": null } ], "metadata": { @@ -1204,4 +1318,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 9d3341f..43419e3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,9 +37,8 @@ dependencies = [ ] [project.optional-dependencies] -dev = [ - "pytest>=7.0", # For the Phase 1/2 validation logic -] +dev = ["pytest>=7.0"] +alphagenome = ["alphagenome", "pyyaml"] [tool.setuptools.packages.find] where = ["src"] diff --git a/src/deepISA/modeling/__init__.py b/src/deepISA/modeling/__init__.py index e69de29..0ede3a6 100644 --- a/src/deepISA/modeling/__init__.py +++ b/src/deepISA/modeling/__init__.py @@ -0,0 +1,5 @@ +# Alpha-Genome backend (optional — requires: pip install alphagenome pyyaml) +try: + from deepISA.modeling.alpha_genome_adapter import AlphaGenomeAdapter +except ImportError: + pass # alphagenome not installed; ConvModel path unaffected diff --git a/src/deepISA/modeling/alpha_genome_adapter.py b/src/deepISA/modeling/alpha_genome_adapter.py new file mode 100644 index 0000000..7872d75 --- /dev/null +++ b/src/deepISA/modeling/alpha_genome_adapter.py @@ -0,0 +1,217 @@ +"""AlphaGenome adapter — drop-in nn.Module backend for deepISA. + +Config formats +-------------- +Single track (backward-compatible): + api_key: YOUR_KEY + output_type: DNASE + biosample_name: GM12878 + context_len: 16384 # optional, default 16384 + seq_len: 600 # optional, default 600 + aggregation: sum # optional, default sum + +Multi-track (new): + api_key: YOUR_KEY + tracks: + - output_type: DNASE + biosample_name: GM12878 + - output_type: CAGE + biosample_name: GM12878 + - output_type: ATAC + biosample_name: K562 + context_len: 16384 + seq_len: 600 + aggregation: sum + +Every sequence makes exactly ONE API call regardless of how many tracks are +configured. Columns in the output tensor are ordered by the `tracks` list. +""" +from __future__ import annotations +from typing import Any +import numpy as np +import torch +import yaml + +_DEFAULTS = {"context_len": 16384, "seq_len": 600, "aggregation": "sum"} + +_BASES = np.array(['A', 'C', 'G', 'T'], dtype='U1') + + +def load_config(path: str) -> dict[str, Any]: + with open(path) as f: + cfg = yaml.safe_load(f) + if "api_key" not in cfg: + raise KeyError("alpha_genome config missing required key: 'api_key'") + # Normalise old single-track format → new tracks list + if "tracks" not in cfg: + for key in ("output_type", "biosample_name"): + if key not in cfg: + raise KeyError(f"alpha_genome config missing required key: '{key}'") + cfg["tracks"] = [{"output_type": cfg["output_type"], + "biosample_name": cfg["biosample_name"]}] + return {**_DEFAULTS, **cfg} + + +def _tensor_to_seqs(x: torch.Tensor) -> list[str]: + """(N, 4, L) one-hot tensor → list[str]. Vectorized via argmax.""" + x_np = x.cpu().numpy() + idx = x_np.argmax(axis=1) + has_base = x_np.max(axis=1) > 0 + chars = np.where(has_base, _BASES[idx], 'N') + return [''.join(row) for row in chars] + + +def _pad_seqs(seqs: list[str], context_len: int, seq_len: int) -> list[str]: + """Centre each seq in context_len of N padding.""" + pad_left = (context_len - seq_len) // 2 + pad_right = context_len - seq_len - pad_left + pre, suf = 'N' * pad_left, 'N' * pad_right + return [pre + s + suf for s in seqs] + + +import torch.nn as nn +from alphagenome.models import dna_client +from alphagenome.models.dna_output import OutputType + + +class AlphaGenomeAdapter(nn.Module): + """ + Drop-in nn.Module replacement for deepISA's Conv model. + + Supports one or more (output_type, biosample_name) track combinations via + the config file. Every sequence prediction is a single API call; columns + are concatenated in the order the tracks appear in the config. + + Returns (N, n_tracks) float32 tensor compatible with run_single_isa / + run_combi_isa. Use adapter.n_tracks to know the output width. + """ + + def __init__(self, config_path: str) -> None: + super().__init__() + cfg = load_config(config_path) + self._cfg = cfg + + self._dna_model = dna_client.create(cfg["api_key"]) + + meta = self._dna_model.output_metadata( + dna_client.Organism.HOMO_SAPIENS + ).concatenate() + + ctx = cfg["context_len"] + sl = cfg["seq_len"] + self._context_len = ctx + self._seq_len = sl + self._start_idx = (ctx - sl) // 2 + self._end_idx = self._start_idx + sl + + # ── Resolve each (output_type, biosample) track ─────────────────────── + tracks_cfg = cfg["tracks"] + all_terms: list[str] = [] + all_output_type_enums: list[OutputType] = [] + + for track in tracks_cfg: + ot_str = track["output_type"] + bio = track["biosample_name"] + ot_enum = OutputType[ot_str] + matched = meta[ + (meta["output_type"] == ot_enum) & + (meta["biosample_name"] == bio) + ] + if matched.empty: + available = sorted( + meta[meta["output_type"] == ot_enum] + ["biosample_name"].dropna().unique() + )[:15] + raise ValueError( + f"biosample_name='{bio}' not found for output_type='{ot_str}'.\n" + f"Available (first 15): {available}\n" + f"Browse notebooks/ag_biosample_reference.csv to find valid names." + ) + terms = matched["ontology_curie"].dropna().unique().tolist() + all_terms.extend(terms) + if ot_enum not in all_output_type_enums: + all_output_type_enums.append(ot_enum) + + self._all_output_type_enums: list[OutputType] = all_output_type_enums + self._all_terms: list[str] = list(dict.fromkeys(all_terms)) # dedup, keep order + + # Keep _ontology_terms as alias for backward compatibility + self._ontology_terms = self._all_terms + + # ── Probe call: learn exact column indices for each desired track ───── + # One call with all output types + all terms reveals which columns + # belong to which biosample via TrackData.metadata.biosample_name. + probe_out = self._dna_model.predict_sequence( + sequence="N" * ctx, + requested_outputs=self._all_output_type_enums, + ontology_terms=self._all_terms, + ) + + # _extraction_plan: ordered list of (attr_name, col_indices_array) + # Each entry corresponds to one desired (output_type, biosample) pair. + self._extraction_plan: list[tuple[str, np.ndarray]] = [] + for track in tracks_cfg: + ot_str = track["output_type"] + bio = track["biosample_name"] + attr = ot_str.lower() # "DNASE" → "dnase", "RNA_SEQ" → "rna_seq" + track_data = getattr(probe_out, attr) + tmeta = track_data.metadata.reset_index(drop=True) + col_idx = np.where(tmeta["biosample_name"] == bio)[0] + if len(col_idx) == 0: + raise ValueError( + f"Probe returned no columns for biosample='{bio}' in {ot_str}. " + f"Available in probe: {tmeta['biosample_name'].tolist()}" + ) + self._extraction_plan.append((attr, col_idx)) + + self._n_tracks: int = sum(len(idx) for _, idx in self._extraction_plan) + + # Sequence-level cache: raw 600bp string → list[float] (n_tracks,) + self._cache: dict[str, list[float]] = {} + + # ── Public properties ───────────────────────────────────────────────────── + + @property + def n_tracks(self) -> int: + """Total number of output tracks across all configured (output_type, biosample) pairs.""" + return self._n_tracks + + @property + def cache_size(self) -> int: + return len(self._cache) + + def clear_cache(self) -> None: + self._cache.clear() + + # ── nn.Module interface ─────────────────────────────────────────────────── + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + x : (N, 4, seq_len) one-hot tensor from compute_predictions + returns: (N, n_tracks) float32 tensor + """ + seqs = _tensor_to_seqs(x) + seqs_padded = _pad_seqs(seqs, self._context_len, self._seq_len) + scalars = self._predict_sequential(seqs, seqs_padded) + return torch.tensor(scalars, dtype=torch.float32) + + def _predict_sequential( + self, seqs: list[str], seqs_padded: list[str] + ) -> list[list[float]]: + """One API call per unique sequence; cache hits skip the API entirely.""" + result = [] + for raw, padded in zip(seqs, seqs_padded): + if raw not in self._cache: + output = self._dna_model.predict_sequence( + sequence=padded, + requested_outputs=self._all_output_type_enums, + ontology_terms=self._all_terms, + ) + parts = [] + for attr, col_idx in self._extraction_plan: + track_data = getattr(output, attr) + window = track_data.values[self._start_idx:self._end_idx, :] + parts.append(window[:, col_idx].sum(axis=0)) # (n_cols_for_this_bio,) + self._cache[raw] = np.concatenate(parts).tolist() + result.append(self._cache[raw]) + return result diff --git a/tests/test_adapter.py b/tests/test_adapter.py new file mode 100644 index 0000000..3238845 --- /dev/null +++ b/tests/test_adapter.py @@ -0,0 +1,284 @@ +import sys +import pytest +import yaml +import numpy as np +import torch + +sys.path.insert(0, "deepISA/src") # make deepISA importable without modifying it + + +def test_load_config_reads_fields(tmp_path): + cfg = { + "api_key": "testkey", + "output_type": "DNASE", + "biosample_name": "GM12878", + "context_len": 16384, + "seq_len": 600, + "aggregation": "sum", + } + p = tmp_path / "config.yaml" + p.write_text(yaml.dump(cfg)) + + from deepisa_ag.adapter import load_config + loaded = load_config(str(p)) + assert loaded["api_key"] == "testkey" + assert loaded["context_len"] == 16384 + assert loaded["aggregation"] == "sum" + + +def test_load_config_missing_required_key(tmp_path): + p = tmp_path / "bad.yaml" + p.write_text(yaml.dump({"output_type": "DNASE"})) + + from deepisa_ag.adapter import load_config + with pytest.raises(KeyError): + load_config(str(p)) + + +# ── Task 2: vectorized sequence utilities ───────────────────────────────────── + +def test_tensor_to_seqs_roundtrip(): + """one_hot_encode → tensor → _tensor_to_seqs should recover original strings.""" + from deepISA.utils import one_hot_encode + from deepisa_ag.adapter import _tensor_to_seqs + seqs = ["ACGT" * 150] # 600 bp + x = torch.from_numpy(one_hot_encode(seqs)) # (1, 4, 600) + assert _tensor_to_seqs(x) == seqs + + +def test_tensor_to_seqs_n_positions(): + from deepisa_ag.adapter import _tensor_to_seqs + x = torch.zeros(1, 4, 4) # all-zero → 'N' + assert _tensor_to_seqs(x)[0] == "NNNN" + + +def test_pad_seqs_total_length(): + from deepisa_ag.adapter import _pad_seqs + padded = _pad_seqs(["ACGT" * 150], context_len=16384, seq_len=600) + assert len(padded[0]) == 16384 + + +def test_pad_seqs_centre_preserved(): + from deepisa_ag.adapter import _pad_seqs + seq = "ACGT" * 150 + padded = _pad_seqs([seq], context_len=16384, seq_len=600)[0] + pad_left = (16384 - 600) // 2 + assert padded[pad_left: pad_left + 600] == seq + + +def test_pad_seqs_flanks_are_n(): + from deepisa_ag.adapter import _pad_seqs + padded = _pad_seqs(["A" * 600], context_len=16384, seq_len=600)[0] + pad_left = (16384 - 600) // 2 + assert set(padded[:pad_left]) == {"N"} + assert set(padded[pad_left + 600:]) == {"N"} + + +# ── Task 3: AlphaGenomeAdapter class ───────────────────────────────────────── + +import pandas as pd +from unittest.mock import MagicMock, patch +from alphagenome.models.dna_output import OutputType + + +def _fake_metadata(biosample: str, output_type: str) -> pd.DataFrame: + """Return metadata with real OutputType enum objects, matching the live API.""" + return pd.DataFrame({ + "biosample_name": [biosample], + "output_type": [OutputType[output_type]], + "ontology_curie": ["CL:0000000"], + }) + + +def _fake_track_output(n_positions: int, n_tracks: int, value: float, + biosample: str = "GM12878"): + td = MagicMock() + td.values = np.full((n_positions, n_tracks), value, dtype=np.float32) + # metadata must be a real DataFrame so probe-call col-index logic works + td.metadata = pd.DataFrame({"biosample_name": [biosample] * n_tracks}) + return td + + +def _fake_predict_output(value: float, output_attr: str = "dnase", + biosample: str = "GM12878"): + out = MagicMock() + setattr(out, output_attr, _fake_track_output(16384, 1, value, biosample)) + return out + + +def _make_adapter(tmp_path, biosample="GM12878", output_type="DNASE", mock_dc=None): + cfg = {"api_key": "k", "output_type": output_type, "biosample_name": biosample, + "context_len": 16384, "seq_len": 600, "aggregation": "sum"} + (tmp_path / "cfg.yaml").write_text(yaml.dump(cfg)) + mock_dc.create.return_value.output_metadata.return_value.concatenate.return_value = ( + _fake_metadata(biosample, output_type)) + from deepisa_ag.adapter import AlphaGenomeAdapter + return AlphaGenomeAdapter(str(tmp_path / "cfg.yaml")) + + +def test_adapter_forward_returns_n_by_n_tracks(tmp_path): + with patch("deepisa_ag.adapter.dna_client") as mock_dc: + mock_dc.create.return_value.predict_sequence.return_value = ( + _fake_predict_output(1.0)) + adapter = _make_adapter(tmp_path, mock_dc=mock_dc) + + from deepISA.utils import one_hot_encode + x = torch.from_numpy(one_hot_encode(["ACGT" * 150])) # (1, 4, 600) + out = adapter(x) + + assert out.shape == (1, 1) # 1 seq × 1 track (mock has 1 track) + assert out.dtype == torch.float32 + + +def test_adapter_col0_equals_signal_sum(tmp_path): + """col 0 = sum of central 600 bp × 1 track × signal_value.""" + signal_value = 0.5 + with patch("deepisa_ag.adapter.dna_client") as mock_dc: + mock_dc.create.return_value.predict_sequence.return_value = ( + _fake_predict_output(signal_value)) + adapter = _make_adapter(tmp_path, mock_dc=mock_dc) + + from deepISA.utils import one_hot_encode + x = torch.from_numpy(one_hot_encode(["ACGT" * 150])) + out = adapter(x) + + expected = signal_value * 600 * 1 # sum over 600 positions × 1 track + assert float(out[0, 0]) == pytest.approx(expected) + + +def test_adapter_cache_deduplicates_api_calls(tmp_path): + """Identical sequences must produce only one API call, not two.""" + with patch("deepisa_ag.adapter.dna_client") as mock_dc: + mock_dc.create.return_value.predict_sequence.return_value = ( + _fake_predict_output(1.0)) + adapter = _make_adapter(tmp_path, mock_dc=mock_dc) + + calls_after_init = mock_dc.create.return_value.predict_sequence.call_count + + from deepISA.utils import one_hot_encode + x = torch.from_numpy(one_hot_encode(["ACGT" * 150])) + + adapter(x) # first call → API hit, stored in cache + adapter(x) # second call → cache hit, no API call + + assert mock_dc.create.return_value.predict_sequence.call_count == calls_after_init + 1 + assert adapter.cache_size == 1 + + +def test_adapter_clear_cache(tmp_path): + """clear_cache() resets the cache so the next call hits the API again.""" + with patch("deepisa_ag.adapter.dna_client") as mock_dc: + mock_dc.create.return_value.predict_sequence.return_value = ( + _fake_predict_output(1.0)) + adapter = _make_adapter(tmp_path, mock_dc=mock_dc) + + calls_after_init = mock_dc.create.return_value.predict_sequence.call_count + + from deepISA.utils import one_hot_encode + x = torch.from_numpy(one_hot_encode(["ACGT" * 150])) + + adapter(x) + assert adapter.cache_size == 1 + adapter.clear_cache() + assert adapter.cache_size == 0 + adapter(x) # cache was cleared → one more API call + assert mock_dc.create.return_value.predict_sequence.call_count == calls_after_init + 2 + + +def test_adapter_bad_biosample_raises(tmp_path): + with patch("deepisa_ag.adapter.dna_client") as mock_dc: + mock_dc.create.return_value.output_metadata.return_value.concatenate.return_value = ( + _fake_metadata("GM12878", "DNASE")) + cfg = {"api_key": "k", "output_type": "DNASE", "biosample_name": "NonExistent", + "context_len": 16384, "seq_len": 600, "aggregation": "sum"} + (tmp_path / "cfg.yaml").write_text(yaml.dump(cfg)) + + from deepisa_ag import AlphaGenomeAdapter + with pytest.raises(ValueError, match="not found"): + AlphaGenomeAdapter(str(tmp_path / "cfg.yaml")) + + +# ── Task 4: Full-chain integration test ────────────────────────────────────── + +def test_full_chain_compute_predictions(tmp_path): + """adapter works as model arg in deepISA's compute_predictions — zero ISA code changes.""" + from deepISA.modeling.predict import compute_predictions + + with patch("deepisa_ag.adapter.dna_client") as mock_dc: + mock_dc.create.return_value.output_metadata.return_value.concatenate.return_value = ( + _fake_metadata("GM12878", "DNASE")) + mock_dc.create.return_value.predict_sequence.side_effect = [ + _fake_predict_output(1.0), # probe in __init__ + _fake_predict_output(2.0), # seq 1 original + _fake_predict_output(1.0), # seq 1 ablated + ] + adapter = _make_adapter(tmp_path, mock_dc=mock_dc) + + device = torch.device("cpu") + seqs_orig = ["ACGT" * 150] + seqs_ablat = ["NNNN" * 150] + + preds_orig = compute_predictions(adapter, seqs_orig, device, batch_size=1) + preds_ablat = compute_predictions(adapter, seqs_ablat, device, batch_size=1) + + isa = preds_orig[:, 0] - preds_ablat[:, 0] + assert preds_orig.shape == (1, 1) + assert preds_ablat.shape == (1, 1) + assert float(isa[0]) == pytest.approx(2.0 * 600 - 1.0 * 600) # 600.0 + + +# ── Task 5: multi-track config ──────────────────────────────────────────────── + +def _fake_metadata_multi(pairs: list) -> pd.DataFrame: + """pairs = [(biosample, output_type_str), ...]""" + return pd.DataFrame({ + "biosample_name": [b for b, _ in pairs], + "output_type": [OutputType[ot] for _, ot in pairs], + "ontology_curie": [f"CL:{i:07d}" for i in range(len(pairs))], + }) + + +def test_multi_track_config_new_format(tmp_path): + """tracks: list config → correct n_tracks and output shape.""" + biosample_a, biosample_b = "GM12878", "K562" + with patch("deepisa_ag.adapter.dna_client") as mock_dc: + mock_dc.create.return_value.output_metadata.return_value.concatenate.return_value = ( + _fake_metadata_multi([ + (biosample_a, "DNASE"), + (biosample_b, "ATAC"), + ]) + ) + # probe + forward calls: each returns dnase(1 col for A) + atac(1 col for B) + def make_output(): + out = MagicMock() + out.dnase = _fake_track_output(16384, 1, 1.0, biosample_a) + out.atac = _fake_track_output(16384, 1, 2.0, biosample_b) + return out + mock_dc.create.return_value.predict_sequence.return_value = make_output() + + cfg = {"api_key": "k", + "tracks": [{"output_type": "DNASE", "biosample_name": biosample_a}, + {"output_type": "ATAC", "biosample_name": biosample_b}], + "context_len": 16384, "seq_len": 600} + (tmp_path / "cfg.yaml").write_text(yaml.dump(cfg)) + from deepisa_ag.adapter import AlphaGenomeAdapter + adapter = AlphaGenomeAdapter(str(tmp_path / "cfg.yaml")) + + assert adapter.n_tracks == 2 + + from deepISA.utils import one_hot_encode + x = torch.from_numpy(one_hot_encode(["ACGT" * 150])) + out = adapter(x) + assert out.shape == (1, 2) + # col 0 = DNASE signal (1.0 × 600), col 1 = ATAC signal (2.0 × 600) + assert float(out[0, 0]) == pytest.approx(600.0) + assert float(out[0, 1]) == pytest.approx(1200.0) + + +def test_single_track_old_format_still_works(tmp_path): + """Old output_type / biosample_name keys still accepted (backward compat).""" + with patch("deepisa_ag.adapter.dna_client") as mock_dc: + mock_dc.create.return_value.predict_sequence.return_value = ( + _fake_predict_output(1.0)) + adapter = _make_adapter(tmp_path, mock_dc=mock_dc) + assert adapter.n_tracks == 1 diff --git a/tests/test_attr_filter_pipeline.py b/tests/test_attr_filter_pipeline.py new file mode 100644 index 0000000..b6614ab --- /dev/null +++ b/tests/test_attr_filter_pipeline.py @@ -0,0 +1,100 @@ +import sys, os, pytest, yaml, torch +import pandas as pd +import numpy as np +from unittest.mock import patch, MagicMock + +sys.path.insert(0, "deepISA/src") + + +def _make_filtered_motif_csv(tmp_path): + """Minimal motif CSV matching attr_filter output: 3 motifs, 1 region.""" + df = pd.DataFrame({ + "chrom": ["chr1", "chr1", "chr1"], + "start": [1010, 1030, 1050], + "end": [1025, 1045, 1065], + "tf": ["NFKB1","SP1","IRF1"], + "score": [900, 850, 800], + "strand": ["+", "+", "-"], + "region": ["chr1:1000-1600"] * 3, + "start_rel": [10, 30, 50], + "end_rel": [25, 45, 65], + "second_max_t0": [0.9, 0.7, 0.85], + "pass_threshold_t0": [1, 1, 1], + }) + p = tmp_path / "motif_filtered.csv" + df.to_csv(p, index=False) + return str(p) + + +def _make_fasta(tmp_path): + """Write a minimal FASTA for chr1 (2000 bp of A) with index.""" + fa = tmp_path / "mini.fa" + seq = "A" * 2000 + fa.write_text(f">chr1\n{seq}\n") + fai = tmp_path / "mini.fa.fai" + fai.write_text(f"chr1\t2000\t6\t2000\t2001\n") + return str(fa) + + +def test_two_stage_pipeline_isa_cols(tmp_path): + """ + Verifies that run_single_isa preserves the pass_threshold_t0 column from + a pre-filtered motif CSV (as attr_filter would produce) and writes correct + isa_t0 values. + """ + filtered_path = _make_filtered_motif_csv(tmp_path) + fasta_path = _make_fasta(tmp_path) + out_path = str(tmp_path / "isa_out.csv") + + from alphagenome.models.dna_output import OutputType + + cfg = {"api_key": "k", "output_type": "DNASE", + "biosample_name": "GM12878", "context_len": 16384, + "seq_len": 600, "aggregation": "sum"} + (tmp_path / "cfg.yaml").write_text(yaml.dump(cfg)) + + fake_meta = pd.DataFrame({ + "biosample_name": ["GM12878"], + "output_type": [OutputType["DNASE"]], + "ontology_curie": ["EFO:0002784"], + }) + + def _fake_output(val): + out = MagicMock() + track = MagicMock() + track.values = np.full((16384, 1), val, dtype=np.float32) + # metadata must be real DataFrame so probe col-index extraction works + track.metadata = pd.DataFrame({"biosample_name": ["GM12878"]}) + out.dnase = track + return out + + with patch("deepisa_ag.adapter.dna_client") as mock_dc: + mock_dc.create.return_value.output_metadata.return_value.concatenate.return_value = fake_meta + # 1 probe (__init__) + 1 orig + 3 ablated = 5 calls; extra entries are unused + mock_dc.create.return_value.predict_sequence.side_effect = [ + _fake_output(1.0), # probe in __init__ + _fake_output(1.0), _fake_output(0.5), + _fake_output(1.0), _fake_output(0.5), + _fake_output(1.0), _fake_output(0.5), + ] + from deepisa_ag import AlphaGenomeAdapter + adapter = AlphaGenomeAdapter(str(tmp_path / "cfg.yaml")) + + from deepISA.scoring.single_isa import run_single_isa + run_single_isa( + model = adapter, + fasta_path = fasta_path, + motif_locs_path = filtered_path, + outpath = out_path, + device = torch.device("cpu"), + tracks = [0], + num_regions_per_batch = 10, + pred_batch_size = 1, + ) + + result = pd.read_csv(out_path) + assert "isa_t0" in result.columns + assert "pass_threshold_t0" in result.columns # pass-through from filter + assert len(result) == 3 + # orig sum = 1.0 * 600 = 600, mut sum = 0.5 * 600 = 300 → isa = 300 + assert float(result["isa_t0"].iloc[0]) == pytest.approx(300.0, rel=1e-3)