Skip to content

Commit 3c40f8c

Browse files
IanShi1996phil-fradkinTaykhoomDalal
authored
V1.2.1: Adding datasets and models. Switched license to AGPL. (#19)
Non-API breaking update: Added additional dataset: - GO Cellular Component - GO Biological Process - mRNA localization (Fazal et al.) - PERSIST-Seq paired MRL and HL dataset (Leppek et al.) Model Changes: - Added NaiveMamba model (unpretrained Mamba) - Added context window to HyenaDNA - Updated datasets to use Morrislab HF links Temporarily removed essentiality datasets due to data quality issues. Updated license to AGPL for compatibility with multimolecule. Removed AIDO.RNA due to license compatibility issues. --------- Co-authored-by: Fradkin <fradkinp@islogin01.mskcc.org> Co-authored-by: Taykhoom Dalal <tid4007@med.cornell.edu>
1 parent 396f3da commit 3c40f8c

26 files changed

Lines changed: 1196 additions & 653 deletions

LICENSE

Lines changed: 661 additions & 21 deletions
Large diffs are not rendered by default.

README.md

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -36,18 +36,16 @@ I had to manually move the checkpoint into its corresponding snapshot directory.
3636
/hub/models--arcinstitute-evo2_40b*/snapshots/snapshot_name/
3737

3838
```bash
39-
conda create --name evo_bench python=3.11
39+
conda create --name evo_bench -c conda-forge python=3.11 gxx=12.2.0 -y
4040
conda activate evo_bench
4141

42-
conda install conda-forge::gcc # need updated gcc version
42+
pip install torch==2.6.0+cu124 --index-url https://download.pytorch.org/whl/cu124
43+
pip install vtx==1.0.4
44+
pip install evo2==0.2.0
45+
pip install flash-attn==2.7.4.post1
4346

4447
cd path/to/mRNA/bench
4548
pip install -e .
46-
47-
git clone --recurse-submodules git@github.com:ArcInstitute/evo2.git
48-
cd path/to/evo2
49-
pip install .
50-
pip install transformer_engine[pytorch]==1.13
5149
```
5250

5351
### Post-install

experiments/overall_df.parquet

5.34 MB
Binary file not shown.

mrna_bench/datasets/dataset_catalog.py

Lines changed: 38 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,14 @@
11
from collections.abc import Callable
22

33
from .benchmark_dataset import BenchmarkDataset
4+
from .go_bio_proc import GOBiologicalProcess
5+
from .go_cell_comp import GOCellularComponent
46
from .go_mol_func import GOMolecularFunction
5-
from .pcg_essentiality import (
6-
PCGEssHAP1,
7-
PCGEssHEK293FT,
8-
PCGEssK562,
9-
PCGEssMDA_MB_231,
10-
PCGEssTHP1,
11-
PCGEssShared
12-
)
13-
from .lncrna_essentiality import (
14-
LNCRNAEssHAP1,
15-
LNCRNAEssHEK293FT,
16-
LNCRNAEssK562,
17-
LNCRNAEssMDA_MB_231,
18-
LNCRNAEssTHP1,
19-
LNCRNAEssShared
20-
)
217
from .rna_hl_human import RNAHalfLifeHuman
228
from .rna_hl_mouse import RNAHalfLifeMouse
9+
from .rna_loc_fazal import RNALocalizationFazal
2310
from .rna_loc_ietswaart import RNALocalizationIetswaart
11+
from .mrl_hl_lbkwk import MRLHLLBKWK
2412
from .prot_loc import ProteinLocalization
2513
from .mrl_sugimoto import MRLSugimoto
2614
from .mrl_sample import (
@@ -41,23 +29,15 @@
4129
DATASET_CATALOG: dict[str, Callable[..., BenchmarkDataset]] = {
4230
"eclip-binding-k562": eCLIPBindingK562,
4331
"eclip-binding-hepg2": eCLIPBindingHepG2,
32+
"go-bp": GOBiologicalProcess,
33+
"go-cc": GOCellularComponent,
4434
"go-mf": GOMolecularFunction,
45-
"pcg-ess-hap1": PCGEssHAP1,
46-
"pcg-ess-hek293ft": PCGEssHEK293FT,
47-
"pcg-ess-k562": PCGEssK562,
48-
"pcg-ess-mda-mb-231": PCGEssMDA_MB_231,
49-
"pcg-ess-thp1": PCGEssTHP1,
50-
"pcg-ess-shared": PCGEssShared,
51-
"lncrna-ess-hap1": LNCRNAEssHAP1,
52-
"lncrna-ess-hek293ft": LNCRNAEssHEK293FT,
53-
"lncrna-ess-k562": LNCRNAEssK562,
54-
"lncrna-ess-mda-mb-231": LNCRNAEssMDA_MB_231,
55-
"lncrna-ess-thp1": LNCRNAEssTHP1,
56-
"lncrna-ess-shared": LNCRNAEssShared,
5735
"rnahl-human": RNAHalfLifeHuman,
5836
"rnahl-mouse": RNAHalfLifeMouse,
37+
"rna-loc-fazal": RNALocalizationFazal,
5938
"rna-loc-ietswaart": RNALocalizationIetswaart,
6039
"prot-loc": ProteinLocalization,
40+
"mrl-hl-lbkwk": MRLHLLBKWK,
6141
"mrl-sugimoto": MRLSugimoto,
6242
"mrl-sample-egfp": MRLSampleEGFP,
6343
"mrl-sample-mcherry": MRLSampleMCherry,
@@ -80,12 +60,36 @@
8060
"target_col": eCLIP_HepG2_TOP_RBPS_LIST,
8161
"split_type": "homology",
8262
},
63+
"go-bp": {
64+
"dataset": "go-bp",
65+
"task": "multilabel",
66+
"target_col": "target",
67+
"split_type": "homology",
68+
},
69+
"go-cc": {
70+
"dataset": "go-cc",
71+
"task": "multilabel",
72+
"target_col": "target",
73+
"split_type": "homology",
74+
},
8375
"go-mf": {
8476
"dataset": "go-mf",
8577
"task": "multilabel",
8678
"target_col": "target",
8779
"split_type": "homology",
8880
},
81+
"mrl-hl-lbkwk-hl": {
82+
"dataset": "mrl-hl-lbkwk",
83+
"task": "reg_ridge",
84+
"target_col": "target_in_cell_half_life",
85+
"split_type": "default",
86+
},
87+
"mrl-hl-lbkwk-mrl": {
88+
"dataset": "mrl-hl-lbkwk",
89+
"task": "reg_ridge",
90+
"target_col": "target_ribosome_load",
91+
"split_type": "default",
92+
},
8993
"mrl-sugimoto": {
9094
"dataset": "mrl-sugimoto",
9195
"task": "reg_ridge",
@@ -146,6 +150,12 @@
146150
"target_col": "target",
147151
"split_type": "homology",
148152
},
153+
"rna-loc-fazal": {
154+
"dataset": "rna-loc-fazal",
155+
"task": "multilabel",
156+
"target_col": "target",
157+
"split_type": "homology",
158+
},
149159
"rna-loc-ietswaart": {
150160
"dataset": "rna-loc-ietswaart",
151161
"task": "multilabel",
@@ -165,24 +175,3 @@
165175
"split_type": "homology",
166176
},
167177
}
168-
169-
for ttype in ["pcg", "lncrna"]:
170-
split_type = "homology" if ttype == "pcg" else "default"
171-
for cell in ["hap1", "hek293ft", "k562", "mda-mb-231", "thp1", "shared"]:
172-
173-
cell_upper = cell.upper()
174-
175-
DATASET_INFO[f"{ttype}-ess-{cell}"] = {
176-
"dataset": f"{ttype}-ess-{cell}",
177-
"task": "classification",
178-
"target_col": f"target_essential_{cell_upper}",
179-
"split_type": split_type,
180-
}
181-
182-
if cell != "shared":
183-
DATASET_INFO[f"{ttype}-ess-{cell}-day14-log2fc"] = {
184-
"dataset": f"{ttype}-ess-{cell}",
185-
"task": "reg_ridge",
186-
"target_col": f"target_day14_log2fc_{cell_upper}",
187-
"split_type": split_type,
188-
}

mrna_bench/datasets/eclip_binding.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ def __init__(self, force_redownload=False):
104104
"eclip-binding-k562",
105105
force_redownload,
106106
hf_url=(
107-
"https://huggingface.co/datasets/quietflamingo/"
107+
"https://huggingface.co/datasets/morrislab/"
108108
"eclip/resolve/main/eclip-k562.parquet"
109109
)
110110
)
@@ -125,7 +125,7 @@ def __init__(self, force_redownload=False):
125125
"eclip-binding-hepg2",
126126
force_redownload,
127127
hf_url=(
128-
"https://huggingface.co/datasets/quietflamingo/"
128+
"https://huggingface.co/datasets/morrislab/"
129129
"eclip/resolve/main/eclip-hepg2.parquet"
130130
)
131131
)

mrna_bench/datasets/go_bio_proc.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import pandas as pd
2+
3+
from mrna_bench.datasets.benchmark_dataset import BenchmarkDataset
4+
5+
6+
class GOBiologicalProcess(BenchmarkDataset):
7+
"""GO Biological Process Dataset."""
8+
9+
def __init__(self, force_redownload: bool = False):
10+
"""Initialize GO Biological Process dataset.
11+
12+
Args:
13+
force_redownload: Force raw data download even if pre-existing.
14+
"""
15+
super().__init__(
16+
dataset_name="go-bp",
17+
species="human",
18+
force_redownload=force_redownload,
19+
hf_url=(
20+
"https://huggingface.co/datasets/morrislab/"
21+
"go-bp/resolve/main/go_dna_dataset_bp.parquet"
22+
)
23+
)
24+
25+
def _get_data_from_raw(self) -> pd.DataFrame:
26+
raise NotImplementedError(
27+
"Code documenting GO Biological Process data is still in progress."
28+
)
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
import pandas as pd
2+
3+
from mrna_bench.datasets.benchmark_dataset import BenchmarkDataset
4+
5+
6+
class GOCellularComponent(BenchmarkDataset):
7+
"""GO Cellular Component Dataset."""
8+
9+
def __init__(self, force_redownload: bool = False):
10+
"""Initialize GO Cellular Component dataset.
11+
12+
Args:
13+
force_redownload: Force raw data download even if pre-existing.
14+
"""
15+
super().__init__(
16+
dataset_name="go-cc",
17+
species="human",
18+
force_redownload=force_redownload,
19+
hf_url=(
20+
"https://huggingface.co/datasets/morrislab/"
21+
"go-cc/resolve/main/go_dna_dataset_cc.parquet"
22+
)
23+
)
24+
25+
def _get_data_from_raw(self) -> pd.DataFrame:
26+
raise NotImplementedError(
27+
"Code documenting GO Cellular Component data is still in progress."
28+
)

mrna_bench/datasets/go_mol_func.py

Lines changed: 5 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,6 @@
1-
import numpy as np
21
import pandas as pd
32

43
from mrna_bench.datasets.benchmark_dataset import BenchmarkDataset
5-
from mrna_bench.datasets.dataset_utils import ohe_to_str
6-
from mrna_bench.utils import download_file
7-
8-
9-
GOMF_URL = "https://zenodo.org/records/14708163/files/go_dna_dataset.npz"
104

115

126
class GOMolecularFunction(BenchmarkDataset):
@@ -23,48 +17,12 @@ def __init__(self, force_redownload: bool = False):
2317
species="human",
2418
force_redownload=force_redownload,
2519
hf_url=(
26-
"https://huggingface.co/datasets/quietflamingo/"
27-
"go-mf/resolve/main/go-mf.parquet"
20+
"https://huggingface.co/datasets/morrislab/"
21+
"go-mf/resolve/main/go_dna_dataset_mf.parquet"
2822
)
2923
)
3024

3125
def _get_data_from_raw(self) -> pd.DataFrame:
32-
"""Process raw data into Pandas dataframe.
33-
34-
Returns:
35-
Pandas dataframe of processed sequences.
36-
"""
37-
try:
38-
import genome_kit as gk
39-
hg_genes = gk.Genome("gencode.v41").genes
40-
except ImportError:
41-
print("GenomeKit is required for raw processing. See README.")
42-
raise
43-
44-
print("Downloading raw data...")
45-
self.raw_data_path = download_file(GOMF_URL, self.raw_data_dir)
46-
data = np.load(self.raw_data_path)
47-
X = data["X"]
48-
49-
print("Processing raw data...")
50-
seq_str = ohe_to_str(X[:, :, :4])
51-
lens = [len(s) for s in seq_str]
52-
cds = [X[i, :lens[i], 4] for i in range(len(X))]
53-
splice = [X[i, :lens[i], 5] for i in range(len(X))]
54-
55-
chrs = []
56-
for gene in data["genes"]:
57-
transcript_chr = hg_genes.first_by_name(gene).chromosome
58-
transcript_chr = transcript_chr.replace("chr", "")
59-
chrs.append(transcript_chr)
60-
61-
df = pd.DataFrame({
62-
"gene": data["genes"],
63-
"chromosome": chrs,
64-
"sequence": seq_str,
65-
"cds": cds,
66-
"splice": splice,
67-
"target": [y for y in data["y"]]
68-
})
69-
70-
return df
26+
raise NotImplementedError(
27+
"Code documenting GO Molecular Function data is still in progress."
28+
)

0 commit comments

Comments
 (0)