morrislab
diff --git a/‎LICENSE‎
Lines changed: 661 additions & 21 deletions b/‎LICENSE‎
Lines changed: 661 additions & 21 deletions
diff --git a/‎README.md‎
Lines changed: 5 additions & 7 deletions b/‎README.md‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎experiments/overall_df.parquet‎
5.34 MB b/‎experiments/overall_df.parquet‎
5.34 MB
diff --git a/‎mrna_bench/datasets/dataset_catalog.py‎
Lines changed: 38 additions & 49 deletions b/‎mrna_bench/datasets/dataset_catalog.py‎
Lines changed: 38 additions & 49 deletions
diff --git a/‎mrna_bench/datasets/eclip_binding.py‎
Lines changed: 2 additions & 2 deletions b/‎mrna_bench/datasets/eclip_binding.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎mrna_bench/datasets/go_bio_proc.py‎
Lines changed: 28 additions & 0 deletions b/‎mrna_bench/datasets/go_bio_proc.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎mrna_bench/datasets/go_cell_comp.py‎
Lines changed: 28 additions & 0 deletions b/‎mrna_bench/datasets/go_cell_comp.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎mrna_bench/datasets/go_mol_func.py‎
Lines changed: 5 additions & 47 deletions b/‎mrna_bench/datasets/go_mol_func.py‎
Lines changed: 5 additions & 47 deletions
@@ -36,18 +36,16 @@ I had to manually move the checkpoint into its corresponding snapshot directory.
 /hub/models--arcinstitute-evo2_40b*/snapshots/snapshot_name/
 
 ```bash
-conda create --name evo_bench python=3.11
+conda create --name evo_bench -c conda-forge python=3.11 gxx=12.2.0 -y
 conda activate evo_bench
 
-conda install conda-forge::gcc # need updated gcc version
+pip install torch==2.6.0+cu124 --index-url https://download.pytorch.org/whl/cu124
+pip install vtx==1.0.4
+pip install evo2==0.2.0
+pip install flash-attn==2.7.4.post1
 
 cd path/to/mRNA/bench
 pip install -e .
-
-git clone --recurse-submodules git@github.com:ArcInstitute/evo2.git
-cd path/to/evo2
-pip install .
-pip install transformer_engine[pytorch]==1.13
 ```
 
 ### Post-install
 
@@ -1,26 +1,14 @@
 from collections.abc import Callable
 
 from .benchmark_dataset import BenchmarkDataset
+from .go_bio_proc import GOBiologicalProcess
+from .go_cell_comp import GOCellularComponent
 from .go_mol_func import GOMolecularFunction
-from .pcg_essentiality import (
-    PCGEssHAP1,
-    PCGEssHEK293FT,
-    PCGEssK562,
-    PCGEssMDA_MB_231,
-    PCGEssTHP1,
-    PCGEssShared
-)
-from .lncrna_essentiality import (
-    LNCRNAEssHAP1,
-    LNCRNAEssHEK293FT,
-    LNCRNAEssK562,
-    LNCRNAEssMDA_MB_231,
-    LNCRNAEssTHP1,
-    LNCRNAEssShared
-)
 from .rna_hl_human import RNAHalfLifeHuman
 from .rna_hl_mouse import RNAHalfLifeMouse
+from .rna_loc_fazal import RNALocalizationFazal
 from .rna_loc_ietswaart import RNALocalizationIetswaart
+from .mrl_hl_lbkwk import MRLHLLBKWK
 from .prot_loc import ProteinLocalization
 from .mrl_sugimoto import MRLSugimoto
 from .mrl_sample import (
@@ -41,23 +29,15 @@
 DATASET_CATALOG: dict[str, Callable[..., BenchmarkDataset]] = {
     "eclip-binding-k562": eCLIPBindingK562,
     "eclip-binding-hepg2": eCLIPBindingHepG2,
+    "go-bp": GOBiologicalProcess,
+    "go-cc": GOCellularComponent,
     "go-mf": GOMolecularFunction,
-    "pcg-ess-hap1": PCGEssHAP1,
-    "pcg-ess-hek293ft": PCGEssHEK293FT,
-    "pcg-ess-k562": PCGEssK562,
-    "pcg-ess-mda-mb-231": PCGEssMDA_MB_231,
-    "pcg-ess-thp1": PCGEssTHP1,
-    "pcg-ess-shared": PCGEssShared,
-    "lncrna-ess-hap1": LNCRNAEssHAP1,
-    "lncrna-ess-hek293ft": LNCRNAEssHEK293FT,
-    "lncrna-ess-k562": LNCRNAEssK562,
-    "lncrna-ess-mda-mb-231": LNCRNAEssMDA_MB_231,
-    "lncrna-ess-thp1": LNCRNAEssTHP1,
-    "lncrna-ess-shared": LNCRNAEssShared,
     "rnahl-human": RNAHalfLifeHuman,
     "rnahl-mouse": RNAHalfLifeMouse,
+    "rna-loc-fazal": RNALocalizationFazal,
     "rna-loc-ietswaart": RNALocalizationIetswaart,
     "prot-loc": ProteinLocalization,
+    "mrl-hl-lbkwk": MRLHLLBKWK,
     "mrl-sugimoto": MRLSugimoto,
     "mrl-sample-egfp": MRLSampleEGFP,
     "mrl-sample-mcherry": MRLSampleMCherry,
@@ -80,12 +60,36 @@
         "target_col": eCLIP_HepG2_TOP_RBPS_LIST,
         "split_type": "homology",
     },
+    "go-bp": {
+        "dataset": "go-bp",
+        "task": "multilabel",
+        "target_col": "target",
+        "split_type": "homology",
+    },
+    "go-cc": {
+        "dataset": "go-cc",
+        "task": "multilabel",
+        "target_col": "target",
+        "split_type": "homology",
+    },
     "go-mf": {
         "dataset": "go-mf",
         "task": "multilabel",
         "target_col": "target",
         "split_type": "homology",
     },
+    "mrl-hl-lbkwk-hl": {
+        "dataset": "mrl-hl-lbkwk",
+        "task": "reg_ridge",
+        "target_col": "target_in_cell_half_life",
+        "split_type": "default",
+    },
+    "mrl-hl-lbkwk-mrl": {
+        "dataset": "mrl-hl-lbkwk",
+        "task": "reg_ridge",
+        "target_col": "target_ribosome_load",
+        "split_type": "default",
+    },
     "mrl-sugimoto": {
         "dataset": "mrl-sugimoto",
         "task": "reg_ridge",
@@ -146,6 +150,12 @@
         "target_col": "target",
         "split_type": "homology",
     },
+    "rna-loc-fazal": {
+        "dataset": "rna-loc-fazal",
+        "task": "multilabel",
+        "target_col": "target",
+        "split_type": "homology",
+    },
     "rna-loc-ietswaart": {
         "dataset": "rna-loc-ietswaart",
         "task": "multilabel",
@@ -165,24 +175,3 @@
         "split_type": "homology",
     },
 }
-
-for ttype in ["pcg", "lncrna"]:
-    split_type = "homology" if ttype == "pcg" else "default"
-    for cell in ["hap1", "hek293ft", "k562", "mda-mb-231", "thp1", "shared"]:
-
-        cell_upper = cell.upper()
-
-        DATASET_INFO[f"{ttype}-ess-{cell}"] = {
-            "dataset": f"{ttype}-ess-{cell}",
-            "task": "classification",
-            "target_col": f"target_essential_{cell_upper}",
-            "split_type": split_type,
-        }
-
-        if cell != "shared":
-            DATASET_INFO[f"{ttype}-ess-{cell}-day14-log2fc"] = {
-                "dataset": f"{ttype}-ess-{cell}",
-                "task": "reg_ridge",
-                "target_col": f"target_day14_log2fc_{cell_upper}",
-                "split_type": split_type,
-            }
@@ -104,7 +104,7 @@ def __init__(self, force_redownload=False):
             "eclip-binding-k562",
             force_redownload,
             hf_url=(
-                "https://huggingface.co/datasets/quietflamingo/"
+                "https://huggingface.co/datasets/morrislab/"
                 "eclip/resolve/main/eclip-k562.parquet"
             )
         )
@@ -125,7 +125,7 @@ def __init__(self, force_redownload=False):
             "eclip-binding-hepg2",
             force_redownload,
             hf_url=(
-                "https://huggingface.co/datasets/quietflamingo/"
+                "https://huggingface.co/datasets/morrislab/"
                 "eclip/resolve/main/eclip-hepg2.parquet"
             )
         )
@@ -0,0 +1,28 @@
+import pandas as pd
+
+from mrna_bench.datasets.benchmark_dataset import BenchmarkDataset
+
+
+class GOBiologicalProcess(BenchmarkDataset):
+    """GO Biological Process Dataset."""
+
+    def __init__(self, force_redownload: bool = False):
+        """Initialize GO Biological Process dataset.
+
+        Args:
+            force_redownload: Force raw data download even if pre-existing.
+        """
+        super().__init__(
+            dataset_name="go-bp",
+            species="human",
+            force_redownload=force_redownload,
+            hf_url=(
+                "https://huggingface.co/datasets/morrislab/"
+                "go-bp/resolve/main/go_dna_dataset_bp.parquet"
+            )
+        )
+
+    def _get_data_from_raw(self) -> pd.DataFrame:
+        raise NotImplementedError(
+            "Code documenting GO Biological Process data is still in progress."
+        )
@@ -0,0 +1,28 @@
+import pandas as pd
+
+from mrna_bench.datasets.benchmark_dataset import BenchmarkDataset
+
+
+class GOCellularComponent(BenchmarkDataset):
+    """GO Cellular Component Dataset."""
+
+    def __init__(self, force_redownload: bool = False):
+        """Initialize GO Cellular Component dataset.
+
+        Args:
+            force_redownload: Force raw data download even if pre-existing.
+        """
+        super().__init__(
+            dataset_name="go-cc",
+            species="human",
+            force_redownload=force_redownload,
+            hf_url=(
+                "https://huggingface.co/datasets/morrislab/"
+                "go-cc/resolve/main/go_dna_dataset_cc.parquet"
+            )
+        )
+
+    def _get_data_from_raw(self) -> pd.DataFrame:
+        raise NotImplementedError(
+            "Code documenting GO Cellular Component data is still in progress."
+        )
@@ -1,12 +1,6 @@
-import numpy as np
 import pandas as pd
 
 from mrna_bench.datasets.benchmark_dataset import BenchmarkDataset
-from mrna_bench.datasets.dataset_utils import ohe_to_str
-from mrna_bench.utils import download_file
-
-
-GOMF_URL = "https://zenodo.org/records/14708163/files/go_dna_dataset.npz"
 
 
 class GOMolecularFunction(BenchmarkDataset):
@@ -23,48 +17,12 @@ def __init__(self, force_redownload: bool = False):
             species="human",
             force_redownload=force_redownload,
             hf_url=(
-                "https://huggingface.co/datasets/quietflamingo/"
-                "go-mf/resolve/main/go-mf.parquet"
+                "https://huggingface.co/datasets/morrislab/"
+                "go-mf/resolve/main/go_dna_dataset_mf.parquet"
             )
         )
 
     def _get_data_from_raw(self) -> pd.DataFrame:
-        """Process raw data into Pandas dataframe.
-
-        Returns:
-            Pandas dataframe of processed sequences.
-        """
-        try:
-            import genome_kit as gk
-            hg_genes = gk.Genome("gencode.v41").genes
-        except ImportError:
-            print("GenomeKit is required for raw processing. See README.")
-            raise
-
-        print("Downloading raw data...")
-        self.raw_data_path = download_file(GOMF_URL, self.raw_data_dir)
-        data = np.load(self.raw_data_path)
-        X = data["X"]
-
-        print("Processing raw data...")
-        seq_str = ohe_to_str(X[:, :, :4])
-        lens = [len(s) for s in seq_str]
-        cds = [X[i, :lens[i], 4] for i in range(len(X))]
-        splice = [X[i, :lens[i], 5] for i in range(len(X))]
-
-        chrs = []
-        for gene in data["genes"]:
-            transcript_chr = hg_genes.first_by_name(gene).chromosome
-            transcript_chr = transcript_chr.replace("chr", "")
-            chrs.append(transcript_chr)
-
-        df = pd.DataFrame({
-            "gene": data["genes"],
-            "chromosome": chrs,
-            "sequence": seq_str,
-            "cds": cds,
-            "splice": splice,
-            "target": [y for y in data["y"]]
-        })
-
-        return df
+        raise NotImplementedError(
+            "Code documenting GO Molecular Function data is still in progress."
+        )
Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ def __init__(self, force_redownload=False):`
`104`	`104`	`"eclip-binding-k562",`
`105`	`105`	`force_redownload,`
`106`	`106`	`hf_url=(`
`107`		`- "https://huggingface.co/datasets/quietflamingo/"`
	`107`	`+ "https://huggingface.co/datasets/morrislab/"`
`108`	`108`	`"eclip/resolve/main/eclip-k562.parquet"`
`109`	`109`	`)`
`110`	`110`	`)`
`@@ -125,7 +125,7 @@ def __init__(self, force_redownload=False):`
`125`	`125`	`"eclip-binding-hepg2",`
`126`	`126`	`force_redownload,`
`127`	`127`	`hf_url=(`
`128`		`- "https://huggingface.co/datasets/quietflamingo/"`
	`128`	`+ "https://huggingface.co/datasets/morrislab/"`
`129`	`129`	`"eclip/resolve/main/eclip-hepg2.parquet"`
`130`	`130`	`)`
`131`	`131`	`)`