From a2e453536043e9cc888133546f5d4c26650129a4 Mon Sep 17 00:00:00 2001 From: Jeeva Ramasamy Date: Fri, 24 Apr 2026 18:53:00 -0400 Subject: [PATCH] Add robust installationGuide for OpenFold on PACE ICE Adds install.py and install.ipynb to automate OpenFold and VizFold-Foundation setup on PACE ICE (RHEL 9, CUDA 12.4, H100). Key features: - Pre-flight checks for GPU, conda, disk space, and AlphaFold data - PACE ICE-compatible environment.yml (PyTorch 2.5.1, CUDA 12.4) - libmamba solver for faster dependency resolution - System GCC 12.3.0 + CUDA_HOME=$CONDA_PREFIX to avoid compiler and CUDA version mismatches during OpenFold build - Safe re-runs: skips git clone and symlink creation if already done - Post-install verification of PyTorch+CUDA, OpenFold, viz tools - Updated README documenting both install methods and prerequisites --- environment.yml | 57 ++-- installationGuide/README.md | 72 +++++ installationGuide/install.ipynb | 496 ++++++++++++++++++++++++++++++++ installationGuide/install.py | 379 ++++++++++++++++++++++++ viz_attention_demo_base.ipynb | 94 ++++-- 5 files changed, 1044 insertions(+), 54 deletions(-) create mode 100644 installationGuide/README.md create mode 100644 installationGuide/install.ipynb create mode 100755 installationGuide/install.py diff --git a/environment.yml b/environment.yml index e02d1b4d..a0a987f8 100644 --- a/environment.yml +++ b/environment.yml @@ -1,40 +1,49 @@ -name: openfold-env +name: openfold_env channels: + - pytorch + - nvidia - conda-forge - bioconda - - pytorch - - nvidia dependencies: - - cuda - - gcc=12.4 - python=3.10 - - setuptools=59.5.0 - - pip - - openmm - - pdbfixer - - pytorch-lightning - - biopython + - pytorch=2.5.1 + - pytorch-cuda=12.4 + # Build tools — use conda cross-compiler; system GCC is used at build time + - gxx_linux-64 + - gcc_linux-64 + - libstdcxx-ng + - sysroot_linux-64=2.17 + - make + - ninja + # CUDA development headers and libraries + - nvidia::cuda-nvcc=12.4 + - nvidia::cuda-libraries-dev=12.4 + - nvidia::cuda-cudart-dev=12.4 + - nvidia::cuda-driver-dev=12.4 + # Core scientific stack - numpy - pandas - - PyYAML - - requests - scipy - tqdm + - pyyaml + - requests - typing-extensions + # ML / framework dependencies - wandb + - pytorch-lightning + # Structural biology tools + - openmm + - pdbfixer + - biopython - modelcif==0.7 + # Sequence search databases and tools + - bioconda::hmmer + - bioconda::hhsuite + - bioconda::kalign2 + # Utilities - awscli - ml-collections - aria2 - - mkl - git - - bioconda::hmmer - - bioconda::hhsuite - - bioconda::kalign2 - - pytorch::pytorch=2.5 - - pytorch::pytorch-cuda=12.4 - - pip: - - deepspeed==0.14.5 - - dm-tree==0.1.6 - - git+https://github.com/NVIDIA/dllogger.git - - flash-attn==2.6.3 + - pip + - packaging \ No newline at end of file diff --git a/installationGuide/README.md b/installationGuide/README.md new file mode 100644 index 00000000..1c60f938 --- /dev/null +++ b/installationGuide/README.md @@ -0,0 +1,72 @@ +# OpenFold and VizFold-Foundation Setup + +This guide explains how to install OpenFold and VizFold-Foundation on an HPC cluster (tested on Georgia Tech PACE ICE). + +## Prerequisites + +- Conda package manager available via `module load anaconda3` (or equivalent on your cluster) +- Access to the AlphaFold data directory: + - **PACE ICE**: `/storage/ice1/shared/d-pace_community/alphafold/alphafold_2.3.2_data` + - **Purdue Anvil**: `/anvil/datasets/alphafold/db` +- At least 50 GB of free disk space +- A compute node with GPU access (required for building CUDA extensions) + +## Installation + +Two equivalent options are provided — choose whichever fits your workflow. Both include automatic pre-flight checks and a verification step at the end. + +### Option A — Jupyter Notebook (`install.ipynb`) + +1. Open `install.ipynb` in JupyterLab or a compatible IDE. +2. Select a kernel: click **Kernel** (top right) → **Python Kernel** → choose the kernel starting with **base**. If it doesn't appear, click the refresh button. +3. In the **first code cell**, update the three directory variables to match your environment: + ```python + os.environ['ROOT_DIR'] = '~/scratch' # where repos will be cloned + os.environ['DATA_DIR'] = '/storage/...' # path to AlphaFold data + os.environ['CONDA_INSTALL_DIR'] = '~/scratch' # where the conda env is stored + ``` +4. Click **Run All**. The notebook will: + - Run **pre-flight checks** (GPU, conda, disk space, data directory) + - Clone VizFold-Foundation and OpenFold (skips if already cloned) + - Write a PACE ICE–compatible `environment.yml` + - Create the `openfold_env` environment (using the fast libmamba solver) + - Build OpenFold's CUDA extensions using the system GCC 12.3.0 + - Install third-party dependencies + - Set up the VizFold-Foundation directory structure + - Install visualization tools (matplotlib, PyMOL) + - **Verify** the installation (PyTorch + CUDA, OpenFold import, all dependencies) + +### Option B — Python Script (`install.py`) + +1. Transfer the script to your cluster if needed. +2. Load conda and start an interactive GPU job: + ```bash + module load anaconda3 + srun --partition=gpu-ice --gres=gpu:1 --mem=32G --time=2:00:00 --pty bash + ``` +3. Edit the three directory variables near the top of `install.py` to match your environment: + ```python + os.environ['ROOT_DIR'] = '~/scratch' + os.environ['DATA_DIR'] = '/storage/...' + os.environ['CONDA_INSTALL_DIR'] = '~/scratch' + ``` +4. Run the script: + ```bash + python install.py + ``` + The script prints timestamped progress headers for each step and exits immediately on any error, so failures are easy to locate. + +## Notes + +- Run the installation on a **compute node with a GPU**, not the login node — building OpenFold's CUDA extensions requires GPU access. +- If your conda environment is stored in a non-default location (i.e. `CONDA_INSTALL_DIR` is not `~`), activate it with: + ```bash + conda activate [CONDA_INSTALL_DIR]/.conda/envs/openfold_env + ``` + Otherwise: + ```bash + conda activate openfold_env + ``` +- To use this environment in another Jupyter notebook (e.g. `viz_attention_demo_base.ipynb`), select the `openfold_env` Jupyter kernel. +- The installer uses the **libmamba solver** (`conda --solver=libmamba`) for faster dependency resolution. This is included with recent conda versions and requires no additional installation. +- Installation time varies depending on your internet connection and cluster load; expect 30–90 minutes. diff --git a/installationGuide/install.ipynb b/installationGuide/install.ipynb new file mode 100644 index 00000000..0b03427a --- /dev/null +++ b/installationGuide/install.ipynb @@ -0,0 +1,496 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "12e85d50", + "metadata": {}, + "source": [ + "# OpenFold & VizFold-Foundation Installer\n", + "\n", + "This notebook installs OpenFold and VizFold-Foundation on a PACE ICE cluster.\n", + "\n", + "**Before running:** Update the three directory variables in the first code cell to match your environment." + ] + }, + { + "cell_type": "markdown", + "id": "a0b1c2d3", + "metadata": {}, + "source": [ + "## Setting Up the Directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffe08cd7", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# Root directory for cloned repositories and the conda environment\n", + "os.environ['ROOT_DIR'] = '~/scratch'\n", + "\n", + "# Path to AlphaFold data (pre-downloaded on PACE ICE)\n", + "os.environ['DATA_DIR'] = '/storage/ice1/shared/d-pace_community/alphafold/alphafold_2.3.2_data'\n", + "\n", + "# Directory where the conda environment will be stored\n", + "os.environ['CONDA_INSTALL_DIR'] = '~/scratch'" + ] + }, + { + "cell_type": "markdown", + "id": "8511e3a4", + "metadata": {}, + "source": [ + "## Fix paths to be expanded from user to exact paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c8d5f79d", + "metadata": {}, + "outputs": [], + "source": [ + "# Resolve ~ to the absolute home directory path for use in bash cells\n", + "os.environ['ROOT_DIR'] = os.path.expanduser(os.environ['ROOT_DIR'])\n", + "os.environ['DATA_DIR'] = os.path.expanduser(os.environ['DATA_DIR'])\n", + "os.environ['CONDA_INSTALL_DIR'] = os.path.expanduser(os.environ['CONDA_INSTALL_DIR'])" + ] + }, + { + "cell_type": "markdown", + "id": "e1f2a3b4", + "metadata": {}, + "source": [ + "## Pre-flight Checks\n", + "\n", + "Verify that the environment is ready before starting the installation.\n", + "GPU and conda are hard requirements. Disk space is a warning only — if a later step fails unexpectedly, low disk space may be the cause." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f5a6b7c8", + "metadata": {}, + "outputs": [], + "source": [ + "import shutil, subprocess\n", + "\n", + "errors = []\n", + "\n", + "# Check conda is available (hard requirement)\n", + "if shutil.which('conda') is None:\n", + " errors.append(\"conda is not available. Load a conda module first (e.g. 'module load anaconda3').\")\n", + "else:\n", + " print('[OK] conda found')\n", + "\n", + "# Check GPU is available via nvidia-smi (hard requirement)\n", + "gpu = subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total',\n", + " '--format=csv,noheader'],\n", + " capture_output=True, text=True)\n", + "if gpu.returncode != 0:\n", + " errors.append('No NVIDIA GPU detected. Run this on a compute node with GPU access.')\n", + "else:\n", + " for i, line in enumerate(gpu.stdout.strip().splitlines()):\n", + " print(f'[OK] GPU {i}: {line.strip()}')\n", + "\n", + "# Check disk space — warn only, do not abort\n", + "root_dir = os.environ['ROOT_DIR']\n", + "os.makedirs(root_dir, exist_ok=True)\n", + "disk = shutil.disk_usage(root_dir)\n", + "free_gb = disk.free / (1024 ** 3)\n", + "if free_gb < 50:\n", + " print(f'[WARN] Only {free_gb:.1f} GB free in {root_dir}. '\n", + " 'At least 50 GB recommended — proceeding anyway.')\n", + " print(' If a later step fails, low disk space may be the cause.')\n", + "else:\n", + " print(f'[OK] {free_gb:.0f} GB free in {root_dir}')\n", + "\n", + "# Check AlphaFold data directory exists (hard requirement)\n", + "data_dir = os.environ['DATA_DIR']\n", + "if not os.path.isdir(data_dir):\n", + " errors.append(f'AlphaFold data directory not found: {data_dir}')\n", + "else:\n", + " print(f'[OK] AlphaFold data found at {data_dir}')\n", + "\n", + "if errors:\n", + " print('\\n*** Pre-flight checks FAILED ***')\n", + " for e in errors:\n", + " print(f' ✗ {e}')\n", + " raise SystemExit('Fix the issues above before continuing.')\n", + "else:\n", + " print('\\nAll pre-flight checks passed.')" + ] + }, + { + "cell_type": "markdown", + "id": "6796467d", + "metadata": {}, + "source": [ + "## Clone the Vizfold-Foundation, and OpenFold repository." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09e8650b", + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "%%bash\n", + "# Clone VizFold-Foundation and OpenFold into the root directory (skip if already cloned)\n", + "if [ ! -d \"$ROOT_DIR/vizfold-foundation/.git\" ]; then\n", + " git clone https://github.com/AI2Science/vizfold-foundation.git $ROOT_DIR/vizfold-foundation\n", + "else\n", + " echo \"vizfold-foundation already cloned — skipping\"\n", + "fi\n", + "\n", + "if [ ! -d \"$ROOT_DIR/openfold/.git\" ]; then\n", + " git clone https://github.com/aqlaboratory/openfold.git $ROOT_DIR/openfold\n", + "else\n", + " echo \"openfold already cloned — skipping\"\n", + "fi" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "533e4cfd", + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "%%bash\n", + "# Overwrite the upstream OpenFold environment.yml with a version that is\n", + "# compatible with PACE ICE (CUDA 12.4, system GCC, no conda-bundled pip packages).\n", + "cd $ROOT_DIR/openfold\n", + "cat > environment.yml << 'EOF'\n", + "name: openfold_env\n", + "channels:\n", + " - pytorch\n", + " - nvidia\n", + " - conda-forge\n", + " - bioconda\n", + "dependencies:\n", + " - python=3.10\n", + " - pytorch=2.5.1\n", + " - pytorch-cuda=12.4\n", + " # Build tools — use conda cross-compiler; system GCC is used at build time\n", + " - gxx_linux-64\n", + " - gcc_linux-64\n", + " - libstdcxx-ng\n", + " - sysroot_linux-64=2.17\n", + " - make\n", + " - ninja\n", + " # CUDA development headers and libraries\n", + " - nvidia::cuda-nvcc=12.4\n", + " - nvidia::cuda-libraries-dev=12.4\n", + " - nvidia::cuda-cudart-dev=12.4\n", + " - nvidia::cuda-driver-dev=12.4\n", + " # Core scientific stack\n", + " - numpy\n", + " - pandas\n", + " - scipy\n", + " - tqdm\n", + " - pyyaml\n", + " - requests\n", + " - typing-extensions\n", + " # ML / framework dependencies\n", + " - wandb\n", + " - pytorch-lightning\n", + " # Structural biology tools\n", + " - openmm\n", + " - pdbfixer\n", + " - biopython\n", + " - modelcif==0.7\n", + " # Sequence search databases and tools\n", + " - bioconda::hmmer\n", + " - bioconda::hhsuite\n", + " - bioconda::kalign2\n", + " # Utilities\n", + " - awscli\n", + " - ml-collections\n", + " - aria2\n", + " - git\n", + " - pip\n", + " - packaging\n", + "EOF\n", + "echo \"environment.yml written successfully\"" + ] + }, + { + "cell_type": "markdown", + "id": "58429302", + "metadata": {}, + "source": [ + "## Create and activate the OpenFold conda environment" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d135e88e", + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "%%bash\n", + "cd $ROOT_DIR/openfold\n", + "export PYTHONNOUSERSITE=1\n", + "export MAX_JOBS=4\n", + "source \"$(conda info --base)/etc/profile.d/conda.sh\"\n", + "export CONDA_ENVS_PATH=$CONDA_INSTALL_DIR/.conda/envs\n", + "export CONDA_PKGS_DIRS=$CONDA_INSTALL_DIR/.conda/pkgs\n", + "\n", + "# Remove any previous (possibly partial) environment\n", + "echo \"Removing old environment if it exists...\"\n", + "conda env remove -n openfold_env -y || true\n", + "\n", + "# Create the environment (use libmamba solver for faster dependency resolution)\n", + "echo \"Creating conda environment...\"\n", + "conda env create --solver=libmamba -f environment.yml --force\n", + "\n", + "# Install pip-only packages after the conda environment is ready.\n", + "conda activate openfold_env\n", + "echo \"Installing pip dependencies...\"\n", + "pip install deepspeed==0.14.5 dm-tree==0.1.6 git+https://github.com/NVIDIA/dllogger.git\n", + "pip install flash-attn --no-build-isolation --no-cache-dir\n", + "\n", + "echo \"openfold_env created successfully\"" + ] + }, + { + "cell_type": "markdown", + "id": "2d737216", + "metadata": {}, + "source": [ + "## Set up compiler and library paths" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e4b7e10", + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "%%bash\n", + "source \"$(conda info --base)/etc/profile.d/conda.sh\"\n", + "export CONDA_ENVS_PATH=$CONDA_INSTALL_DIR/.conda/envs\n", + "export CONDA_PKGS_DIRS=$CONDA_INSTALL_DIR/.conda/pkgs\n", + "conda activate openfold_env\n", + "cd $ROOT_DIR/openfold\n", + "\n", + "# Use the PACE ICE system GCC 12.3.0 instead of the conda-bundled compiler.\n", + "# The conda GCC toolchain causes linker issues when building CUDA extensions on RHEL 9.\n", + "export CC=/usr/local/pace-apps/spack/packages/linux-rhel9-x86_64_v3/gcc-11.3.1/gcc-12.3.0-ukkkutsxfl5kpnnaxflpkq2jtliwthfz/bin/gcc\n", + "export CXX=/usr/local/pace-apps/spack/packages/linux-rhel9-x86_64_v3/gcc-11.3.1/gcc-12.3.0-ukkkutsxfl5kpnnaxflpkq2jtliwthfz/bin/g++\n", + "\n", + "# Point the build at the conda CUDA 12.4 toolkit, not the system CUDA 13.0.\n", + "export CUDA_HOME=$CONDA_PREFIX\n", + "export PATH=$CONDA_PREFIX/bin:$PATH\n", + "\n", + "export CFLAGS=\"-I${CONDA_PREFIX}/include\"\n", + "export CXXFLAGS=\"-I${CONDA_PREFIX}/include\"\n", + "export LDFLAGS=\"-L${CONDA_PREFIX}/lib\"\n", + "export LIBRARY_PATH=${CONDA_PREFIX}/lib:${LIBRARY_PATH}\n", + "export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}\n", + "export PYTHONNOUSERSITE=1\n", + "export MAX_JOBS=4\n", + "\n", + "echo \"Cleaning previous build artifacts...\"\n", + "rm -rf build/ openfold.egg-info/ dist/\n", + "\n", + "echo \"Building OpenFold with system GCC...\"\n", + "pip install -e . --no-build-isolation\n", + "\n", + "echo \"Installing third-party dependencies...\"\n", + "bash scripts/install_third_party_dependencies.sh\n", + "\n", + "echo \"OpenFold installation complete\"" + ] + }, + { + "cell_type": "markdown", + "id": "4331ec0c", + "metadata": {}, + "source": [ + "## Set up Vizfold-Foundation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e3ec183", + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "%%bash\n", + "source \"$(conda info --base)/etc/profile.d/conda.sh\"\n", + "export CONDA_ENVS_PATH=$CONDA_INSTALL_DIR/.conda/envs\n", + "export CONDA_PKGS_DIRS=$CONDA_INSTALL_DIR/.conda/pkgs\n", + "conda activate openfold_env\n", + "\n", + "cd $ROOT_DIR/vizfold-foundation/openfold\n", + "mkdir -p resources\n", + "\n", + "# Symlink to the shared AlphaFold parameters on PACE ICE\n", + "ln -sfn $DATA_DIR/params resources/params\n", + "\n", + "# Download stereo chemical properties file (required by OpenFold relaxation)\n", + "if [ ! -f \"resources/stereo_chemical_props.txt\" ]; then\n", + " wget -q --no-check-certificate -P resources \\\n", + " https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt\n", + "fi\n", + "\n", + "echo \"VizFold-Foundation setup complete\"" + ] + }, + { + "cell_type": "markdown", + "id": "0f660ed1", + "metadata": {}, + "source": [ + "## Install additional visualization tools" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f01cb106", + "metadata": { + "tags": [], + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "%%bash\n", + "source \"$(conda info --base)/etc/profile.d/conda.sh\"\n", + "export CONDA_ENVS_PATH=$CONDA_INSTALL_DIR/.conda/envs\n", + "export CONDA_PKGS_DIRS=$CONDA_INSTALL_DIR/.conda/pkgs\n", + "conda activate openfold_env\n", + "\n", + "# Matplotlib for plotting attention maps\n", + "conda install --solver=libmamba -y conda-forge::matplotlib\n", + "\n", + "# PyMOL for 3D molecular structure visualization\n", + "conda install --solver=libmamba -y -c conda-forge pymol-open-source\n", + "\n", + "echo \"Visualization tools installed successfully\"" + ] + }, + { + "cell_type": "markdown", + "id": "d4e5f6a7", + "metadata": {}, + "source": [ + "## Verification\n", + "\n", + "Confirm that all packages were installed correctly and the GPU is accessible from PyTorch." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a8b9c0d1", + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "%%bash\n", + "source \"$(conda info --base)/etc/profile.d/conda.sh\"\n", + "export CONDA_ENVS_PATH=$CONDA_INSTALL_DIR/.conda/envs\n", + "export CONDA_PKGS_DIRS=$CONDA_INSTALL_DIR/.conda/pkgs\n", + "conda activate openfold_env\n", + "\n", + "echo \"--- Python & PyTorch ---\"\n", + "python -c \"\n", + "import torch\n", + "print(f' PyTorch {torch.__version__}')\n", + "print(f' CUDA available: {torch.cuda.is_available()}')\n", + "if torch.cuda.is_available():\n", + " print(f' GPU: {torch.cuda.get_device_name(0)}')\n", + " print(f' CUDA version: {torch.version.cuda}')\n", + "\"\n", + "\n", + "echo \"\"\n", + "echo \"--- OpenFold ---\"\n", + "python -c \"import openfold; print(' openfold imported successfully')\"\n", + "\n", + "echo \"\"\n", + "echo \"--- Visualization tools ---\"\n", + "# Force a non-interactive backend; Jupyter sets MPLBACKEND=matplotlib_inline\n", + "# which is invalid outside of an ipykernel process.\n", + "export MPLBACKEND=Agg\n", + "python -c \"\n", + "import matplotlib; print(f' matplotlib {matplotlib.__version__}')\n", + "import pymol; print(f' pymol imported successfully')\n", + "\"\n", + "\n", + "echo \"\"\n", + "echo \"--- Flash Attention ---\"\n", + "python -c \"import flash_attn; print(f' flash_attn {flash_attn.__version__}')\"\n", + "\n", + "echo \"\"\n", + "echo \"--- Key paths ---\"\n", + "echo \" Params symlink: $(readlink -f $ROOT_DIR/vizfold-foundation/openfold/resources/params)\"\n", + "echo \" Stereo props: $ROOT_DIR/vizfold-foundation/openfold/resources/stereo_chemical_props.txt\"\n", + "ls $ROOT_DIR/vizfold-foundation/openfold/resources/stereo_chemical_props.txt > /dev/null 2>&1 \\\n", + " && echo \" [OK] exists\" || echo \" [MISSING]\"\n", + "\n", + "echo \"\"\n", + "echo \"============================================\"\n", + "echo \" Installation verified successfully!\"\n", + "echo \"============================================\"" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbformat_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/installationGuide/install.py b/installationGuide/install.py new file mode 100755 index 00000000..13acd19d --- /dev/null +++ b/installationGuide/install.py @@ -0,0 +1,379 @@ +#!/usr/bin/env python +# coding: utf-8 + +import os +import subprocess +import sys +import shutil +import time + + +def run_bash(script: str) -> None: + """Run a bash script, streaming output live and raising on failure.""" + # Note: -u (nounset) is intentionally omitted because conda activation + # scripts (e.g. libblas_mkl_activate.sh) reference unbound variables. + result = subprocess.run(["bash", "-eo", "pipefail", "-c", script], + env=os.environ.copy()) + if result.returncode != 0: + sys.exit(result.returncode) + + +def step(name: str) -> None: + """Print a timestamped section header.""" + timestamp = time.strftime("%H:%M:%S") + banner = f"\n{'='*60}\n[{timestamp}] {name}\n{'='*60}" + print(banner) + + +def _check_storage_hint() -> None: + """Print a storage hint if disk space is low (called after any failure).""" + disk = shutil.disk_usage(os.environ['ROOT_DIR']) + free_gb = disk.free / (1024 ** 3) + if free_gb < 50: + print(f"\n[HINT] Only {free_gb:.1f} GB free in {os.environ['ROOT_DIR']}. " + "Low disk space may have contributed to this failure. " + "Try freeing space or pointing ROOT_DIR / CONDA_INSTALL_DIR " + "to a partition with more room.") + + +# --------------------------------------------------------------------------- +# Setting Up the Directory +# --------------------------------------------------------------------------- + +# Root directory for cloned repositories and the conda environment +os.environ['ROOT_DIR'] = '~/scratch' + +# Path to AlphaFold data (pre-downloaded on PACE ICE) +os.environ['DATA_DIR'] = '/storage/ice1/shared/d-pace_community/alphafold/alphafold_2.3.2_data' + +# Directory where the conda environment will be stored +os.environ['CONDA_INSTALL_DIR'] = '~/scratch' + +# --------------------------------------------------------------------------- +# Fix paths to be expanded from user to exact paths +# --------------------------------------------------------------------------- + +# Resolve ~ to the absolute home directory path for use in bash cells +os.environ['ROOT_DIR'] = os.path.expanduser(os.environ['ROOT_DIR']) +os.environ['DATA_DIR'] = os.path.expanduser(os.environ['DATA_DIR']) +os.environ['CONDA_INSTALL_DIR'] = os.path.expanduser(os.environ['CONDA_INSTALL_DIR']) + +# --------------------------------------------------------------------------- +# Pre-flight checks +# --------------------------------------------------------------------------- + +step("Pre-flight checks") + +errors = [] + +# Check conda is available (hard requirement) +if shutil.which("conda") is None: + errors.append("conda is not available. Load a conda module first (e.g. 'module load anaconda3').") +else: + print("[OK] conda found") + +# Check GPU is available via nvidia-smi (hard requirement) +gpu_result = subprocess.run(["nvidia-smi", "--query-gpu=name,memory.total", + "--format=csv,noheader"], + capture_output=True, text=True) +if gpu_result.returncode != 0: + errors.append("No NVIDIA GPU detected. Run this on a compute node with GPU access.") +else: + for i, line in enumerate(gpu_result.stdout.strip().splitlines()): + print(f"[OK] GPU {i}: {line.strip()}") + +# Check disk space — warn only, do not abort +root_dir = os.environ['ROOT_DIR'] +os.makedirs(root_dir, exist_ok=True) +disk = shutil.disk_usage(root_dir) +free_gb = disk.free / (1024 ** 3) +if free_gb < 50: + print(f"[WARN] Only {free_gb:.1f} GB free in {root_dir}. " + "At least 50 GB recommended — proceeding anyway.") +else: + print(f"[OK] {free_gb:.0f} GB free in {root_dir}") + +# Check AlphaFold data directory exists (hard requirement) +data_dir = os.environ['DATA_DIR'] +if not os.path.isdir(data_dir): + errors.append(f"AlphaFold data directory not found: {data_dir}") +else: + print(f"[OK] AlphaFold data found at {data_dir}") + +if errors: + print("\n*** Pre-flight checks FAILED ***") + for e in errors: + print(f" ✗ {e}") + sys.exit(1) + +print("\nAll pre-flight checks passed.\n") + + +# --------------------------------------------------------------------------- +# Installation steps — storage hint is printed on any failure +# --------------------------------------------------------------------------- + +try: + + # ----------------------------------------------------------------------- + # Clone the Vizfold-Foundation, and OpenFold repository. + # ----------------------------------------------------------------------- + + step("Cloning repositories") + + run_bash(""" + # Clone VizFold-Foundation and OpenFold into the root directory (skip if already cloned) + if [ ! -d "$ROOT_DIR/vizfold-foundation/.git" ]; then + git clone https://github.com/AI2Science/vizfold-foundation.git $ROOT_DIR/vizfold-foundation + else + echo "vizfold-foundation already cloned — skipping" + fi + + if [ ! -d "$ROOT_DIR/openfold/.git" ]; then + git clone https://github.com/aqlaboratory/openfold.git $ROOT_DIR/openfold + else + echo "openfold already cloned — skipping" + fi + """) + + # ----------------------------------------------------------------------- + # Overwrite environment.yml with a PACE ICE–compatible version + # ----------------------------------------------------------------------- + + step("Writing environment.yml") + + run_bash(""" + # Overwrite the upstream OpenFold environment.yml with a version that is + # compatible with PACE ICE (CUDA 12.4, system GCC, no conda-bundled pip packages). + cd $ROOT_DIR/openfold + cat > environment.yml << 'EOF' +name: openfold_env +channels: + - pytorch + - nvidia + - conda-forge + - bioconda +dependencies: + - python=3.10 + - pytorch=2.5.1 + - pytorch-cuda=12.4 + # Build tools — use conda cross-compiler; system GCC is used at build time + - gxx_linux-64 + - gcc_linux-64 + - libstdcxx-ng + - sysroot_linux-64=2.17 + - make + - ninja + # CUDA development headers and libraries + - nvidia::cuda-nvcc=12.4 + - nvidia::cuda-libraries-dev=12.4 + - nvidia::cuda-cudart-dev=12.4 + - nvidia::cuda-driver-dev=12.4 + # Core scientific stack + - numpy + - pandas + - scipy + - tqdm + - pyyaml + - requests + - typing-extensions + # ML / framework dependencies + - wandb + - pytorch-lightning + # Structural biology tools + - openmm + - pdbfixer + - biopython + - modelcif==0.7 + # Sequence search databases and tools + - bioconda::hmmer + - bioconda::hhsuite + - bioconda::kalign2 + # Utilities + - awscli + - ml-collections + - aria2 + - git + - pip + - packaging +EOF + echo "environment.yml written successfully" + """) + + # ----------------------------------------------------------------------- + # Create and activate the OpenFold conda environment + # ----------------------------------------------------------------------- + + step("Creating conda environment") + + run_bash(""" + cd $ROOT_DIR/openfold + export PYTHONNOUSERSITE=1 + export MAX_JOBS=4 + source "$(conda info --base)/etc/profile.d/conda.sh" + export CONDA_ENVS_PATH=$CONDA_INSTALL_DIR/.conda/envs + export CONDA_PKGS_DIRS=$CONDA_INSTALL_DIR/.conda/pkgs + + # Remove any previous (possibly partial) environment + echo "Removing old environment if it exists..." + conda env remove -n openfold_env -y || true + + # Create the environment (use libmamba solver for faster dependency resolution) + echo "Creating conda environment..." + conda env create --solver=libmamba -f environment.yml --force + + # Install pip-only packages after the conda environment is ready. + conda activate openfold_env + echo "Installing pip dependencies..." + pip install deepspeed==0.14.5 dm-tree==0.1.6 git+https://github.com/NVIDIA/dllogger.git + pip install flash-attn --no-build-isolation --no-cache-dir + + echo "openfold_env created successfully" + """) + + # ----------------------------------------------------------------------- + # Set up compiler and library paths, then build OpenFold + # ----------------------------------------------------------------------- + + step("Building OpenFold") + + run_bash(""" + source "$(conda info --base)/etc/profile.d/conda.sh" + export CONDA_ENVS_PATH=$CONDA_INSTALL_DIR/.conda/envs + export CONDA_PKGS_DIRS=$CONDA_INSTALL_DIR/.conda/pkgs + conda activate openfold_env + cd $ROOT_DIR/openfold + + # Use the PACE ICE system GCC 12.3.0 instead of the conda-bundled compiler. + # The conda GCC toolchain causes linker issues when building CUDA extensions on RHEL 9. + export CC=/usr/local/pace-apps/spack/packages/linux-rhel9-x86_64_v3/gcc-11.3.1/gcc-12.3.0-ukkkutsxfl5kpnnaxflpkq2jtliwthfz/bin/gcc + export CXX=/usr/local/pace-apps/spack/packages/linux-rhel9-x86_64_v3/gcc-11.3.1/gcc-12.3.0-ukkkutsxfl5kpnnaxflpkq2jtliwthfz/bin/g++ + + # Point the build at the conda CUDA 12.4 toolkit, not the system CUDA 13.0. + export CUDA_HOME=$CONDA_PREFIX + export PATH=$CONDA_PREFIX/bin:$PATH + + export CFLAGS="-I${CONDA_PREFIX}/include" + export CXXFLAGS="-I${CONDA_PREFIX}/include" + export LDFLAGS="-L${CONDA_PREFIX}/lib" + export LIBRARY_PATH=${CONDA_PREFIX}/lib:${LIBRARY_PATH} + export LD_LIBRARY_PATH=${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH} + export PYTHONNOUSERSITE=1 + export MAX_JOBS=4 + + echo "Cleaning previous build artifacts..." + rm -rf build/ openfold.egg-info/ dist/ + + echo "Building OpenFold with system GCC..." + pip install -e . --no-build-isolation + + echo "Installing third-party dependencies..." + bash scripts/install_third_party_dependencies.sh + + echo "OpenFold installation complete" + """) + + # ----------------------------------------------------------------------- + # Set up Vizfold-Foundation + # ----------------------------------------------------------------------- + + step("Setting up VizFold-Foundation") + + run_bash(""" + source "$(conda info --base)/etc/profile.d/conda.sh" + export CONDA_ENVS_PATH=$CONDA_INSTALL_DIR/.conda/envs + export CONDA_PKGS_DIRS=$CONDA_INSTALL_DIR/.conda/pkgs + conda activate openfold_env + + cd $ROOT_DIR/vizfold-foundation/openfold + mkdir -p resources + + # Symlink to the shared AlphaFold parameters on PACE ICE + ln -sfn $DATA_DIR/params resources/params + + # Download stereo chemical properties file (required by OpenFold relaxation) + if [ ! -f "resources/stereo_chemical_props.txt" ]; then + wget -q --no-check-certificate -P resources \\ + https://git.scicore.unibas.ch/schwede/openstructure/-/raw/7102c63615b64735c4941278d92b554ec94415f8/modules/mol/alg/src/stereo_chemical_props.txt + fi + + echo "VizFold-Foundation setup complete" + """) + + # ----------------------------------------------------------------------- + # Install additional visualization tools + # ----------------------------------------------------------------------- + + step("Installing visualization tools") + + run_bash(""" + source "$(conda info --base)/etc/profile.d/conda.sh" + export CONDA_ENVS_PATH=$CONDA_INSTALL_DIR/.conda/envs + export CONDA_PKGS_DIRS=$CONDA_INSTALL_DIR/.conda/pkgs + conda activate openfold_env + + # Matplotlib for plotting attention maps + conda install --solver=libmamba -y conda-forge::matplotlib + + # PyMOL for 3D molecular structure visualization + conda install --solver=libmamba -y -c conda-forge pymol-open-source + + echo "Visualization tools installed successfully" + """) + + # ----------------------------------------------------------------------- + # Verification + # ----------------------------------------------------------------------- + + step("Verifying installation") + + run_bash(""" + source "$(conda info --base)/etc/profile.d/conda.sh" + export CONDA_ENVS_PATH=$CONDA_INSTALL_DIR/.conda/envs + export CONDA_PKGS_DIRS=$CONDA_INSTALL_DIR/.conda/pkgs + conda activate openfold_env + + echo "--- Python & PyTorch ---" + python -c " +import torch +print(f' PyTorch {torch.__version__}') +print(f' CUDA available: {torch.cuda.is_available()}') +if torch.cuda.is_available(): + print(f' GPU: {torch.cuda.get_device_name(0)}') + print(f' CUDA version: {torch.version.cuda}') +" + + echo "" + echo "--- OpenFold ---" + python -c "import openfold; print(' openfold imported successfully')" + + echo "" + echo "--- Visualization tools ---" + # Force a non-interactive backend; Jupyter sets MPLBACKEND=matplotlib_inline + # which is invalid outside of an ipykernel process. + export MPLBACKEND=Agg + python -c " +import matplotlib; print(f' matplotlib {matplotlib.__version__}') +import pymol; print(f' pymol imported successfully') +" + + echo "" + echo "--- Flash Attention ---" + python -c "import flash_attn; print(f' flash_attn {flash_attn.__version__}')" + + echo "" + echo "--- Key paths ---" + echo " Params symlink: $(readlink -f $ROOT_DIR/vizfold-foundation/openfold/resources/params)" + echo " Stereo props: $ROOT_DIR/vizfold-foundation/openfold/resources/stereo_chemical_props.txt" + ls $ROOT_DIR/vizfold-foundation/openfold/resources/stereo_chemical_props.txt > /dev/null 2>&1 \\ + && echo " [OK] exists" || echo " [MISSING]" + + echo "" + echo "============================================" + echo " Installation verified successfully!" + echo "============================================" + """) + +except (SystemExit, Exception): + _check_storage_hint() + raise diff --git a/viz_attention_demo_base.ipynb b/viz_attention_demo_base.ipynb index d10f0b1b..77c611c4 100644 --- a/viz_attention_demo_base.ipynb +++ b/viz_attention_demo_base.ipynb @@ -16,17 +16,21 @@ "TRI_RESIDUE_IDX = 18\n", "\n", "# Define all relevant directories\n", - "BASE_DATA_DIR = \"/ime/hdd/rhaas/SUP-5301/database\" # path to AlphaFold database\n", + "BASE_DATA_DIR = \"/storage/ice1/shared/d-pace_community/alphafold/alphafold_2.3.2_data\"\n", + "ROOT_DIR = os.path.expanduser(\"~/scratch\")\n", "\n", "# Local paths for saving results (these probably can remain unchanged)\n", - "ATTN_MAP_DIR = f\"./outputs/attention_files_{PROT}_demo_tri_{TRI_RESIDUE_IDX}\" # directory for saving text files with top-k attention scores\n", - "ALIGNMENT_DIR = \"./examples/monomer/alignments\" # directory containing pre-computed alignment files (and MSAs)\n", - "OUTPUT_DIR = f\"./outputs/my_outputs_align_{PROT}_demo_tri_{TRI_RESIDUE_IDX}\" # directory to save outputs\n", - "IMAGE_OUTPUT_DIR = f\"./outputs/attention_images_{PROT}_demo_tri_{TRI_RESIDUE_IDX}\"\n", - "FASTA_DIR = f\"./examples/monomer/fasta_dir_{PROT}\"\n", + "ATTN_MAP_DIR = f\"{ROOT_DIR}/vizfold-foundation/outputs/attention_files_{PROT}_demo_tri_{TRI_RESIDUE_IDX}\"\n", + "ALIGNMENT_DIR = f\"{ROOT_DIR}/vizfold-foundation/examples/monomer/alignments\"\n", + "OUTPUT_DIR = f\"{ROOT_DIR}/vizfold-foundation/outputs/my_outputs_align_{PROT}_demo_tri_{TRI_RESIDUE_IDX}\"\n", + "IMAGE_OUTPUT_DIR = f\"{ROOT_DIR}/vizfold-foundation/outputs/attention_images_{PROT}_demo_tri_{TRI_RESIDUE_IDX}\"\n", + "FASTA_DIR = f\"{ROOT_DIR}/vizfold-foundation/examples/monomer/fasta_dir_{PROT}\"\n", + "\n", + "os.makedirs(FASTA_DIR, exist_ok=True)\n", + "os.makedirs(f\"{ROOT_DIR}/vizfold-foundation/outputs\", exist_ok=True)\n", "\n", "# Note: If this is a new protein, the ALIGNMENT_DIR does not need to be specified here or in the next cell\n", - "# In this case, the code will compute MSAs and alignments, which can take several hours\n" + "# In this case, the code will compute MSAs and alignments, which can take several hours" ] }, { @@ -36,28 +40,46 @@ "metadata": {}, "outputs": [], "source": [ - "# Run OpenFold inference and save top attention scores to text files \n", - "inference_cmd = f\"\"\"\n", + "%%bash\n", + "# Activate conda environment with OpenFold installed\n", + "source \"$(conda info --base)/etc/profile.d/conda.sh\"\n", + "export CONDA_ENVS_PATH=$CONDA_INSTALL_DIR/.conda/envs\n", + "export CONDA_PKGS_DIRS=$CONDA_INSTALL_DIR/.conda/pkgs\n", + "conda activate openfold_env\n", + "\n", + "export MPLBACKEND=Agg\n", + "export PYTHONNOUSERSITE=1\n", + "cd ~/scratch/vizfold-foundation\n", + "\n", + "# Define variables for the script\n", + "PROT=\"6KWC\"\n", + "TRI_RESIDUE_IDX=18\n", + "BASE_DATA_DIR=\"/storage/ice1/shared/d-pace_community/alphafold/alphafold_2.3.2_data\"\n", + "FASTA_DIR=\"$HOME/scratch/vizfold-foundation/examples/monomer/fasta_dir_${PROT}\"\n", + "ALIGNMENT_DIR=\"$HOME/scratch/vizfold-foundation/examples/monomer/alignments\"\n", + "OUTPUT_DIR=\"$HOME/scratch/vizfold-foundation/outputs/my_outputs_align_${PROT}_demo_tri_${TRI_RESIDUE_IDX}\"\n", + "ATTN_MAP_DIR=\"$HOME/scratch/vizfold-foundation/outputs/attention_files_${PROT}_demo_tri_${TRI_RESIDUE_IDX}\"\n", + "\n", + "# Run OpenFold inference\n", "python3 run_pretrained_openfold.py \\\n", - " {FASTA_DIR} \\\n", - " {BASE_DATA_DIR}/pdb_mmcif/mmcif_files \\\n", - " --use_precomputed_alignments {ALIGNMENT_DIR} \\\n", - " --output_dir {OUTPUT_DIR} \\\n", + " ${FASTA_DIR} \\\n", + " ${BASE_DATA_DIR}/pdb_mmcif/mmcif_files \\\n", + " --use_precomputed_alignments ${ALIGNMENT_DIR} \\\n", + " --output_dir ${OUTPUT_DIR} \\\n", " --config_preset model_1_ptm \\\n", - " --uniref90_database_path {BASE_DATA_DIR}/uniref90/uniref90.fasta \\\n", - " --mgnify_database_path {BASE_DATA_DIR}/mgnify/mgy_clusters_2022_05.fa \\\n", - " --pdb70_database_path {BASE_DATA_DIR}/pdb70/pdb70 \\\n", - " --uniclust30_database_path {BASE_DATA_DIR}/uniclust30/uniclust30_2018_08 \\\n", - " --bfd_database_path {BASE_DATA_DIR}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \\\n", + " --uniref90_database_path ${BASE_DATA_DIR}/uniref90/uniref90.fasta \\\n", + " --mgnify_database_path ${BASE_DATA_DIR}/mgnify/mgy_clusters_2022_05.fa \\\n", + " --pdb70_database_path ${BASE_DATA_DIR}/pdb70/pdb70 \\\n", + " --uniclust30_database_path ${BASE_DATA_DIR}/uniclust30/uniclust30_2018_08 \\\n", + " --bfd_database_path ${BASE_DATA_DIR}/bfd/bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt \\\n", " --save_outputs \\\n", " --model_device \"cuda:0\" \\\n", - " --attn_map_dir {ATTN_MAP_DIR} \\\n", + " --attn_map_dir ${ATTN_MAP_DIR} \\\n", " --num_recycles_save 1 \\\n", - " --triangle_residue_idx {TRI_RESIDUE_IDX} \\\n", + " --triangle_residue_idx ${TRI_RESIDUE_IDX} \\\n", " --demo_attn\n", - "\"\"\"\n", "\n", - "subprocess.run(inference_cmd, shell=True, check=True)\n" + "echo \"Inference complete. Outputs saved to ${OUTPUT_DIR} and attention maps saved to ${ATTN_MAP_DIR}.\"" ] }, { @@ -73,7 +95,7 @@ "PDB_FILE = os.path.join(OUTPUT_DIR, f\"predictions/{PROT}_1_model_1_ptm_relaxed.pdb\")\n", "FNAME = f\"predicted_structure_{PROT}_tri_{TRI_RESIDUE_IDX}.png\"\n", "\n", - "render_pdb_to_image(PDB_FILE, IMAGE_OUTPUT_DIR, FNAME)\n" + "render_pdb_to_image(PDB_FILE, IMAGE_OUTPUT_DIR, FNAME)" ] }, { @@ -88,12 +110,24 @@ "from visualize_attention_arc_diagram_demo_utils import generate_arc_diagrams, parse_fasta_sequence\n", "\n", "# Setup visualization output directories\n", - "output_dir_msa = os.path.join(IMAGE_OUTPUT_DIR, 'msa_row_attention_plots') # directory for saving msa attention 3D visuals\n", - "output_dir_tri = os.path.join(IMAGE_OUTPUT_DIR, 'tri_start_attention_plots') # directory for saving triangle attention 3D visuals\n", - "FASTA_PATH = f\"/u/thayes/vizfold/examples/monomer/fasta_dir_{PROT}/{PROT}.fasta\"\n", + "PROT = \"6KWC\"\n", + "TRI_RESIDUE_IDX = 18\n", "LAYER_IDX = 47 # selected layer for attention evaluation\n", "TOP_K = 50 # show top-k attention links (limit to 500)\n", "\n", + "BASE_DIR = os.path.expanduser(\"~/scratch/vizfold-foundation\")\n", + "OUTPUT_DIR = os.path.join(BASE_DIR, f\"outputs/my_outputs_align_{PROT}_demo_tri_{TRI_RESIDUE_IDX}\")\n", + "ATTN_MAP_DIR = os.path.join(BASE_DIR, f\"outputs/attention_files_{PROT}_demo_tri_{TRI_RESIDUE_IDX}\")\n", + "IMAGE_OUTPUT_DIR = os.path.join(OUTPUT_DIR, \"images\")\n", + "PDB_FILE = os.path.join(OUTPUT_DIR, f\"predictions/{PROT}_1_model_1_ptm_relaxed.pdb\")\n", + "FASTA_PATH = os.path.join(BASE_DIR, f\"examples/monomer/fasta_dir_{PROT}/{PROT}.fasta\")\n", + "\n", + "# Setup output sub-directories and create if they don't exist\n", + "output_dir_msa = os.path.join(IMAGE_OUTPUT_DIR, 'msa_row_attention_plots')\n", + "output_dir_tri = os.path.join(IMAGE_OUTPUT_DIR, 'tri_start_attention_plots')\n", + "os.makedirs(output_dir_msa, exist_ok=True)\n", + "os.makedirs(output_dir_tri, exist_ok=True)\n", + "\n", "# Generate 3D attention plots for MSA row attention\n", "plot_pymol_attention_heads(\n", " pdb_file=PDB_FILE,\n", @@ -141,7 +175,7 @@ " residue_indices=[TRI_RESIDUE_IDX],\n", " top_k=TOP_K,\n", " layer_idx=LAYER_IDX\n", - ")\n" + ")" ] }, { @@ -173,13 +207,13 @@ " output_dir_arc=output_dir_tri,\n", " combined_output_dir=IMAGE_OUTPUT_DIR,\n", " residue_indices=[TRI_RESIDUE_IDX]\n", - ")\n" + ")" ] } ], "metadata": { "kernelspec": { - "display_name": "openfold_env3", + "display_name": "openfold_env", "language": "python", "name": "python3" }, @@ -193,7 +227,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.17" + "version": "3.10.20" } }, "nbformat": 4,