From 0e070c07b5a179f0ef376120503e4303b6c76dd3 Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Tue, 7 Apr 2026 13:37:18 -0400
Subject: [PATCH 01/17] Replace GKE/Helm Galaxy path with GCE VM-based
 deployment

---
 .../init-resources/galaxy-user-data.sh        | 138 +++++
 .../main/resources/init-resources/startup.sh  |   4 +
 http/src/main/resources/reference.conf        |  52 ++
 .../workbench/leonardo/config/Config.scala    |  18 +-
 .../leonardo/config/KubernetesAppConfig.scala |  15 +-
 .../leonardo/dao/HttpJupyterDAO.scala         |   2 +-
 .../monitor/LeoPubsubMessageSubscriber.scala  |   7 +-
 .../leonardo/util/BuildHelmChartValues.scala  | 107 ----
 .../leonardo/util/GKEInterpreter.scala        | 576 ++++++++++--------
 .../util/BuildHelmChartValuesSpec.scala       | 172 ------
 10 files changed, 544 insertions(+), 547 deletions(-)
 create mode 100644 http/src/main/resources/init-resources/galaxy-user-data.sh

diff --git a/http/src/main/resources/init-resources/galaxy-user-data.sh b/http/src/main/resources/init-resources/galaxy-user-data.sh
new file mode 100644
index 0000000000..3b7fbb640b
--- /dev/null
+++ b/http/src/main/resources/init-resources/galaxy-user-data.sh
@@ -0,0 +1,138 @@
+# Sourced from https://github.com/galaxyproject/galaxy-k8s-boot/blob/dev/bin/user_data.sh
+# When updating this file, sync it manually from that repository and verify the changes.
+#cloud-config
+write_files:
+  - path: /usr/local/bin/galaxy_bootstrap.sh
+    permissions: '0755'
+    owner: root:root
+    content: |
+      #!/bin/bash
+
+      echo "[$(date)] - Starting galaxy_bootstrap script..."
+
+      # 1. Setup persistent disk if available
+      DISK_DEVICE="/dev/disk/by-id/google-galaxy-data"
+      if [ -b "$DISK_DEVICE" ]; then
+        echo "[$(date)] - Found persistent disk at $DISK_DEVICE"
+
+        # Check if disk is already formatted
+        if ! blkid "$DISK_DEVICE" > /dev/null 2>&1; then
+          echo "[$(date)] - Formatting disk $DISK_DEVICE with ext4"
+          mkfs -t ext4 "$DISK_DEVICE"
+        else
+          echo "[$(date)] - Disk $DISK_DEVICE is already formatted"
+        fi
+
+        # Create mount point and mount
+        mkdir -p /mnt/block_storage
+        mount "$DISK_DEVICE" /mnt/block_storage
+
+        # Add to fstab for persistent mounting across reboots
+        DISK_UUID=$(blkid -s UUID -o value "$DISK_DEVICE")
+        if [ -n "$DISK_UUID" ] && ! grep -q "$DISK_UUID" /etc/fstab; then
+          echo "UUID=$DISK_UUID /mnt/block_storage ext4 defaults 0 2" >> /etc/fstab
+        fi
+
+        # Set proper ownership
+        chown debian:debian /mnt/block_storage
+        echo "[$(date)] - Persistent disk mounted at /mnt/block_storage"
+      else
+        echo "[$(date)] - No persistent disk found at $DISK_DEVICE. Galaxy will use ephemeral storage."
+      fi
+
+      # 2. Setup PostgreSQL disk if available
+      POSTGRES_DISK_DEVICE="/dev/disk/by-id/google-galaxy-postgres-data"
+      if [ -b "$POSTGRES_DISK_DEVICE" ]; then
+        echo "[$(date)] - Found PostgreSQL disk at $POSTGRES_DISK_DEVICE"
+
+        # Check if disk is already formatted
+        if ! blkid "$POSTGRES_DISK_DEVICE" > /dev/null 2>&1; then
+          echo "[$(date)] - Formatting PostgreSQL disk $POSTGRES_DISK_DEVICE with ext4"
+          mkfs -t ext4 "$POSTGRES_DISK_DEVICE"
+        else
+          echo "[$(date)] - PostgreSQL disk $POSTGRES_DISK_DEVICE is already formatted"
+        fi
+
+        # Create mount point and mount
+        mkdir -p /mnt/postgres_storage
+        mount "$POSTGRES_DISK_DEVICE" /mnt/postgres_storage
+
+        # Add to fstab for persistent mounting across reboots
+        POSTGRES_DISK_UUID=$(blkid -s UUID -o value "$POSTGRES_DISK_DEVICE")
+        if [ -n "$POSTGRES_DISK_UUID" ] && ! grep -q "$POSTGRES_DISK_UUID" /etc/fstab; then
+          echo "UUID=$POSTGRES_DISK_UUID /mnt/postgres_storage ext4 defaults 0 2" >> /etc/fstab
+        fi
+
+        # Set proper ownership
+        chown debian:debian /mnt/postgres_storage
+        echo "[$(date)] - PostgreSQL disk mounted at /mnt/postgres_storage"
+      else
+        echo "[$(date)] - No PostgreSQL disk found at $POSTGRES_DISK_DEVICE. PostgreSQL will use ephemeral storage."
+      fi
+
+      # 3. Run ansible-pull
+      sudo -u debian bash -c '
+      export HOME=/home/debian
+      HOST_IP=$(curl -s ifconfig.me)
+
+      PV_SIZE=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/persistent-volume-size" -H "Metadata-Flavor: Google" 2>/dev/null)
+      if [ -z "$PV_SIZE" ]; then
+          echo "[$(date)] - persistent-volume-size metadata not found or empty, using default."
+          PV_SIZE="139Gi"
+      fi
+      echo "[$(date)] - NFS storage size for Galaxy: ${PV_SIZE}"
+
+      # Add restore_galaxy if enabled
+      RESTORE_GALAXY=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/restore_galaxy" -H "Metadata-Flavor: Google" 2>/dev/null || echo "false")
+
+      GCP_BATCH_SERVICE_ACCOUNT_EMAIL=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/gcp_batch_service_account_email" -H "Metadata-Flavor: Google" 2>/dev/null || echo "galaxy-batch-runner@anvil-and-terra-development.iam.gserviceaccount.com")
+      echo "[$(date)] - GCP Batch service account email: ${GCP_BATCH_SERVICE_ACCOUNT_EMAIL}"
+
+      GIT_REPO=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/git-repo" -H "Metadata-Flavor: Google" 2>/dev/null || echo "https://github.com/galaxyproject/galaxy-k8s-boot.git")
+      GIT_BRANCH=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/git-branch" -H "Metadata-Flavor: Google" 2>/dev/null || echo "master")
+
+      PULL_ARGS=(
+        -U "${GIT_REPO}"
+        -C "${GIT_BRANCH}"
+        -d /home/debian/ansible
+        -i /tmp/ansible-inventory/localhost
+        --accept-host-key
+        --limit 127.0.0.1
+        --extra-vars "gcp_batch_service_account_email=${GCP_BATCH_SERVICE_ACCOUNT_EMAIL}"
+      )
+
+      if [ "$RESTORE_GALAXY" = "true" ]; then
+          PULL_ARGS+=(--extra-vars "restore_galaxy=true")
+          echo "[$(date)] - Galaxy Restore Mode: Enabled"
+      else
+          echo "[$(date)] - Galaxy Restore Mode: Disabled"
+      fi
+
+      PULL_ARGS+=(playbook.yml)
+
+      mkdir -p /tmp/ansible-inventory
+      cat > /tmp/ansible-inventory/localhost << EOF
+      [vm]
+      127.0.0.1 ansible_connection=local ansible_python_interpreter="/usr/bin/python3"
+
+      [all:vars]
+      ansible_user="debian"
+      rke2_token="defaultSecret12345"
+      rke2_additional_sans=["${HOST_IP}"]
+      rke2_debug=true
+      nfs_size="${PV_SIZE}"
+      galaxy_persistence_size="${PV_SIZE}"
+      galaxy_db_password="gxy-db-password"
+      galaxy_user="dev@galaxyproject.org"
+      EOF
+
+      echo "[$(date)] - Inventory file created at /tmp/ansible-inventory/localhost; running ansible-pull..."
+      echo "[$(date)] - Running: ANSIBLE_CALLBACKS_ENABLED=profile_tasks ANSIBLE_HOST_PATTERN_MISMATCH=ignore ansible-pull ${PULL_ARGS[@]}"
+
+      ANSIBLE_CALLBACKS_ENABLED=profile_tasks ANSIBLE_HOST_PATTERN_MISMATCH=ignore ansible-pull "${PULL_ARGS[@]}"
+      '
+
+      echo "[$(date)] - Bootstrap script completed."
+
+runcmd:
+  - /usr/local/bin/galaxy_bootstrap.sh
diff --git a/http/src/main/resources/init-resources/startup.sh b/http/src/main/resources/init-resources/startup.sh
index 6375f7d7a0..4701c3a892 100644
--- a/http/src/main/resources/init-resources/startup.sh
+++ b/http/src/main/resources/init-resources/startup.sh
@@ -122,6 +122,10 @@ function failScriptIfError() {
 
 function validateCert() {
   certFileDirectory=$1
+  ## Only the master node has certs; worker nodes in multi-node Dataproc clusters do not.
+  if [ ! -f "${certFileDirectory}/jupyter-server.crt" ]; then
+    return 0
+  fi
   ## This helps when we need to rotate certs.
   notAfter=`openssl x509 -enddate -noout -in ${certFileDirectory}/jupyter-server.crt` # output should be something like `notAfter=Jul  4 20:31:52 2026 GMT`
 
diff --git a/http/src/main/resources/reference.conf b/http/src/main/resources/reference.conf
index 06d866b7fb..7a9c841302 100644
--- a/http/src/main/resources/reference.conf
+++ b/http/src/main/resources/reference.conf
@@ -210,6 +210,44 @@ vpc {
     northamerica-northeast2 = "10.26.0.0/20"
   }
   firewallsToAdd = [
+    # Allows Galaxy VM traffic on port 80 (nginx ingress on hostNetwork)
+    {
+      name-prefix = "leonardo-allow-http"
+      sourceRanges = {
+        us-central1 = ["0.0.0.0/0"]
+        northamerica-northeast1 = ["0.0.0.0/0"]
+        southamerica-east1 = ["0.0.0.0/0"]
+        us-east1 = ["0.0.0.0/0"]
+        us-east4 = ["0.0.0.0/0"]
+        us-west1 = ["0.0.0.0/0"]
+        us-west2 = ["0.0.0.0/0"]
+        us-west3 = ["0.0.0.0/0"]
+        us-west4 = ["0.0.0.0/0"]
+        europe-central2 = ["0.0.0.0/0"]
+        europe-north1 = ["0.0.0.0/0"]
+        europe-west1 = ["0.0.0.0/0"]
+        europe-west2 = ["0.0.0.0/0"]
+        europe-west3 = ["0.0.0.0/0"]
+        europe-west4 = ["0.0.0.0/0"]
+        europe-west6 = ["0.0.0.0/0"]
+        asia-east1 = ["0.0.0.0/0"]
+        asia-east2 = ["0.0.0.0/0"]
+        asia-northeast1 = ["0.0.0.0/0"]
+        asia-northeast2 = ["0.0.0.0/0"]
+        asia-northeast3 = ["0.0.0.0/0"]
+        asia-south1 = ["0.0.0.0/0"]
+        asia-southeast1 = ["0.0.0.0/0"]
+        asia-southeast2 = ["0.0.0.0/0"]
+        australia-southeast1 = ["0.0.0.0/0"]
+        northamerica-northeast2 = ["0.0.0.0/0"]
+      }
+      allowed = [
+        {
+          protocol = "tcp"
+          port = "80"
+        }
+      ]
+    },
     # Allows Leonardo proxy traffic on port 443
     {
       name-prefix = "leonardo-allow-https"
@@ -395,6 +433,20 @@ groups {
   }
 }
 
+galaxyVm {
+  # Debian 12 image — galaxy-k8s-boot bootstrap requires a standard Debian VM, not COS
+  sourceImage = "projects/debian-cloud/global/images/family/debian-12"
+  machineType = "n1-highmem-8"
+  bootDiskSizeGb = 50
+  postgresDiskSizeGb = 10
+  # Suffix appended to the NFS disk name to derive the postgres disk name.
+  # Must match the value used in LeoPubsubMessageSubscriber (galaxyDisk.postgresDiskNameSuffix).
+  postgresDiskNameSuffix = ${gke.galaxyDisk.postgresDiskNameSuffix}
+  gcpBatchServiceAccountEmail = ""
+  gitRepo = "https://github.com/galaxyproject/galaxy-k8s-boot.git"
+  gitBranch = "master"
+}
+
 gke {
   cluster {
     location = "us-central1-a",
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala
index 45a3e26f38..42c7a53737 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala
@@ -131,6 +131,19 @@ object Config {
     )
   }
 
+  implicit private val galaxyVmConfigReader: ValueReader[GalaxyVmConfig] = ValueReader.relative { config =>
+    GalaxyVmConfig(
+      config.as[GceCustomImage]("sourceImage"),
+      config.as[MachineTypeName]("machineType"),
+      config.as[DiskSize]("bootDiskSizeGb"),
+      config.as[DiskSize]("postgresDiskSizeGb"),
+      config.as[String]("postgresDiskNameSuffix"),
+      config.as[String]("gcpBatchServiceAccountEmail"),
+      config.as[String]("gitRepo"),
+      config.as[String]("gitBranch")
+    )
+  }
+
   implicit private val gceConfigReader: ValueReader[GceConfig] = ValueReader.relative { config =>
     GceConfig(
       config.as[GceCustomImage]("customGceImage"),
@@ -497,6 +510,7 @@ object Config {
   val googleGroupsConfig = config.as[GoogleGroupsConfig]("groups")
 
   val dataprocConfig = config.as[DataprocConfig]("dataproc")
+  val galaxyVmConfig = config.as[GalaxyVmConfig]("galaxyVm")
   val gceConfig = config.as[GceConfig]("gce")
   val imageConfig = config.as[ImageConfig]("image")
   val prometheusConfig = config.as[PrometheusConfig]("prometheus")
@@ -901,13 +915,13 @@ object Config {
       vpcConfig.networkTag,
       org.broadinstitute.dsde.workbench.leonardo.http.ConfigReader.appConfig.terraAppSetupChart,
       gkeIngressConfig,
-      gkeGalaxyAppConfig,
       gkeCromwellAppConfig,
       gkeCustomAppConfig,
       gkeAllowedAppConfig,
       appMonitorConfig,
       gkeClusterConfig,
       proxyConfig,
-      gkeGalaxyDiskConfig
+      gkeGalaxyDiskConfig,
+      galaxyVmConfig
     )
 }
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala
index 2b3fc4a587..53e2562db6 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala
@@ -1,7 +1,9 @@
 package org.broadinstitute.dsde.workbench.leonardo.config
 
-import org.broadinstitute.dsde.workbench.google2.KubernetesSerializableName.ServiceAccountName
+import org.broadinstitute.dsde.workbench.google2.{KubernetesSerializableName, MachineTypeName}
+import KubernetesSerializableName.ServiceAccountName
 import org.broadinstitute.dsde.workbench.leonardo.AppType._
+import org.broadinstitute.dsde.workbench.leonardo.CustomImage.GceCustomImage
 import org.broadinstitute.dsde.workbench.leonardo._
 import org.broadinstitute.dsp.{ChartName, ChartVersion}
 
@@ -92,6 +94,17 @@ final case class CustomAppConfig(chartName: ChartName,
   val appType: AppType = AppType.Custom
 }
 
+final case class GalaxyVmConfig(
+  sourceImage: GceCustomImage,
+  machineType: MachineTypeName,
+  bootDiskSizeGb: DiskSize,
+  postgresDiskSizeGb: DiskSize,
+  postgresDiskNameSuffix: String,
+  gcpBatchServiceAccountEmail: String,
+  gitRepo: String,
+  gitBranch: String
+)
+
 final case class ContainerRegistryUsername(asString: String) extends AnyVal
 final case class ContainerRegistryPassword(asString: String) extends AnyVal
 final case class ContainerRegistryCredentials(username: ContainerRegistryUsername, password: ContainerRegistryPassword)
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala
index 9bb06e9548..0c55f089db 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala
@@ -36,7 +36,7 @@ class HttpJupyterDAO[F[_]](val runtimeDnsCache: RuntimeDnsCache[F], client: Clie
                 headers = Headers.empty
               )
             )
-            .handleError(_ => false)
+            .handleErrorWith(e => logger.warn(e)(s"isProxyAvailable failed for ${cloudContext}/${runtimeName}").as(false))
         case _ => F.pure(false)
       }
     } yield res
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala
index 8c33c93579..97a6c984bd 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala
@@ -999,10 +999,15 @@ class LeoPubsubMessageSubscriber[F[_]](
           } yield res
         } else F.unit
 
+      // Galaxy uses VM-based deployment: no GKE cluster or nodepool is created.
+      // The VM is launched inside createAndPollApp instead.
+      effectiveClusterOrNodepoolOp =
+        if (msg.appType == AppType.Galaxy) F.unit else createClusterOrNodepoolOp
+
       // build asynchronous task
       task = for {
         // parallelize disk creation and cluster/nodepool monitoring
-        _ <- List(createDiskOp, createSecondDiskOp, createClusterOrNodepoolOp).parSequence_
+        _ <- List(createDiskOp, createSecondDiskOp, effectiveClusterOrNodepoolOp).parSequence_
 
         // create and monitor app
         _ <- getGkeAlgFromRegistry()
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValues.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValues.scala
index 7fa725c8cf..3c54efd044 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValues.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValues.scala
@@ -1,10 +1,8 @@
 package org.broadinstitute.dsde.workbench.leonardo
 package util
 
-import org.broadinstitute.dsde.workbench.google2.DiskName
 import org.broadinstitute.dsde.workbench.google2.GKEModels.NodepoolName
 import org.broadinstitute.dsde.workbench.google2.KubernetesSerializableName.{NamespaceName, ServiceAccountName}
-import org.broadinstitute.dsde.workbench.leonardo.AppRestore.GalaxyRestore
 import org.broadinstitute.dsde.workbench.leonardo.Autopilot
 import org.broadinstitute.dsde.workbench.leonardo.dao.CustomAppService
 import org.broadinstitute.dsde.workbench.leonardo.http.kubernetesProxyHost
@@ -15,111 +13,6 @@ import org.broadinstitute.dsp.Release
 import java.nio.charset.StandardCharsets
 
 private[leonardo] object BuildHelmChartValues {
-  def buildGalaxyChartOverrideValuesString(config: GKEInterpreterConfig,
-                                           appName: AppName,
-                                           release: Release,
-                                           cluster: KubernetesCluster,
-                                           nodepoolName: NodepoolName,
-                                           userEmail: WorkbenchEmail,
-                                           customEnvironmentVariables: Map[String, String],
-                                           ksa: ServiceAccountName,
-                                           namespaceName: NamespaceName,
-                                           nfsDisk: PersistentDisk,
-                                           postgresDiskName: DiskName,
-                                           machineType: AppMachineType,
-                                           galaxyRestore: Option[GalaxyRestore]
-  ): List[String] = {
-    val k8sProxyHost = kubernetesProxyHost(cluster, config.proxyConfig.proxyDomain).address
-    val leoProxyhost = config.proxyConfig.getProxyServerHostName
-    val ingressPath = s"/proxy/google/v1/apps/${cluster.cloudContext.asString}/${appName.value}/galaxy"
-    val workspaceName = customEnvironmentVariables.getOrElse("WORKSPACE_NAME", "")
-    val workspaceNamespace = customEnvironmentVariables.getOrElse("WORKSPACE_NAMESPACE", "")
-    // Machine type info
-    val maxLimitMemory = machineType.memorySizeInGb
-    val maxLimitCpu = machineType.numOfCpus
-    val maxRequestMemory = maxLimitMemory - 22
-    val maxRequestCpu = maxLimitCpu - 6
-
-    // Custom EV configs
-    val configs = customEnvironmentVariables.toList.zipWithIndex.flatMap { case ((k, v), i) =>
-      List(
-        raw"""configs.$k=$v""",
-        raw"""extraEnv[$i].name=$k""",
-        raw"""extraEnv[$i].valueFrom.configMapKeyRef.name=${release.asString}-galaxykubeman-configs""",
-        raw"""extraEnv[$i].valueFrom.configMapKeyRef.key=$k"""
-      )
-    }
-
-    val galaxyRestoreSettings = galaxyRestore.fold(List.empty[String])(g =>
-      List(
-        raw"""restore.persistence.nfs.galaxy.pvcID=${g.galaxyPvcId.asString}""",
-        raw"""galaxy.persistence.existingClaim=${release.asString}-galaxy-galaxy-pvc"""
-      )
-    )
-    // Using the string interpolator raw""" since the chart keys include quotes to escape Helm
-    // value override special characters such as '.'
-    // https://helm.sh/docs/intro/using_helm/#the-format-and-limitations-of---set
-    List(
-      // Storage class configs
-      raw"""nfs.storageClass.name=nfs-${release.asString}""",
-      raw"""galaxy.persistence.storageClass=nfs-${release.asString}""",
-      // Node selector config: this ensures the app is run on the user's nodepool
-      raw"""galaxy.nodeSelector.cloud\.google\.com/gke-nodepool=${nodepoolName.value}""",
-      raw"""nfs.nodeSelector.cloud\.google\.com/gke-nodepool=${nodepoolName.value}""",
-      raw"""galaxy.configs.job_conf\.yml.runners.k8s.k8s_node_selector=cloud.google.com/gke-nodepool: ${nodepoolName.value}""",
-      raw"""galaxy.postgresql.master.nodeSelector.cloud\.google\.com/gke-nodepool=${nodepoolName.value}""",
-      // Ingress configs
-      raw"""galaxy.ingress.path=${ingressPath}""",
-      raw"""galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-from=https://${k8sProxyHost}""",
-      raw"""galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-to=${leoProxyhost}""",
-      raw"""galaxy.ingress.hosts[0].host=${k8sProxyHost}""",
-      raw"""galaxy.ingress.hosts[0].paths[0].path=${ingressPath}""",
-      raw"""galaxy.ingress.tls[0].hosts[0]=${k8sProxyHost}""",
-      raw"""galaxy.ingress.tls[0].secretName=tls-secret""",
-      // CVMFS configs
-      raw"""cvmfs.cvmfscsi.cache.alien.pvc.storageClass=nfs-${release.asString}""",
-      raw"""cvmfs.cvmfscsi.cache.alien.pvc.name=cvmfs-alien-cache""",
-      // Galaxy configs
-      raw"""galaxy.configs.galaxy\.yml.galaxy.single_user=${userEmail.value}""",
-      raw"""galaxy.configs.galaxy\.yml.galaxy.admin_users=${userEmail.value}""",
-      raw"""galaxy.terra.launch.workspace=${workspaceName}""",
-      raw"""galaxy.terra.launch.namespace=${workspaceNamespace}""",
-      raw"""galaxy.terra.launch.apiURL=${config.galaxyAppConfig.orchUrl.value}""",
-      raw"""galaxy.terra.launch.drsURL=${config.galaxyAppConfig.drsUrl.value}""",
-      // Tusd ingress configs
-      raw"""galaxy.tusd.ingress.hosts[0].host=${k8sProxyHost}""",
-      raw"""galaxy.tusd.ingress.hosts[0].paths[0].path=${ingressPath}/api/upload/resumable_upload""",
-      raw"""galaxy.tusd.ingress.tls[0].hosts[0]=${k8sProxyHost}""",
-      raw"""galaxy.tusd.ingress.tls[0].secretName=tls-secret""",
-      // Set RabbitMQ storage class
-      raw"""galaxy.rabbitmq.persistence.storageClassName=nfs-${release.asString}""",
-      // Set Machine Type specs
-      raw"""galaxy.jobs.maxLimits.memory=${maxLimitMemory}""",
-      raw"""galaxy.jobs.maxLimits.cpu=${maxLimitCpu}""",
-      raw"""galaxy.jobs.maxRequests.memory=${maxRequestMemory}""",
-      raw"""galaxy.jobs.maxRequests.cpu=${maxRequestCpu}""",
-      raw"""galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_mem=${maxRequestMemory}""",
-      raw"""galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_cores=${maxRequestCpu}""",
-      // RBAC configs
-      raw"""galaxy.serviceAccount.create=false""",
-      raw"""galaxy.serviceAccount.name=${ksa.value}""",
-      raw"""rbac.serviceAccount=${ksa.value}""",
-      // Persistence configs
-      raw"""persistence.nfs.name=${namespaceName.value}-${config.galaxyDiskConfig.nfsPersistenceName}""",
-      raw"""persistence.nfs.persistentVolume.extraSpec.gcePersistentDisk.pdName=${nfsDisk.name.value}""",
-      raw"""persistence.nfs.size=${nfsDisk.size.gb.toString}Gi""",
-      raw"""persistence.postgres.name=${namespaceName.value}-${config.galaxyDiskConfig.postgresPersistenceName}""",
-      raw"""galaxy.postgresql.galaxyDatabasePassword=${config.galaxyAppConfig.postgresPassword.value}""",
-      raw"""persistence.postgres.persistentVolume.extraSpec.gcePersistentDisk.pdName=${postgresDiskName.value}""",
-      raw"""persistence.postgres.size=${config.galaxyDiskConfig.postgresDiskSizeGB.gb.toString}Gi""",
-      raw"""nfs.persistence.existingClaim=${namespaceName.value}-${config.galaxyDiskConfig.nfsPersistenceName}-pvc""",
-      raw"""nfs.persistence.size=${nfsDisk.size.gb.toString}Gi""",
-      raw"""galaxy.postgresql.persistence.existingClaim=${namespaceName.value}-${config.galaxyDiskConfig.postgresPersistenceName}-pvc""",
-      // Note Galaxy pvc claim is the nfs disk size minus 50G
-      raw"""galaxy.persistence.size=${(nfsDisk.size.gb - 50).toString}Gi"""
-    ) ++ configs ++ galaxyRestoreSettings
-  }
-
   def buildCromwellAppChartOverrideValuesString(config: GKEInterpreterConfig,
                                                 appName: AppName,
                                                 cluster: KubernetesCluster,
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
index f3d609f228..b42d783d04 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
@@ -6,7 +6,17 @@ import cats.effect.Async
 import cats.mtl.Ask
 import cats.syntax.all._
 import com.google.auth.oauth2.GoogleCredentials
-import com.google.cloud.compute.v1.Disk
+import com.google.cloud.compute.v1.{
+  AccessConfig,
+  AttachedDisk,
+  AttachedDiskInitializeParams,
+  Instance,
+  Items,
+  Metadata,
+  NetworkInterface,
+  ServiceAccount,
+  Tags
+}
 import com.google.container.v1._
 import fs2.io.file.Files
 import org.broadinstitute.dsde.workbench.DoneCheckable
@@ -27,26 +37,27 @@ import org.broadinstitute.dsde.workbench.google2.{
   streamFUntilDone,
   streamUntilDoneOrTimeout,
   tracedRetryF,
-  DiskName,
   GoogleComputeService,
   GoogleDiskService,
   GoogleResourceService,
   KubernetesClusterNotFoundException,
-  PvName,
+  NetworkName,
+  RegionName,
+  SubnetworkName,
   ZoneName
 }
+import org.broadinstitute.dsde.workbench.util2.InstanceName
 import org.broadinstitute.dsde.workbench.leonardo.dao.{AppDAO, AppDescriptorDAO}
 import org.broadinstitute.dsde.workbench.leonardo.db._
 import org.broadinstitute.dsde.workbench.leonardo.http._
 import org.broadinstitute.dsde.workbench.leonardo.http.service.AppNotFoundException
+import org.broadinstitute.dsde.workbench.leonardo.dao.google.{buildMachineTypeUri, buildSubnetworkUri}
 import org.broadinstitute.dsde.workbench.leonardo.util.BuildHelmChartValues.{
   buildAllowedAppChartOverrideValuesString,
   buildCromwellAppChartOverrideValuesString,
-  buildCustomChartOverrideValuesString,
-  buildGalaxyChartOverrideValuesString
+  buildCustomChartOverrideValuesString
 }
 import org.broadinstitute.dsde.workbench.leonardo.model.LeoException
-import org.broadinstitute.dsde.workbench.leonardo.monitor.PubsubHandleMessageError.PubsubKubernetesError
 import org.broadinstitute.dsde.workbench.leonardo.util.GKEAlgebra._
 import org.broadinstitute.dsde.workbench.model.google.{GcsBucketName, GoogleProject}
 import org.broadinstitute.dsde.workbench.model.{IP, TraceId, WorkbenchEmail}
@@ -387,21 +398,52 @@ class GKEInterpreter[F[_]](
                             )
       )
       app = dbApp.app
-      namespaceName = app.appResources.namespace
       dbCluster = dbApp.cluster
-      gkeClusterId = dbCluster.getClusterId
       googleProject = params.googleProject
 
-      // TODO: This DB query might not be needed if it makes sense to add diskId in App model (will revisit in next PR)
       diskOpt <- appQuery.getDiskId(app.id).transaction
       diskId <- F.fromOption(diskOpt, DiskNotFoundForAppException(app.id, ctx.traceId))
 
-      // Create namespace and secrets
+      _ <- logger.info(ctx.loggingCtx)(s"Begin App(${app.appName.value}) Creation.")
+
+      nfsDisk <- F.fromOption(
+        dbApp.app.appResources.disk,
+        AppCreationException(s"NFS disk not found in DB for app ${app.appName.value} | trace id: ${ctx.traceId}")
+      )
+
+      // Galaxy uses a VM-based deployment; all other app types use the GKE/Helm path.
+      _ <- app.appType match {
+        case AppType.Galaxy =>
+          installGalaxyVm(dbCluster, app, nfsDisk, googleProject) >>
+            persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction.void
+
+        case _ =>
+          createAndPollAppViaHelm(params, dbApp, app, dbCluster, nfsDisk, diskId, googleProject, ctx)
+      }
+
       _ <- logger.info(ctx.loggingCtx)(
-        s"Begin App(${app.appName.value}) Creation."
+        s"Finished app creation for app ${app.appName.value}"
       )
 
-      // Create KSA
+      readyTime <- F.realTimeInstant
+      _ <- appUsageQuery.recordStart(params.appId, readyTime)
+      _ <- appQuery.updateStatus(params.appId, AppStatus.Running).transaction
+    } yield ()
+
+  // GKE/Helm path for non-Galaxy app types (Cromwell, Allowed, Custom).
+  private def createAndPollAppViaHelm(
+    params: CreateAppParams,
+    dbApp: GetAppResult,
+    app: App,
+    dbCluster: KubernetesCluster,
+    nfsDisk: PersistentDisk,
+    diskId: DiskId,
+    googleProject: GoogleProject,
+    ctx: AppContext
+  )(implicit ev: Ask[F, AppContext]): F[Unit] = {
+    val namespaceName = app.appResources.namespace
+    val gkeClusterId = dbCluster.getClusterId
+    for {
       ksaName <- F.fromOption(
         app.appResources.kubernetesServiceAccountName,
         AppCreationException(
@@ -424,11 +466,6 @@ class GKEInterpreter[F[_]](
             )
       }
 
-      nfsDisk <- F.fromOption(
-        dbApp.app.appResources.disk,
-        AppCreationException(s"NFS disk not found in DB for app ${app.appName.value} | trace id: ${ctx.traceId}")
-      )
-
       helmAuthContext <- getHelmAuthContext(googleCluster, dbCluster, namespaceName)
 
       _ <- helmClient
@@ -442,12 +479,8 @@ class GKEInterpreter[F[_]](
           true
         )
         .run(helmAuthContext)
-      // update KSA in DB
       _ <- appQuery.updateKubernetesServiceAccount(app.id, ksaName).transaction
 
-      // Associate GSA to newly created KSA
-      // This string is constructed based on Google requirements to associate a GSA to a KSA
-      // (https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity#creating_a_relationship_between_ksas_and_gsas)
       ksaToGsa = s"${googleProject.value}.svc.id.goog[${namespaceName.value}/${ksaName.value}]"
       call = F.fromFuture(
         F.delay(
@@ -458,49 +491,15 @@ class GKEInterpreter[F[_]](
           )
         )
       )
-      retryConfig = RetryPredicates.retryConfigWithPredicates(
-        when409
-      )
+      retryConfig = RetryPredicates.retryConfigWithPredicates(when409)
       _ <- tracedRetryF(retryConfig)(
         call,
         s"googleIamDAO.addIamPolicyBindingOnServiceAccount for GSA ${gsa.value} & KSA ${ksaName.value}"
       ).compile.lastOrError
 
-      // TODO: validate app release is the same as restore release
-      appRestore: Option[AppRestore] <- persistentDiskQuery.getAppDiskRestore(diskId).transaction
-      galaxyRestore: Option[AppRestore.GalaxyRestore] = appRestore.flatMap {
-        case a: AppRestore.GalaxyRestore => Some(a)
-        case _: AppRestore.Other         => None
-      }
-
       nodepool = if (app.autopilot.isDefined) None else Some(dbApp.nodepool.nodepoolName)
-      // helm install and wait
+
       _ <- app.appType match {
-        case AppType.Galaxy =>
-          for {
-            machineType <- F.fromOption(
-              params.appMachineType,
-              new LeoException(
-                s"can't find machine config for ${googleProject.value}/${app.appName.value}. This should never happen",
-                traceId = Some(ctx.traceId)
-              )
-            )
-            _ <- installGalaxy(
-              helmAuthContext,
-              app.appName,
-              app.release,
-              app.chart,
-              dbCluster,
-              dbApp.nodepool.nodepoolName, // https://broadworkbench.atlassian.net/browse/IA-4987
-              namespaceName,
-              app.auditInfo.creator,
-              app.customEnvironmentVariables,
-              ksaName,
-              nfsDisk,
-              machineType,
-              galaxyRestore
-            )
-          } yield ()
         case AppType.Cromwell =>
           installCromwellApp(
             helmAuthContext,
@@ -551,60 +550,15 @@ class GKEInterpreter[F[_]](
           F.raiseError(AppCreationException(s"App type ${app.appType} not supported on GCP"))
       }
 
-      _ <- logger.info(ctx.loggingCtx)(
-        s"Finished app creation for app ${app.appName.value} in cluster ${gkeClusterId.toString}"
-      )
-
       _ <- app.appType match {
-        case AppType.Galaxy =>
-          if (galaxyRestore.isDefined) persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction.void
-          else
-            for {
-              pvcs <- kubeService.listPersistentVolumeClaims(gkeClusterId,
-                                                             KubernetesNamespace(app.appResources.namespace)
-              )
-
-              _ <- pvcs
-                // We added an extra -galaxy here: https://github.com/galaxyproject/galaxykubeman-helm/blob/f7f27be74c213deda3ae53122[…]959c96480bb21f/galaxykubeman/templates/config-setup-galaxy.yaml
-                .find(pvc => pvc.getMetadata.getName == s"${app.release.asString}-galaxy-galaxy-pvc")
-                .fold(
-                  F.raiseError[Unit](
-                    PubsubKubernetesError(AppError("Fail to retrieve pvc ids",
-                                                   ctx.now,
-                                                   ErrorAction.CreateApp,
-                                                   ErrorSource.App,
-                                                   None,
-                                                   Some(ctx.traceId)
-                                          ),
-                                          Some(app.id),
-                                          false,
-                                          None,
-                                          None,
-                                          None
-                    )
-                  )
-                ) { galaxyPvc =>
-                  val galaxyDiskRestore = AppRestore.GalaxyRestore(
-                    PvcId(galaxyPvc.getMetadata.getUid),
-                    app.id
-                  )
-                  persistentDiskQuery
-                    .updateGalaxyDiskRestore(diskId, galaxyDiskRestore)
-                    .transaction
-                    .void
-                }
-            } yield ()
         case AppType.Cromwell => persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction
         case AppType.Allowed  => persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction
         case AppType.Custom   => F.unit
         case _ =>
           F.raiseError(AppCreationException(s"App type ${app.appType} not supported on GCP"))
       }
-
-      readyTime <- F.realTimeInstant
-      _ <- appUsageQuery.recordStart(params.appId, readyTime)
-      _ <- appQuery.updateStatus(params.appId, AppStatus.Running).transaction
     } yield ()
+  }
 
   override def deleteAndPollCluster(params: DeleteClusterParams)(implicit ev: Ask[F, AppContext]): F[Unit] =
     for {
@@ -725,77 +679,86 @@ class GKEInterpreter[F[_]](
       // Resolve the cluster in Google
       googleClusterOpt <- gkeService.getCluster(gkeClusterId)
 
-      _ <- googleClusterOpt
-        .traverse { googleCluster =>
-          val uninstallCharts = for {
-            helmAuthContext <- getHelmAuthContext(googleCluster, dbCluster, namespaceName)
-
-            _ <- logger.info(ctx.loggingCtx)(
-              s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}"
+      _ <- app.appType match {
+        case AppType.Galaxy =>
+          // Galaxy runs on a VM: delete the GCE instance. Disks are retained for persistence.
+          for {
+            gp <- F.fromOption(
+              LeoLenses.cloudContextToGoogleProject.get(dbCluster.cloudContext),
+              new RuntimeException("Galaxy app cloud context should be a google project")
             )
-
-            // helm uninstall the app chart and wait
-            _ <- helmClient
-              .uninstall(app.release, config.galaxyAppConfig.uninstallKeepHistory)
-              .run(helmAuthContext)
-
-            last <- streamFUntilDone(
-              kubeService.listPodStatus(dbCluster.getClusterId, KubernetesNamespace(namespaceName)),
-              config.monitorConfig.deleteApp.maxAttempts,
-              config.monitorConfig.deleteApp.interval
-            ).compile.lastOrError
-
-            _ <-
-              if (!podDoneCheckable.isDone(last)) {
-                val msg =
-                  s"Helm deletion has failed or timed out for app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}. The following pods are not in a terminal state: ${last
-                      .filterNot(isPodDone)
-                      .map(_.name.value)
-                      .mkString(", ")}"
-                logger.error(ctx.loggingCtx)(msg) >>
-                  F.raiseError[Unit](AppDeletionException(msg))
-              } else F.unit
-
-            // helm uninstall the setup chart
-            _ <- helmClient
-              .uninstall(
-                getTerraAppSetupChartReleaseName(app.release),
-                config.galaxyAppConfig.uninstallKeepHistory
-              )
-              .run(helmAuthContext)
+            instanceName = InstanceName(s"galaxy-${app.appName.value}")
+            zone = dbApp.app.appResources.disk.map(_.zone).getOrElse(ZoneName(config.clusterConfig.location.value))
+            _ <- computeService
+              .deleteInstance(gp, zone, instanceName)
+              .void
+              .handleErrorWith { e =>
+                logger.warn(ctx.loggingCtx)(
+                  s"Failed to delete Galaxy VM ${instanceName.value}: ${e.getMessage}. Continuing with app deletion."
+                )
+              }
           } yield ()
 
-          uninstallCharts.handleErrorWith { e =>
-            logger.info(ctx.loggingCtx)(
-              s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString} failed with error ${e.getMessage}"
-            )
-          }
-        }
-
-      // delete the namespace only after the helm uninstall completes
-      _ <- kubeService.deleteNamespace(dbApp.cluster.getClusterId,
-                                       KubernetesNamespace(dbApp.app.appResources.namespace)
-      )
+        case _ =>
+          // GKE/Helm path for all other app types
+          googleClusterOpt
+            .traverse { googleCluster =>
+              val uninstallCharts = for {
+                helmAuthContext <- getHelmAuthContext(googleCluster, dbCluster, namespaceName)
+
+                _ <- logger.info(ctx.loggingCtx)(
+                  s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}"
+                )
 
-      fa = kubeService
-        .namespaceExists(dbApp.cluster.getClusterId, KubernetesNamespace(dbApp.app.appResources.namespace))
-        .map(!_) // mapping to inverse because booleanDoneCheckable defines `Done` when it becomes `true`...In this case, the namespace will exists for a while, and eventually becomes non-existent
+                _ <- helmClient
+                  .uninstall(app.release, true)
+                  .run(helmAuthContext)
+
+                last <- streamFUntilDone(
+                  kubeService.listPodStatus(dbCluster.getClusterId, KubernetesNamespace(namespaceName)),
+                  config.monitorConfig.deleteApp.maxAttempts,
+                  config.monitorConfig.deleteApp.interval
+                ).compile.lastOrError
+
+                _ <-
+                  if (!podDoneCheckable.isDone(last)) {
+                    val msg =
+                      s"Helm deletion has failed or timed out for app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}. The following pods are not in a terminal state: ${last
+                          .filterNot(isPodDone)
+                          .map(_.name.value)
+                          .mkString(", ")}"
+                    logger.error(ctx.loggingCtx)(msg) >>
+                      F.raiseError[Unit](AppDeletionException(msg))
+                  } else F.unit
+
+                _ <- helmClient
+                  .uninstall(getTerraAppSetupChartReleaseName(app.release), true)
+                  .run(helmAuthContext)
+              } yield ()
+
+              uninstallCharts.handleErrorWith { e =>
+                logger.info(ctx.loggingCtx)(
+                  s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString} failed with error ${e.getMessage}"
+                )
+              }
+            }
+            .void >>
+            kubeService
+              .deleteNamespace(dbApp.cluster.getClusterId, KubernetesNamespace(dbApp.app.appResources.namespace)) >>
+            streamUntilDoneOrTimeout(
+              kubeService
+                .namespaceExists(dbApp.cluster.getClusterId, KubernetesNamespace(dbApp.app.appResources.namespace))
+                .map(!_),
+              60,
+              5 seconds,
+              "delete namespace timed out"
+            )
+      }
 
-      _ <- streamUntilDoneOrTimeout(fa, 60, 5 seconds, "delete namespace timed out")
       _ <- logger.info(ctx.loggingCtx)(
-        s"Delete app operation has finished for app ${app.appName.value} in cluster ${gkeClusterId.toString}"
+        s"Delete app operation has finished for app ${app.appName.value}"
       )
 
-      appRestore: Option[AppRestore.GalaxyRestore] = dbApp.app.appResources.disk.flatMap(_.appRestore).flatMap {
-        case a: AppRestore.GalaxyRestore => Some(a)
-        case _: AppRestore.Other         => None
-      }
-      _ <- appRestore.traverse { restore =>
-        for {
-          _ <- kubeService.deletePv(dbCluster.getClusterId, PvName(s"pvc-${restore.galaxyPvcId.asString}"))
-        } yield ()
-      }
-
       _ <-
         if (!params.errorAfterDelete) {
           F.unit
@@ -1016,29 +979,6 @@ class GKEInterpreter[F[_]](
         }
     } yield ()
 
-  private[leonardo] def getGalaxyPostgresDisk(diskName: DiskName,
-                                              namespaceName: NamespaceName,
-                                              project: GoogleProject,
-                                              zone: ZoneName
-  )(implicit traceId: Ask[F, AppContext]): F[Option[Disk]] =
-    for {
-      postgresDiskOpt <- googleDiskService
-        .getDisk(
-          project,
-          zone,
-          getGalaxyPostgresDiskName(diskName, config.galaxyDiskConfig.postgresDiskNameSuffix)
-        )
-      res <- postgresDiskOpt match {
-        case Some(disk) => F.pure(Some(disk))
-        case None =>
-          googleDiskService.getDisk(
-            project,
-            zone,
-            getOldStyleGalaxyPostgresDiskName(namespaceName, config.galaxyDiskConfig.postgresDiskNameSuffix)
-          )
-      }
-    } yield res
-
   private[util] def installNginx(dbCluster: KubernetesCluster, googleCluster: Cluster)(implicit
     ev: Ask[F, AppContext]
   ): F[IP] =
@@ -1085,94 +1025,204 @@ class GKEInterpreter[F[_]](
       )
     } yield loadBalancerIp
 
-  private[util] def installGalaxy(helmAuthContext: AuthContext,
-                                  appName: AppName,
-                                  release: Release,
-                                  chart: Chart,
-                                  dbCluster: KubernetesCluster,
-                                  nodepoolName: NodepoolName,
-                                  namespaceName: NamespaceName,
-                                  userEmail: WorkbenchEmail,
-                                  customEnvironmentVariables: Map[String, String],
-                                  kubernetesServiceAccount: ServiceAccountName,
-                                  nfsDisk: PersistentDisk,
-                                  machineType: AppMachineType,
-                                  galaxyRestore: Option[AppRestore.GalaxyRestore]
-  )(implicit
-    ev: Ask[F, AppContext]
-  ): F[Unit] =
+  private[util] def installGalaxyVm(
+    dbCluster: KubernetesCluster,
+    app: App,
+    nfsDisk: PersistentDisk,
+    googleProject: GoogleProject
+  )(implicit ev: Ask[F, AppContext]): F[Unit] =
     for {
       ctx <- ev.ask
 
       _ <- logger.info(ctx.loggingCtx)(
-        s"Installing helm chart $chart for app ${appName.value} in cluster ${dbCluster.getClusterId.toString}"
-      )
-      googleProject <- F.fromOption(
-        LeoLenses.cloudContextToGoogleProject.get(nfsDisk.cloudContext),
-        new RuntimeException("this should never happen. Galaxy disk's cloud context should be a google project")
+        s"Installing Galaxy VM for app ${app.appName.value} in project ${googleProject.value}"
       )
-      postgresDiskNameOpt <- for {
-        disk <- getGalaxyPostgresDisk(nfsDisk.name, namespaceName, googleProject, nfsDisk.zone)
-      } yield disk.map(x => DiskName(x.getName))
 
-      postgresDiskName <- F.fromOption(
-        postgresDiskNameOpt,
-        AppCreationException(s"No postgres disk found in google for app ${appName.value} ", traceId = Some(ctx.traceId))
+      zoneParam = nfsDisk.zone
+      regionParam = RegionName(zoneParam.value.dropRight(2))
+
+      // Set up VPC and firewall
+      (network, subnetwork) <- vpcAlg.setUpProjectNetworkAndFirewalls(
+        SetUpProjectNetworkParams(googleProject, regionParam)
       )
 
-      chartValues = buildGalaxyChartOverrideValuesString(
-        config,
-        appName,
-        release,
-        dbCluster,
-        nodepoolName,
-        userEmail,
-        customEnvironmentVariables,
-        kubernetesServiceAccount,
-        namespaceName,
-        nfsDisk,
-        postgresDiskName,
-        machineType,
-        galaxyRestore
+      // Load cloud-config content bundled from galaxy-k8s-boot bin/user_data.sh.
+      // Intentionally not fetched at runtime to avoid unexpected production changes.
+      // To update, sync manually from https://github.com/galaxyproject/galaxy-k8s-boot/blob/dev/bin/user_data.sh
+      userDataContent = scala.io.Source
+        .fromResource("init-resources/galaxy-user-data.sh")
+        .getLines()
+        .toList
+        .mkString("\n")
+
+      // Derive postgres disk name using the same naming convention as the subscriber
+      postgresDiskName = GKEAlgebra.getGalaxyPostgresDiskName(nfsDisk.name, config.galaxyDiskConfig.postgresDiskNameSuffix)
+
+      // Persistent-volume-size passed to ansible-pull (leave ~11 GiB for filesystem overhead on 150 GB disk)
+      pvSizeGi = math.max(1, nfsDisk.size.gb - 11)
+      pvSize   = s"${pvSizeGi}Gi"
+
+      // GCP Batch SA: prefer value from customEnvironmentVariables, fall back to config default
+      gcpBatchSa = app.customEnvironmentVariables.getOrElse(
+        "gcp_batch_service_account_email",
+        config.galaxyVmConfig.gcpBatchServiceAccountEmail
       )
 
+      // restore_galaxy flag
+      restoreGalaxy = app.customEnvironmentVariables.getOrElse("restore_galaxy", "false")
+
+      // Disks
+      bootDisk = AttachedDisk
+        .newBuilder()
+        .setBoot(true)
+        .setAutoDelete(true)
+        .setInitializeParams(
+          AttachedDiskInitializeParams
+            .newBuilder()
+            .setSourceImage(config.galaxyVmConfig.sourceImage.asString)
+            .setDiskSizeGb(config.galaxyVmConfig.bootDiskSizeGb.gb)
+            .putAllLabels(Map("leonardo" -> "true").asJava)
+            .build()
+        )
+        .build()
+
+      // Galaxy data disk — device name must match what the bootstrap script expects
+      dataDisk = AttachedDisk
+        .newBuilder()
+        .setBoot(false)
+        .setDeviceName("galaxy-data")
+        .setAutoDelete(false)
+        .setInitializeParams(
+          AttachedDiskInitializeParams
+            .newBuilder()
+            .setDiskName(nfsDisk.name.value)
+            .setDiskSizeGb(nfsDisk.size.gb)
+            .setDiskType(nfsDisk.diskType.googleString(googleProject, zoneParam))
+            .putAllLabels(Map("leonardo" -> "true").asJava)
+            .build()
+        )
+        .build()
+
+      // PostgreSQL disk — device name must match what the bootstrap script expects
+      postgresDisk = AttachedDisk
+        .newBuilder()
+        .setBoot(false)
+        .setDeviceName("galaxy-postgres-data")
+        .setAutoDelete(false)
+        .setInitializeParams(
+          AttachedDiskInitializeParams
+            .newBuilder()
+            .setDiskName(postgresDiskName.value)
+            .setDiskSizeGb(config.galaxyVmConfig.postgresDiskSizeGb.gb)
+            .putAllLabels(Map("leonardo" -> "true").asJava)
+            .build()
+        )
+        .build()
+
+      // Network interface with external IP
+      networkInterface = NetworkInterface
+        .newBuilder()
+        .setSubnetwork(
+          buildSubnetworkUri(googleProject, regionParam, subnetwork)
+        )
+        .addAccessConfigs(AccessConfig.newBuilder().setName("Leonardo Galaxy VM external IP").build())
+        .build()
+
+      instanceName = InstanceName(s"galaxy-${app.appName.value}")
+
+      instance = Instance
+        .newBuilder()
+        .setName(instanceName.value)
+        .setDescription("Leonardo Galaxy VM")
+        .setTags(Tags.newBuilder().addItems(config.vpcNetworkTag.value).build())
+        .setMachineType(buildMachineTypeUri(zoneParam, config.galaxyVmConfig.machineType))
+        .addNetworkInterfaces(networkInterface)
+        .addAllDisks(List(bootDisk, dataDisk, postgresDisk).asJava)
+        .addServiceAccounts(
+          ServiceAccount
+            .newBuilder()
+            .setEmail(app.googleServiceAccount.value)
+            .addAllScopes(
+              List(
+                "https://www.googleapis.com/auth/cloud-platform",
+                "https://www.googleapis.com/auth/logging.write"
+              ).asJava
+            )
+            .build()
+        )
+        .setMetadata(
+          Metadata
+            .newBuilder()
+            .addItems(Items.newBuilder().setKey("user-data").setValue(userDataContent).build())
+            .addItems(Items.newBuilder().setKey("google-logging-enabled").setValue("true").build())
+            .addItems(Items.newBuilder().setKey("gcp_batch_service_account_email").setValue(gcpBatchSa).build())
+            .addItems(Items.newBuilder().setKey("persistent-volume-size").setValue(pvSize).build())
+            .addItems(Items.newBuilder().setKey("restore_galaxy").setValue(restoreGalaxy).build())
+            .addItems(Items.newBuilder().setKey("git-repo").setValue(config.galaxyVmConfig.gitRepo).build())
+            .addItems(Items.newBuilder().setKey("git-branch").setValue(config.galaxyVmConfig.gitBranch).build())
+            .addItems(Items.newBuilder().setKey("gcp-region").setValue(regionParam.value).build())
+            .addItems(Items.newBuilder().setKey("gcp-network").setValue(network.value).build())
+            .addItems(Items.newBuilder().setKey("gcp-subnet").setValue(subnetwork.value).build())
+            .build()
+        )
+        .putAllLabels(Map("leonardo" -> "true").asJava)
+        .build()
+
+      _ <- computeService.createInstance(googleProject, zoneParam, instance)
+
       _ <- logger.info(ctx.loggingCtx)(
-        s"Chart override values are: ${chartValues.map(s =>
-            if (s.contains("galaxyDatabasePassword")) "persistence.postgres.galaxyDatabasePassword=<redacted>"
-            else s
-          )}"
+        s"Galaxy VM instance ${instanceName.value} submitted for project ${googleProject.value}; polling for external IP"
       )
 
-      // Invoke helm
-      helmInstall = helmClient
-        .installChart(
-          release,
-          chart.name,
-          chart.version,
-          org.broadinstitute.dsp.Values(chartValues.mkString(",")),
-          false
+      // Poll until the instance has an external IP, then store it as the cluster's load balancer IP
+      // so that KubernetesDnsCache can resolve the proxy host.
+      externalIpOpt <- streamFUntilDone(
+        computeService.getInstance(googleProject, zoneParam, instanceName).map { instanceOpt =>
+          instanceOpt.flatMap { inst =>
+            import scala.jdk.CollectionConverters._
+            for {
+              iface  <- Option(inst.getNetworkInterfacesList).flatMap(_.asScala.headOption)
+              cfg    <- Option(iface.getAccessConfigsList).flatMap(_.asScala.headOption)
+              natIp  <- Option(cfg.getNatIP).filter(_.nonEmpty)
+            } yield IP(natIp)
+          }
+        },
+        config.monitorConfig.createApp.maxAttempts,
+        config.monitorConfig.createApp.interval
+      ).compile.lastOrError
+
+      externalIp <- F.fromOption(
+        externalIpOpt,
+        AppCreationException(
+          s"Galaxy VM ${instanceName.value} did not obtain an external IP after ${config.monitorConfig.createApp.interruptAfter}",
+          traceId = Some(ctx.traceId)
         )
-        .run(helmAuthContext)
+      )
 
-      // Currently we always retry.
-      // The main failure mode here is helm install, which does not have easily interpretable error codes
-      retryConfig = RetryPredicates.retryAllConfig
-      _ <- tracedRetryF(retryConfig)(
-        helmInstall,
-        s"helm install for app ${appName.value} in project ${dbCluster.cloudContext.asString}"
-      ).compile.lastOrError
+      _ <- logger.info(ctx.loggingCtx)(
+        s"Galaxy VM ${instanceName.value} has external IP ${externalIp.asString}; storing in cluster async fields"
+      )
 
-      googleProject <- F.fromOption(
-        LeoLenses.cloudContextToGoogleProject.get(dbCluster.cloudContext),
-        new RuntimeException("trying to create a non google runtime in GKEInterpreter. This should never happen")
+      // Store the VM's external IP as the cluster load balancer IP consumed by KubernetesDnsCache
+      _ <- kubernetesClusterQuery
+        .updateAsyncFields(
+          dbCluster.id,
+          KubernetesClusterAsyncFields(
+            externalIp,
+            IP(""),
+            NetworkFields(NetworkName(""), SubnetworkName(""), IpRange(""))
+          )
+        )
+        .transaction
+      _ <- kubernetesClusterQuery.updateStatus(dbCluster.id, KubernetesClusterStatus.Running).transaction
+
+      _ <- logger.info(ctx.loggingCtx)(
+        s"Polling Galaxy readiness for app ${app.appName.value} at ${externalIp.asString}:80"
       )
-      // Poll galaxy until it starts up
-      // TODO potentially add other status checks for pod readiness, beyond just HTTP polling the galaxy-web service
-      // Wait a bit before starting polling for the app status check as the certificates might not be quite ready yet
-      // This seems to only impact galaxy, See https://broadworkbench.atlassian.net/browse/IA-4551
-      _ <- F.sleep(60 seconds)
+
+      // Wait for Galaxy's nginx ingress to respond on port 80
       isDone <- streamFUntilDone(
-        appDao.isProxyAvailable(googleProject, appName, ServiceName("galaxy"), ctx.traceId),
+        appDao.isProxyAvailable(googleProject, app.appName, ServiceName("galaxy"), ctx.traceId),
         config.monitorConfig.createApp.maxAttempts,
         config.monitorConfig.createApp.interval
       ).interruptAfter(config.monitorConfig.createApp.interruptAfter).compile.lastOrError
@@ -1180,9 +1230,9 @@ class GKEInterpreter[F[_]](
       _ <-
         if (!isDone) {
           val msg =
-            s"Galaxy installation has failed or timed out for app ${appName.value} in cluster ${dbCluster.getClusterId.toString}"
+            s"Galaxy VM installation has failed or timed out for app ${app.appName.value} in project ${googleProject.value}"
           logger.error(ctx.loggingCtx)(msg) >>
-            F.raiseError[Unit](AppCreationException(msg))
+            F.raiseError[Unit](AppCreationException(msg, traceId = Some(ctx.traceId)))
         } else F.unit
 
     } yield ()
@@ -1833,14 +1883,14 @@ final case class GKEInterpreterConfig(leoUrlBase: URL,
                                       vpcNetworkTag: NetworkTag,
                                       terraAppSetupChartConfig: TerraAppSetupChartConfig,
                                       ingressConfig: KubernetesIngressConfig,
-                                      galaxyAppConfig: GalaxyAppConfig,
                                       cromwellAppConfig: CromwellAppConfig,
                                       customAppConfig: CustomAppConfig,
                                       allowedAppConfig: AllowedAppConfig,
                                       monitorConfig: AppMonitorConfig,
                                       clusterConfig: KubernetesClusterConfig,
                                       proxyConfig: ProxyConfig,
-                                      galaxyDiskConfig: GalaxyDiskConfig
+                                      galaxyDiskConfig: GalaxyDiskConfig,
+                                      galaxyVmConfig: GalaxyVmConfig
 )
 
 final case class TerraAppSetupChartConfig(
diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValuesSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValuesSpec.scala
index 6ea504a907..a408cd4559 100644
--- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValuesSpec.scala
+++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValuesSpec.scala
@@ -4,7 +4,6 @@ package util
 import org.broadinstitute.dsde.workbench.google2.DiskName
 import org.broadinstitute.dsde.workbench.google2.GKEModels.NodepoolName
 import org.broadinstitute.dsde.workbench.google2.KubernetesSerializableName.{NamespaceName, ServiceAccountName}
-import org.broadinstitute.dsde.workbench.leonardo.AppRestore.GalaxyRestore
 import org.broadinstitute.dsde.workbench.leonardo.CommonTestData.{makePersistentDisk, userEmail, userEmail2}
 import org.broadinstitute.dsde.workbench.leonardo.KubernetesTestData.{makeCustomAppService, makeKubeCluster}
 import org.broadinstitute.dsde.workbench.leonardo.config.Config
@@ -16,177 +15,6 @@ import org.scalatest.flatspec.AnyFlatSpecLike
 
 class BuildHelmChartValuesSpec extends AnyFlatSpecLike with LeonardoTestSuite {
 
-  it should "build Galaxy override values string" in {
-    val savedCluster1 = makeKubeCluster(1)
-    val savedDisk1 = makePersistentDisk(Some(DiskName("disk1")), Some(FormattedBy.Galaxy))
-    val res = buildGalaxyChartOverrideValuesString(
-      Config.gkeInterpConfig,
-      AppName("app1"),
-      Release("app1-galaxy-rls"),
-      savedCluster1,
-      NodepoolName("pool1"),
-      userEmail,
-      Map("WORKSPACE_NAME" -> "test-workspace",
-          "WORKSPACE_BUCKET" -> "gs://test-bucket",
-          "WORKSPACE_NAMESPACE" -> "dsp-leo-test1"
-      ),
-      ServiceAccountName("app1-galaxy-ksa"),
-      NamespaceName("ns"),
-      savedDisk1,
-      DiskName("disk1-gxy-postres-disk"),
-      AppMachineType(23, 7),
-      None
-    )
-
-    res.mkString(
-      ","
-    ) shouldBe
-      """nfs.storageClass.name=nfs-app1-galaxy-rls,""" +
-      """galaxy.persistence.storageClass=nfs-app1-galaxy-rls,""" +
-      """galaxy.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" +
-      """nfs.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" +
-      """galaxy.configs.job_conf\.yml.runners.k8s.k8s_node_selector=cloud.google.com/gke-nodepool: pool1,""" +
-      """galaxy.postgresql.master.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" +
-      """galaxy.ingress.path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy,""" +
-      """galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-from=https://1455694897.jupyter.firecloud.org,""" +
-      """galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-to=https://leo,""" +
-      """galaxy.ingress.hosts[0].host=1455694897.jupyter.firecloud.org,""" +
-      """galaxy.ingress.hosts[0].paths[0].path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy,""" +
-      """galaxy.ingress.tls[0].hosts[0]=1455694897.jupyter.firecloud.org,""" +
-      """galaxy.ingress.tls[0].secretName=tls-secret,""" +
-      """cvmfs.cvmfscsi.cache.alien.pvc.storageClass=nfs-app1-galaxy-rls,""" +
-      """cvmfs.cvmfscsi.cache.alien.pvc.name=cvmfs-alien-cache,""" +
-      """galaxy.configs.galaxy\.yml.galaxy.single_user=user1@example.com,""" +
-      """galaxy.configs.galaxy\.yml.galaxy.admin_users=user1@example.com,""" +
-      """galaxy.terra.launch.workspace=test-workspace,""" +
-      """galaxy.terra.launch.namespace=dsp-leo-test1,""" +
-      """galaxy.terra.launch.apiURL=https://firecloud-orchestration.dsde-dev.broadinstitute.org/api/,""" +
-      """galaxy.terra.launch.drsURL=https://drshub.dsde-dev.broadinstitute.org/api/v4/drs/resolve,""" +
-      """galaxy.tusd.ingress.hosts[0].host=1455694897.jupyter.firecloud.org,""" +
-      """galaxy.tusd.ingress.hosts[0].paths[0].path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy/api/upload/resumable_upload,""" +
-      """galaxy.tusd.ingress.tls[0].hosts[0]=1455694897.jupyter.firecloud.org,""" +
-      """galaxy.tusd.ingress.tls[0].secretName=tls-secret,""" +
-      """galaxy.rabbitmq.persistence.storageClassName=nfs-app1-galaxy-rls,""" +
-      """galaxy.jobs.maxLimits.memory=23,""" +
-      """galaxy.jobs.maxLimits.cpu=7,""" +
-      """galaxy.jobs.maxRequests.memory=1,""" +
-      """galaxy.jobs.maxRequests.cpu=1,""" +
-      """galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_mem=1,""" +
-      """galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_cores=1,""" +
-      """galaxy.serviceAccount.create=false,""" +
-      """galaxy.serviceAccount.name=app1-galaxy-ksa,""" +
-      """rbac.serviceAccount=app1-galaxy-ksa,persistence.nfs.name=ns-nfs-disk,""" +
-      """persistence.nfs.persistentVolume.extraSpec.gcePersistentDisk.pdName=disk1,""" +
-      """persistence.nfs.size=250Gi,""" +
-      """persistence.postgres.name=ns-postgres-disk,""" +
-      """galaxy.postgresql.galaxyDatabasePassword=replace-me,""" +
-      """persistence.postgres.persistentVolume.extraSpec.gcePersistentDisk.pdName=disk1-gxy-postres-disk,""" +
-      """persistence.postgres.size=10Gi,""" +
-      """nfs.persistence.existingClaim=ns-nfs-disk-pvc,""" +
-      """nfs.persistence.size=250Gi,""" +
-      """galaxy.postgresql.persistence.existingClaim=ns-postgres-disk-pvc,""" +
-      """galaxy.persistence.size=200Gi,""" +
-      """configs.WORKSPACE_NAME=test-workspace,""" +
-      """extraEnv[0].name=WORKSPACE_NAME,extraEnv[0].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" +
-      """extraEnv[0].valueFrom.configMapKeyRef.key=WORKSPACE_NAME,""" +
-      """configs.WORKSPACE_BUCKET=gs://test-bucket,""" +
-      """extraEnv[1].name=WORKSPACE_BUCKET,""" +
-      """extraEnv[1].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" +
-      """extraEnv[1].valueFrom.configMapKeyRef.key=WORKSPACE_BUCKET,""" +
-      """configs.WORKSPACE_NAMESPACE=dsp-leo-test1,""" +
-      """extraEnv[2].name=WORKSPACE_NAMESPACE,""" +
-      """extraEnv[2].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" +
-      """extraEnv[2].valueFrom.configMapKeyRef.key=WORKSPACE_NAMESPACE"""
-  }
-
-  it should "build Galaxy override values string with restore info" in {
-    val savedCluster1 = makeKubeCluster(1)
-    val savedDisk1 = makePersistentDisk(Some(DiskName("disk1")), Some(FormattedBy.Galaxy))
-    val result =
-      buildGalaxyChartOverrideValuesString(
-        Config.gkeInterpConfig,
-        AppName("app1"),
-        Release("app1-galaxy-rls"),
-        savedCluster1,
-        NodepoolName("pool1"),
-        userEmail,
-        Map("WORKSPACE_NAME" -> "test-workspace",
-            "WORKSPACE_BUCKET" -> "gs://test-bucket",
-            "WORKSPACE_NAMESPACE" -> "dsp-leo-test1"
-        ),
-        ServiceAccountName("app1-galaxy-ksa"),
-        NamespaceName("ns"),
-        savedDisk1,
-        DiskName("disk1-gxy-postres"),
-        AppMachineType(23, 7),
-        Some(
-          GalaxyRestore(PvcId("galaxy-pvc-id"), AppId(123))
-        )
-      )
-    result.mkString(
-      ","
-    ) shouldBe
-      """nfs.storageClass.name=nfs-app1-galaxy-rls,""" +
-      """galaxy.persistence.storageClass=nfs-app1-galaxy-rls,""" +
-      """galaxy.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" +
-      """nfs.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" +
-      """galaxy.configs.job_conf\.yml.runners.k8s.k8s_node_selector=cloud.google.com/gke-nodepool: pool1,""" +
-      """galaxy.postgresql.master.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" +
-      """galaxy.ingress.path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy,""" +
-      """galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-from=https://1455694897.jupyter.firecloud.org,""" +
-      """galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-to=https://leo,""" +
-      """galaxy.ingress.hosts[0].host=1455694897.jupyter.firecloud.org,""" +
-      """galaxy.ingress.hosts[0].paths[0].path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy,""" +
-      """galaxy.ingress.tls[0].hosts[0]=1455694897.jupyter.firecloud.org,""" +
-      """galaxy.ingress.tls[0].secretName=tls-secret,""" +
-      """cvmfs.cvmfscsi.cache.alien.pvc.storageClass=nfs-app1-galaxy-rls,""" +
-      """cvmfs.cvmfscsi.cache.alien.pvc.name=cvmfs-alien-cache,""" +
-      """galaxy.configs.galaxy\.yml.galaxy.single_user=user1@example.com,""" +
-      """galaxy.configs.galaxy\.yml.galaxy.admin_users=user1@example.com,""" +
-      """galaxy.terra.launch.workspace=test-workspace,""" +
-      """galaxy.terra.launch.namespace=dsp-leo-test1,""" +
-      """galaxy.terra.launch.apiURL=https://firecloud-orchestration.dsde-dev.broadinstitute.org/api/,""" +
-      """galaxy.terra.launch.drsURL=https://drshub.dsde-dev.broadinstitute.org/api/v4/drs/resolve,""" +
-      """galaxy.tusd.ingress.hosts[0].host=1455694897.jupyter.firecloud.org,""" +
-      """galaxy.tusd.ingress.hosts[0].paths[0].path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy/api/upload/resumable_upload,""" +
-      """galaxy.tusd.ingress.tls[0].hosts[0]=1455694897.jupyter.firecloud.org,""" +
-      """galaxy.tusd.ingress.tls[0].secretName=tls-secret,""" +
-      """galaxy.rabbitmq.persistence.storageClassName=nfs-app1-galaxy-rls,""" +
-      """galaxy.jobs.maxLimits.memory=23,""" +
-      """galaxy.jobs.maxLimits.cpu=7,""" +
-      """galaxy.jobs.maxRequests.memory=1,""" +
-      """galaxy.jobs.maxRequests.cpu=1,""" +
-      """galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_mem=1,""" +
-      """galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_cores=1,""" +
-      """galaxy.serviceAccount.create=false,""" +
-      """galaxy.serviceAccount.name=app1-galaxy-ksa,""" +
-      """rbac.serviceAccount=app1-galaxy-ksa,""" +
-      """persistence.nfs.name=ns-nfs-disk,""" +
-      """persistence.nfs.persistentVolume.extraSpec.gcePersistentDisk.pdName=disk1,persistence.nfs.size=250Gi,""" +
-      """persistence.postgres.name=ns-postgres-disk,""" +
-      """galaxy.postgresql.galaxyDatabasePassword=replace-me,""" +
-      """persistence.postgres.persistentVolume.extraSpec.gcePersistentDisk.pdName=disk1-gxy-postres,""" +
-      """persistence.postgres.size=10Gi,""" +
-      """nfs.persistence.existingClaim=ns-nfs-disk-pvc,""" +
-      """nfs.persistence.size=250Gi,""" +
-      """galaxy.postgresql.persistence.existingClaim=ns-postgres-disk-pvc,""" +
-      """galaxy.persistence.size=200Gi,""" +
-      """configs.WORKSPACE_NAME=test-workspace,""" +
-      """extraEnv[0].name=WORKSPACE_NAME,""" +
-      """extraEnv[0].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" +
-      """extraEnv[0].valueFrom.configMapKeyRef.key=WORKSPACE_NAME,""" +
-      """configs.WORKSPACE_BUCKET=gs://test-bucket,""" +
-      """extraEnv[1].name=WORKSPACE_BUCKET,""" +
-      """extraEnv[1].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" +
-      """extraEnv[1].valueFrom.configMapKeyRef.key=WORKSPACE_BUCKET,""" +
-      """configs.WORKSPACE_NAMESPACE=dsp-leo-test1,""" +
-      """extraEnv[2].name=WORKSPACE_NAMESPACE,""" +
-      """extraEnv[2].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" +
-      """extraEnv[2].valueFrom.configMapKeyRef.key=WORKSPACE_NAMESPACE,""" +
-      """restore.persistence.nfs.galaxy.pvcID=galaxy-pvc-id,""" +
-      """galaxy.persistence.existingClaim=app1-galaxy-rls-galaxy-galaxy-pvc""".stripMargin
-  }
-
   it should "build Cromwell override values string" in {
     val savedCluster1 = makeKubeCluster(1)
     val savedDisk1 = makePersistentDisk(Some(DiskName("disk1")))

From 8b9dffd1c3eb8828e3f5eab48b1def580bb3f7d0 Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Tue, 7 Apr 2026 13:51:53 -0400
Subject: [PATCH 02/17] fix format

---
 .../leonardo/dao/HttpJupyterDAO.scala         |  4 +-
 .../leonardo/util/GKEInterpreter.scala        | 88 +++++++++----------
 2 files changed, 47 insertions(+), 45 deletions(-)

diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala
index 0c55f089db..4d09a08109 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala
@@ -36,7 +36,9 @@ class HttpJupyterDAO[F[_]](val runtimeDnsCache: RuntimeDnsCache[F], client: Clie
                 headers = Headers.empty
               )
             )
-            .handleErrorWith(e => logger.warn(e)(s"isProxyAvailable failed for ${cloudContext}/${runtimeName}").as(false))
+            .handleErrorWith(e =>
+              logger.warn(e)(s"isProxyAvailable failed for ${cloudContext}/${runtimeName}").as(false)
+            )
         case _ => F.pure(false)
       }
     } yield res
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
index b42d783d04..782f4adabe 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
@@ -701,48 +701,46 @@ class GKEInterpreter[F[_]](
 
         case _ =>
           // GKE/Helm path for all other app types
-          googleClusterOpt
-            .traverse { googleCluster =>
-              val uninstallCharts = for {
-                helmAuthContext <- getHelmAuthContext(googleCluster, dbCluster, namespaceName)
+          googleClusterOpt.traverse { googleCluster =>
+            val uninstallCharts = for {
+              helmAuthContext <- getHelmAuthContext(googleCluster, dbCluster, namespaceName)
 
-                _ <- logger.info(ctx.loggingCtx)(
-                  s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}"
-                )
+              _ <- logger.info(ctx.loggingCtx)(
+                s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}"
+              )
 
-                _ <- helmClient
-                  .uninstall(app.release, true)
-                  .run(helmAuthContext)
-
-                last <- streamFUntilDone(
-                  kubeService.listPodStatus(dbCluster.getClusterId, KubernetesNamespace(namespaceName)),
-                  config.monitorConfig.deleteApp.maxAttempts,
-                  config.monitorConfig.deleteApp.interval
-                ).compile.lastOrError
-
-                _ <-
-                  if (!podDoneCheckable.isDone(last)) {
-                    val msg =
-                      s"Helm deletion has failed or timed out for app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}. The following pods are not in a terminal state: ${last
-                          .filterNot(isPodDone)
-                          .map(_.name.value)
-                          .mkString(", ")}"
-                    logger.error(ctx.loggingCtx)(msg) >>
-                      F.raiseError[Unit](AppDeletionException(msg))
-                  } else F.unit
-
-                _ <- helmClient
-                  .uninstall(getTerraAppSetupChartReleaseName(app.release), true)
-                  .run(helmAuthContext)
-              } yield ()
-
-              uninstallCharts.handleErrorWith { e =>
-                logger.info(ctx.loggingCtx)(
-                  s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString} failed with error ${e.getMessage}"
-                )
-              }
+              _ <- helmClient
+                .uninstall(app.release, true)
+                .run(helmAuthContext)
+
+              last <- streamFUntilDone(
+                kubeService.listPodStatus(dbCluster.getClusterId, KubernetesNamespace(namespaceName)),
+                config.monitorConfig.deleteApp.maxAttempts,
+                config.monitorConfig.deleteApp.interval
+              ).compile.lastOrError
+
+              _ <-
+                if (!podDoneCheckable.isDone(last)) {
+                  val msg =
+                    s"Helm deletion has failed or timed out for app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}. The following pods are not in a terminal state: ${last
+                        .filterNot(isPodDone)
+                        .map(_.name.value)
+                        .mkString(", ")}"
+                  logger.error(ctx.loggingCtx)(msg) >>
+                    F.raiseError[Unit](AppDeletionException(msg))
+                } else F.unit
+
+              _ <- helmClient
+                .uninstall(getTerraAppSetupChartReleaseName(app.release), true)
+                .run(helmAuthContext)
+            } yield ()
+
+            uninstallCharts.handleErrorWith { e =>
+              logger.info(ctx.loggingCtx)(
+                s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString} failed with error ${e.getMessage}"
+              )
             }
-            .void >>
+          }.void >>
             kubeService
               .deleteNamespace(dbApp.cluster.getClusterId, KubernetesNamespace(dbApp.app.appResources.namespace)) >>
             streamUntilDoneOrTimeout(
@@ -1056,11 +1054,13 @@ class GKEInterpreter[F[_]](
         .mkString("\n")
 
       // Derive postgres disk name using the same naming convention as the subscriber
-      postgresDiskName = GKEAlgebra.getGalaxyPostgresDiskName(nfsDisk.name, config.galaxyDiskConfig.postgresDiskNameSuffix)
+      postgresDiskName = GKEAlgebra.getGalaxyPostgresDiskName(nfsDisk.name,
+                                                              config.galaxyDiskConfig.postgresDiskNameSuffix
+      )
 
       // Persistent-volume-size passed to ansible-pull (leave ~11 GiB for filesystem overhead on 150 GB disk)
       pvSizeGi = math.max(1, nfsDisk.size.gb - 11)
-      pvSize   = s"${pvSizeGi}Gi"
+      pvSize = s"${pvSizeGi}Gi"
 
       // GCP Batch SA: prefer value from customEnvironmentVariables, fall back to config default
       gcpBatchSa = app.customEnvironmentVariables.getOrElse(
@@ -1181,9 +1181,9 @@ class GKEInterpreter[F[_]](
           instanceOpt.flatMap { inst =>
             import scala.jdk.CollectionConverters._
             for {
-              iface  <- Option(inst.getNetworkInterfacesList).flatMap(_.asScala.headOption)
-              cfg    <- Option(iface.getAccessConfigsList).flatMap(_.asScala.headOption)
-              natIp  <- Option(cfg.getNatIP).filter(_.nonEmpty)
+              iface <- Option(inst.getNetworkInterfacesList).flatMap(_.asScala.headOption)
+              cfg <- Option(iface.getAccessConfigsList).flatMap(_.asScala.headOption)
+              natIp <- Option(cfg.getNatIP).filter(_.nonEmpty)
             } yield IP(natIp)
           }
         },

From ee1c16a98befa34f6d983caa5231c90f1fc54164 Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Tue, 7 Apr 2026 14:41:45 -0400
Subject: [PATCH 03/17] Grant pet SA GCP Batch IAM roles at Galaxy VM creation
 time

At Galaxy VM creation, grant the pet SA roles/batch.jobsEditor on the
user's project so it can submit and monitor GCP Batch jobs. When the
Batch SA lives in the same project, also grant serviceAccountUser on it;
cross-project Batch SAs must have that binding configured externally.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../leonardo/util/GKEInterpreter.scala        | 40 +++++++++++++++++++
 1 file changed, 40 insertions(+)

diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
index 782f4adabe..335e88fb24 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
@@ -60,6 +60,7 @@ import org.broadinstitute.dsde.workbench.leonardo.util.BuildHelmChartValues.{
 import org.broadinstitute.dsde.workbench.leonardo.model.LeoException
 import org.broadinstitute.dsde.workbench.leonardo.util.GKEAlgebra._
 import org.broadinstitute.dsde.workbench.model.google.{GcsBucketName, GoogleProject}
+import org.broadinstitute.dsde.workbench.model.google.iam.IamMemberTypes
 import org.broadinstitute.dsde.workbench.model.{IP, TraceId, WorkbenchEmail}
 import org.broadinstitute.dsde.workbench.openTelemetry.OpenTelemetryMetrics
 import org.broadinstitute.dsp._
@@ -1170,6 +1171,45 @@ class GKEInterpreter[F[_]](
 
       _ <- computeService.createInstance(googleProject, zoneParam, instance)
 
+      // Grant the pet SA permission to submit and monitor GCP Batch jobs in this project.
+      // Galaxy uses the VM's attached SA (pet SA) to call the Batch API.
+      _ <- {
+        val call = F.fromFuture(
+          F.delay(
+            googleIamDAO
+              .addRoles(googleProject, app.googleServiceAccount, IamMemberTypes.ServiceAccount, Set("roles/batch.jobsEditor"))
+              .void
+          )
+        )
+        val retryConfig = RetryPredicates.retryConfigWithPredicates(when409)
+        tracedRetryF(retryConfig)(
+          call,
+          s"googleIamDAO.addRoles(batch.jobsEditor) for pet SA ${app.googleServiceAccount.value} in project ${googleProject.value}"
+        ).compile.lastOrError
+      }
+
+      // Grant the pet SA serviceAccountUser on the Batch SA so it can specify it as the job runner identity.
+      // Only attempted when the Batch SA lives in the same project as the user (i.e. not a shared platform SA).
+      // For cross-project Batch SAs, this binding must be set up externally (e.g. via Terraform).
+      gcpBatchSaProject = GoogleProject(gcpBatchSa.split("@").lastOption.getOrElse("").replace(".iam.gserviceaccount.com", ""))
+      _ <-
+        if (gcpBatchSaProject == googleProject)
+          F.fromFuture(
+            F.delay(
+              googleIamDAO.addIamPolicyBindingOnServiceAccount(
+                googleProject,
+                WorkbenchEmail(gcpBatchSa),
+                app.googleServiceAccount,
+                Set("roles/iam.serviceAccountUser")
+              )
+            )
+          )
+        else
+          logger.info(ctx.loggingCtx)(
+            s"Batch SA $gcpBatchSa is in a different project ($gcpBatchSaProject) than ${googleProject.value}; " +
+              s"skipping serviceAccountUser binding — must be configured externally"
+          )
+
       _ <- logger.info(ctx.loggingCtx)(
         s"Galaxy VM instance ${instanceName.value} submitted for project ${googleProject.value}; polling for external IP"
       )

From d48a60f6afee52371935452ce9567f310362680f Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Tue, 7 Apr 2026 16:09:40 -0400
Subject: [PATCH 04/17] add SA permissions and fix unit tests

---
 .../leonardo/util/GKEInterpreter.scala        | 10 ++-
 .../LeoPubsubMessageSubscriberSpec.scala      | 87 +++++++++++--------
 .../leonardo/util/VPCInterpreterSpec.scala    | 10 ++-
 3 files changed, 68 insertions(+), 39 deletions(-)

diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
index 335e88fb24..b0f52276c1 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
@@ -1177,7 +1177,11 @@ class GKEInterpreter[F[_]](
         val call = F.fromFuture(
           F.delay(
             googleIamDAO
-              .addRoles(googleProject, app.googleServiceAccount, IamMemberTypes.ServiceAccount, Set("roles/batch.jobsEditor"))
+              .addRoles(googleProject,
+                        app.googleServiceAccount,
+                        IamMemberTypes.ServiceAccount,
+                        Set("roles/batch.jobsEditor")
+              )
               .void
           )
         )
@@ -1191,7 +1195,9 @@ class GKEInterpreter[F[_]](
       // Grant the pet SA serviceAccountUser on the Batch SA so it can specify it as the job runner identity.
       // Only attempted when the Batch SA lives in the same project as the user (i.e. not a shared platform SA).
       // For cross-project Batch SAs, this binding must be set up externally (e.g. via Terraform).
-      gcpBatchSaProject = GoogleProject(gcpBatchSa.split("@").lastOption.getOrElse("").replace(".iam.gserviceaccount.com", ""))
+      gcpBatchSaProject = GoogleProject(
+        gcpBatchSa.split("@").lastOption.getOrElse("").replace(".iam.gserviceaccount.com", "")
+      )
       _ <-
         if (gcpBatchSaProject == googleProject)
           F.fromFuture(
diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala
index d932498dc3..d7f351488d 100644
--- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala
+++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala
@@ -10,7 +10,7 @@ import cats.mtl.Ask
 import cats.syntax.all._
 import com.github.benmanes.caffeine.cache.Caffeine
 import com.google.api.gax.longrunning.OperationFuture
-import com.google.cloud.compute.v1.{Disk, Operation}
+import com.google.cloud.compute.v1.{AccessConfig, Disk, Instance, NetworkInterface, Operation}
 import com.google.protobuf.Timestamp
 import fs2.Stream
 import org.broadinstitute.dsde.workbench.google.GoogleStorageDAO
@@ -21,13 +21,16 @@ import org.broadinstitute.dsde.workbench.google2.mock.{MockKubernetesService =>
 import org.broadinstitute.dsde.workbench.google2.{
   DiskName,
   GKEModels,
+  GoogleComputeService,
   GoogleDiskService,
   GoogleStorageService,
   KubernetesModels,
   MachineTypeName,
-  RegionName,
+  NetworkName,
+  SubnetworkName,
   ZoneName
 }
+import org.broadinstitute.dsde.workbench.util2.InstanceName
 import org.broadinstitute.dsde.workbench.leonardo.AppRestore.GalaxyRestore
 import org.broadinstitute.dsde.workbench.leonardo.AsyncTaskProcessor.Task
 import org.broadinstitute.dsde.workbench.leonardo.CommonTestData._
@@ -97,6 +100,24 @@ class LeoPubsubMessageSubscriberSpec
     ): Future[Unit] = Future.successful(())
   }
   val iamDAO = new MockGoogleIamDAO
+
+  // Returns a GCE instance with external IP "1.2.3.4" so Galaxy VM IP polling succeeds in tests.
+  val galaxyComputeService: GoogleComputeService[IO] = new FakeGoogleComputeService {
+    override def getInstance(project: GoogleProject, zone: ZoneName, instanceName: InstanceName)(implicit
+      ev: Ask[IO, TraceId]
+    ): IO[Option[Instance]] = {
+      val inst = Instance
+        .newBuilder()
+        .addNetworkInterfaces(
+          NetworkInterface
+            .newBuilder()
+            .addAccessConfigs(AccessConfig.newBuilder().setNatIP("1.2.3.4").build())
+            .build()
+        )
+        .build()
+      IO.pure(Some(inst))
+    }
+  }
   val resourceService = new FakeGoogleResourceService {
     override def getProjectNumber(project: GoogleProject)(implicit ev: Ask[IO, TraceId]): IO[Option[Long]] =
       IO(Some(1L))
@@ -903,32 +924,29 @@ class LeoPubsubMessageSubscriberSpec
       getDisk = getDiskOpt.get
       appRestore <- persistentDiskQuery.getAppDiskRestore(savedApp1.appResources.disk.get.id).transaction
       galaxyRestore = appRestore.map(_.asInstanceOf[GalaxyRestore])
-      ipRange = Config.vpcConfig.subnetworkRegionIpRangeMap
-        .getOrElse(RegionName("us-central1"), throw new Exception(s"Unsupported Region us-central1"))
     } yield {
       getCluster.status shouldBe KubernetesClusterStatus.Running
       getCluster.nodepools.size shouldBe 2
-      getCluster.nodepools.filter(_.isDefault).head.status shouldBe NodepoolStatus.Running
+      // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified
+      getCluster.nodepools.filter(_.isDefault).head.status shouldBe NodepoolStatus.Unspecified
       getApp.app.errors shouldBe List.empty
       getApp.app.status shouldBe AppStatus.Running
       getApp.app.appResources.kubernetesServiceAccountName shouldBe Some(
         ServiceAccountName("gxy-ksa")
       )
       getApp.cluster.status shouldBe KubernetesClusterStatus.Running
-      getApp.nodepool.status shouldBe NodepoolStatus.Running
+      // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified
+      getApp.nodepool.status shouldBe NodepoolStatus.Unspecified
+      // Galaxy VM path stores external IP as loadBalancerIp; network fields are not populated
       getApp.cluster.asyncFields shouldBe Some(
         KubernetesClusterAsyncFields(IP("1.2.3.4"),
-                                     IP("0.0.0.0"),
-                                     NetworkFields(Config.vpcConfig.networkName,
-                                                   Config.vpcConfig.subnetworkName,
-                                                   ipRange
-                                     )
+                                     IP(""),
+                                     NetworkFields(NetworkName(""), SubnetworkName(""), IpRange(""))
         )
       )
       getDisk.status shouldBe DiskStatus.Ready
-      galaxyRestore shouldBe Some(
-        GalaxyRestore(PvcId(s"nfs-pvc-id1"), getApp.app.id)
-      )
+      // Galaxy VM path does not use PVCs — no GalaxyRestore is recorded
+      galaxyRestore shouldBe None
     }
 
     implicit val gkeAlg: GKEAlgebra[IO] = makeGKEInterp(nodepoolLock, List(savedApp1.release))
@@ -1062,25 +1080,22 @@ class LeoPubsubMessageSubscriberSpec
         .transaction
       getApp1 = getAppOpt1.get
       getApp2 = getAppOpt2.get
-      ipRange = Config.vpcConfig.subnetworkRegionIpRangeMap
-        .getOrElse(RegionName("us-central1"), throw new Exception(s"Unsupported Region us-central1"))
     } yield {
       getApp1.cluster.status shouldBe KubernetesClusterStatus.Running
       getApp2.cluster.status shouldBe KubernetesClusterStatus.Running
-      getApp1.nodepool.status shouldBe NodepoolStatus.Running
-      getApp2.nodepool.status shouldBe NodepoolStatus.Running
+      // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified
+      getApp1.nodepool.status shouldBe NodepoolStatus.Unspecified
+      getApp2.nodepool.status shouldBe NodepoolStatus.Unspecified
       getApp1.app.errors shouldBe List()
       getApp1.app.status shouldBe AppStatus.Running
       getApp1.app.appResources.kubernetesServiceAccountName shouldBe Some(
         ServiceAccountName("gxy-ksa")
       )
+      // Galaxy VM path stores external IP as loadBalancerIp; network fields are not populated
       getApp1.cluster.asyncFields shouldBe Some(
         KubernetesClusterAsyncFields(IP("1.2.3.4"),
-                                     IP("0.0.0.0"),
-                                     NetworkFields(Config.vpcConfig.networkName,
-                                                   Config.vpcConfig.subnetworkName,
-                                                   ipRange
-                                     )
+                                     IP(""),
+                                     NetworkFields(NetworkName(""), SubnetworkName(""), IpRange(""))
         )
       )
       getApp2.app.errors shouldBe List()
@@ -1298,7 +1313,9 @@ class LeoPubsubMessageSubscriberSpec
   it should "handle an error in delete app" in isolatedDbTest {
     val savedCluster1 = makeKubeCluster(1).save()
     val savedNodepool1 = makeNodepool(1, savedCluster1.id).save()
-    val savedApp1 = makeApp(1, savedNodepool1.id).save()
+    // Use Cromwell (GKE/Helm path) so the deleteNamespace error triggers AppStatus.Error.
+    // Galaxy now uses the VM path which swallows deleteInstance errors gracefully.
+    val savedApp1 = makeApp(1, savedNodepool1.id, appType = AppType.Cromwell).save()
     val mockAckConsumer = mock[AckHandler]
 
     val assertions = for {
@@ -1676,7 +1693,9 @@ class LeoPubsubMessageSubscriberSpec
           savedApp1.appName,
           None,
           Map.empty,
-          AppType.Galaxy,
+          // Use Cromwell so the GKE cluster creation path is taken and mockGKEService.createCluster can throw.
+          // Galaxy skips cluster creation (VM path), so the mock would have no effect.
+          AppType.Cromwell,
           savedApp1.appResources.namespace,
           None,
           Some(tr),
@@ -1821,26 +1840,24 @@ class LeoPubsubMessageSubscriberSpec
       getApp = getAppOpt.get
       getDiskOpt <- persistentDiskQuery.getById(savedApp1.appResources.disk.get.id).transaction
       getDisk = getDiskOpt.get
-      ipRange = Config.vpcConfig.subnetworkRegionIpRangeMap
-        .getOrElse(RegionName("us-central1"), throw new Exception(s"Unsupported Region us-central1"))
     } yield {
       getCluster.status shouldBe KubernetesClusterStatus.Running
       getCluster.nodepools.size shouldBe 2
-      getCluster.nodepools.filter(_.isDefault).head.status shouldBe NodepoolStatus.Running
+      // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified
+      getCluster.nodepools.filter(_.isDefault).head.status shouldBe NodepoolStatus.Unspecified
       getApp.app.errors shouldBe List()
       getApp.app.status shouldBe AppStatus.Running
       getApp.app.appResources.kubernetesServiceAccountName shouldBe Some(
         ServiceAccountName("gxy-ksa")
       )
       getApp.cluster.status shouldBe KubernetesClusterStatus.Running
-      getApp.nodepool.status shouldBe NodepoolStatus.Running
+      // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified
+      getApp.nodepool.status shouldBe NodepoolStatus.Unspecified
+      // Galaxy VM path stores external IP as loadBalancerIp; network fields are not populated
       getApp.cluster.asyncFields shouldBe Some(
         KubernetesClusterAsyncFields(IP("1.2.3.4"),
-                                     IP("0.0.0.0"),
-                                     NetworkFields(Config.vpcConfig.networkName,
-                                                   Config.vpcConfig.subnetworkName,
-                                                   ipRange
-                                     )
+                                     IP(""),
+                                     NetworkFields(NetworkName(""), SubnetworkName(""), IpRange(""))
         )
       )
       getDisk.status shouldBe DiskStatus.Ready
@@ -2024,7 +2041,7 @@ class LeoPubsubMessageSubscriberSpec
       MockAppDescriptorDAO,
       lock,
       resourceService,
-      FakeGoogleComputeService
+      galaxyComputeService
     )
 
   def makeLeoSubscriber(
diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/VPCInterpreterSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/VPCInterpreterSpec.scala
index c6ad02ccc5..5b6a8b2190 100644
--- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/VPCInterpreterSpec.scala
+++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/VPCInterpreterSpec.scala
@@ -87,7 +87,7 @@ class VPCInterpreterSpec extends AnyFlatSpecLike with LeonardoTestSuite {
         SetUpProjectFirewallsParams(project, vpcConfig.networkName, RegionName("us-central1"), Map.empty)
       )
       .unsafeRunSync()
-    computeService.firewallMap.size shouldBe 4
+    computeService.firewallMap.size shouldBe 5
     vpcConfig.firewallsToAdd.foreach { fwConfig =>
       val fw = computeService.firewallMap.get(FirewallRuleName(s"${fwConfig.namePrefix}-us-central1"))
       fw shouldBe defined
@@ -155,6 +155,12 @@ class VPCInterpreterSpec extends AnyFlatSpecLike with LeonardoTestSuite {
     )
     val test = new VPCInterpreter(Config.vpcInterpreterConfig, stubResourceService(Map.empty), computeService)
 
+    val expectedHttpFirewallRules = FirewallRuleConfig(
+      "leonardo-allow-http",
+      None,
+      allSupportedRegions.map(r => r -> List(IpRange("0.0.0.0/0"))).toMap,
+      List(Allowed("tcp", Some("80")))
+    )
     test
       .firewallRulesToAdd(
         Map(
@@ -162,7 +168,7 @@ class VPCInterpreterSpec extends AnyFlatSpecLike with LeonardoTestSuite {
           "leonardo-allow-https-firewall-name" -> "leonardo-ssl"
         )
       )
-      .toSet shouldBe Set(expectedSshFirewallRules, expectedIapFirewallRules)
+      .toSet shouldBe Set(expectedHttpFirewallRules, expectedSshFirewallRules, expectedIapFirewallRules)
   }
 
   private def stubResourceService(labels: Map[String, String]): FakeGoogleResourceService =

From 8bf243289e3a44e77877a1ca925cabf52d206981 Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Wed, 8 Apr 2026 10:30:46 -0400
Subject: [PATCH 05/17] fix the last two unit tests

---
 .../leonardo/util/GKEInterpreter.scala        | 54 +++++++++++--------
 .../LeoPubsubMessageSubscriberSpec.scala      |  7 +--
 2 files changed, 36 insertions(+), 25 deletions(-)

diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
index b0f52276c1..75dadd0df1 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
@@ -401,34 +401,44 @@ class GKEInterpreter[F[_]](
       app = dbApp.app
       dbCluster = dbApp.cluster
       googleProject = params.googleProject
+      // Idempotency: if the app is already Running, skip creation to avoid double-recording usage
+      _ <-
+        if (app.status == AppStatus.Running)
+          logger.info(ctx.loggingCtx)(
+            s"App ${app.appName.value} is already Running, skipping creation (idempotent)"
+          )
+        else
+          for {
+            diskOpt <- appQuery.getDiskId(app.id).transaction
+            diskId <- F.fromOption(diskOpt, DiskNotFoundForAppException(app.id, ctx.traceId))
 
-      diskOpt <- appQuery.getDiskId(app.id).transaction
-      diskId <- F.fromOption(diskOpt, DiskNotFoundForAppException(app.id, ctx.traceId))
-
-      _ <- logger.info(ctx.loggingCtx)(s"Begin App(${app.appName.value}) Creation.")
+            _ <- logger.info(ctx.loggingCtx)(s"Begin App(${app.appName.value}) Creation.")
 
-      nfsDisk <- F.fromOption(
-        dbApp.app.appResources.disk,
-        AppCreationException(s"NFS disk not found in DB for app ${app.appName.value} | trace id: ${ctx.traceId}")
-      )
+            nfsDisk <- F.fromOption(
+              dbApp.app.appResources.disk,
+              AppCreationException(
+                s"NFS disk not found in DB for app ${app.appName.value} | trace id: ${ctx.traceId}"
+              )
+            )
 
-      // Galaxy uses a VM-based deployment; all other app types use the GKE/Helm path.
-      _ <- app.appType match {
-        case AppType.Galaxy =>
-          installGalaxyVm(dbCluster, app, nfsDisk, googleProject) >>
-            persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction.void
+            // Galaxy uses a VM-based deployment; all other app types use the GKE/Helm path.
+            _ <- app.appType match {
+              case AppType.Galaxy =>
+                installGalaxyVm(dbCluster, app, nfsDisk, googleProject) >>
+                  persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction.void
 
-        case _ =>
-          createAndPollAppViaHelm(params, dbApp, app, dbCluster, nfsDisk, diskId, googleProject, ctx)
-      }
+              case _ =>
+                createAndPollAppViaHelm(params, dbApp, app, dbCluster, nfsDisk, diskId, googleProject, ctx)
+            }
 
-      _ <- logger.info(ctx.loggingCtx)(
-        s"Finished app creation for app ${app.appName.value}"
-      )
+            _ <- logger.info(ctx.loggingCtx)(
+              s"Finished app creation for app ${app.appName.value}"
+            )
 
-      readyTime <- F.realTimeInstant
-      _ <- appUsageQuery.recordStart(params.appId, readyTime)
-      _ <- appQuery.updateStatus(params.appId, AppStatus.Running).transaction
+            readyTime <- F.realTimeInstant
+            _ <- appUsageQuery.recordStart(params.appId, readyTime)
+            _ <- appQuery.updateStatus(params.appId, AppStatus.Running).transaction
+          } yield ()
     } yield ()
 
   // GKE/Helm path for non-Galaxy app types (Cromwell, Allowed, Custom).
diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala
index d7f351488d..dd542cbd3a 100644
--- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala
+++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala
@@ -1449,7 +1449,7 @@ class LeoPubsubMessageSubscriberSpec
     val savedNodepool1 = makeNodepool(1, savedCluster1.id).save()
 
     val disk = makePersistentDisk(None).save().unsafeRunSync()(cats.effect.unsafe.IORuntime.global)
-    val makeApp1 = makeApp(1, savedNodepool1.id)
+    val makeApp1 = makeApp(1, savedNodepool1.id, appType = AppType.Cromwell)
     val savedApp1 = makeApp1
       .copy(appResources =
         makeApp1.appResources.copy(
@@ -1540,7 +1540,7 @@ class LeoPubsubMessageSubscriberSpec
           savedApp1.appName,
           Some(disk.id),
           Map.empty,
-          AppType.Galaxy,
+          AppType.Cromwell,
           savedApp1.appResources.namespace,
           None,
           Some(tr),
@@ -1889,7 +1889,8 @@ class LeoPubsubMessageSubscriberSpec
           false,
           Some(GcsBucketName("fc-bucket"))
         )
-        asyncTaskProcessor = AsyncTaskProcessor(AsyncTaskProcessor.Config(10, 10), queue)
+        // maxConcurrentTasks=1 ensures tasks run sequentially so the idempotency check fires for the 2nd task
+        asyncTaskProcessor = AsyncTaskProcessor(AsyncTaskProcessor.Config(10, 1), queue)
         // send message twice
         _ <- leoSubscriber.handleCreateAppMessage(msg)
         _ <- leoSubscriber.handleCreateAppMessage(msg)

From 8196bea7cdf1b666a9d816ada9997caba0d3ec17 Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Fri, 10 Apr 2026 11:17:28 -0400
Subject: [PATCH 06/17] handle Batch SA creation and fix restore Galaxy

---
 http/src/main/resources/reference.conf        |   1 -
 .../workbench/leonardo/config/Config.scala    |   1 -
 .../leonardo/config/KubernetesAppConfig.scala |   1 -
 .../monitor/LeoPubsubMessageSubscriber.scala  |  11 +-
 .../workbench/leonardo/util/GKEAlgebra.scala  |   3 +-
 .../leonardo/util/GKEInterpreter.scala        | 107 +++++++++++++-----
 6 files changed, 89 insertions(+), 35 deletions(-)

diff --git a/http/src/main/resources/reference.conf b/http/src/main/resources/reference.conf
index 7a9c841302..64b9b732fe 100644
--- a/http/src/main/resources/reference.conf
+++ b/http/src/main/resources/reference.conf
@@ -442,7 +442,6 @@ galaxyVm {
   # Suffix appended to the NFS disk name to derive the postgres disk name.
   # Must match the value used in LeoPubsubMessageSubscriber (galaxyDisk.postgresDiskNameSuffix).
   postgresDiskNameSuffix = ${gke.galaxyDisk.postgresDiskNameSuffix}
-  gcpBatchServiceAccountEmail = ""
   gitRepo = "https://github.com/galaxyproject/galaxy-k8s-boot.git"
   gitBranch = "master"
 }
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala
index 42c7a53737..08eb73020d 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala
@@ -138,7 +138,6 @@ object Config {
       config.as[DiskSize]("bootDiskSizeGb"),
       config.as[DiskSize]("postgresDiskSizeGb"),
       config.as[String]("postgresDiskNameSuffix"),
-      config.as[String]("gcpBatchServiceAccountEmail"),
       config.as[String]("gitRepo"),
       config.as[String]("gitBranch")
     )
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala
index 53e2562db6..bd6712279f 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala
@@ -100,7 +100,6 @@ final case class GalaxyVmConfig(
   bootDiskSizeGb: DiskSize,
   postgresDiskSizeGb: DiskSize,
   postgresDiskNameSuffix: String,
-  gcpBatchServiceAccountEmail: String,
   gitRepo: String,
   gitBranch: String
 )
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala
index 97a6c984bd..fb71a5603d 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala
@@ -980,9 +980,14 @@ class LeoPubsubMessageSubscriber[F[_]](
         )
         .void
 
-      // create second Galaxy disk asynchronously
+      // Restore mode: disk already exists (no new disk to create) for a Galaxy app.
+      // In this case we skip creating new disks and pass restore=true so the VM
+      // runs the Ansible restore playbook instead of a fresh install.
+      restore = msg.appType == AppType.Galaxy && msg.createDisk.isEmpty
+
+      // create second Galaxy disk asynchronously (only for fresh installs)
       createSecondDiskOp =
-        if (msg.appType == AppType.Galaxy && disk.isDefined) {
+        if (msg.appType == AppType.Galaxy && disk.isDefined && !restore) {
           val d = disk.get // it's safe to do `.get` here because we've verified
           for {
             res <- createGalaxyPostgresDiskOnlyInGoogle(msg.project, ZoneName("us-central1-a"), msg.appName, d.name)
@@ -1012,7 +1017,7 @@ class LeoPubsubMessageSubscriber[F[_]](
         // create and monitor app
         _ <- getGkeAlgFromRegistry()
           .createAndPollApp(
-            CreateAppParams(msg.appId, msg.project, msg.appName, msg.machineType, msg.bucketNameToMount)
+            CreateAppParams(msg.appId, msg.project, msg.appName, msg.machineType, msg.bucketNameToMount, restore)
           )
           .onError { case e =>
             cleanUpAfterCreateAppError(msg.appId, msg.appName, msg.project, msg.createDisk, e)
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEAlgebra.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEAlgebra.scala
index 22108113e1..9a81c53064 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEAlgebra.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEAlgebra.scala
@@ -125,7 +125,8 @@ final case class CreateAppParams(appId: AppId,
                                  googleProject: GoogleProject,
                                  appName: AppName,
                                  appMachineType: Option[AppMachineType],
-                                 bucketNameToMount: Option[GcsBucketName]
+                                 bucketNameToMount: Option[GcsBucketName],
+                                 restore: Boolean = false
 )
 
 final case class DeleteClusterParams(clusterId: KubernetesClusterLeoId, googleProject: GoogleProject)
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
index 75dadd0df1..4bb7017ccd 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
@@ -8,8 +8,10 @@ import cats.syntax.all._
 import com.google.auth.oauth2.GoogleCredentials
 import com.google.cloud.compute.v1.{
   AccessConfig,
+  Allowed,
   AttachedDisk,
   AttachedDiskInitializeParams,
+  Firewall,
   Instance,
   Items,
   Metadata,
@@ -37,6 +39,7 @@ import org.broadinstitute.dsde.workbench.google2.{
   streamFUntilDone,
   streamUntilDoneOrTimeout,
   tracedRetryF,
+  FirewallRuleName,
   GoogleComputeService,
   GoogleDiskService,
   GoogleResourceService,
@@ -59,7 +62,12 @@ import org.broadinstitute.dsde.workbench.leonardo.util.BuildHelmChartValues.{
 }
 import org.broadinstitute.dsde.workbench.leonardo.model.LeoException
 import org.broadinstitute.dsde.workbench.leonardo.util.GKEAlgebra._
-import org.broadinstitute.dsde.workbench.model.google.{GcsBucketName, GoogleProject}
+import org.broadinstitute.dsde.workbench.model.google.{
+  GcsBucketName,
+  GoogleProject,
+  ServiceAccountDisplayName,
+  ServiceAccountName
+}
 import org.broadinstitute.dsde.workbench.model.google.iam.IamMemberTypes
 import org.broadinstitute.dsde.workbench.model.{IP, TraceId, WorkbenchEmail}
 import org.broadinstitute.dsde.workbench.openTelemetry.OpenTelemetryMetrics
@@ -424,7 +432,7 @@ class GKEInterpreter[F[_]](
             // Galaxy uses a VM-based deployment; all other app types use the GKE/Helm path.
             _ <- app.appType match {
               case AppType.Galaxy =>
-                installGalaxyVm(dbCluster, app, nfsDisk, googleProject) >>
+                installGalaxyVm(dbCluster, app, nfsDisk, googleProject, params.restore) >>
                   persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction.void
 
               case _ =>
@@ -1038,7 +1046,8 @@ class GKEInterpreter[F[_]](
     dbCluster: KubernetesCluster,
     app: App,
     nfsDisk: PersistentDisk,
-    googleProject: GoogleProject
+    googleProject: GoogleProject,
+    restore: Boolean
   )(implicit ev: Ask[F, AppContext]): F[Unit] =
     for {
       ctx <- ev.ask
@@ -1073,16 +1082,23 @@ class GKEInterpreter[F[_]](
       pvSizeGi = math.max(1, nfsDisk.size.gb - 11)
       pvSize = s"${pvSizeGi}Gi"
 
-      // GCP Batch SA: prefer value from customEnvironmentVariables, fall back to config default
-      gcpBatchSa = app.customEnvironmentVariables.getOrElse(
-        "gcp_batch_service_account_email",
-        config.galaxyVmConfig.gcpBatchServiceAccountEmail
-      )
-
-      // restore_galaxy flag
-      restoreGalaxy = app.customEnvironmentVariables.getOrElse("restore_galaxy", "false")
+      // Get or create the galaxy-batch-runner SA in the user's project.
+      gcpBatchSa <- F
+        .fromFuture(
+          F.delay(
+            googleIamDAO.getOrCreateServiceAccount(
+              googleProject,
+              ServiceAccountName("galaxy-batch-runner"),
+              ServiceAccountDisplayName("Galaxy Batch Runner"),
+              executionContext
+            )
+          )
+        )
+        .map(sa => sa.email.value)
 
-      // Disks
+      // Disks — data and postgres disks are always pre-existing by the time this method runs
+      // (created by createDiskOp / createSecondDiskOp, or retained from a previous app).
+      // Use setSource to attach existing disks; only the boot disk is created fresh.
       bootDisk = AttachedDisk
         .newBuilder()
         .setBoot(true)
@@ -1103,14 +1119,8 @@ class GKEInterpreter[F[_]](
         .setBoot(false)
         .setDeviceName("galaxy-data")
         .setAutoDelete(false)
-        .setInitializeParams(
-          AttachedDiskInitializeParams
-            .newBuilder()
-            .setDiskName(nfsDisk.name.value)
-            .setDiskSizeGb(nfsDisk.size.gb)
-            .setDiskType(nfsDisk.diskType.googleString(googleProject, zoneParam))
-            .putAllLabels(Map("leonardo" -> "true").asJava)
-            .build()
+        .setSource(
+          s"projects/${googleProject.value}/zones/${zoneParam.value}/disks/${nfsDisk.name.value}"
         )
         .build()
 
@@ -1120,13 +1130,8 @@ class GKEInterpreter[F[_]](
         .setBoot(false)
         .setDeviceName("galaxy-postgres-data")
         .setAutoDelete(false)
-        .setInitializeParams(
-          AttachedDiskInitializeParams
-            .newBuilder()
-            .setDiskName(postgresDiskName.value)
-            .setDiskSizeGb(config.galaxyVmConfig.postgresDiskSizeGb.gb)
-            .putAllLabels(Map("leonardo" -> "true").asJava)
-            .build()
+        .setSource(
+          s"projects/${googleProject.value}/zones/${zoneParam.value}/disks/${postgresDiskName.value}"
         )
         .build()
 
@@ -1168,7 +1173,7 @@ class GKEInterpreter[F[_]](
             .addItems(Items.newBuilder().setKey("google-logging-enabled").setValue("true").build())
             .addItems(Items.newBuilder().setKey("gcp_batch_service_account_email").setValue(gcpBatchSa).build())
             .addItems(Items.newBuilder().setKey("persistent-volume-size").setValue(pvSize).build())
-            .addItems(Items.newBuilder().setKey("restore_galaxy").setValue(restoreGalaxy).build())
+            .addItems(Items.newBuilder().setKey("restore_galaxy").setValue(restore.toString).build())
             .addItems(Items.newBuilder().setKey("git-repo").setValue(config.galaxyVmConfig.gitRepo).build())
             .addItems(Items.newBuilder().setKey("git-branch").setValue(config.galaxyVmConfig.gitBranch).build())
             .addItems(Items.newBuilder().setKey("gcp-region").setValue(regionParam.value).build())
@@ -1226,6 +1231,52 @@ class GKEInterpreter[F[_]](
               s"skipping serviceAccountUser binding — must be configured externally"
           )
 
+      // Grant the Batch SA the project-level roles it needs to run jobs and attach a service account to Batch VMs.
+      // See https://github.com/galaxyproject/galaxy-k8s-boot?tab=readme-ov-file#prerequisites
+      _ <- {
+        val call = F.fromFuture(
+          F.delay(
+            googleIamDAO
+              .addRoles(googleProject,
+                        WorkbenchEmail(gcpBatchSa),
+                        IamMemberTypes.ServiceAccount,
+                        Set("roles/batch.jobsEditor", "roles/iam.serviceAccountUser")
+              )
+              .void
+          )
+        )
+        val retryConfig = RetryPredicates.retryConfigWithPredicates(when409)
+        tracedRetryF(retryConfig)(
+          call,
+          s"googleIamDAO.addRoles(batch.jobsEditor, iam.serviceAccountUser) for Batch SA $gcpBatchSa in project ${googleProject.value}"
+        ).compile.lastOrError
+      }
+
+      // Create an NFS firewall rule so GCP Batch VMs can reach the Galaxy VM's NFS server.
+      // Idempotent: skipped if the rule already exists.
+      nfsFwName = FirewallRuleName("leonardo-galaxy-allow-nfs-for-batch")
+      nfsFwExists <- computeService.getFirewallRule(googleProject, nfsFwName)
+      _ <-
+        if (nfsFwExists.isEmpty) {
+          val nfsFirewall = Firewall
+            .newBuilder()
+            .setName(nfsFwName.value)
+            .setNetwork(s"projects/${googleProject.value}/global/networks/${network.value}")
+            .addSourceRanges("10.0.0.0/8")
+            .addTargetTags(config.vpcNetworkTag.value)
+            .addAllowed(Allowed.newBuilder().setIPProtocol("tcp").addPorts("2049").build())
+            .addAllowed(Allowed.newBuilder().setIPProtocol("udp").addPorts("2049").build())
+            .addAllowed(Allowed.newBuilder().setIPProtocol("tcp").addPorts("111").build())
+            .addAllowed(Allowed.newBuilder().setIPProtocol("udp").addPorts("111").build())
+            .build()
+          computeService
+            .addFirewallRule(googleProject, nfsFirewall)
+            .flatMap(op => F.blocking(op.get()).void)
+        } else
+          logger.info(ctx.loggingCtx)(
+            s"NFS firewall rule ${nfsFwName.value} already exists, skipping creation"
+          )
+
       _ <- logger.info(ctx.loggingCtx)(
         s"Galaxy VM instance ${instanceName.value} submitted for project ${googleProject.value}; polling for external IP"
       )

From 95fa6b84b049f85c25200441bef6d3fe04287bf9 Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Fri, 10 Apr 2026 11:42:46 -0400
Subject: [PATCH 07/17] fix compiler issues

---
 .../workbench/leonardo/util/GKEInterpreter.scala     | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
index 4bb7017ccd..719b57386b 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
@@ -62,12 +62,7 @@ import org.broadinstitute.dsde.workbench.leonardo.util.BuildHelmChartValues.{
 }
 import org.broadinstitute.dsde.workbench.leonardo.model.LeoException
 import org.broadinstitute.dsde.workbench.leonardo.util.GKEAlgebra._
-import org.broadinstitute.dsde.workbench.model.google.{
-  GcsBucketName,
-  GoogleProject,
-  ServiceAccountDisplayName,
-  ServiceAccountName
-}
+import org.broadinstitute.dsde.workbench.model.google.{GcsBucketName, GoogleProject, ServiceAccountDisplayName}
 import org.broadinstitute.dsde.workbench.model.google.iam.IamMemberTypes
 import org.broadinstitute.dsde.workbench.model.{IP, TraceId, WorkbenchEmail}
 import org.broadinstitute.dsde.workbench.openTelemetry.OpenTelemetryMetrics
@@ -1088,9 +1083,8 @@ class GKEInterpreter[F[_]](
           F.delay(
             googleIamDAO.getOrCreateServiceAccount(
               googleProject,
-              ServiceAccountName("galaxy-batch-runner"),
-              ServiceAccountDisplayName("Galaxy Batch Runner"),
-              executionContext
+              org.broadinstitute.dsde.workbench.model.google.ServiceAccountName("galaxy-batch-runner"),
+              ServiceAccountDisplayName("Galaxy Batch Runner")
             )
           )
         )

From 86a49665ee73f4f9e904a08de9ed94b4885811d1 Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Mon, 13 Apr 2026 08:55:22 -0400
Subject: [PATCH 08/17] Fix Leo proxy for Galaxy VM: use HTTP on port 80 via VM
 internal IP

---
 .../workbench/leonardo/dao/HttpAppDAO.scala   |  2 +-
 .../workbench/leonardo/dao/ProxyDAO.scala     |  5 +-
 .../leonardo/dns/KubernetesDnsCache.scala     |  8 +-
 .../leonardo/http/service/ProxyService.scala  | 86 +++++++++++++------
 .../leonardo/util/GKEInterpreter.scala        | 31 ++++---
 .../LeoPubsubMessageSubscriberSpec.scala      | 15 ++--
 6 files changed, 97 insertions(+), 50 deletions(-)

diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala
index 84a04b45da..e846792f11 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala
@@ -23,7 +23,7 @@ class HttpAppDAO[F[_]: Async](kubernetesDnsCache: KubernetesDnsCache[F], client:
                        traceId: TraceId
   ): F[Boolean] =
     Proxy.getAppTargetHost[F](kubernetesDnsCache, CloudContext.Gcp(googleProject), appName) flatMap {
-      case HostReady(targetHost, _, _) =>
+      case HostReady(targetHost, _, _, _) =>
         val serviceUrl = serviceName match {
           case ServiceName("welder-service") =>
             s"https://${targetHost.address}/proxy/google/v1/apps/${googleProject.value}/${appName.value}/${serviceName.value}/status/"
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/ProxyDAO.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/ProxyDAO.scala
index 6f6251a07d..ee1b07ddb2 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/ProxyDAO.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/ProxyDAO.scala
@@ -11,7 +11,10 @@ object HostStatus {
   final case object HostNotFound extends HostStatus
   final case object HostNotReady extends HostStatus
   final case object HostPaused extends HostStatus
-  final case class HostReady(hostname: Host, path: String, cloudProvider: CloudProvider) extends HostStatus {
+  // useHttp = true means the proxy connects to the backend via plain HTTP (port 80) instead of
+  // HTTPS (proxyConfig.proxyPort). Used for Galaxy VM apps whose nginx serves HTTP only.
+  final case class HostReady(hostname: Host, path: String, cloudProvider: CloudProvider, useHttp: Boolean = false)
+      extends HostStatus {
     def toUri: Uri = Uri.unsafeFromString(s"https://${hostname.address()}/proxy/${path}")
 
     def toNotebooksUri: Uri = Uri.unsafeFromString(s"https://${hostname.address()}/notebooks/${path}")
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala
index 6f7ff4783c..4d286c7f4b 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala
@@ -7,6 +7,7 @@ import org.broadinstitute.dsde.workbench.leonardo.dao.HostStatus
 import org.broadinstitute.dsde.workbench.leonardo.dao.HostStatus.{HostNotFound, HostNotReady, HostReady}
 import org.broadinstitute.dsde.workbench.leonardo.db.{DbReference, KubernetesServiceDbQueries}
 import org.broadinstitute.dsde.workbench.leonardo.http.{kubernetesProxyHost, GetAppResult}
+import org.broadinstitute.dsde.workbench.leonardo.AppType.Galaxy
 import org.broadinstitute.dsde.workbench.leonardo.{AppName, CloudContext, CloudProvider}
 import org.broadinstitute.dsde.workbench.model.IP
 import org.broadinstitute.dsde.workbench.openTelemetry.OpenTelemetryMetrics
@@ -48,8 +49,13 @@ final class KubernetesDnsCache[F[_]: Logger: OpenTelemetryMetrics](
       case None => F.pure[HostStatus](HostNotReady)
       case Some(ip) =>
         val h = kubernetesProxyHost(appResult.cluster, proxyConfig.proxyDomain)
+        // Galaxy VM apps serve HTTP on port 80. The proxy should connect via plain HTTP,
+        // and we map the fake hostname to the VM's internal IP (stored in loadBalancerIp).
+        val isGalaxyVm = appResult.app.appType == Galaxy
         hostToIpMapping
           .getAndUpdate(_ + (h.address -> ip))
-          .as[HostStatus](HostReady(h, "", CloudProvider.Gcp)) // TODO: update this once we start support AKS
+          .as[HostStatus](
+            HostReady(h, "", CloudProvider.Gcp, useHttp = isGalaxyVm)
+          ) // TODO: update this once we start support AKS
     }
 }
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala
index 5d989946bd..0ba9da08e0 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala
@@ -256,7 +256,7 @@ class ProxyService(
 
       hostStatus <- getRuntimeTargetHost(cloudContext, runtimeName)
       _ <- hostStatus match {
-        case HostReady(_, _, _) =>
+        case HostReady(_, _, _, _) =>
           dateAccessUpdaterQueue.offer(
             UpdateDateAccessedMessage(UpdateTarget.Runtime(runtimeName), cloudContext, ctx.now)
           )
@@ -350,7 +350,7 @@ class ProxyService(
         } else IO.unit
       hostStatus <- getAppTargetHost(cloudContext, appName)
       _ <- hostStatus match {
-        case HostReady(_, _, _) =>
+        case HostReady(_, _, _, _) =>
           dateAccessUpdaterQueue.offer(UpdateDateAccessedMessage(UpdateTarget.App(appName), cloudContext, ctx.now))
         case _ => IO.unit
       }
@@ -376,16 +376,16 @@ class ProxyService(
     for {
       ctx <- ev.ask[AppContext]
       res <- hostContext.status match {
-        case HostReady(targetHost, _, _) =>
+        case HostReady(targetHost, _, _, useHttp) =>
           // If this is a WebSocket request (e.g. wss://leo:8080/...) then akka-http injects a
           // virtual UpgradeToWebSocket header which contains facilities to handle the WebSocket data.
           // The presence of this header distinguishes WebSocket from http requests.
           val res = for {
             response <- request.attribute(AttributeKeys.webSocketUpgrade) match {
               case Some(upgrade) =>
-                IO.fromFuture(IO(handleWebSocketRequest(targetHost, request, upgrade)))
+                IO.fromFuture(IO(handleWebSocketRequest(targetHost, request, upgrade, useHttp)))
               case None =>
-                IO.fromFuture(IO(handleHttpRequest(targetHost, request)))
+                IO.fromFuture(IO(handleHttpRequest(targetHost, request, useHttp)))
             }
             r <-
               if (response.status.isFailure())
@@ -418,9 +418,10 @@ class ProxyService(
       }
     } yield res
 
-  private def handleHttpRequest(targetHost: Host, request: HttpRequest): Future[HttpResponse] = {
-    logger.debug(s"Opening https connection to ${targetHost.address}:${proxyConfig.proxyPort}")
-
+  private def handleHttpRequest(targetHost: Host,
+                                request: HttpRequest,
+                                useHttp: Boolean = false
+  ): Future[HttpResponse] = {
     // A note on akka-http philosophy:
     // The Akka HTTP server is implemented on top of Streams and makes heavy use of it. Requests come
     // in as a Source[HttpRequest] and responses are returned as a Sink[HttpResponse]. The transformation
@@ -429,12 +430,24 @@ class ProxyService(
 
     // Initializes a Flow representing a prospective connection to the given endpoint. The connection
     // is not made until a Source and Sink are plugged into the Flow (i.e. it is materialized).
-    val flow = Http()
-      .connectionTo(targetHost.address)
-      .toPort(proxyConfig.proxyPort)
-      .withCustomHttpsConnectionContext(httpsConnectionContext)
-      .withClientConnectionSettings(clientConnectionSettings)
-      .https()
+    // Galaxy VM apps use plain HTTP on port 80; all other backends use HTTPS on proxyConfig.proxyPort.
+    val flow =
+      if (useHttp) {
+        logger.debug(s"Opening http connection to ${targetHost.address}:80")
+        Http()
+          .connectionTo(targetHost.address)
+          .toPort(80)
+          .withClientConnectionSettings(clientConnectionSettings)
+          .http()
+      } else {
+        logger.debug(s"Opening https connection to ${targetHost.address}:${proxyConfig.proxyPort}")
+        Http()
+          .connectionTo(targetHost.address)
+          .toPort(proxyConfig.proxyPort)
+          .withCustomHttpsConnectionContext(httpsConnectionContext)
+          .withClientConnectionSettings(clientConnectionSettings)
+          .https()
+      }
 
     // Now build a Source[Request] out of the original HttpRequest. We need to make some modifications
     // to the original request in order for the proxy to work:
@@ -492,7 +505,8 @@ class ProxyService(
 
   private def handleWebSocketRequest(targetHost: Host,
                                      request: HttpRequest,
-                                     upgrade: WebSocketUpgrade
+                                     upgrade: WebSocketUpgrade,
+                                     useHttp: Boolean = false
   ): Future[HttpResponse] = {
     logger.info(s"Opening websocket connection to ${targetHost.address}")
 
@@ -509,19 +523,35 @@ class ProxyService(
     // Make a single WebSocketRequest to the notebook server, passing in our Flow. This returns a Future[WebSocketUpgradeResponse].
     // Keep our publisher/subscriber (e.g. sink/source) for use later. These are returned because we specified Keep.both above.
     // Note that we are rewriting the paths for any requests that are routed to /proxy/*/*/jupyter/
-    val (responseFuture, (publisher, subscriber)) = Http().singleWebSocketRequest(
-      WebSocketRequest(
-        request.uri.copy(path = rewriteJupyterPath(request.uri.path),
-                         authority = request.uri.authority.copy(host = targetHost, port = proxyConfig.proxyPort),
-                         scheme = "wss"
-        ),
-        extraHeaders = filterHeaders(request.headers),
-        upgrade.requestedProtocols.headOption
-      ),
-      flow,
-      httpsConnectionContext,
-      settings = clientConnectionSettings
-    )
+    // Galaxy VM apps use ws:// on port 80; all other backends use wss:// on proxyConfig.proxyPort.
+    val (responseFuture, (publisher, subscriber)) =
+      if (useHttp)
+        Http().singleWebSocketRequest(
+          WebSocketRequest(
+            request.uri.copy(path = rewriteJupyterPath(request.uri.path),
+                             authority = request.uri.authority.copy(host = targetHost, port = 80),
+                             scheme = "ws"
+            ),
+            extraHeaders = filterHeaders(request.headers),
+            upgrade.requestedProtocols.headOption
+          ),
+          flow,
+          settings = clientConnectionSettings
+        )
+      else
+        Http().singleWebSocketRequest(
+          WebSocketRequest(
+            request.uri.copy(path = rewriteJupyterPath(request.uri.path),
+                             authority = request.uri.authority.copy(host = targetHost, port = proxyConfig.proxyPort),
+                             scheme = "wss"
+            ),
+            extraHeaders = filterHeaders(request.headers),
+            upgrade.requestedProtocols.headOption
+          ),
+          flow,
+          httpsConnectionContext,
+          settings = clientConnectionSettings
+        )
 
     // If we got a valid WebSocketUpgradeResponse, call handleMessages with our publisher/subscriber, which are
     // already materialized from the HttpRequest.
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
index 719b57386b..0e9992a673 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
@@ -1275,41 +1275,45 @@ class GKEInterpreter[F[_]](
         s"Galaxy VM instance ${instanceName.value} submitted for project ${googleProject.value}; polling for external IP"
       )
 
-      // Poll until the instance has an external IP, then store it as the cluster's load balancer IP
-      // so that KubernetesDnsCache can resolve the proxy host.
-      externalIpOpt <- streamFUntilDone(
+      // Poll until the instance has both internal and external IPs assigned.
+      // We store the internal IP as the proxy backend (KubernetesDnsCache loadBalancerIp) so that
+      // the Leo proxy connects to the VM over the internal VPC network using plain HTTP.
+      // The external IP is only used for the readiness health check (TCP to port 80).
+      ipPairOpt <- streamFUntilDone(
         computeService.getInstance(googleProject, zoneParam, instanceName).map { instanceOpt =>
           instanceOpt.flatMap { inst =>
             import scala.jdk.CollectionConverters._
             for {
               iface <- Option(inst.getNetworkInterfacesList).flatMap(_.asScala.headOption)
+              internalIp = IP(iface.getNetworkIP)
               cfg <- Option(iface.getAccessConfigsList).flatMap(_.asScala.headOption)
               natIp <- Option(cfg.getNatIP).filter(_.nonEmpty)
-            } yield IP(natIp)
+            } yield (internalIp, IP(natIp))
           }
         },
         config.monitorConfig.createApp.maxAttempts,
         config.monitorConfig.createApp.interval
       ).compile.lastOrError
 
-      externalIp <- F.fromOption(
-        externalIpOpt,
+      (internalIp, externalIp) <- F.fromOption(
+        ipPairOpt,
         AppCreationException(
-          s"Galaxy VM ${instanceName.value} did not obtain an external IP after ${config.monitorConfig.createApp.interruptAfter}",
+          s"Galaxy VM ${instanceName.value} did not obtain an IP after ${config.monitorConfig.createApp.interruptAfter}",
           traceId = Some(ctx.traceId)
         )
       )
 
       _ <- logger.info(ctx.loggingCtx)(
-        s"Galaxy VM ${instanceName.value} has external IP ${externalIp.asString}; storing in cluster async fields"
+        s"Galaxy VM ${instanceName.value} has internal IP ${internalIp.asString} / external IP ${externalIp.asString}; storing internal IP in cluster async fields"
       )
 
-      // Store the VM's external IP as the cluster load balancer IP consumed by KubernetesDnsCache
+      // Store the VM's internal IP as the cluster load balancer IP consumed by KubernetesDnsCache.
+      // The proxy will connect to this IP via HTTP on port 80 (Galaxy VM serves HTTP, not HTTPS).
       _ <- kubernetesClusterQuery
         .updateAsyncFields(
           dbCluster.id,
           KubernetesClusterAsyncFields(
-            externalIp,
+            internalIp,
             IP(""),
             NetworkFields(NetworkName(""), SubnetworkName(""), IpRange(""))
           )
@@ -1318,10 +1322,13 @@ class GKEInterpreter[F[_]](
       _ <- kubernetesClusterQuery.updateStatus(dbCluster.id, KubernetesClusterStatus.Running).transaction
 
       _ <- logger.info(ctx.loggingCtx)(
-        s"Polling Galaxy readiness for app ${app.appName.value} at ${externalIp.asString}:80"
+        s"Polling Galaxy readiness for app ${app.appName.value} via proxy (backend: ${internalIp.asString}:80)"
       )
 
-      // Wait for Galaxy's nginx ingress to respond on port 80
+      // Wait for Galaxy's nginx to respond.
+      // Uses isProxyAvailable which routes through the Leo proxy. The proxy now connects to the
+      // VM's internal IP on port 80 (plain HTTP) because KubernetesDnsCache sets useHttp=true for
+      // Galaxy apps and stores the internal IP as loadBalancerIp.
       isDone <- streamFUntilDone(
         appDao.isProxyAvailable(googleProject, app.appName, ServiceName("galaxy"), ctx.traceId),
         config.monitorConfig.createApp.maxAttempts,
diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala
index dd542cbd3a..3b368a36a8 100644
--- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala
+++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala
@@ -101,7 +101,7 @@ class LeoPubsubMessageSubscriberSpec
   }
   val iamDAO = new MockGoogleIamDAO
 
-  // Returns a GCE instance with external IP "1.2.3.4" so Galaxy VM IP polling succeeds in tests.
+  // Returns a GCE instance with internal IP "10.0.0.1" and external IP "1.2.3.4" so Galaxy VM IP polling succeeds in tests.
   val galaxyComputeService: GoogleComputeService[IO] = new FakeGoogleComputeService {
     override def getInstance(project: GoogleProject, zone: ZoneName, instanceName: InstanceName)(implicit
       ev: Ask[IO, TraceId]
@@ -111,6 +111,7 @@ class LeoPubsubMessageSubscriberSpec
         .addNetworkInterfaces(
           NetworkInterface
             .newBuilder()
+            .setNetworkIP("10.0.0.1")
             .addAccessConfigs(AccessConfig.newBuilder().setNatIP("1.2.3.4").build())
             .build()
         )
@@ -937,9 +938,9 @@ class LeoPubsubMessageSubscriberSpec
       getApp.cluster.status shouldBe KubernetesClusterStatus.Running
       // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified
       getApp.nodepool.status shouldBe NodepoolStatus.Unspecified
-      // Galaxy VM path stores external IP as loadBalancerIp; network fields are not populated
+      // Galaxy VM path stores internal IP as loadBalancerIp (proxy routes to VM via HTTP on port 80); network fields are not populated
       getApp.cluster.asyncFields shouldBe Some(
-        KubernetesClusterAsyncFields(IP("1.2.3.4"),
+        KubernetesClusterAsyncFields(IP("10.0.0.1"),
                                      IP(""),
                                      NetworkFields(NetworkName(""), SubnetworkName(""), IpRange(""))
         )
@@ -1091,9 +1092,9 @@ class LeoPubsubMessageSubscriberSpec
       getApp1.app.appResources.kubernetesServiceAccountName shouldBe Some(
         ServiceAccountName("gxy-ksa")
       )
-      // Galaxy VM path stores external IP as loadBalancerIp; network fields are not populated
+      // Galaxy VM path stores internal IP as loadBalancerIp (proxy routes to VM via HTTP on port 80); network fields are not populated
       getApp1.cluster.asyncFields shouldBe Some(
-        KubernetesClusterAsyncFields(IP("1.2.3.4"),
+        KubernetesClusterAsyncFields(IP("10.0.0.1"),
                                      IP(""),
                                      NetworkFields(NetworkName(""), SubnetworkName(""), IpRange(""))
         )
@@ -1853,9 +1854,9 @@ class LeoPubsubMessageSubscriberSpec
       getApp.cluster.status shouldBe KubernetesClusterStatus.Running
       // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified
       getApp.nodepool.status shouldBe NodepoolStatus.Unspecified
-      // Galaxy VM path stores external IP as loadBalancerIp; network fields are not populated
+      // Galaxy VM path stores internal IP as loadBalancerIp (proxy routes to VM via HTTP on port 80); network fields are not populated
       getApp.cluster.asyncFields shouldBe Some(
-        KubernetesClusterAsyncFields(IP("1.2.3.4"),
+        KubernetesClusterAsyncFields(IP("10.0.0.1"),
                                      IP(""),
                                      NetworkFields(NetworkName(""), SubnetworkName(""), IpRange(""))
         )

From bea6e9d556a4afa0778fca21568f9228266fafbf Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Mon, 13 Apr 2026 09:39:49 -0400
Subject: [PATCH 09/17] Fix Galaxy VM readiness check: use direct HTTP to VM
 internal IP

---
 .../dsde/workbench/leonardo/dao/HttpAppDAO.scala | 16 ++++++++++++++++
 .../workbench/leonardo/util/GKEInterpreter.scala |  8 ++++----
 .../dsde/workbench/leonardo/dao/MockAppDAO.scala |  5 ++++-
 3 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala
index e846792f11..7fc78868ef 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala
@@ -44,6 +44,20 @@ class HttpAppDAO[F[_]: Async](kubernetesDnsCache: KubernetesDnsCache[F], client:
           )
       case _ => Async[F].pure(false) // Update once we support Relay for apps
     }
+
+  def isVmReachable(ip: org.broadinstitute.dsde.workbench.model.IP, port: Int, traceId: TraceId): F[Boolean] =
+    client
+      .status(
+        Request[F](
+          method = Method.GET,
+          uri = Uri.unsafeFromString(s"http://${ip.asString}:${port}/"),
+          headers = Headers(Header.Raw(CIString("X-Request-ID"), traceId.asString))
+        )
+      )
+      .map(status => status.code < 500)
+      .handleErrorWith(t =>
+        logger.error(Map("traceId" -> traceId.asString), t)("Fail to check if VM is reachable").as(false)
+      )
 }
 
 trait AppDAO[F[_]] {
@@ -52,4 +66,6 @@ trait AppDAO[F[_]] {
                        serviceName: ServiceName,
                        traceId: TraceId
   ): F[Boolean]
+
+  def isVmReachable(ip: org.broadinstitute.dsde.workbench.model.IP, port: Int, traceId: TraceId): F[Boolean]
 }
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
index 0e9992a673..40fc8c437e 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
@@ -1326,11 +1326,11 @@ class GKEInterpreter[F[_]](
       )
 
       // Wait for Galaxy's nginx to respond.
-      // Uses isProxyAvailable which routes through the Leo proxy. The proxy now connects to the
-      // VM's internal IP on port 80 (plain HTTP) because KubernetesDnsCache sets useHttp=true for
-      // Galaxy apps and stores the internal IP as loadBalancerIp.
+      // Uses a direct HTTP check to the VM's internal IP on port 80, bypassing the Leo proxy
+      // hostname chain (which would require the proxy wildcard DNS to be reachable from within
+      // the Leo pod — unreliable in BEE environments due to hairpin NAT).
       isDone <- streamFUntilDone(
-        appDao.isProxyAvailable(googleProject, app.appName, ServiceName("galaxy"), ctx.traceId),
+        appDao.isVmReachable(internalIp, 80, ctx.traceId),
         config.monitorConfig.createApp.maxAttempts,
         config.monitorConfig.createApp.interval
       ).interruptAfter(config.monitorConfig.createApp.interruptAfter).compile.lastOrError
diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/dao/MockAppDAO.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/dao/MockAppDAO.scala
index 8ee7c2c2cb..647f3cc77f 100644
--- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/dao/MockAppDAO.scala
+++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/dao/MockAppDAO.scala
@@ -3,7 +3,7 @@ package org.broadinstitute.dsde.workbench.leonardo.dao
 import cats.effect.IO
 import org.broadinstitute.dsde.workbench.google2.KubernetesSerializableName.ServiceName
 import org.broadinstitute.dsde.workbench.leonardo.AppName
-import org.broadinstitute.dsde.workbench.model.TraceId
+import org.broadinstitute.dsde.workbench.model.{IP, TraceId}
 import org.broadinstitute.dsde.workbench.model.google.GoogleProject
 
 class MockAppDAO(isUp: Boolean = true) extends AppDAO[IO] {
@@ -13,5 +13,8 @@ class MockAppDAO(isUp: Boolean = true) extends AppDAO[IO] {
                                 traceId: TraceId
   ): IO[Boolean] =
     IO.pure(isUp)
+
+  override def isVmReachable(ip: IP, port: Int, traceId: TraceId): IO[Boolean] =
+    IO.pure(isUp)
 }
 object MockAppDAO extends MockAppDAO(isUp = true)

From c28fcca00d9aa1f51120f0228a0edb1e819c2302 Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Mon, 13 Apr 2026 12:46:58 -0400
Subject: [PATCH 10/17] Fix Galaxy VM proxy: use external IP instead of
 internal IP

---
 .../leonardo/dns/KubernetesDnsCache.scala        |  3 ++-
 .../workbench/leonardo/util/GKEInterpreter.scala | 16 ++++++++++------
 .../monitor/LeoPubsubMessageSubscriberSpec.scala | 12 ++++++------
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala
index 4d286c7f4b..a0bc7196fb 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala
@@ -50,7 +50,8 @@ final class KubernetesDnsCache[F[_]: Logger: OpenTelemetryMetrics](
       case Some(ip) =>
         val h = kubernetesProxyHost(appResult.cluster, proxyConfig.proxyDomain)
         // Galaxy VM apps serve HTTP on port 80. The proxy should connect via plain HTTP,
-        // and we map the fake hostname to the VM's internal IP (stored in loadBalancerIp).
+        // and we map the fake hostname to the VM's external IP (stored in loadBalancerIp).
+        // External IP is used because Leo's pod is in a different VPC from the user's workspace project.
         val isGalaxyVm = appResult.app.appType == Galaxy
         hostToIpMapping
           .getAndUpdate(_ + (h.address -> ip))
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
index 40fc8c437e..322931d48b 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
@@ -1304,16 +1304,20 @@ class GKEInterpreter[F[_]](
       )
 
       _ <- logger.info(ctx.loggingCtx)(
-        s"Galaxy VM ${instanceName.value} has internal IP ${internalIp.asString} / external IP ${externalIp.asString}; storing internal IP in cluster async fields"
+        s"Galaxy VM ${instanceName.value} has internal IP ${internalIp.asString} / external IP ${externalIp.asString}; storing external IP in cluster async fields for proxy access"
       )
 
-      // Store the VM's internal IP as the cluster load balancer IP consumed by KubernetesDnsCache.
+      // Store the VM's external IP as the cluster load balancer IP consumed by KubernetesDnsCache.
       // The proxy will connect to this IP via HTTP on port 80 (Galaxy VM serves HTTP, not HTTPS).
+      // We use the external IP because Leo's GKE cluster is in Leo's GCP project while the Galaxy VM
+      // is in the user's workspace project — the two VPCs are not peered, so the internal IP is
+      // not routable from Leo's pod. The leonardo-allow-http firewall rule (0.0.0.0/0 → port 80,
+      // targeting VMs with the "leonardo" tag) allows Leo to reach the VM on its external IP.
       _ <- kubernetesClusterQuery
         .updateAsyncFields(
           dbCluster.id,
           KubernetesClusterAsyncFields(
-            internalIp,
+            externalIp,
             IP(""),
             NetworkFields(NetworkName(""), SubnetworkName(""), IpRange(""))
           )
@@ -1322,15 +1326,15 @@ class GKEInterpreter[F[_]](
       _ <- kubernetesClusterQuery.updateStatus(dbCluster.id, KubernetesClusterStatus.Running).transaction
 
       _ <- logger.info(ctx.loggingCtx)(
-        s"Polling Galaxy readiness for app ${app.appName.value} via proxy (backend: ${internalIp.asString}:80)"
+        s"Polling Galaxy readiness for app ${app.appName.value} via proxy (backend: ${externalIp.asString}:80)"
       )
 
       // Wait for Galaxy's nginx to respond.
-      // Uses a direct HTTP check to the VM's internal IP on port 80, bypassing the Leo proxy
+      // Uses a direct HTTP check to the VM's external IP on port 80, bypassing the Leo proxy
       // hostname chain (which would require the proxy wildcard DNS to be reachable from within
       // the Leo pod — unreliable in BEE environments due to hairpin NAT).
       isDone <- streamFUntilDone(
-        appDao.isVmReachable(internalIp, 80, ctx.traceId),
+        appDao.isVmReachable(externalIp, 80, ctx.traceId),
         config.monitorConfig.createApp.maxAttempts,
         config.monitorConfig.createApp.interval
       ).interruptAfter(config.monitorConfig.createApp.interruptAfter).compile.lastOrError
diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala
index 3b368a36a8..3da67b1856 100644
--- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala
+++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala
@@ -938,9 +938,9 @@ class LeoPubsubMessageSubscriberSpec
       getApp.cluster.status shouldBe KubernetesClusterStatus.Running
       // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified
       getApp.nodepool.status shouldBe NodepoolStatus.Unspecified
-      // Galaxy VM path stores internal IP as loadBalancerIp (proxy routes to VM via HTTP on port 80); network fields are not populated
+      // Galaxy VM path stores external IP as loadBalancerIp (proxy uses external IP because Leo VPC ≠ user VPC); network fields are not populated
       getApp.cluster.asyncFields shouldBe Some(
-        KubernetesClusterAsyncFields(IP("10.0.0.1"),
+        KubernetesClusterAsyncFields(IP("1.2.3.4"),
                                      IP(""),
                                      NetworkFields(NetworkName(""), SubnetworkName(""), IpRange(""))
         )
@@ -1092,9 +1092,9 @@ class LeoPubsubMessageSubscriberSpec
       getApp1.app.appResources.kubernetesServiceAccountName shouldBe Some(
         ServiceAccountName("gxy-ksa")
       )
-      // Galaxy VM path stores internal IP as loadBalancerIp (proxy routes to VM via HTTP on port 80); network fields are not populated
+      // Galaxy VM path stores external IP as loadBalancerIp (proxy uses external IP because Leo VPC ≠ user VPC); network fields are not populated
       getApp1.cluster.asyncFields shouldBe Some(
-        KubernetesClusterAsyncFields(IP("10.0.0.1"),
+        KubernetesClusterAsyncFields(IP("1.2.3.4"),
                                      IP(""),
                                      NetworkFields(NetworkName(""), SubnetworkName(""), IpRange(""))
         )
@@ -1854,9 +1854,9 @@ class LeoPubsubMessageSubscriberSpec
       getApp.cluster.status shouldBe KubernetesClusterStatus.Running
       // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified
       getApp.nodepool.status shouldBe NodepoolStatus.Unspecified
-      // Galaxy VM path stores internal IP as loadBalancerIp (proxy routes to VM via HTTP on port 80); network fields are not populated
+      // Galaxy VM path stores external IP as loadBalancerIp (proxy uses external IP because Leo VPC ≠ user VPC); network fields are not populated
       getApp.cluster.asyncFields shouldBe Some(
-        KubernetesClusterAsyncFields(IP("10.0.0.1"),
+        KubernetesClusterAsyncFields(IP("1.2.3.4"),
                                      IP(""),
                                      NetworkFields(NetworkName(""), SubnetworkName(""), IpRange(""))
         )

From d8683d8b402f1352a2269a20467100ffcc031a07 Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Mon, 13 Apr 2026 16:20:20 -0400
Subject: [PATCH 11/17] Fix Galaxy VM bootstrap: use galaxy-k8s-boot image

---
 http/src/main/resources/init-resources/galaxy-user-data.sh  | 2 +-
 http/src/main/resources/reference.conf                      | 6 ++++--
 .../dsde/workbench/leonardo/util/GKEInterpreter.scala       | 4 +++-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/http/src/main/resources/init-resources/galaxy-user-data.sh b/http/src/main/resources/init-resources/galaxy-user-data.sh
index 3b7fbb640b..fb3306e6c9 100644
--- a/http/src/main/resources/init-resources/galaxy-user-data.sh
+++ b/http/src/main/resources/init-resources/galaxy-user-data.sh
@@ -1,6 +1,6 @@
+#cloud-config
 # Sourced from https://github.com/galaxyproject/galaxy-k8s-boot/blob/dev/bin/user_data.sh
 # When updating this file, sync it manually from that repository and verify the changes.
-#cloud-config
 write_files:
   - path: /usr/local/bin/galaxy_bootstrap.sh
     permissions: '0755'
diff --git a/http/src/main/resources/reference.conf b/http/src/main/resources/reference.conf
index 64b9b732fe..d10e1fbdb2 100644
--- a/http/src/main/resources/reference.conf
+++ b/http/src/main/resources/reference.conf
@@ -434,8 +434,10 @@ groups {
 }
 
 galaxyVm {
-  # Debian 12 image — galaxy-k8s-boot bootstrap requires a standard Debian VM, not COS
-  sourceImage = "projects/debian-cloud/global/images/family/debian-12"
+  # Pre-built galaxy-k8s-boot image with all dependencies (Ansible, RKE2, etc.) pre-installed.
+  # Has cloud-init, which processes the "user-data" metadata key on first boot.
+  # Source: https://github.com/galaxyproject/galaxy-k8s-boot (image built by the Galaxy team)
+  sourceImage = "projects/anvil-and-terra-development/global/images/galaxy-k8s-boot-v2026-02-25"
   machineType = "n1-highmem-8"
   bootDiskSizeGb = 50
   postgresDiskSizeGb = 10
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
index 322931d48b..73b8751fa5 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
@@ -1060,7 +1060,9 @@ class GKEInterpreter[F[_]](
       )
 
       // Load cloud-config content bundled from galaxy-k8s-boot bin/user_data.sh.
-      // Intentionally not fetched at runtime to avoid unexpected production changes.
+      // Passed as the "user-data" metadata key, processed by cloud-init on first boot only.
+      // The galaxy-k8s-boot custom image has cloud-init pre-installed; "#cloud-config" must be
+      // the first line for cloud-init to recognise the file format.
       // To update, sync manually from https://github.com/galaxyproject/galaxy-k8s-boot/blob/dev/bin/user_data.sh
       userDataContent = scala.io.Source
         .fromResource("init-resources/galaxy-user-data.sh")

From 66135f30d70a5e9296671c9027cc1ab138677e79 Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Tue, 14 Apr 2026 10:15:25 -0400
Subject: [PATCH 12/17] increase boot disk size to Galaxy VM disk image reqs

---
 http/src/main/resources/reference.conf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/http/src/main/resources/reference.conf b/http/src/main/resources/reference.conf
index d10e1fbdb2..d4ea61e98c 100644
--- a/http/src/main/resources/reference.conf
+++ b/http/src/main/resources/reference.conf
@@ -439,7 +439,7 @@ galaxyVm {
   # Source: https://github.com/galaxyproject/galaxy-k8s-boot (image built by the Galaxy team)
   sourceImage = "projects/anvil-and-terra-development/global/images/galaxy-k8s-boot-v2026-02-25"
   machineType = "n1-highmem-8"
-  bootDiskSizeGb = 50
+  bootDiskSizeGb = 100
   postgresDiskSizeGb = 10
   # Suffix appended to the NFS disk name to derive the postgres disk name.
   # Must match the value used in LeoPubsubMessageSubscriber (galaxyDisk.postgresDiskNameSuffix).

From 98ec831f553233d85a086b4e8afec9805eb65e6f Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Tue, 14 Apr 2026 16:34:14 -0400
Subject: [PATCH 13/17] mark cluster as deleted and wait for iam to propagate

---
 .../dsde/workbench/leonardo/util/GKEInterpreter.scala     | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
index 73b8751fa5..8e110f2cb1 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
@@ -711,6 +711,10 @@ class GKEInterpreter[F[_]](
                   s"Failed to delete Galaxy VM ${instanceName.value}: ${e.getMessage}. Continuing with app deletion."
                 )
               }
+            // Mark the cluster DB record as deleted so future app creation in this project is not blocked.
+            // For Galaxy, the "cluster" is a pure DB abstraction (no real GKE cluster); it must be
+            // cleaned up here because no separate cluster-deletion pubsub message is sent.
+            _ <- kubernetesClusterQuery.markAsDeleted(dbCluster.id, ctx.now).transaction
           } yield ()
 
         case _ =>
@@ -1196,7 +1200,7 @@ class GKEInterpreter[F[_]](
               .void
           )
         )
-        val retryConfig = RetryPredicates.retryConfigWithPredicates(when409)
+        val retryConfig = RetryPredicates.retryConfigWithPredicates(when409, whenGroupDoesNotExist)
         tracedRetryF(retryConfig)(
           call,
           s"googleIamDAO.addRoles(batch.jobsEditor) for pet SA ${app.googleServiceAccount.value} in project ${googleProject.value}"
@@ -1241,7 +1245,7 @@ class GKEInterpreter[F[_]](
               .void
           )
         )
-        val retryConfig = RetryPredicates.retryConfigWithPredicates(when409)
+        val retryConfig = RetryPredicates.retryConfigWithPredicates(when409, whenGroupDoesNotExist)
         tracedRetryF(retryConfig)(
           call,
           s"googleIamDAO.addRoles(batch.jobsEditor, iam.serviceAccountUser) for Batch SA $gcpBatchSa in project ${googleProject.value}"

From b3502847e00a2a615780395579b9da59ecfd9ab3 Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Wed, 15 Apr 2026 09:59:20 -0400
Subject: [PATCH 14/17] fix Gb to Gib conversion

---
 .../dsde/workbench/leonardo/util/GKEInterpreter.scala    | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
index 8e110f2cb1..c823e255ae 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
@@ -1079,8 +1079,13 @@ class GKEInterpreter[F[_]](
                                                               config.galaxyDiskConfig.postgresDiskNameSuffix
       )
 
-      // Persistent-volume-size passed to ansible-pull (leave ~11 GiB for filesystem overhead on 150 GB disk)
-      pvSizeGi = math.max(1, nfsDisk.size.gb - 11)
+      // Persistent-volume-size passed to ansible-pull.
+      // nfsDisk.size.gb is in decimal GB; convert to binary GiB before subtracting filesystem overhead.
+      // Example: a 500 GB disk = (500 * 10^9) / 2^30 ≈ 465 GiB, so we request 465 - 11 = 454 GiB.
+      // Using raw gb - 11 (treating GB as GiB) overestimates by ~23 GiB on a 500 GB disk and
+      // causes the NFS provisioner to fail with "insufficient available space".
+      diskSizeGiB = (nfsDisk.size.gb.toLong * 1000L * 1000L * 1000L) / (1024L * 1024L * 1024L)
+      pvSizeGi = math.max(1, diskSizeGiB - 11)
       pvSize = s"${pvSizeGi}Gi"
 
       // Get or create the galaxy-batch-runner SA in the user's project.

From 25c66051748539e0fc0d71e2cfc120727fb26cbb Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Wed, 15 Apr 2026 12:15:45 -0400
Subject: [PATCH 15/17] strip leo proxy prefix

---
 .../leonardo/http/service/ProxyService.scala      | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala
index 0ba9da08e0..841e75f7d0 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala
@@ -355,7 +355,20 @@ class ProxyService(
         case _ => IO.unit
       }
       hostContext = HostContext(hostStatus, s"${cloudContext.asString}/${appName.value}/${serviceName.value}")
-      r <- proxyInternal(hostContext, request)
+      // Galaxy VM apps serve at /, but Leo forwards the full proxy path
+      // (e.g. /proxy/google/v1/apps/{project}/{app}/galaxy). Strip the Leo
+      // prefix so Galaxy's nginx sees requests rooted at /.
+      adjustedRequest = hostStatus match {
+        case HostReady(_, _, _, useHttp) if useHttp =>
+          val prefix = s"/proxy/google/v1/apps/${cloudContext.asString}/${appName.value}/${serviceName.value}"
+          val stripped = request.uri.path.toString.stripPrefix(prefix) match {
+            case "" | "/" => "/"
+            case p        => p
+          }
+          request.withUri(request.uri.withPath(Uri.Path(stripped)))
+        case _ => request
+      }
+      r <- proxyInternal(hostContext, adjustedRequest)
       appType <- appQuery.getAppType(appName).transaction
       result = if (r.status.isSuccess()) "success" else "failure"
       _ <- metrics.incrementCounter(

From 2739d88abdd466dc3d547cabfd6dfbf1f748973d Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Wed, 15 Apr 2026 14:28:17 -0400
Subject: [PATCH 16/17] pass the galaxy_url_prefix

---
 .../resources/init-resources/galaxy-user-data.sh   | 12 ++++++++++++
 .../workbench/leonardo/util/GKEInterpreter.scala   | 14 ++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/http/src/main/resources/init-resources/galaxy-user-data.sh b/http/src/main/resources/init-resources/galaxy-user-data.sh
index fb3306e6c9..c21013f438 100644
--- a/http/src/main/resources/init-resources/galaxy-user-data.sh
+++ b/http/src/main/resources/init-resources/galaxy-user-data.sh
@@ -88,6 +88,13 @@ write_files:
       GCP_BATCH_SERVICE_ACCOUNT_EMAIL=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/gcp_batch_service_account_email" -H "Metadata-Flavor: Google" 2>/dev/null || echo "galaxy-batch-runner@anvil-and-terra-development.iam.gserviceaccount.com")
       echo "[$(date)] - GCP Batch service account email: ${GCP_BATCH_SERVICE_ACCOUNT_EMAIL}"
 
+      # Leo proxy path prefix for this Galaxy app (e.g. /proxy/google/v1/apps/{project}/{appName}/galaxy).
+      # Passed to ansible as galaxy_url_prefix so Galaxy generates correct absolute links (JS/CSS/API)
+      # that include the full proxy path. Without this Galaxy emits links rooted at / which the
+      # browser resolves against Leo's host and gets 404s → blank page.
+      GALAXY_URL_PREFIX=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/galaxy-url-prefix" -H "Metadata-Flavor: Google" 2>/dev/null || echo "")
+      echo "[$(date)] - Galaxy URL prefix: ${GALAXY_URL_PREFIX}"
+
       GIT_REPO=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/git-repo" -H "Metadata-Flavor: Google" 2>/dev/null || echo "https://github.com/galaxyproject/galaxy-k8s-boot.git")
       GIT_BRANCH=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/git-branch" -H "Metadata-Flavor: Google" 2>/dev/null || echo "master")
 
@@ -108,6 +115,11 @@ write_files:
           echo "[$(date)] - Galaxy Restore Mode: Disabled"
       fi
 
+      if [ -n "$GALAXY_URL_PREFIX" ]; then
+          PULL_ARGS+=(--extra-vars "galaxy_url_prefix=${GALAXY_URL_PREFIX}")
+          echo "[$(date)] - Galaxy URL prefix passed to ansible: ${GALAXY_URL_PREFIX}"
+      fi
+
       PULL_ARGS+=(playbook.yml)
 
       mkdir -p /tmp/ansible-inventory
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
index c823e255ae..1901685b67 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala
@@ -1184,6 +1184,20 @@ class GKEInterpreter[F[_]](
             .addItems(Items.newBuilder().setKey("gcp-region").setValue(regionParam.value).build())
             .addItems(Items.newBuilder().setKey("gcp-network").setValue(network.value).build())
             .addItems(Items.newBuilder().setKey("gcp-subnet").setValue(subnetwork.value).build())
+            // Galaxy needs to know its public URL prefix so it generates correct absolute links
+            // (JS, CSS, API calls) that include the full Leo proxy path.
+            // galaxy-k8s-boot's ansible playbook must accept galaxy_url_prefix and set it in
+            // Galaxy's helm values (galaxy.yml). Without this, Galaxy generates links rooted at /
+            // which the browser resolves against Leo's host and gets 404s → blank page.
+            .addItems(
+              Items
+                .newBuilder()
+                .setKey("galaxy-url-prefix")
+                .setValue(
+                  s"/proxy/google/v1/apps/${googleProject.value}/${app.appName.value}/galaxy"
+                )
+                .build()
+            )
             .build()
         )
         .putAllLabels(Map("leonardo" -> "true").asJava)

From a98b3534a90a437f6760f26e542ee67a9ead97ca Mon Sep 17 00:00:00 2001
From: Liz Baldo <lizbaldo@Liz-Baldo-H5267Y2XDX.local>
Date: Fri, 8 May 2026 13:21:29 -0400
Subject: [PATCH 17/17] use new proxy path env variable

---
 .../init-resources/galaxy-user-data.sh        |  2 +-
 .../leonardo/http/service/ProxyService.scala  | 19 +++++--------------
 2 files changed, 6 insertions(+), 15 deletions(-)

diff --git a/http/src/main/resources/init-resources/galaxy-user-data.sh b/http/src/main/resources/init-resources/galaxy-user-data.sh
index c21013f438..1aa42cb6c8 100644
--- a/http/src/main/resources/init-resources/galaxy-user-data.sh
+++ b/http/src/main/resources/init-resources/galaxy-user-data.sh
@@ -116,7 +116,7 @@ write_files:
       fi
 
       if [ -n "$GALAXY_URL_PREFIX" ]; then
-          PULL_ARGS+=(--extra-vars "galaxy_url_prefix=${GALAXY_URL_PREFIX}")
+          PULL_ARGS+=(--extra-vars "galaxy_prefix=${GALAXY_URL_PREFIX}")
           echo "[$(date)] - Galaxy URL prefix passed to ansible: ${GALAXY_URL_PREFIX}"
       fi
 
diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala
index 841e75f7d0..5f407ad424 100644
--- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala
+++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala
@@ -355,20 +355,11 @@ class ProxyService(
         case _ => IO.unit
       }
       hostContext = HostContext(hostStatus, s"${cloudContext.asString}/${appName.value}/${serviceName.value}")
-      // Galaxy VM apps serve at /, but Leo forwards the full proxy path
-      // (e.g. /proxy/google/v1/apps/{project}/{app}/galaxy). Strip the Leo
-      // prefix so Galaxy's nginx sees requests rooted at /.
-      adjustedRequest = hostStatus match {
-        case HostReady(_, _, _, useHttp) if useHttp =>
-          val prefix = s"/proxy/google/v1/apps/${cloudContext.asString}/${appName.value}/${serviceName.value}"
-          val stripped = request.uri.path.toString.stripPrefix(prefix) match {
-            case "" | "/" => "/"
-            case p        => p
-          }
-          request.withUri(request.uri.withPath(Uri.Path(stripped)))
-        case _ => request
-      }
-      r <- proxyInternal(hostContext, adjustedRequest)
+      // Galaxy VM apps: forward the full Leo proxy path to the VM unchanged.
+      // galaxy-k8s-boot configures nginx location blocks and galaxy_url_prefix
+      // using the ingress.path value (= the Leo proxy prefix), so the VM expects
+      // to receive the full path including the prefix.
+      r <- proxyInternal(hostContext, request)
       appType <- appQuery.getAppType(appName).transaction
       result = if (r.status.isSuccess()) "success" else "failure"
       _ <- metrics.incrementCounter(