From 0e070c07b5a179f0ef376120503e4303b6c76dd3 Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Tue, 7 Apr 2026 13:37:18 -0400 Subject: [PATCH 01/17] Replace GKE/Helm Galaxy path with GCE VM-based deployment --- .../init-resources/galaxy-user-data.sh | 138 +++++ .../main/resources/init-resources/startup.sh | 4 + http/src/main/resources/reference.conf | 52 ++ .../workbench/leonardo/config/Config.scala | 18 +- .../leonardo/config/KubernetesAppConfig.scala | 15 +- .../leonardo/dao/HttpJupyterDAO.scala | 2 +- .../monitor/LeoPubsubMessageSubscriber.scala | 7 +- .../leonardo/util/BuildHelmChartValues.scala | 107 ---- .../leonardo/util/GKEInterpreter.scala | 576 ++++++++++-------- .../util/BuildHelmChartValuesSpec.scala | 172 ------ 10 files changed, 544 insertions(+), 547 deletions(-) create mode 100644 http/src/main/resources/init-resources/galaxy-user-data.sh diff --git a/http/src/main/resources/init-resources/galaxy-user-data.sh b/http/src/main/resources/init-resources/galaxy-user-data.sh new file mode 100644 index 0000000000..3b7fbb640b --- /dev/null +++ b/http/src/main/resources/init-resources/galaxy-user-data.sh @@ -0,0 +1,138 @@ +# Sourced from https://github.com/galaxyproject/galaxy-k8s-boot/blob/dev/bin/user_data.sh +# When updating this file, sync it manually from that repository and verify the changes. +#cloud-config +write_files: + - path: /usr/local/bin/galaxy_bootstrap.sh + permissions: '0755' + owner: root:root + content: | + #!/bin/bash + + echo "[$(date)] - Starting galaxy_bootstrap script..." + + # 1. Setup persistent disk if available + DISK_DEVICE="/dev/disk/by-id/google-galaxy-data" + if [ -b "$DISK_DEVICE" ]; then + echo "[$(date)] - Found persistent disk at $DISK_DEVICE" + + # Check if disk is already formatted + if ! blkid "$DISK_DEVICE" > /dev/null 2>&1; then + echo "[$(date)] - Formatting disk $DISK_DEVICE with ext4" + mkfs -t ext4 "$DISK_DEVICE" + else + echo "[$(date)] - Disk $DISK_DEVICE is already formatted" + fi + + # Create mount point and mount + mkdir -p /mnt/block_storage + mount "$DISK_DEVICE" /mnt/block_storage + + # Add to fstab for persistent mounting across reboots + DISK_UUID=$(blkid -s UUID -o value "$DISK_DEVICE") + if [ -n "$DISK_UUID" ] && ! grep -q "$DISK_UUID" /etc/fstab; then + echo "UUID=$DISK_UUID /mnt/block_storage ext4 defaults 0 2" >> /etc/fstab + fi + + # Set proper ownership + chown debian:debian /mnt/block_storage + echo "[$(date)] - Persistent disk mounted at /mnt/block_storage" + else + echo "[$(date)] - No persistent disk found at $DISK_DEVICE. Galaxy will use ephemeral storage." + fi + + # 2. Setup PostgreSQL disk if available + POSTGRES_DISK_DEVICE="/dev/disk/by-id/google-galaxy-postgres-data" + if [ -b "$POSTGRES_DISK_DEVICE" ]; then + echo "[$(date)] - Found PostgreSQL disk at $POSTGRES_DISK_DEVICE" + + # Check if disk is already formatted + if ! blkid "$POSTGRES_DISK_DEVICE" > /dev/null 2>&1; then + echo "[$(date)] - Formatting PostgreSQL disk $POSTGRES_DISK_DEVICE with ext4" + mkfs -t ext4 "$POSTGRES_DISK_DEVICE" + else + echo "[$(date)] - PostgreSQL disk $POSTGRES_DISK_DEVICE is already formatted" + fi + + # Create mount point and mount + mkdir -p /mnt/postgres_storage + mount "$POSTGRES_DISK_DEVICE" /mnt/postgres_storage + + # Add to fstab for persistent mounting across reboots + POSTGRES_DISK_UUID=$(blkid -s UUID -o value "$POSTGRES_DISK_DEVICE") + if [ -n "$POSTGRES_DISK_UUID" ] && ! grep -q "$POSTGRES_DISK_UUID" /etc/fstab; then + echo "UUID=$POSTGRES_DISK_UUID /mnt/postgres_storage ext4 defaults 0 2" >> /etc/fstab + fi + + # Set proper ownership + chown debian:debian /mnt/postgres_storage + echo "[$(date)] - PostgreSQL disk mounted at /mnt/postgres_storage" + else + echo "[$(date)] - No PostgreSQL disk found at $POSTGRES_DISK_DEVICE. PostgreSQL will use ephemeral storage." + fi + + # 3. Run ansible-pull + sudo -u debian bash -c ' + export HOME=/home/debian + HOST_IP=$(curl -s ifconfig.me) + + PV_SIZE=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/persistent-volume-size" -H "Metadata-Flavor: Google" 2>/dev/null) + if [ -z "$PV_SIZE" ]; then + echo "[$(date)] - persistent-volume-size metadata not found or empty, using default." + PV_SIZE="139Gi" + fi + echo "[$(date)] - NFS storage size for Galaxy: ${PV_SIZE}" + + # Add restore_galaxy if enabled + RESTORE_GALAXY=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/restore_galaxy" -H "Metadata-Flavor: Google" 2>/dev/null || echo "false") + + GCP_BATCH_SERVICE_ACCOUNT_EMAIL=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/gcp_batch_service_account_email" -H "Metadata-Flavor: Google" 2>/dev/null || echo "galaxy-batch-runner@anvil-and-terra-development.iam.gserviceaccount.com") + echo "[$(date)] - GCP Batch service account email: ${GCP_BATCH_SERVICE_ACCOUNT_EMAIL}" + + GIT_REPO=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/git-repo" -H "Metadata-Flavor: Google" 2>/dev/null || echo "https://github.com/galaxyproject/galaxy-k8s-boot.git") + GIT_BRANCH=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/git-branch" -H "Metadata-Flavor: Google" 2>/dev/null || echo "master") + + PULL_ARGS=( + -U "${GIT_REPO}" + -C "${GIT_BRANCH}" + -d /home/debian/ansible + -i /tmp/ansible-inventory/localhost + --accept-host-key + --limit 127.0.0.1 + --extra-vars "gcp_batch_service_account_email=${GCP_BATCH_SERVICE_ACCOUNT_EMAIL}" + ) + + if [ "$RESTORE_GALAXY" = "true" ]; then + PULL_ARGS+=(--extra-vars "restore_galaxy=true") + echo "[$(date)] - Galaxy Restore Mode: Enabled" + else + echo "[$(date)] - Galaxy Restore Mode: Disabled" + fi + + PULL_ARGS+=(playbook.yml) + + mkdir -p /tmp/ansible-inventory + cat > /tmp/ansible-inventory/localhost << EOF + [vm] + 127.0.0.1 ansible_connection=local ansible_python_interpreter="/usr/bin/python3" + + [all:vars] + ansible_user="debian" + rke2_token="defaultSecret12345" + rke2_additional_sans=["${HOST_IP}"] + rke2_debug=true + nfs_size="${PV_SIZE}" + galaxy_persistence_size="${PV_SIZE}" + galaxy_db_password="gxy-db-password" + galaxy_user="dev@galaxyproject.org" + EOF + + echo "[$(date)] - Inventory file created at /tmp/ansible-inventory/localhost; running ansible-pull..." + echo "[$(date)] - Running: ANSIBLE_CALLBACKS_ENABLED=profile_tasks ANSIBLE_HOST_PATTERN_MISMATCH=ignore ansible-pull ${PULL_ARGS[@]}" + + ANSIBLE_CALLBACKS_ENABLED=profile_tasks ANSIBLE_HOST_PATTERN_MISMATCH=ignore ansible-pull "${PULL_ARGS[@]}" + ' + + echo "[$(date)] - Bootstrap script completed." + +runcmd: + - /usr/local/bin/galaxy_bootstrap.sh diff --git a/http/src/main/resources/init-resources/startup.sh b/http/src/main/resources/init-resources/startup.sh index 6375f7d7a0..4701c3a892 100644 --- a/http/src/main/resources/init-resources/startup.sh +++ b/http/src/main/resources/init-resources/startup.sh @@ -122,6 +122,10 @@ function failScriptIfError() { function validateCert() { certFileDirectory=$1 + ## Only the master node has certs; worker nodes in multi-node Dataproc clusters do not. + if [ ! -f "${certFileDirectory}/jupyter-server.crt" ]; then + return 0 + fi ## This helps when we need to rotate certs. notAfter=`openssl x509 -enddate -noout -in ${certFileDirectory}/jupyter-server.crt` # output should be something like `notAfter=Jul 4 20:31:52 2026 GMT` diff --git a/http/src/main/resources/reference.conf b/http/src/main/resources/reference.conf index 06d866b7fb..7a9c841302 100644 --- a/http/src/main/resources/reference.conf +++ b/http/src/main/resources/reference.conf @@ -210,6 +210,44 @@ vpc { northamerica-northeast2 = "10.26.0.0/20" } firewallsToAdd = [ + # Allows Galaxy VM traffic on port 80 (nginx ingress on hostNetwork) + { + name-prefix = "leonardo-allow-http" + sourceRanges = { + us-central1 = ["0.0.0.0/0"] + northamerica-northeast1 = ["0.0.0.0/0"] + southamerica-east1 = ["0.0.0.0/0"] + us-east1 = ["0.0.0.0/0"] + us-east4 = ["0.0.0.0/0"] + us-west1 = ["0.0.0.0/0"] + us-west2 = ["0.0.0.0/0"] + us-west3 = ["0.0.0.0/0"] + us-west4 = ["0.0.0.0/0"] + europe-central2 = ["0.0.0.0/0"] + europe-north1 = ["0.0.0.0/0"] + europe-west1 = ["0.0.0.0/0"] + europe-west2 = ["0.0.0.0/0"] + europe-west3 = ["0.0.0.0/0"] + europe-west4 = ["0.0.0.0/0"] + europe-west6 = ["0.0.0.0/0"] + asia-east1 = ["0.0.0.0/0"] + asia-east2 = ["0.0.0.0/0"] + asia-northeast1 = ["0.0.0.0/0"] + asia-northeast2 = ["0.0.0.0/0"] + asia-northeast3 = ["0.0.0.0/0"] + asia-south1 = ["0.0.0.0/0"] + asia-southeast1 = ["0.0.0.0/0"] + asia-southeast2 = ["0.0.0.0/0"] + australia-southeast1 = ["0.0.0.0/0"] + northamerica-northeast2 = ["0.0.0.0/0"] + } + allowed = [ + { + protocol = "tcp" + port = "80" + } + ] + }, # Allows Leonardo proxy traffic on port 443 { name-prefix = "leonardo-allow-https" @@ -395,6 +433,20 @@ groups { } } +galaxyVm { + # Debian 12 image — galaxy-k8s-boot bootstrap requires a standard Debian VM, not COS + sourceImage = "projects/debian-cloud/global/images/family/debian-12" + machineType = "n1-highmem-8" + bootDiskSizeGb = 50 + postgresDiskSizeGb = 10 + # Suffix appended to the NFS disk name to derive the postgres disk name. + # Must match the value used in LeoPubsubMessageSubscriber (galaxyDisk.postgresDiskNameSuffix). + postgresDiskNameSuffix = ${gke.galaxyDisk.postgresDiskNameSuffix} + gcpBatchServiceAccountEmail = "" + gitRepo = "https://github.com/galaxyproject/galaxy-k8s-boot.git" + gitBranch = "master" +} + gke { cluster { location = "us-central1-a", diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala index 45a3e26f38..42c7a53737 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala @@ -131,6 +131,19 @@ object Config { ) } + implicit private val galaxyVmConfigReader: ValueReader[GalaxyVmConfig] = ValueReader.relative { config => + GalaxyVmConfig( + config.as[GceCustomImage]("sourceImage"), + config.as[MachineTypeName]("machineType"), + config.as[DiskSize]("bootDiskSizeGb"), + config.as[DiskSize]("postgresDiskSizeGb"), + config.as[String]("postgresDiskNameSuffix"), + config.as[String]("gcpBatchServiceAccountEmail"), + config.as[String]("gitRepo"), + config.as[String]("gitBranch") + ) + } + implicit private val gceConfigReader: ValueReader[GceConfig] = ValueReader.relative { config => GceConfig( config.as[GceCustomImage]("customGceImage"), @@ -497,6 +510,7 @@ object Config { val googleGroupsConfig = config.as[GoogleGroupsConfig]("groups") val dataprocConfig = config.as[DataprocConfig]("dataproc") + val galaxyVmConfig = config.as[GalaxyVmConfig]("galaxyVm") val gceConfig = config.as[GceConfig]("gce") val imageConfig = config.as[ImageConfig]("image") val prometheusConfig = config.as[PrometheusConfig]("prometheus") @@ -901,13 +915,13 @@ object Config { vpcConfig.networkTag, org.broadinstitute.dsde.workbench.leonardo.http.ConfigReader.appConfig.terraAppSetupChart, gkeIngressConfig, - gkeGalaxyAppConfig, gkeCromwellAppConfig, gkeCustomAppConfig, gkeAllowedAppConfig, appMonitorConfig, gkeClusterConfig, proxyConfig, - gkeGalaxyDiskConfig + gkeGalaxyDiskConfig, + galaxyVmConfig ) } diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala index 2b3fc4a587..53e2562db6 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala @@ -1,7 +1,9 @@ package org.broadinstitute.dsde.workbench.leonardo.config -import org.broadinstitute.dsde.workbench.google2.KubernetesSerializableName.ServiceAccountName +import org.broadinstitute.dsde.workbench.google2.{KubernetesSerializableName, MachineTypeName} +import KubernetesSerializableName.ServiceAccountName import org.broadinstitute.dsde.workbench.leonardo.AppType._ +import org.broadinstitute.dsde.workbench.leonardo.CustomImage.GceCustomImage import org.broadinstitute.dsde.workbench.leonardo._ import org.broadinstitute.dsp.{ChartName, ChartVersion} @@ -92,6 +94,17 @@ final case class CustomAppConfig(chartName: ChartName, val appType: AppType = AppType.Custom } +final case class GalaxyVmConfig( + sourceImage: GceCustomImage, + machineType: MachineTypeName, + bootDiskSizeGb: DiskSize, + postgresDiskSizeGb: DiskSize, + postgresDiskNameSuffix: String, + gcpBatchServiceAccountEmail: String, + gitRepo: String, + gitBranch: String +) + final case class ContainerRegistryUsername(asString: String) extends AnyVal final case class ContainerRegistryPassword(asString: String) extends AnyVal final case class ContainerRegistryCredentials(username: ContainerRegistryUsername, password: ContainerRegistryPassword) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala index 9bb06e9548..0c55f089db 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala @@ -36,7 +36,7 @@ class HttpJupyterDAO[F[_]](val runtimeDnsCache: RuntimeDnsCache[F], client: Clie headers = Headers.empty ) ) - .handleError(_ => false) + .handleErrorWith(e => logger.warn(e)(s"isProxyAvailable failed for ${cloudContext}/${runtimeName}").as(false)) case _ => F.pure(false) } } yield res diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala index 8c33c93579..97a6c984bd 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala @@ -999,10 +999,15 @@ class LeoPubsubMessageSubscriber[F[_]]( } yield res } else F.unit + // Galaxy uses VM-based deployment: no GKE cluster or nodepool is created. + // The VM is launched inside createAndPollApp instead. + effectiveClusterOrNodepoolOp = + if (msg.appType == AppType.Galaxy) F.unit else createClusterOrNodepoolOp + // build asynchronous task task = for { // parallelize disk creation and cluster/nodepool monitoring - _ <- List(createDiskOp, createSecondDiskOp, createClusterOrNodepoolOp).parSequence_ + _ <- List(createDiskOp, createSecondDiskOp, effectiveClusterOrNodepoolOp).parSequence_ // create and monitor app _ <- getGkeAlgFromRegistry() diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValues.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValues.scala index 7fa725c8cf..3c54efd044 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValues.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValues.scala @@ -1,10 +1,8 @@ package org.broadinstitute.dsde.workbench.leonardo package util -import org.broadinstitute.dsde.workbench.google2.DiskName import org.broadinstitute.dsde.workbench.google2.GKEModels.NodepoolName import org.broadinstitute.dsde.workbench.google2.KubernetesSerializableName.{NamespaceName, ServiceAccountName} -import org.broadinstitute.dsde.workbench.leonardo.AppRestore.GalaxyRestore import org.broadinstitute.dsde.workbench.leonardo.Autopilot import org.broadinstitute.dsde.workbench.leonardo.dao.CustomAppService import org.broadinstitute.dsde.workbench.leonardo.http.kubernetesProxyHost @@ -15,111 +13,6 @@ import org.broadinstitute.dsp.Release import java.nio.charset.StandardCharsets private[leonardo] object BuildHelmChartValues { - def buildGalaxyChartOverrideValuesString(config: GKEInterpreterConfig, - appName: AppName, - release: Release, - cluster: KubernetesCluster, - nodepoolName: NodepoolName, - userEmail: WorkbenchEmail, - customEnvironmentVariables: Map[String, String], - ksa: ServiceAccountName, - namespaceName: NamespaceName, - nfsDisk: PersistentDisk, - postgresDiskName: DiskName, - machineType: AppMachineType, - galaxyRestore: Option[GalaxyRestore] - ): List[String] = { - val k8sProxyHost = kubernetesProxyHost(cluster, config.proxyConfig.proxyDomain).address - val leoProxyhost = config.proxyConfig.getProxyServerHostName - val ingressPath = s"/proxy/google/v1/apps/${cluster.cloudContext.asString}/${appName.value}/galaxy" - val workspaceName = customEnvironmentVariables.getOrElse("WORKSPACE_NAME", "") - val workspaceNamespace = customEnvironmentVariables.getOrElse("WORKSPACE_NAMESPACE", "") - // Machine type info - val maxLimitMemory = machineType.memorySizeInGb - val maxLimitCpu = machineType.numOfCpus - val maxRequestMemory = maxLimitMemory - 22 - val maxRequestCpu = maxLimitCpu - 6 - - // Custom EV configs - val configs = customEnvironmentVariables.toList.zipWithIndex.flatMap { case ((k, v), i) => - List( - raw"""configs.$k=$v""", - raw"""extraEnv[$i].name=$k""", - raw"""extraEnv[$i].valueFrom.configMapKeyRef.name=${release.asString}-galaxykubeman-configs""", - raw"""extraEnv[$i].valueFrom.configMapKeyRef.key=$k""" - ) - } - - val galaxyRestoreSettings = galaxyRestore.fold(List.empty[String])(g => - List( - raw"""restore.persistence.nfs.galaxy.pvcID=${g.galaxyPvcId.asString}""", - raw"""galaxy.persistence.existingClaim=${release.asString}-galaxy-galaxy-pvc""" - ) - ) - // Using the string interpolator raw""" since the chart keys include quotes to escape Helm - // value override special characters such as '.' - // https://helm.sh/docs/intro/using_helm/#the-format-and-limitations-of---set - List( - // Storage class configs - raw"""nfs.storageClass.name=nfs-${release.asString}""", - raw"""galaxy.persistence.storageClass=nfs-${release.asString}""", - // Node selector config: this ensures the app is run on the user's nodepool - raw"""galaxy.nodeSelector.cloud\.google\.com/gke-nodepool=${nodepoolName.value}""", - raw"""nfs.nodeSelector.cloud\.google\.com/gke-nodepool=${nodepoolName.value}""", - raw"""galaxy.configs.job_conf\.yml.runners.k8s.k8s_node_selector=cloud.google.com/gke-nodepool: ${nodepoolName.value}""", - raw"""galaxy.postgresql.master.nodeSelector.cloud\.google\.com/gke-nodepool=${nodepoolName.value}""", - // Ingress configs - raw"""galaxy.ingress.path=${ingressPath}""", - raw"""galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-from=https://${k8sProxyHost}""", - raw"""galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-to=${leoProxyhost}""", - raw"""galaxy.ingress.hosts[0].host=${k8sProxyHost}""", - raw"""galaxy.ingress.hosts[0].paths[0].path=${ingressPath}""", - raw"""galaxy.ingress.tls[0].hosts[0]=${k8sProxyHost}""", - raw"""galaxy.ingress.tls[0].secretName=tls-secret""", - // CVMFS configs - raw"""cvmfs.cvmfscsi.cache.alien.pvc.storageClass=nfs-${release.asString}""", - raw"""cvmfs.cvmfscsi.cache.alien.pvc.name=cvmfs-alien-cache""", - // Galaxy configs - raw"""galaxy.configs.galaxy\.yml.galaxy.single_user=${userEmail.value}""", - raw"""galaxy.configs.galaxy\.yml.galaxy.admin_users=${userEmail.value}""", - raw"""galaxy.terra.launch.workspace=${workspaceName}""", - raw"""galaxy.terra.launch.namespace=${workspaceNamespace}""", - raw"""galaxy.terra.launch.apiURL=${config.galaxyAppConfig.orchUrl.value}""", - raw"""galaxy.terra.launch.drsURL=${config.galaxyAppConfig.drsUrl.value}""", - // Tusd ingress configs - raw"""galaxy.tusd.ingress.hosts[0].host=${k8sProxyHost}""", - raw"""galaxy.tusd.ingress.hosts[0].paths[0].path=${ingressPath}/api/upload/resumable_upload""", - raw"""galaxy.tusd.ingress.tls[0].hosts[0]=${k8sProxyHost}""", - raw"""galaxy.tusd.ingress.tls[0].secretName=tls-secret""", - // Set RabbitMQ storage class - raw"""galaxy.rabbitmq.persistence.storageClassName=nfs-${release.asString}""", - // Set Machine Type specs - raw"""galaxy.jobs.maxLimits.memory=${maxLimitMemory}""", - raw"""galaxy.jobs.maxLimits.cpu=${maxLimitCpu}""", - raw"""galaxy.jobs.maxRequests.memory=${maxRequestMemory}""", - raw"""galaxy.jobs.maxRequests.cpu=${maxRequestCpu}""", - raw"""galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_mem=${maxRequestMemory}""", - raw"""galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_cores=${maxRequestCpu}""", - // RBAC configs - raw"""galaxy.serviceAccount.create=false""", - raw"""galaxy.serviceAccount.name=${ksa.value}""", - raw"""rbac.serviceAccount=${ksa.value}""", - // Persistence configs - raw"""persistence.nfs.name=${namespaceName.value}-${config.galaxyDiskConfig.nfsPersistenceName}""", - raw"""persistence.nfs.persistentVolume.extraSpec.gcePersistentDisk.pdName=${nfsDisk.name.value}""", - raw"""persistence.nfs.size=${nfsDisk.size.gb.toString}Gi""", - raw"""persistence.postgres.name=${namespaceName.value}-${config.galaxyDiskConfig.postgresPersistenceName}""", - raw"""galaxy.postgresql.galaxyDatabasePassword=${config.galaxyAppConfig.postgresPassword.value}""", - raw"""persistence.postgres.persistentVolume.extraSpec.gcePersistentDisk.pdName=${postgresDiskName.value}""", - raw"""persistence.postgres.size=${config.galaxyDiskConfig.postgresDiskSizeGB.gb.toString}Gi""", - raw"""nfs.persistence.existingClaim=${namespaceName.value}-${config.galaxyDiskConfig.nfsPersistenceName}-pvc""", - raw"""nfs.persistence.size=${nfsDisk.size.gb.toString}Gi""", - raw"""galaxy.postgresql.persistence.existingClaim=${namespaceName.value}-${config.galaxyDiskConfig.postgresPersistenceName}-pvc""", - // Note Galaxy pvc claim is the nfs disk size minus 50G - raw"""galaxy.persistence.size=${(nfsDisk.size.gb - 50).toString}Gi""" - ) ++ configs ++ galaxyRestoreSettings - } - def buildCromwellAppChartOverrideValuesString(config: GKEInterpreterConfig, appName: AppName, cluster: KubernetesCluster, diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala index f3d609f228..b42d783d04 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala @@ -6,7 +6,17 @@ import cats.effect.Async import cats.mtl.Ask import cats.syntax.all._ import com.google.auth.oauth2.GoogleCredentials -import com.google.cloud.compute.v1.Disk +import com.google.cloud.compute.v1.{ + AccessConfig, + AttachedDisk, + AttachedDiskInitializeParams, + Instance, + Items, + Metadata, + NetworkInterface, + ServiceAccount, + Tags +} import com.google.container.v1._ import fs2.io.file.Files import org.broadinstitute.dsde.workbench.DoneCheckable @@ -27,26 +37,27 @@ import org.broadinstitute.dsde.workbench.google2.{ streamFUntilDone, streamUntilDoneOrTimeout, tracedRetryF, - DiskName, GoogleComputeService, GoogleDiskService, GoogleResourceService, KubernetesClusterNotFoundException, - PvName, + NetworkName, + RegionName, + SubnetworkName, ZoneName } +import org.broadinstitute.dsde.workbench.util2.InstanceName import org.broadinstitute.dsde.workbench.leonardo.dao.{AppDAO, AppDescriptorDAO} import org.broadinstitute.dsde.workbench.leonardo.db._ import org.broadinstitute.dsde.workbench.leonardo.http._ import org.broadinstitute.dsde.workbench.leonardo.http.service.AppNotFoundException +import org.broadinstitute.dsde.workbench.leonardo.dao.google.{buildMachineTypeUri, buildSubnetworkUri} import org.broadinstitute.dsde.workbench.leonardo.util.BuildHelmChartValues.{ buildAllowedAppChartOverrideValuesString, buildCromwellAppChartOverrideValuesString, - buildCustomChartOverrideValuesString, - buildGalaxyChartOverrideValuesString + buildCustomChartOverrideValuesString } import org.broadinstitute.dsde.workbench.leonardo.model.LeoException -import org.broadinstitute.dsde.workbench.leonardo.monitor.PubsubHandleMessageError.PubsubKubernetesError import org.broadinstitute.dsde.workbench.leonardo.util.GKEAlgebra._ import org.broadinstitute.dsde.workbench.model.google.{GcsBucketName, GoogleProject} import org.broadinstitute.dsde.workbench.model.{IP, TraceId, WorkbenchEmail} @@ -387,21 +398,52 @@ class GKEInterpreter[F[_]]( ) ) app = dbApp.app - namespaceName = app.appResources.namespace dbCluster = dbApp.cluster - gkeClusterId = dbCluster.getClusterId googleProject = params.googleProject - // TODO: This DB query might not be needed if it makes sense to add diskId in App model (will revisit in next PR) diskOpt <- appQuery.getDiskId(app.id).transaction diskId <- F.fromOption(diskOpt, DiskNotFoundForAppException(app.id, ctx.traceId)) - // Create namespace and secrets + _ <- logger.info(ctx.loggingCtx)(s"Begin App(${app.appName.value}) Creation.") + + nfsDisk <- F.fromOption( + dbApp.app.appResources.disk, + AppCreationException(s"NFS disk not found in DB for app ${app.appName.value} | trace id: ${ctx.traceId}") + ) + + // Galaxy uses a VM-based deployment; all other app types use the GKE/Helm path. + _ <- app.appType match { + case AppType.Galaxy => + installGalaxyVm(dbCluster, app, nfsDisk, googleProject) >> + persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction.void + + case _ => + createAndPollAppViaHelm(params, dbApp, app, dbCluster, nfsDisk, diskId, googleProject, ctx) + } + _ <- logger.info(ctx.loggingCtx)( - s"Begin App(${app.appName.value}) Creation." + s"Finished app creation for app ${app.appName.value}" ) - // Create KSA + readyTime <- F.realTimeInstant + _ <- appUsageQuery.recordStart(params.appId, readyTime) + _ <- appQuery.updateStatus(params.appId, AppStatus.Running).transaction + } yield () + + // GKE/Helm path for non-Galaxy app types (Cromwell, Allowed, Custom). + private def createAndPollAppViaHelm( + params: CreateAppParams, + dbApp: GetAppResult, + app: App, + dbCluster: KubernetesCluster, + nfsDisk: PersistentDisk, + diskId: DiskId, + googleProject: GoogleProject, + ctx: AppContext + )(implicit ev: Ask[F, AppContext]): F[Unit] = { + val namespaceName = app.appResources.namespace + val gkeClusterId = dbCluster.getClusterId + for { ksaName <- F.fromOption( app.appResources.kubernetesServiceAccountName, AppCreationException( @@ -424,11 +466,6 @@ class GKEInterpreter[F[_]]( ) } - nfsDisk <- F.fromOption( - dbApp.app.appResources.disk, - AppCreationException(s"NFS disk not found in DB for app ${app.appName.value} | trace id: ${ctx.traceId}") - ) - helmAuthContext <- getHelmAuthContext(googleCluster, dbCluster, namespaceName) _ <- helmClient @@ -442,12 +479,8 @@ class GKEInterpreter[F[_]]( true ) .run(helmAuthContext) - // update KSA in DB _ <- appQuery.updateKubernetesServiceAccount(app.id, ksaName).transaction - // Associate GSA to newly created KSA - // This string is constructed based on Google requirements to associate a GSA to a KSA - // (https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity#creating_a_relationship_between_ksas_and_gsas) ksaToGsa = s"${googleProject.value}.svc.id.goog[${namespaceName.value}/${ksaName.value}]" call = F.fromFuture( F.delay( @@ -458,49 +491,15 @@ class GKEInterpreter[F[_]]( ) ) ) - retryConfig = RetryPredicates.retryConfigWithPredicates( - when409 - ) + retryConfig = RetryPredicates.retryConfigWithPredicates(when409) _ <- tracedRetryF(retryConfig)( call, s"googleIamDAO.addIamPolicyBindingOnServiceAccount for GSA ${gsa.value} & KSA ${ksaName.value}" ).compile.lastOrError - // TODO: validate app release is the same as restore release - appRestore: Option[AppRestore] <- persistentDiskQuery.getAppDiskRestore(diskId).transaction - galaxyRestore: Option[AppRestore.GalaxyRestore] = appRestore.flatMap { - case a: AppRestore.GalaxyRestore => Some(a) - case _: AppRestore.Other => None - } - nodepool = if (app.autopilot.isDefined) None else Some(dbApp.nodepool.nodepoolName) - // helm install and wait + _ <- app.appType match { - case AppType.Galaxy => - for { - machineType <- F.fromOption( - params.appMachineType, - new LeoException( - s"can't find machine config for ${googleProject.value}/${app.appName.value}. This should never happen", - traceId = Some(ctx.traceId) - ) - ) - _ <- installGalaxy( - helmAuthContext, - app.appName, - app.release, - app.chart, - dbCluster, - dbApp.nodepool.nodepoolName, // https://broadworkbench.atlassian.net/browse/IA-4987 - namespaceName, - app.auditInfo.creator, - app.customEnvironmentVariables, - ksaName, - nfsDisk, - machineType, - galaxyRestore - ) - } yield () case AppType.Cromwell => installCromwellApp( helmAuthContext, @@ -551,60 +550,15 @@ class GKEInterpreter[F[_]]( F.raiseError(AppCreationException(s"App type ${app.appType} not supported on GCP")) } - _ <- logger.info(ctx.loggingCtx)( - s"Finished app creation for app ${app.appName.value} in cluster ${gkeClusterId.toString}" - ) - _ <- app.appType match { - case AppType.Galaxy => - if (galaxyRestore.isDefined) persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction.void - else - for { - pvcs <- kubeService.listPersistentVolumeClaims(gkeClusterId, - KubernetesNamespace(app.appResources.namespace) - ) - - _ <- pvcs - // We added an extra -galaxy here: https://github.com/galaxyproject/galaxykubeman-helm/blob/f7f27be74c213deda3ae53122[…]959c96480bb21f/galaxykubeman/templates/config-setup-galaxy.yaml - .find(pvc => pvc.getMetadata.getName == s"${app.release.asString}-galaxy-galaxy-pvc") - .fold( - F.raiseError[Unit]( - PubsubKubernetesError(AppError("Fail to retrieve pvc ids", - ctx.now, - ErrorAction.CreateApp, - ErrorSource.App, - None, - Some(ctx.traceId) - ), - Some(app.id), - false, - None, - None, - None - ) - ) - ) { galaxyPvc => - val galaxyDiskRestore = AppRestore.GalaxyRestore( - PvcId(galaxyPvc.getMetadata.getUid), - app.id - ) - persistentDiskQuery - .updateGalaxyDiskRestore(diskId, galaxyDiskRestore) - .transaction - .void - } - } yield () case AppType.Cromwell => persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction case AppType.Allowed => persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction case AppType.Custom => F.unit case _ => F.raiseError(AppCreationException(s"App type ${app.appType} not supported on GCP")) } - - readyTime <- F.realTimeInstant - _ <- appUsageQuery.recordStart(params.appId, readyTime) - _ <- appQuery.updateStatus(params.appId, AppStatus.Running).transaction } yield () + } override def deleteAndPollCluster(params: DeleteClusterParams)(implicit ev: Ask[F, AppContext]): F[Unit] = for { @@ -725,77 +679,86 @@ class GKEInterpreter[F[_]]( // Resolve the cluster in Google googleClusterOpt <- gkeService.getCluster(gkeClusterId) - _ <- googleClusterOpt - .traverse { googleCluster => - val uninstallCharts = for { - helmAuthContext <- getHelmAuthContext(googleCluster, dbCluster, namespaceName) - - _ <- logger.info(ctx.loggingCtx)( - s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}" + _ <- app.appType match { + case AppType.Galaxy => + // Galaxy runs on a VM: delete the GCE instance. Disks are retained for persistence. + for { + gp <- F.fromOption( + LeoLenses.cloudContextToGoogleProject.get(dbCluster.cloudContext), + new RuntimeException("Galaxy app cloud context should be a google project") ) - - // helm uninstall the app chart and wait - _ <- helmClient - .uninstall(app.release, config.galaxyAppConfig.uninstallKeepHistory) - .run(helmAuthContext) - - last <- streamFUntilDone( - kubeService.listPodStatus(dbCluster.getClusterId, KubernetesNamespace(namespaceName)), - config.monitorConfig.deleteApp.maxAttempts, - config.monitorConfig.deleteApp.interval - ).compile.lastOrError - - _ <- - if (!podDoneCheckable.isDone(last)) { - val msg = - s"Helm deletion has failed or timed out for app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}. The following pods are not in a terminal state: ${last - .filterNot(isPodDone) - .map(_.name.value) - .mkString(", ")}" - logger.error(ctx.loggingCtx)(msg) >> - F.raiseError[Unit](AppDeletionException(msg)) - } else F.unit - - // helm uninstall the setup chart - _ <- helmClient - .uninstall( - getTerraAppSetupChartReleaseName(app.release), - config.galaxyAppConfig.uninstallKeepHistory - ) - .run(helmAuthContext) + instanceName = InstanceName(s"galaxy-${app.appName.value}") + zone = dbApp.app.appResources.disk.map(_.zone).getOrElse(ZoneName(config.clusterConfig.location.value)) + _ <- computeService + .deleteInstance(gp, zone, instanceName) + .void + .handleErrorWith { e => + logger.warn(ctx.loggingCtx)( + s"Failed to delete Galaxy VM ${instanceName.value}: ${e.getMessage}. Continuing with app deletion." + ) + } } yield () - uninstallCharts.handleErrorWith { e => - logger.info(ctx.loggingCtx)( - s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString} failed with error ${e.getMessage}" - ) - } - } - - // delete the namespace only after the helm uninstall completes - _ <- kubeService.deleteNamespace(dbApp.cluster.getClusterId, - KubernetesNamespace(dbApp.app.appResources.namespace) - ) + case _ => + // GKE/Helm path for all other app types + googleClusterOpt + .traverse { googleCluster => + val uninstallCharts = for { + helmAuthContext <- getHelmAuthContext(googleCluster, dbCluster, namespaceName) + + _ <- logger.info(ctx.loggingCtx)( + s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}" + ) - fa = kubeService - .namespaceExists(dbApp.cluster.getClusterId, KubernetesNamespace(dbApp.app.appResources.namespace)) - .map(!_) // mapping to inverse because booleanDoneCheckable defines `Done` when it becomes `true`...In this case, the namespace will exists for a while, and eventually becomes non-existent + _ <- helmClient + .uninstall(app.release, true) + .run(helmAuthContext) + + last <- streamFUntilDone( + kubeService.listPodStatus(dbCluster.getClusterId, KubernetesNamespace(namespaceName)), + config.monitorConfig.deleteApp.maxAttempts, + config.monitorConfig.deleteApp.interval + ).compile.lastOrError + + _ <- + if (!podDoneCheckable.isDone(last)) { + val msg = + s"Helm deletion has failed or timed out for app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}. The following pods are not in a terminal state: ${last + .filterNot(isPodDone) + .map(_.name.value) + .mkString(", ")}" + logger.error(ctx.loggingCtx)(msg) >> + F.raiseError[Unit](AppDeletionException(msg)) + } else F.unit + + _ <- helmClient + .uninstall(getTerraAppSetupChartReleaseName(app.release), true) + .run(helmAuthContext) + } yield () + + uninstallCharts.handleErrorWith { e => + logger.info(ctx.loggingCtx)( + s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString} failed with error ${e.getMessage}" + ) + } + } + .void >> + kubeService + .deleteNamespace(dbApp.cluster.getClusterId, KubernetesNamespace(dbApp.app.appResources.namespace)) >> + streamUntilDoneOrTimeout( + kubeService + .namespaceExists(dbApp.cluster.getClusterId, KubernetesNamespace(dbApp.app.appResources.namespace)) + .map(!_), + 60, + 5 seconds, + "delete namespace timed out" + ) + } - _ <- streamUntilDoneOrTimeout(fa, 60, 5 seconds, "delete namespace timed out") _ <- logger.info(ctx.loggingCtx)( - s"Delete app operation has finished for app ${app.appName.value} in cluster ${gkeClusterId.toString}" + s"Delete app operation has finished for app ${app.appName.value}" ) - appRestore: Option[AppRestore.GalaxyRestore] = dbApp.app.appResources.disk.flatMap(_.appRestore).flatMap { - case a: AppRestore.GalaxyRestore => Some(a) - case _: AppRestore.Other => None - } - _ <- appRestore.traverse { restore => - for { - _ <- kubeService.deletePv(dbCluster.getClusterId, PvName(s"pvc-${restore.galaxyPvcId.asString}")) - } yield () - } - _ <- if (!params.errorAfterDelete) { F.unit @@ -1016,29 +979,6 @@ class GKEInterpreter[F[_]]( } } yield () - private[leonardo] def getGalaxyPostgresDisk(diskName: DiskName, - namespaceName: NamespaceName, - project: GoogleProject, - zone: ZoneName - )(implicit traceId: Ask[F, AppContext]): F[Option[Disk]] = - for { - postgresDiskOpt <- googleDiskService - .getDisk( - project, - zone, - getGalaxyPostgresDiskName(diskName, config.galaxyDiskConfig.postgresDiskNameSuffix) - ) - res <- postgresDiskOpt match { - case Some(disk) => F.pure(Some(disk)) - case None => - googleDiskService.getDisk( - project, - zone, - getOldStyleGalaxyPostgresDiskName(namespaceName, config.galaxyDiskConfig.postgresDiskNameSuffix) - ) - } - } yield res - private[util] def installNginx(dbCluster: KubernetesCluster, googleCluster: Cluster)(implicit ev: Ask[F, AppContext] ): F[IP] = @@ -1085,94 +1025,204 @@ class GKEInterpreter[F[_]]( ) } yield loadBalancerIp - private[util] def installGalaxy(helmAuthContext: AuthContext, - appName: AppName, - release: Release, - chart: Chart, - dbCluster: KubernetesCluster, - nodepoolName: NodepoolName, - namespaceName: NamespaceName, - userEmail: WorkbenchEmail, - customEnvironmentVariables: Map[String, String], - kubernetesServiceAccount: ServiceAccountName, - nfsDisk: PersistentDisk, - machineType: AppMachineType, - galaxyRestore: Option[AppRestore.GalaxyRestore] - )(implicit - ev: Ask[F, AppContext] - ): F[Unit] = + private[util] def installGalaxyVm( + dbCluster: KubernetesCluster, + app: App, + nfsDisk: PersistentDisk, + googleProject: GoogleProject + )(implicit ev: Ask[F, AppContext]): F[Unit] = for { ctx <- ev.ask _ <- logger.info(ctx.loggingCtx)( - s"Installing helm chart $chart for app ${appName.value} in cluster ${dbCluster.getClusterId.toString}" - ) - googleProject <- F.fromOption( - LeoLenses.cloudContextToGoogleProject.get(nfsDisk.cloudContext), - new RuntimeException("this should never happen. Galaxy disk's cloud context should be a google project") + s"Installing Galaxy VM for app ${app.appName.value} in project ${googleProject.value}" ) - postgresDiskNameOpt <- for { - disk <- getGalaxyPostgresDisk(nfsDisk.name, namespaceName, googleProject, nfsDisk.zone) - } yield disk.map(x => DiskName(x.getName)) - postgresDiskName <- F.fromOption( - postgresDiskNameOpt, - AppCreationException(s"No postgres disk found in google for app ${appName.value} ", traceId = Some(ctx.traceId)) + zoneParam = nfsDisk.zone + regionParam = RegionName(zoneParam.value.dropRight(2)) + + // Set up VPC and firewall + (network, subnetwork) <- vpcAlg.setUpProjectNetworkAndFirewalls( + SetUpProjectNetworkParams(googleProject, regionParam) ) - chartValues = buildGalaxyChartOverrideValuesString( - config, - appName, - release, - dbCluster, - nodepoolName, - userEmail, - customEnvironmentVariables, - kubernetesServiceAccount, - namespaceName, - nfsDisk, - postgresDiskName, - machineType, - galaxyRestore + // Load cloud-config content bundled from galaxy-k8s-boot bin/user_data.sh. + // Intentionally not fetched at runtime to avoid unexpected production changes. + // To update, sync manually from https://github.com/galaxyproject/galaxy-k8s-boot/blob/dev/bin/user_data.sh + userDataContent = scala.io.Source + .fromResource("init-resources/galaxy-user-data.sh") + .getLines() + .toList + .mkString("\n") + + // Derive postgres disk name using the same naming convention as the subscriber + postgresDiskName = GKEAlgebra.getGalaxyPostgresDiskName(nfsDisk.name, config.galaxyDiskConfig.postgresDiskNameSuffix) + + // Persistent-volume-size passed to ansible-pull (leave ~11 GiB for filesystem overhead on 150 GB disk) + pvSizeGi = math.max(1, nfsDisk.size.gb - 11) + pvSize = s"${pvSizeGi}Gi" + + // GCP Batch SA: prefer value from customEnvironmentVariables, fall back to config default + gcpBatchSa = app.customEnvironmentVariables.getOrElse( + "gcp_batch_service_account_email", + config.galaxyVmConfig.gcpBatchServiceAccountEmail ) + // restore_galaxy flag + restoreGalaxy = app.customEnvironmentVariables.getOrElse("restore_galaxy", "false") + + // Disks + bootDisk = AttachedDisk + .newBuilder() + .setBoot(true) + .setAutoDelete(true) + .setInitializeParams( + AttachedDiskInitializeParams + .newBuilder() + .setSourceImage(config.galaxyVmConfig.sourceImage.asString) + .setDiskSizeGb(config.galaxyVmConfig.bootDiskSizeGb.gb) + .putAllLabels(Map("leonardo" -> "true").asJava) + .build() + ) + .build() + + // Galaxy data disk — device name must match what the bootstrap script expects + dataDisk = AttachedDisk + .newBuilder() + .setBoot(false) + .setDeviceName("galaxy-data") + .setAutoDelete(false) + .setInitializeParams( + AttachedDiskInitializeParams + .newBuilder() + .setDiskName(nfsDisk.name.value) + .setDiskSizeGb(nfsDisk.size.gb) + .setDiskType(nfsDisk.diskType.googleString(googleProject, zoneParam)) + .putAllLabels(Map("leonardo" -> "true").asJava) + .build() + ) + .build() + + // PostgreSQL disk — device name must match what the bootstrap script expects + postgresDisk = AttachedDisk + .newBuilder() + .setBoot(false) + .setDeviceName("galaxy-postgres-data") + .setAutoDelete(false) + .setInitializeParams( + AttachedDiskInitializeParams + .newBuilder() + .setDiskName(postgresDiskName.value) + .setDiskSizeGb(config.galaxyVmConfig.postgresDiskSizeGb.gb) + .putAllLabels(Map("leonardo" -> "true").asJava) + .build() + ) + .build() + + // Network interface with external IP + networkInterface = NetworkInterface + .newBuilder() + .setSubnetwork( + buildSubnetworkUri(googleProject, regionParam, subnetwork) + ) + .addAccessConfigs(AccessConfig.newBuilder().setName("Leonardo Galaxy VM external IP").build()) + .build() + + instanceName = InstanceName(s"galaxy-${app.appName.value}") + + instance = Instance + .newBuilder() + .setName(instanceName.value) + .setDescription("Leonardo Galaxy VM") + .setTags(Tags.newBuilder().addItems(config.vpcNetworkTag.value).build()) + .setMachineType(buildMachineTypeUri(zoneParam, config.galaxyVmConfig.machineType)) + .addNetworkInterfaces(networkInterface) + .addAllDisks(List(bootDisk, dataDisk, postgresDisk).asJava) + .addServiceAccounts( + ServiceAccount + .newBuilder() + .setEmail(app.googleServiceAccount.value) + .addAllScopes( + List( + "https://www.googleapis.com/auth/cloud-platform", + "https://www.googleapis.com/auth/logging.write" + ).asJava + ) + .build() + ) + .setMetadata( + Metadata + .newBuilder() + .addItems(Items.newBuilder().setKey("user-data").setValue(userDataContent).build()) + .addItems(Items.newBuilder().setKey("google-logging-enabled").setValue("true").build()) + .addItems(Items.newBuilder().setKey("gcp_batch_service_account_email").setValue(gcpBatchSa).build()) + .addItems(Items.newBuilder().setKey("persistent-volume-size").setValue(pvSize).build()) + .addItems(Items.newBuilder().setKey("restore_galaxy").setValue(restoreGalaxy).build()) + .addItems(Items.newBuilder().setKey("git-repo").setValue(config.galaxyVmConfig.gitRepo).build()) + .addItems(Items.newBuilder().setKey("git-branch").setValue(config.galaxyVmConfig.gitBranch).build()) + .addItems(Items.newBuilder().setKey("gcp-region").setValue(regionParam.value).build()) + .addItems(Items.newBuilder().setKey("gcp-network").setValue(network.value).build()) + .addItems(Items.newBuilder().setKey("gcp-subnet").setValue(subnetwork.value).build()) + .build() + ) + .putAllLabels(Map("leonardo" -> "true").asJava) + .build() + + _ <- computeService.createInstance(googleProject, zoneParam, instance) + _ <- logger.info(ctx.loggingCtx)( - s"Chart override values are: ${chartValues.map(s => - if (s.contains("galaxyDatabasePassword")) "persistence.postgres.galaxyDatabasePassword=" - else s - )}" + s"Galaxy VM instance ${instanceName.value} submitted for project ${googleProject.value}; polling for external IP" ) - // Invoke helm - helmInstall = helmClient - .installChart( - release, - chart.name, - chart.version, - org.broadinstitute.dsp.Values(chartValues.mkString(",")), - false + // Poll until the instance has an external IP, then store it as the cluster's load balancer IP + // so that KubernetesDnsCache can resolve the proxy host. + externalIpOpt <- streamFUntilDone( + computeService.getInstance(googleProject, zoneParam, instanceName).map { instanceOpt => + instanceOpt.flatMap { inst => + import scala.jdk.CollectionConverters._ + for { + iface <- Option(inst.getNetworkInterfacesList).flatMap(_.asScala.headOption) + cfg <- Option(iface.getAccessConfigsList).flatMap(_.asScala.headOption) + natIp <- Option(cfg.getNatIP).filter(_.nonEmpty) + } yield IP(natIp) + } + }, + config.monitorConfig.createApp.maxAttempts, + config.monitorConfig.createApp.interval + ).compile.lastOrError + + externalIp <- F.fromOption( + externalIpOpt, + AppCreationException( + s"Galaxy VM ${instanceName.value} did not obtain an external IP after ${config.monitorConfig.createApp.interruptAfter}", + traceId = Some(ctx.traceId) ) - .run(helmAuthContext) + ) - // Currently we always retry. - // The main failure mode here is helm install, which does not have easily interpretable error codes - retryConfig = RetryPredicates.retryAllConfig - _ <- tracedRetryF(retryConfig)( - helmInstall, - s"helm install for app ${appName.value} in project ${dbCluster.cloudContext.asString}" - ).compile.lastOrError + _ <- logger.info(ctx.loggingCtx)( + s"Galaxy VM ${instanceName.value} has external IP ${externalIp.asString}; storing in cluster async fields" + ) - googleProject <- F.fromOption( - LeoLenses.cloudContextToGoogleProject.get(dbCluster.cloudContext), - new RuntimeException("trying to create a non google runtime in GKEInterpreter. This should never happen") + // Store the VM's external IP as the cluster load balancer IP consumed by KubernetesDnsCache + _ <- kubernetesClusterQuery + .updateAsyncFields( + dbCluster.id, + KubernetesClusterAsyncFields( + externalIp, + IP(""), + NetworkFields(NetworkName(""), SubnetworkName(""), IpRange("")) + ) + ) + .transaction + _ <- kubernetesClusterQuery.updateStatus(dbCluster.id, KubernetesClusterStatus.Running).transaction + + _ <- logger.info(ctx.loggingCtx)( + s"Polling Galaxy readiness for app ${app.appName.value} at ${externalIp.asString}:80" ) - // Poll galaxy until it starts up - // TODO potentially add other status checks for pod readiness, beyond just HTTP polling the galaxy-web service - // Wait a bit before starting polling for the app status check as the certificates might not be quite ready yet - // This seems to only impact galaxy, See https://broadworkbench.atlassian.net/browse/IA-4551 - _ <- F.sleep(60 seconds) + + // Wait for Galaxy's nginx ingress to respond on port 80 isDone <- streamFUntilDone( - appDao.isProxyAvailable(googleProject, appName, ServiceName("galaxy"), ctx.traceId), + appDao.isProxyAvailable(googleProject, app.appName, ServiceName("galaxy"), ctx.traceId), config.monitorConfig.createApp.maxAttempts, config.monitorConfig.createApp.interval ).interruptAfter(config.monitorConfig.createApp.interruptAfter).compile.lastOrError @@ -1180,9 +1230,9 @@ class GKEInterpreter[F[_]]( _ <- if (!isDone) { val msg = - s"Galaxy installation has failed or timed out for app ${appName.value} in cluster ${dbCluster.getClusterId.toString}" + s"Galaxy VM installation has failed or timed out for app ${app.appName.value} in project ${googleProject.value}" logger.error(ctx.loggingCtx)(msg) >> - F.raiseError[Unit](AppCreationException(msg)) + F.raiseError[Unit](AppCreationException(msg, traceId = Some(ctx.traceId))) } else F.unit } yield () @@ -1833,14 +1883,14 @@ final case class GKEInterpreterConfig(leoUrlBase: URL, vpcNetworkTag: NetworkTag, terraAppSetupChartConfig: TerraAppSetupChartConfig, ingressConfig: KubernetesIngressConfig, - galaxyAppConfig: GalaxyAppConfig, cromwellAppConfig: CromwellAppConfig, customAppConfig: CustomAppConfig, allowedAppConfig: AllowedAppConfig, monitorConfig: AppMonitorConfig, clusterConfig: KubernetesClusterConfig, proxyConfig: ProxyConfig, - galaxyDiskConfig: GalaxyDiskConfig + galaxyDiskConfig: GalaxyDiskConfig, + galaxyVmConfig: GalaxyVmConfig ) final case class TerraAppSetupChartConfig( diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValuesSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValuesSpec.scala index 6ea504a907..a408cd4559 100644 --- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValuesSpec.scala +++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/BuildHelmChartValuesSpec.scala @@ -4,7 +4,6 @@ package util import org.broadinstitute.dsde.workbench.google2.DiskName import org.broadinstitute.dsde.workbench.google2.GKEModels.NodepoolName import org.broadinstitute.dsde.workbench.google2.KubernetesSerializableName.{NamespaceName, ServiceAccountName} -import org.broadinstitute.dsde.workbench.leonardo.AppRestore.GalaxyRestore import org.broadinstitute.dsde.workbench.leonardo.CommonTestData.{makePersistentDisk, userEmail, userEmail2} import org.broadinstitute.dsde.workbench.leonardo.KubernetesTestData.{makeCustomAppService, makeKubeCluster} import org.broadinstitute.dsde.workbench.leonardo.config.Config @@ -16,177 +15,6 @@ import org.scalatest.flatspec.AnyFlatSpecLike class BuildHelmChartValuesSpec extends AnyFlatSpecLike with LeonardoTestSuite { - it should "build Galaxy override values string" in { - val savedCluster1 = makeKubeCluster(1) - val savedDisk1 = makePersistentDisk(Some(DiskName("disk1")), Some(FormattedBy.Galaxy)) - val res = buildGalaxyChartOverrideValuesString( - Config.gkeInterpConfig, - AppName("app1"), - Release("app1-galaxy-rls"), - savedCluster1, - NodepoolName("pool1"), - userEmail, - Map("WORKSPACE_NAME" -> "test-workspace", - "WORKSPACE_BUCKET" -> "gs://test-bucket", - "WORKSPACE_NAMESPACE" -> "dsp-leo-test1" - ), - ServiceAccountName("app1-galaxy-ksa"), - NamespaceName("ns"), - savedDisk1, - DiskName("disk1-gxy-postres-disk"), - AppMachineType(23, 7), - None - ) - - res.mkString( - "," - ) shouldBe - """nfs.storageClass.name=nfs-app1-galaxy-rls,""" + - """galaxy.persistence.storageClass=nfs-app1-galaxy-rls,""" + - """galaxy.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" + - """nfs.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" + - """galaxy.configs.job_conf\.yml.runners.k8s.k8s_node_selector=cloud.google.com/gke-nodepool: pool1,""" + - """galaxy.postgresql.master.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" + - """galaxy.ingress.path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy,""" + - """galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-from=https://1455694897.jupyter.firecloud.org,""" + - """galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-to=https://leo,""" + - """galaxy.ingress.hosts[0].host=1455694897.jupyter.firecloud.org,""" + - """galaxy.ingress.hosts[0].paths[0].path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy,""" + - """galaxy.ingress.tls[0].hosts[0]=1455694897.jupyter.firecloud.org,""" + - """galaxy.ingress.tls[0].secretName=tls-secret,""" + - """cvmfs.cvmfscsi.cache.alien.pvc.storageClass=nfs-app1-galaxy-rls,""" + - """cvmfs.cvmfscsi.cache.alien.pvc.name=cvmfs-alien-cache,""" + - """galaxy.configs.galaxy\.yml.galaxy.single_user=user1@example.com,""" + - """galaxy.configs.galaxy\.yml.galaxy.admin_users=user1@example.com,""" + - """galaxy.terra.launch.workspace=test-workspace,""" + - """galaxy.terra.launch.namespace=dsp-leo-test1,""" + - """galaxy.terra.launch.apiURL=https://firecloud-orchestration.dsde-dev.broadinstitute.org/api/,""" + - """galaxy.terra.launch.drsURL=https://drshub.dsde-dev.broadinstitute.org/api/v4/drs/resolve,""" + - """galaxy.tusd.ingress.hosts[0].host=1455694897.jupyter.firecloud.org,""" + - """galaxy.tusd.ingress.hosts[0].paths[0].path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy/api/upload/resumable_upload,""" + - """galaxy.tusd.ingress.tls[0].hosts[0]=1455694897.jupyter.firecloud.org,""" + - """galaxy.tusd.ingress.tls[0].secretName=tls-secret,""" + - """galaxy.rabbitmq.persistence.storageClassName=nfs-app1-galaxy-rls,""" + - """galaxy.jobs.maxLimits.memory=23,""" + - """galaxy.jobs.maxLimits.cpu=7,""" + - """galaxy.jobs.maxRequests.memory=1,""" + - """galaxy.jobs.maxRequests.cpu=1,""" + - """galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_mem=1,""" + - """galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_cores=1,""" + - """galaxy.serviceAccount.create=false,""" + - """galaxy.serviceAccount.name=app1-galaxy-ksa,""" + - """rbac.serviceAccount=app1-galaxy-ksa,persistence.nfs.name=ns-nfs-disk,""" + - """persistence.nfs.persistentVolume.extraSpec.gcePersistentDisk.pdName=disk1,""" + - """persistence.nfs.size=250Gi,""" + - """persistence.postgres.name=ns-postgres-disk,""" + - """galaxy.postgresql.galaxyDatabasePassword=replace-me,""" + - """persistence.postgres.persistentVolume.extraSpec.gcePersistentDisk.pdName=disk1-gxy-postres-disk,""" + - """persistence.postgres.size=10Gi,""" + - """nfs.persistence.existingClaim=ns-nfs-disk-pvc,""" + - """nfs.persistence.size=250Gi,""" + - """galaxy.postgresql.persistence.existingClaim=ns-postgres-disk-pvc,""" + - """galaxy.persistence.size=200Gi,""" + - """configs.WORKSPACE_NAME=test-workspace,""" + - """extraEnv[0].name=WORKSPACE_NAME,extraEnv[0].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" + - """extraEnv[0].valueFrom.configMapKeyRef.key=WORKSPACE_NAME,""" + - """configs.WORKSPACE_BUCKET=gs://test-bucket,""" + - """extraEnv[1].name=WORKSPACE_BUCKET,""" + - """extraEnv[1].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" + - """extraEnv[1].valueFrom.configMapKeyRef.key=WORKSPACE_BUCKET,""" + - """configs.WORKSPACE_NAMESPACE=dsp-leo-test1,""" + - """extraEnv[2].name=WORKSPACE_NAMESPACE,""" + - """extraEnv[2].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" + - """extraEnv[2].valueFrom.configMapKeyRef.key=WORKSPACE_NAMESPACE""" - } - - it should "build Galaxy override values string with restore info" in { - val savedCluster1 = makeKubeCluster(1) - val savedDisk1 = makePersistentDisk(Some(DiskName("disk1")), Some(FormattedBy.Galaxy)) - val result = - buildGalaxyChartOverrideValuesString( - Config.gkeInterpConfig, - AppName("app1"), - Release("app1-galaxy-rls"), - savedCluster1, - NodepoolName("pool1"), - userEmail, - Map("WORKSPACE_NAME" -> "test-workspace", - "WORKSPACE_BUCKET" -> "gs://test-bucket", - "WORKSPACE_NAMESPACE" -> "dsp-leo-test1" - ), - ServiceAccountName("app1-galaxy-ksa"), - NamespaceName("ns"), - savedDisk1, - DiskName("disk1-gxy-postres"), - AppMachineType(23, 7), - Some( - GalaxyRestore(PvcId("galaxy-pvc-id"), AppId(123)) - ) - ) - result.mkString( - "," - ) shouldBe - """nfs.storageClass.name=nfs-app1-galaxy-rls,""" + - """galaxy.persistence.storageClass=nfs-app1-galaxy-rls,""" + - """galaxy.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" + - """nfs.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" + - """galaxy.configs.job_conf\.yml.runners.k8s.k8s_node_selector=cloud.google.com/gke-nodepool: pool1,""" + - """galaxy.postgresql.master.nodeSelector.cloud\.google\.com/gke-nodepool=pool1,""" + - """galaxy.ingress.path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy,""" + - """galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-from=https://1455694897.jupyter.firecloud.org,""" + - """galaxy.ingress.annotations.nginx\.ingress\.kubernetes\.io/proxy-redirect-to=https://leo,""" + - """galaxy.ingress.hosts[0].host=1455694897.jupyter.firecloud.org,""" + - """galaxy.ingress.hosts[0].paths[0].path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy,""" + - """galaxy.ingress.tls[0].hosts[0]=1455694897.jupyter.firecloud.org,""" + - """galaxy.ingress.tls[0].secretName=tls-secret,""" + - """cvmfs.cvmfscsi.cache.alien.pvc.storageClass=nfs-app1-galaxy-rls,""" + - """cvmfs.cvmfscsi.cache.alien.pvc.name=cvmfs-alien-cache,""" + - """galaxy.configs.galaxy\.yml.galaxy.single_user=user1@example.com,""" + - """galaxy.configs.galaxy\.yml.galaxy.admin_users=user1@example.com,""" + - """galaxy.terra.launch.workspace=test-workspace,""" + - """galaxy.terra.launch.namespace=dsp-leo-test1,""" + - """galaxy.terra.launch.apiURL=https://firecloud-orchestration.dsde-dev.broadinstitute.org/api/,""" + - """galaxy.terra.launch.drsURL=https://drshub.dsde-dev.broadinstitute.org/api/v4/drs/resolve,""" + - """galaxy.tusd.ingress.hosts[0].host=1455694897.jupyter.firecloud.org,""" + - """galaxy.tusd.ingress.hosts[0].paths[0].path=/proxy/google/v1/apps/dsp-leo-test1/app1/galaxy/api/upload/resumable_upload,""" + - """galaxy.tusd.ingress.tls[0].hosts[0]=1455694897.jupyter.firecloud.org,""" + - """galaxy.tusd.ingress.tls[0].secretName=tls-secret,""" + - """galaxy.rabbitmq.persistence.storageClassName=nfs-app1-galaxy-rls,""" + - """galaxy.jobs.maxLimits.memory=23,""" + - """galaxy.jobs.maxLimits.cpu=7,""" + - """galaxy.jobs.maxRequests.memory=1,""" + - """galaxy.jobs.maxRequests.cpu=1,""" + - """galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_mem=1,""" + - """galaxy.jobs.rules.tpv_rules_local\.yml.destinations.k8s.max_cores=1,""" + - """galaxy.serviceAccount.create=false,""" + - """galaxy.serviceAccount.name=app1-galaxy-ksa,""" + - """rbac.serviceAccount=app1-galaxy-ksa,""" + - """persistence.nfs.name=ns-nfs-disk,""" + - """persistence.nfs.persistentVolume.extraSpec.gcePersistentDisk.pdName=disk1,persistence.nfs.size=250Gi,""" + - """persistence.postgres.name=ns-postgres-disk,""" + - """galaxy.postgresql.galaxyDatabasePassword=replace-me,""" + - """persistence.postgres.persistentVolume.extraSpec.gcePersistentDisk.pdName=disk1-gxy-postres,""" + - """persistence.postgres.size=10Gi,""" + - """nfs.persistence.existingClaim=ns-nfs-disk-pvc,""" + - """nfs.persistence.size=250Gi,""" + - """galaxy.postgresql.persistence.existingClaim=ns-postgres-disk-pvc,""" + - """galaxy.persistence.size=200Gi,""" + - """configs.WORKSPACE_NAME=test-workspace,""" + - """extraEnv[0].name=WORKSPACE_NAME,""" + - """extraEnv[0].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" + - """extraEnv[0].valueFrom.configMapKeyRef.key=WORKSPACE_NAME,""" + - """configs.WORKSPACE_BUCKET=gs://test-bucket,""" + - """extraEnv[1].name=WORKSPACE_BUCKET,""" + - """extraEnv[1].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" + - """extraEnv[1].valueFrom.configMapKeyRef.key=WORKSPACE_BUCKET,""" + - """configs.WORKSPACE_NAMESPACE=dsp-leo-test1,""" + - """extraEnv[2].name=WORKSPACE_NAMESPACE,""" + - """extraEnv[2].valueFrom.configMapKeyRef.name=app1-galaxy-rls-galaxykubeman-configs,""" + - """extraEnv[2].valueFrom.configMapKeyRef.key=WORKSPACE_NAMESPACE,""" + - """restore.persistence.nfs.galaxy.pvcID=galaxy-pvc-id,""" + - """galaxy.persistence.existingClaim=app1-galaxy-rls-galaxy-galaxy-pvc""".stripMargin - } - it should "build Cromwell override values string" in { val savedCluster1 = makeKubeCluster(1) val savedDisk1 = makePersistentDisk(Some(DiskName("disk1"))) From 8b9dffd1c3eb8828e3f5eab48b1def580bb3f7d0 Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Tue, 7 Apr 2026 13:51:53 -0400 Subject: [PATCH 02/17] fix format --- .../leonardo/dao/HttpJupyterDAO.scala | 4 +- .../leonardo/util/GKEInterpreter.scala | 88 +++++++++---------- 2 files changed, 47 insertions(+), 45 deletions(-) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala index 0c55f089db..4d09a08109 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpJupyterDAO.scala @@ -36,7 +36,9 @@ class HttpJupyterDAO[F[_]](val runtimeDnsCache: RuntimeDnsCache[F], client: Clie headers = Headers.empty ) ) - .handleErrorWith(e => logger.warn(e)(s"isProxyAvailable failed for ${cloudContext}/${runtimeName}").as(false)) + .handleErrorWith(e => + logger.warn(e)(s"isProxyAvailable failed for ${cloudContext}/${runtimeName}").as(false) + ) case _ => F.pure(false) } } yield res diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala index b42d783d04..782f4adabe 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala @@ -701,48 +701,46 @@ class GKEInterpreter[F[_]]( case _ => // GKE/Helm path for all other app types - googleClusterOpt - .traverse { googleCluster => - val uninstallCharts = for { - helmAuthContext <- getHelmAuthContext(googleCluster, dbCluster, namespaceName) + googleClusterOpt.traverse { googleCluster => + val uninstallCharts = for { + helmAuthContext <- getHelmAuthContext(googleCluster, dbCluster, namespaceName) - _ <- logger.info(ctx.loggingCtx)( - s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}" - ) + _ <- logger.info(ctx.loggingCtx)( + s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}" + ) - _ <- helmClient - .uninstall(app.release, true) - .run(helmAuthContext) - - last <- streamFUntilDone( - kubeService.listPodStatus(dbCluster.getClusterId, KubernetesNamespace(namespaceName)), - config.monitorConfig.deleteApp.maxAttempts, - config.monitorConfig.deleteApp.interval - ).compile.lastOrError - - _ <- - if (!podDoneCheckable.isDone(last)) { - val msg = - s"Helm deletion has failed or timed out for app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}. The following pods are not in a terminal state: ${last - .filterNot(isPodDone) - .map(_.name.value) - .mkString(", ")}" - logger.error(ctx.loggingCtx)(msg) >> - F.raiseError[Unit](AppDeletionException(msg)) - } else F.unit - - _ <- helmClient - .uninstall(getTerraAppSetupChartReleaseName(app.release), true) - .run(helmAuthContext) - } yield () - - uninstallCharts.handleErrorWith { e => - logger.info(ctx.loggingCtx)( - s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString} failed with error ${e.getMessage}" - ) - } + _ <- helmClient + .uninstall(app.release, true) + .run(helmAuthContext) + + last <- streamFUntilDone( + kubeService.listPodStatus(dbCluster.getClusterId, KubernetesNamespace(namespaceName)), + config.monitorConfig.deleteApp.maxAttempts, + config.monitorConfig.deleteApp.interval + ).compile.lastOrError + + _ <- + if (!podDoneCheckable.isDone(last)) { + val msg = + s"Helm deletion has failed or timed out for app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString}. The following pods are not in a terminal state: ${last + .filterNot(isPodDone) + .map(_.name.value) + .mkString(", ")}" + logger.error(ctx.loggingCtx)(msg) >> + F.raiseError[Unit](AppDeletionException(msg)) + } else F.unit + + _ <- helmClient + .uninstall(getTerraAppSetupChartReleaseName(app.release), true) + .run(helmAuthContext) + } yield () + + uninstallCharts.handleErrorWith { e => + logger.info(ctx.loggingCtx)( + s"Uninstalling release ${app.release.asString} for ${app.appType.toString} app ${app.appName.value} in cluster ${dbCluster.getClusterId.toString} failed with error ${e.getMessage}" + ) } - .void >> + }.void >> kubeService .deleteNamespace(dbApp.cluster.getClusterId, KubernetesNamespace(dbApp.app.appResources.namespace)) >> streamUntilDoneOrTimeout( @@ -1056,11 +1054,13 @@ class GKEInterpreter[F[_]]( .mkString("\n") // Derive postgres disk name using the same naming convention as the subscriber - postgresDiskName = GKEAlgebra.getGalaxyPostgresDiskName(nfsDisk.name, config.galaxyDiskConfig.postgresDiskNameSuffix) + postgresDiskName = GKEAlgebra.getGalaxyPostgresDiskName(nfsDisk.name, + config.galaxyDiskConfig.postgresDiskNameSuffix + ) // Persistent-volume-size passed to ansible-pull (leave ~11 GiB for filesystem overhead on 150 GB disk) pvSizeGi = math.max(1, nfsDisk.size.gb - 11) - pvSize = s"${pvSizeGi}Gi" + pvSize = s"${pvSizeGi}Gi" // GCP Batch SA: prefer value from customEnvironmentVariables, fall back to config default gcpBatchSa = app.customEnvironmentVariables.getOrElse( @@ -1181,9 +1181,9 @@ class GKEInterpreter[F[_]]( instanceOpt.flatMap { inst => import scala.jdk.CollectionConverters._ for { - iface <- Option(inst.getNetworkInterfacesList).flatMap(_.asScala.headOption) - cfg <- Option(iface.getAccessConfigsList).flatMap(_.asScala.headOption) - natIp <- Option(cfg.getNatIP).filter(_.nonEmpty) + iface <- Option(inst.getNetworkInterfacesList).flatMap(_.asScala.headOption) + cfg <- Option(iface.getAccessConfigsList).flatMap(_.asScala.headOption) + natIp <- Option(cfg.getNatIP).filter(_.nonEmpty) } yield IP(natIp) } }, From ee1c16a98befa34f6d983caa5231c90f1fc54164 Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Tue, 7 Apr 2026 14:41:45 -0400 Subject: [PATCH 03/17] Grant pet SA GCP Batch IAM roles at Galaxy VM creation time At Galaxy VM creation, grant the pet SA roles/batch.jobsEditor on the user's project so it can submit and monitor GCP Batch jobs. When the Batch SA lives in the same project, also grant serviceAccountUser on it; cross-project Batch SAs must have that binding configured externally. Co-Authored-By: Claude Sonnet 4.6 --- .../leonardo/util/GKEInterpreter.scala | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala index 782f4adabe..335e88fb24 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala @@ -60,6 +60,7 @@ import org.broadinstitute.dsde.workbench.leonardo.util.BuildHelmChartValues.{ import org.broadinstitute.dsde.workbench.leonardo.model.LeoException import org.broadinstitute.dsde.workbench.leonardo.util.GKEAlgebra._ import org.broadinstitute.dsde.workbench.model.google.{GcsBucketName, GoogleProject} +import org.broadinstitute.dsde.workbench.model.google.iam.IamMemberTypes import org.broadinstitute.dsde.workbench.model.{IP, TraceId, WorkbenchEmail} import org.broadinstitute.dsde.workbench.openTelemetry.OpenTelemetryMetrics import org.broadinstitute.dsp._ @@ -1170,6 +1171,45 @@ class GKEInterpreter[F[_]]( _ <- computeService.createInstance(googleProject, zoneParam, instance) + // Grant the pet SA permission to submit and monitor GCP Batch jobs in this project. + // Galaxy uses the VM's attached SA (pet SA) to call the Batch API. + _ <- { + val call = F.fromFuture( + F.delay( + googleIamDAO + .addRoles(googleProject, app.googleServiceAccount, IamMemberTypes.ServiceAccount, Set("roles/batch.jobsEditor")) + .void + ) + ) + val retryConfig = RetryPredicates.retryConfigWithPredicates(when409) + tracedRetryF(retryConfig)( + call, + s"googleIamDAO.addRoles(batch.jobsEditor) for pet SA ${app.googleServiceAccount.value} in project ${googleProject.value}" + ).compile.lastOrError + } + + // Grant the pet SA serviceAccountUser on the Batch SA so it can specify it as the job runner identity. + // Only attempted when the Batch SA lives in the same project as the user (i.e. not a shared platform SA). + // For cross-project Batch SAs, this binding must be set up externally (e.g. via Terraform). + gcpBatchSaProject = GoogleProject(gcpBatchSa.split("@").lastOption.getOrElse("").replace(".iam.gserviceaccount.com", "")) + _ <- + if (gcpBatchSaProject == googleProject) + F.fromFuture( + F.delay( + googleIamDAO.addIamPolicyBindingOnServiceAccount( + googleProject, + WorkbenchEmail(gcpBatchSa), + app.googleServiceAccount, + Set("roles/iam.serviceAccountUser") + ) + ) + ) + else + logger.info(ctx.loggingCtx)( + s"Batch SA $gcpBatchSa is in a different project ($gcpBatchSaProject) than ${googleProject.value}; " + + s"skipping serviceAccountUser binding — must be configured externally" + ) + _ <- logger.info(ctx.loggingCtx)( s"Galaxy VM instance ${instanceName.value} submitted for project ${googleProject.value}; polling for external IP" ) From d48a60f6afee52371935452ce9567f310362680f Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Tue, 7 Apr 2026 16:09:40 -0400 Subject: [PATCH 04/17] add SA permissions and fix unit tests --- .../leonardo/util/GKEInterpreter.scala | 10 ++- .../LeoPubsubMessageSubscriberSpec.scala | 87 +++++++++++-------- .../leonardo/util/VPCInterpreterSpec.scala | 10 ++- 3 files changed, 68 insertions(+), 39 deletions(-) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala index 335e88fb24..b0f52276c1 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala @@ -1177,7 +1177,11 @@ class GKEInterpreter[F[_]]( val call = F.fromFuture( F.delay( googleIamDAO - .addRoles(googleProject, app.googleServiceAccount, IamMemberTypes.ServiceAccount, Set("roles/batch.jobsEditor")) + .addRoles(googleProject, + app.googleServiceAccount, + IamMemberTypes.ServiceAccount, + Set("roles/batch.jobsEditor") + ) .void ) ) @@ -1191,7 +1195,9 @@ class GKEInterpreter[F[_]]( // Grant the pet SA serviceAccountUser on the Batch SA so it can specify it as the job runner identity. // Only attempted when the Batch SA lives in the same project as the user (i.e. not a shared platform SA). // For cross-project Batch SAs, this binding must be set up externally (e.g. via Terraform). - gcpBatchSaProject = GoogleProject(gcpBatchSa.split("@").lastOption.getOrElse("").replace(".iam.gserviceaccount.com", "")) + gcpBatchSaProject = GoogleProject( + gcpBatchSa.split("@").lastOption.getOrElse("").replace(".iam.gserviceaccount.com", "") + ) _ <- if (gcpBatchSaProject == googleProject) F.fromFuture( diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala index d932498dc3..d7f351488d 100644 --- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala +++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala @@ -10,7 +10,7 @@ import cats.mtl.Ask import cats.syntax.all._ import com.github.benmanes.caffeine.cache.Caffeine import com.google.api.gax.longrunning.OperationFuture -import com.google.cloud.compute.v1.{Disk, Operation} +import com.google.cloud.compute.v1.{AccessConfig, Disk, Instance, NetworkInterface, Operation} import com.google.protobuf.Timestamp import fs2.Stream import org.broadinstitute.dsde.workbench.google.GoogleStorageDAO @@ -21,13 +21,16 @@ import org.broadinstitute.dsde.workbench.google2.mock.{MockKubernetesService => import org.broadinstitute.dsde.workbench.google2.{ DiskName, GKEModels, + GoogleComputeService, GoogleDiskService, GoogleStorageService, KubernetesModels, MachineTypeName, - RegionName, + NetworkName, + SubnetworkName, ZoneName } +import org.broadinstitute.dsde.workbench.util2.InstanceName import org.broadinstitute.dsde.workbench.leonardo.AppRestore.GalaxyRestore import org.broadinstitute.dsde.workbench.leonardo.AsyncTaskProcessor.Task import org.broadinstitute.dsde.workbench.leonardo.CommonTestData._ @@ -97,6 +100,24 @@ class LeoPubsubMessageSubscriberSpec ): Future[Unit] = Future.successful(()) } val iamDAO = new MockGoogleIamDAO + + // Returns a GCE instance with external IP "1.2.3.4" so Galaxy VM IP polling succeeds in tests. + val galaxyComputeService: GoogleComputeService[IO] = new FakeGoogleComputeService { + override def getInstance(project: GoogleProject, zone: ZoneName, instanceName: InstanceName)(implicit + ev: Ask[IO, TraceId] + ): IO[Option[Instance]] = { + val inst = Instance + .newBuilder() + .addNetworkInterfaces( + NetworkInterface + .newBuilder() + .addAccessConfigs(AccessConfig.newBuilder().setNatIP("1.2.3.4").build()) + .build() + ) + .build() + IO.pure(Some(inst)) + } + } val resourceService = new FakeGoogleResourceService { override def getProjectNumber(project: GoogleProject)(implicit ev: Ask[IO, TraceId]): IO[Option[Long]] = IO(Some(1L)) @@ -903,32 +924,29 @@ class LeoPubsubMessageSubscriberSpec getDisk = getDiskOpt.get appRestore <- persistentDiskQuery.getAppDiskRestore(savedApp1.appResources.disk.get.id).transaction galaxyRestore = appRestore.map(_.asInstanceOf[GalaxyRestore]) - ipRange = Config.vpcConfig.subnetworkRegionIpRangeMap - .getOrElse(RegionName("us-central1"), throw new Exception(s"Unsupported Region us-central1")) } yield { getCluster.status shouldBe KubernetesClusterStatus.Running getCluster.nodepools.size shouldBe 2 - getCluster.nodepools.filter(_.isDefault).head.status shouldBe NodepoolStatus.Running + // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified + getCluster.nodepools.filter(_.isDefault).head.status shouldBe NodepoolStatus.Unspecified getApp.app.errors shouldBe List.empty getApp.app.status shouldBe AppStatus.Running getApp.app.appResources.kubernetesServiceAccountName shouldBe Some( ServiceAccountName("gxy-ksa") ) getApp.cluster.status shouldBe KubernetesClusterStatus.Running - getApp.nodepool.status shouldBe NodepoolStatus.Running + // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified + getApp.nodepool.status shouldBe NodepoolStatus.Unspecified + // Galaxy VM path stores external IP as loadBalancerIp; network fields are not populated getApp.cluster.asyncFields shouldBe Some( KubernetesClusterAsyncFields(IP("1.2.3.4"), - IP("0.0.0.0"), - NetworkFields(Config.vpcConfig.networkName, - Config.vpcConfig.subnetworkName, - ipRange - ) + IP(""), + NetworkFields(NetworkName(""), SubnetworkName(""), IpRange("")) ) ) getDisk.status shouldBe DiskStatus.Ready - galaxyRestore shouldBe Some( - GalaxyRestore(PvcId(s"nfs-pvc-id1"), getApp.app.id) - ) + // Galaxy VM path does not use PVCs — no GalaxyRestore is recorded + galaxyRestore shouldBe None } implicit val gkeAlg: GKEAlgebra[IO] = makeGKEInterp(nodepoolLock, List(savedApp1.release)) @@ -1062,25 +1080,22 @@ class LeoPubsubMessageSubscriberSpec .transaction getApp1 = getAppOpt1.get getApp2 = getAppOpt2.get - ipRange = Config.vpcConfig.subnetworkRegionIpRangeMap - .getOrElse(RegionName("us-central1"), throw new Exception(s"Unsupported Region us-central1")) } yield { getApp1.cluster.status shouldBe KubernetesClusterStatus.Running getApp2.cluster.status shouldBe KubernetesClusterStatus.Running - getApp1.nodepool.status shouldBe NodepoolStatus.Running - getApp2.nodepool.status shouldBe NodepoolStatus.Running + // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified + getApp1.nodepool.status shouldBe NodepoolStatus.Unspecified + getApp2.nodepool.status shouldBe NodepoolStatus.Unspecified getApp1.app.errors shouldBe List() getApp1.app.status shouldBe AppStatus.Running getApp1.app.appResources.kubernetesServiceAccountName shouldBe Some( ServiceAccountName("gxy-ksa") ) + // Galaxy VM path stores external IP as loadBalancerIp; network fields are not populated getApp1.cluster.asyncFields shouldBe Some( KubernetesClusterAsyncFields(IP("1.2.3.4"), - IP("0.0.0.0"), - NetworkFields(Config.vpcConfig.networkName, - Config.vpcConfig.subnetworkName, - ipRange - ) + IP(""), + NetworkFields(NetworkName(""), SubnetworkName(""), IpRange("")) ) ) getApp2.app.errors shouldBe List() @@ -1298,7 +1313,9 @@ class LeoPubsubMessageSubscriberSpec it should "handle an error in delete app" in isolatedDbTest { val savedCluster1 = makeKubeCluster(1).save() val savedNodepool1 = makeNodepool(1, savedCluster1.id).save() - val savedApp1 = makeApp(1, savedNodepool1.id).save() + // Use Cromwell (GKE/Helm path) so the deleteNamespace error triggers AppStatus.Error. + // Galaxy now uses the VM path which swallows deleteInstance errors gracefully. + val savedApp1 = makeApp(1, savedNodepool1.id, appType = AppType.Cromwell).save() val mockAckConsumer = mock[AckHandler] val assertions = for { @@ -1676,7 +1693,9 @@ class LeoPubsubMessageSubscriberSpec savedApp1.appName, None, Map.empty, - AppType.Galaxy, + // Use Cromwell so the GKE cluster creation path is taken and mockGKEService.createCluster can throw. + // Galaxy skips cluster creation (VM path), so the mock would have no effect. + AppType.Cromwell, savedApp1.appResources.namespace, None, Some(tr), @@ -1821,26 +1840,24 @@ class LeoPubsubMessageSubscriberSpec getApp = getAppOpt.get getDiskOpt <- persistentDiskQuery.getById(savedApp1.appResources.disk.get.id).transaction getDisk = getDiskOpt.get - ipRange = Config.vpcConfig.subnetworkRegionIpRangeMap - .getOrElse(RegionName("us-central1"), throw new Exception(s"Unsupported Region us-central1")) } yield { getCluster.status shouldBe KubernetesClusterStatus.Running getCluster.nodepools.size shouldBe 2 - getCluster.nodepools.filter(_.isDefault).head.status shouldBe NodepoolStatus.Running + // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified + getCluster.nodepools.filter(_.isDefault).head.status shouldBe NodepoolStatus.Unspecified getApp.app.errors shouldBe List() getApp.app.status shouldBe AppStatus.Running getApp.app.appResources.kubernetesServiceAccountName shouldBe Some( ServiceAccountName("gxy-ksa") ) getApp.cluster.status shouldBe KubernetesClusterStatus.Running - getApp.nodepool.status shouldBe NodepoolStatus.Running + // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified + getApp.nodepool.status shouldBe NodepoolStatus.Unspecified + // Galaxy VM path stores external IP as loadBalancerIp; network fields are not populated getApp.cluster.asyncFields shouldBe Some( KubernetesClusterAsyncFields(IP("1.2.3.4"), - IP("0.0.0.0"), - NetworkFields(Config.vpcConfig.networkName, - Config.vpcConfig.subnetworkName, - ipRange - ) + IP(""), + NetworkFields(NetworkName(""), SubnetworkName(""), IpRange("")) ) ) getDisk.status shouldBe DiskStatus.Ready @@ -2024,7 +2041,7 @@ class LeoPubsubMessageSubscriberSpec MockAppDescriptorDAO, lock, resourceService, - FakeGoogleComputeService + galaxyComputeService ) def makeLeoSubscriber( diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/VPCInterpreterSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/VPCInterpreterSpec.scala index c6ad02ccc5..5b6a8b2190 100644 --- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/VPCInterpreterSpec.scala +++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/util/VPCInterpreterSpec.scala @@ -87,7 +87,7 @@ class VPCInterpreterSpec extends AnyFlatSpecLike with LeonardoTestSuite { SetUpProjectFirewallsParams(project, vpcConfig.networkName, RegionName("us-central1"), Map.empty) ) .unsafeRunSync() - computeService.firewallMap.size shouldBe 4 + computeService.firewallMap.size shouldBe 5 vpcConfig.firewallsToAdd.foreach { fwConfig => val fw = computeService.firewallMap.get(FirewallRuleName(s"${fwConfig.namePrefix}-us-central1")) fw shouldBe defined @@ -155,6 +155,12 @@ class VPCInterpreterSpec extends AnyFlatSpecLike with LeonardoTestSuite { ) val test = new VPCInterpreter(Config.vpcInterpreterConfig, stubResourceService(Map.empty), computeService) + val expectedHttpFirewallRules = FirewallRuleConfig( + "leonardo-allow-http", + None, + allSupportedRegions.map(r => r -> List(IpRange("0.0.0.0/0"))).toMap, + List(Allowed("tcp", Some("80"))) + ) test .firewallRulesToAdd( Map( @@ -162,7 +168,7 @@ class VPCInterpreterSpec extends AnyFlatSpecLike with LeonardoTestSuite { "leonardo-allow-https-firewall-name" -> "leonardo-ssl" ) ) - .toSet shouldBe Set(expectedSshFirewallRules, expectedIapFirewallRules) + .toSet shouldBe Set(expectedHttpFirewallRules, expectedSshFirewallRules, expectedIapFirewallRules) } private def stubResourceService(labels: Map[String, String]): FakeGoogleResourceService = From 8bf243289e3a44e77877a1ca925cabf52d206981 Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Wed, 8 Apr 2026 10:30:46 -0400 Subject: [PATCH 05/17] fix the last two unit tests --- .../leonardo/util/GKEInterpreter.scala | 54 +++++++++++-------- .../LeoPubsubMessageSubscriberSpec.scala | 7 +-- 2 files changed, 36 insertions(+), 25 deletions(-) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala index b0f52276c1..75dadd0df1 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala @@ -401,34 +401,44 @@ class GKEInterpreter[F[_]]( app = dbApp.app dbCluster = dbApp.cluster googleProject = params.googleProject + // Idempotency: if the app is already Running, skip creation to avoid double-recording usage + _ <- + if (app.status == AppStatus.Running) + logger.info(ctx.loggingCtx)( + s"App ${app.appName.value} is already Running, skipping creation (idempotent)" + ) + else + for { + diskOpt <- appQuery.getDiskId(app.id).transaction + diskId <- F.fromOption(diskOpt, DiskNotFoundForAppException(app.id, ctx.traceId)) - diskOpt <- appQuery.getDiskId(app.id).transaction - diskId <- F.fromOption(diskOpt, DiskNotFoundForAppException(app.id, ctx.traceId)) - - _ <- logger.info(ctx.loggingCtx)(s"Begin App(${app.appName.value}) Creation.") + _ <- logger.info(ctx.loggingCtx)(s"Begin App(${app.appName.value}) Creation.") - nfsDisk <- F.fromOption( - dbApp.app.appResources.disk, - AppCreationException(s"NFS disk not found in DB for app ${app.appName.value} | trace id: ${ctx.traceId}") - ) + nfsDisk <- F.fromOption( + dbApp.app.appResources.disk, + AppCreationException( + s"NFS disk not found in DB for app ${app.appName.value} | trace id: ${ctx.traceId}" + ) + ) - // Galaxy uses a VM-based deployment; all other app types use the GKE/Helm path. - _ <- app.appType match { - case AppType.Galaxy => - installGalaxyVm(dbCluster, app, nfsDisk, googleProject) >> - persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction.void + // Galaxy uses a VM-based deployment; all other app types use the GKE/Helm path. + _ <- app.appType match { + case AppType.Galaxy => + installGalaxyVm(dbCluster, app, nfsDisk, googleProject) >> + persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction.void - case _ => - createAndPollAppViaHelm(params, dbApp, app, dbCluster, nfsDisk, diskId, googleProject, ctx) - } + case _ => + createAndPollAppViaHelm(params, dbApp, app, dbCluster, nfsDisk, diskId, googleProject, ctx) + } - _ <- logger.info(ctx.loggingCtx)( - s"Finished app creation for app ${app.appName.value}" - ) + _ <- logger.info(ctx.loggingCtx)( + s"Finished app creation for app ${app.appName.value}" + ) - readyTime <- F.realTimeInstant - _ <- appUsageQuery.recordStart(params.appId, readyTime) - _ <- appQuery.updateStatus(params.appId, AppStatus.Running).transaction + readyTime <- F.realTimeInstant + _ <- appUsageQuery.recordStart(params.appId, readyTime) + _ <- appQuery.updateStatus(params.appId, AppStatus.Running).transaction + } yield () } yield () // GKE/Helm path for non-Galaxy app types (Cromwell, Allowed, Custom). diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala index d7f351488d..dd542cbd3a 100644 --- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala +++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala @@ -1449,7 +1449,7 @@ class LeoPubsubMessageSubscriberSpec val savedNodepool1 = makeNodepool(1, savedCluster1.id).save() val disk = makePersistentDisk(None).save().unsafeRunSync()(cats.effect.unsafe.IORuntime.global) - val makeApp1 = makeApp(1, savedNodepool1.id) + val makeApp1 = makeApp(1, savedNodepool1.id, appType = AppType.Cromwell) val savedApp1 = makeApp1 .copy(appResources = makeApp1.appResources.copy( @@ -1540,7 +1540,7 @@ class LeoPubsubMessageSubscriberSpec savedApp1.appName, Some(disk.id), Map.empty, - AppType.Galaxy, + AppType.Cromwell, savedApp1.appResources.namespace, None, Some(tr), @@ -1889,7 +1889,8 @@ class LeoPubsubMessageSubscriberSpec false, Some(GcsBucketName("fc-bucket")) ) - asyncTaskProcessor = AsyncTaskProcessor(AsyncTaskProcessor.Config(10, 10), queue) + // maxConcurrentTasks=1 ensures tasks run sequentially so the idempotency check fires for the 2nd task + asyncTaskProcessor = AsyncTaskProcessor(AsyncTaskProcessor.Config(10, 1), queue) // send message twice _ <- leoSubscriber.handleCreateAppMessage(msg) _ <- leoSubscriber.handleCreateAppMessage(msg) From 8196bea7cdf1b666a9d816ada9997caba0d3ec17 Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Fri, 10 Apr 2026 11:17:28 -0400 Subject: [PATCH 06/17] handle Batch SA creation and fix restore Galaxy --- http/src/main/resources/reference.conf | 1 - .../workbench/leonardo/config/Config.scala | 1 - .../leonardo/config/KubernetesAppConfig.scala | 1 - .../monitor/LeoPubsubMessageSubscriber.scala | 11 +- .../workbench/leonardo/util/GKEAlgebra.scala | 3 +- .../leonardo/util/GKEInterpreter.scala | 107 +++++++++++++----- 6 files changed, 89 insertions(+), 35 deletions(-) diff --git a/http/src/main/resources/reference.conf b/http/src/main/resources/reference.conf index 7a9c841302..64b9b732fe 100644 --- a/http/src/main/resources/reference.conf +++ b/http/src/main/resources/reference.conf @@ -442,7 +442,6 @@ galaxyVm { # Suffix appended to the NFS disk name to derive the postgres disk name. # Must match the value used in LeoPubsubMessageSubscriber (galaxyDisk.postgresDiskNameSuffix). postgresDiskNameSuffix = ${gke.galaxyDisk.postgresDiskNameSuffix} - gcpBatchServiceAccountEmail = "" gitRepo = "https://github.com/galaxyproject/galaxy-k8s-boot.git" gitBranch = "master" } diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala index 42c7a53737..08eb73020d 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/Config.scala @@ -138,7 +138,6 @@ object Config { config.as[DiskSize]("bootDiskSizeGb"), config.as[DiskSize]("postgresDiskSizeGb"), config.as[String]("postgresDiskNameSuffix"), - config.as[String]("gcpBatchServiceAccountEmail"), config.as[String]("gitRepo"), config.as[String]("gitBranch") ) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala index 53e2562db6..bd6712279f 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/config/KubernetesAppConfig.scala @@ -100,7 +100,6 @@ final case class GalaxyVmConfig( bootDiskSizeGb: DiskSize, postgresDiskSizeGb: DiskSize, postgresDiskNameSuffix: String, - gcpBatchServiceAccountEmail: String, gitRepo: String, gitBranch: String ) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala index 97a6c984bd..fb71a5603d 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriber.scala @@ -980,9 +980,14 @@ class LeoPubsubMessageSubscriber[F[_]]( ) .void - // create second Galaxy disk asynchronously + // Restore mode: disk already exists (no new disk to create) for a Galaxy app. + // In this case we skip creating new disks and pass restore=true so the VM + // runs the Ansible restore playbook instead of a fresh install. + restore = msg.appType == AppType.Galaxy && msg.createDisk.isEmpty + + // create second Galaxy disk asynchronously (only for fresh installs) createSecondDiskOp = - if (msg.appType == AppType.Galaxy && disk.isDefined) { + if (msg.appType == AppType.Galaxy && disk.isDefined && !restore) { val d = disk.get // it's safe to do `.get` here because we've verified for { res <- createGalaxyPostgresDiskOnlyInGoogle(msg.project, ZoneName("us-central1-a"), msg.appName, d.name) @@ -1012,7 +1017,7 @@ class LeoPubsubMessageSubscriber[F[_]]( // create and monitor app _ <- getGkeAlgFromRegistry() .createAndPollApp( - CreateAppParams(msg.appId, msg.project, msg.appName, msg.machineType, msg.bucketNameToMount) + CreateAppParams(msg.appId, msg.project, msg.appName, msg.machineType, msg.bucketNameToMount, restore) ) .onError { case e => cleanUpAfterCreateAppError(msg.appId, msg.appName, msg.project, msg.createDisk, e) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEAlgebra.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEAlgebra.scala index 22108113e1..9a81c53064 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEAlgebra.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEAlgebra.scala @@ -125,7 +125,8 @@ final case class CreateAppParams(appId: AppId, googleProject: GoogleProject, appName: AppName, appMachineType: Option[AppMachineType], - bucketNameToMount: Option[GcsBucketName] + bucketNameToMount: Option[GcsBucketName], + restore: Boolean = false ) final case class DeleteClusterParams(clusterId: KubernetesClusterLeoId, googleProject: GoogleProject) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala index 75dadd0df1..4bb7017ccd 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala @@ -8,8 +8,10 @@ import cats.syntax.all._ import com.google.auth.oauth2.GoogleCredentials import com.google.cloud.compute.v1.{ AccessConfig, + Allowed, AttachedDisk, AttachedDiskInitializeParams, + Firewall, Instance, Items, Metadata, @@ -37,6 +39,7 @@ import org.broadinstitute.dsde.workbench.google2.{ streamFUntilDone, streamUntilDoneOrTimeout, tracedRetryF, + FirewallRuleName, GoogleComputeService, GoogleDiskService, GoogleResourceService, @@ -59,7 +62,12 @@ import org.broadinstitute.dsde.workbench.leonardo.util.BuildHelmChartValues.{ } import org.broadinstitute.dsde.workbench.leonardo.model.LeoException import org.broadinstitute.dsde.workbench.leonardo.util.GKEAlgebra._ -import org.broadinstitute.dsde.workbench.model.google.{GcsBucketName, GoogleProject} +import org.broadinstitute.dsde.workbench.model.google.{ + GcsBucketName, + GoogleProject, + ServiceAccountDisplayName, + ServiceAccountName +} import org.broadinstitute.dsde.workbench.model.google.iam.IamMemberTypes import org.broadinstitute.dsde.workbench.model.{IP, TraceId, WorkbenchEmail} import org.broadinstitute.dsde.workbench.openTelemetry.OpenTelemetryMetrics @@ -424,7 +432,7 @@ class GKEInterpreter[F[_]]( // Galaxy uses a VM-based deployment; all other app types use the GKE/Helm path. _ <- app.appType match { case AppType.Galaxy => - installGalaxyVm(dbCluster, app, nfsDisk, googleProject) >> + installGalaxyVm(dbCluster, app, nfsDisk, googleProject, params.restore) >> persistentDiskQuery.updateLastUsedBy(diskId, app.id).transaction.void case _ => @@ -1038,7 +1046,8 @@ class GKEInterpreter[F[_]]( dbCluster: KubernetesCluster, app: App, nfsDisk: PersistentDisk, - googleProject: GoogleProject + googleProject: GoogleProject, + restore: Boolean )(implicit ev: Ask[F, AppContext]): F[Unit] = for { ctx <- ev.ask @@ -1073,16 +1082,23 @@ class GKEInterpreter[F[_]]( pvSizeGi = math.max(1, nfsDisk.size.gb - 11) pvSize = s"${pvSizeGi}Gi" - // GCP Batch SA: prefer value from customEnvironmentVariables, fall back to config default - gcpBatchSa = app.customEnvironmentVariables.getOrElse( - "gcp_batch_service_account_email", - config.galaxyVmConfig.gcpBatchServiceAccountEmail - ) - - // restore_galaxy flag - restoreGalaxy = app.customEnvironmentVariables.getOrElse("restore_galaxy", "false") + // Get or create the galaxy-batch-runner SA in the user's project. + gcpBatchSa <- F + .fromFuture( + F.delay( + googleIamDAO.getOrCreateServiceAccount( + googleProject, + ServiceAccountName("galaxy-batch-runner"), + ServiceAccountDisplayName("Galaxy Batch Runner"), + executionContext + ) + ) + ) + .map(sa => sa.email.value) - // Disks + // Disks — data and postgres disks are always pre-existing by the time this method runs + // (created by createDiskOp / createSecondDiskOp, or retained from a previous app). + // Use setSource to attach existing disks; only the boot disk is created fresh. bootDisk = AttachedDisk .newBuilder() .setBoot(true) @@ -1103,14 +1119,8 @@ class GKEInterpreter[F[_]]( .setBoot(false) .setDeviceName("galaxy-data") .setAutoDelete(false) - .setInitializeParams( - AttachedDiskInitializeParams - .newBuilder() - .setDiskName(nfsDisk.name.value) - .setDiskSizeGb(nfsDisk.size.gb) - .setDiskType(nfsDisk.diskType.googleString(googleProject, zoneParam)) - .putAllLabels(Map("leonardo" -> "true").asJava) - .build() + .setSource( + s"projects/${googleProject.value}/zones/${zoneParam.value}/disks/${nfsDisk.name.value}" ) .build() @@ -1120,13 +1130,8 @@ class GKEInterpreter[F[_]]( .setBoot(false) .setDeviceName("galaxy-postgres-data") .setAutoDelete(false) - .setInitializeParams( - AttachedDiskInitializeParams - .newBuilder() - .setDiskName(postgresDiskName.value) - .setDiskSizeGb(config.galaxyVmConfig.postgresDiskSizeGb.gb) - .putAllLabels(Map("leonardo" -> "true").asJava) - .build() + .setSource( + s"projects/${googleProject.value}/zones/${zoneParam.value}/disks/${postgresDiskName.value}" ) .build() @@ -1168,7 +1173,7 @@ class GKEInterpreter[F[_]]( .addItems(Items.newBuilder().setKey("google-logging-enabled").setValue("true").build()) .addItems(Items.newBuilder().setKey("gcp_batch_service_account_email").setValue(gcpBatchSa).build()) .addItems(Items.newBuilder().setKey("persistent-volume-size").setValue(pvSize).build()) - .addItems(Items.newBuilder().setKey("restore_galaxy").setValue(restoreGalaxy).build()) + .addItems(Items.newBuilder().setKey("restore_galaxy").setValue(restore.toString).build()) .addItems(Items.newBuilder().setKey("git-repo").setValue(config.galaxyVmConfig.gitRepo).build()) .addItems(Items.newBuilder().setKey("git-branch").setValue(config.galaxyVmConfig.gitBranch).build()) .addItems(Items.newBuilder().setKey("gcp-region").setValue(regionParam.value).build()) @@ -1226,6 +1231,52 @@ class GKEInterpreter[F[_]]( s"skipping serviceAccountUser binding — must be configured externally" ) + // Grant the Batch SA the project-level roles it needs to run jobs and attach a service account to Batch VMs. + // See https://github.com/galaxyproject/galaxy-k8s-boot?tab=readme-ov-file#prerequisites + _ <- { + val call = F.fromFuture( + F.delay( + googleIamDAO + .addRoles(googleProject, + WorkbenchEmail(gcpBatchSa), + IamMemberTypes.ServiceAccount, + Set("roles/batch.jobsEditor", "roles/iam.serviceAccountUser") + ) + .void + ) + ) + val retryConfig = RetryPredicates.retryConfigWithPredicates(when409) + tracedRetryF(retryConfig)( + call, + s"googleIamDAO.addRoles(batch.jobsEditor, iam.serviceAccountUser) for Batch SA $gcpBatchSa in project ${googleProject.value}" + ).compile.lastOrError + } + + // Create an NFS firewall rule so GCP Batch VMs can reach the Galaxy VM's NFS server. + // Idempotent: skipped if the rule already exists. + nfsFwName = FirewallRuleName("leonardo-galaxy-allow-nfs-for-batch") + nfsFwExists <- computeService.getFirewallRule(googleProject, nfsFwName) + _ <- + if (nfsFwExists.isEmpty) { + val nfsFirewall = Firewall + .newBuilder() + .setName(nfsFwName.value) + .setNetwork(s"projects/${googleProject.value}/global/networks/${network.value}") + .addSourceRanges("10.0.0.0/8") + .addTargetTags(config.vpcNetworkTag.value) + .addAllowed(Allowed.newBuilder().setIPProtocol("tcp").addPorts("2049").build()) + .addAllowed(Allowed.newBuilder().setIPProtocol("udp").addPorts("2049").build()) + .addAllowed(Allowed.newBuilder().setIPProtocol("tcp").addPorts("111").build()) + .addAllowed(Allowed.newBuilder().setIPProtocol("udp").addPorts("111").build()) + .build() + computeService + .addFirewallRule(googleProject, nfsFirewall) + .flatMap(op => F.blocking(op.get()).void) + } else + logger.info(ctx.loggingCtx)( + s"NFS firewall rule ${nfsFwName.value} already exists, skipping creation" + ) + _ <- logger.info(ctx.loggingCtx)( s"Galaxy VM instance ${instanceName.value} submitted for project ${googleProject.value}; polling for external IP" ) From 95fa6b84b049f85c25200441bef6d3fe04287bf9 Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Fri, 10 Apr 2026 11:42:46 -0400 Subject: [PATCH 07/17] fix compiler issues --- .../workbench/leonardo/util/GKEInterpreter.scala | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala index 4bb7017ccd..719b57386b 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala @@ -62,12 +62,7 @@ import org.broadinstitute.dsde.workbench.leonardo.util.BuildHelmChartValues.{ } import org.broadinstitute.dsde.workbench.leonardo.model.LeoException import org.broadinstitute.dsde.workbench.leonardo.util.GKEAlgebra._ -import org.broadinstitute.dsde.workbench.model.google.{ - GcsBucketName, - GoogleProject, - ServiceAccountDisplayName, - ServiceAccountName -} +import org.broadinstitute.dsde.workbench.model.google.{GcsBucketName, GoogleProject, ServiceAccountDisplayName} import org.broadinstitute.dsde.workbench.model.google.iam.IamMemberTypes import org.broadinstitute.dsde.workbench.model.{IP, TraceId, WorkbenchEmail} import org.broadinstitute.dsde.workbench.openTelemetry.OpenTelemetryMetrics @@ -1088,9 +1083,8 @@ class GKEInterpreter[F[_]]( F.delay( googleIamDAO.getOrCreateServiceAccount( googleProject, - ServiceAccountName("galaxy-batch-runner"), - ServiceAccountDisplayName("Galaxy Batch Runner"), - executionContext + org.broadinstitute.dsde.workbench.model.google.ServiceAccountName("galaxy-batch-runner"), + ServiceAccountDisplayName("Galaxy Batch Runner") ) ) ) From 86a49665ee73f4f9e904a08de9ed94b4885811d1 Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Mon, 13 Apr 2026 08:55:22 -0400 Subject: [PATCH 08/17] Fix Leo proxy for Galaxy VM: use HTTP on port 80 via VM internal IP --- .../workbench/leonardo/dao/HttpAppDAO.scala | 2 +- .../workbench/leonardo/dao/ProxyDAO.scala | 5 +- .../leonardo/dns/KubernetesDnsCache.scala | 8 +- .../leonardo/http/service/ProxyService.scala | 86 +++++++++++++------ .../leonardo/util/GKEInterpreter.scala | 31 ++++--- .../LeoPubsubMessageSubscriberSpec.scala | 15 ++-- 6 files changed, 97 insertions(+), 50 deletions(-) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala index 84a04b45da..e846792f11 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala @@ -23,7 +23,7 @@ class HttpAppDAO[F[_]: Async](kubernetesDnsCache: KubernetesDnsCache[F], client: traceId: TraceId ): F[Boolean] = Proxy.getAppTargetHost[F](kubernetesDnsCache, CloudContext.Gcp(googleProject), appName) flatMap { - case HostReady(targetHost, _, _) => + case HostReady(targetHost, _, _, _) => val serviceUrl = serviceName match { case ServiceName("welder-service") => s"https://${targetHost.address}/proxy/google/v1/apps/${googleProject.value}/${appName.value}/${serviceName.value}/status/" diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/ProxyDAO.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/ProxyDAO.scala index 6f6251a07d..ee1b07ddb2 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/ProxyDAO.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/ProxyDAO.scala @@ -11,7 +11,10 @@ object HostStatus { final case object HostNotFound extends HostStatus final case object HostNotReady extends HostStatus final case object HostPaused extends HostStatus - final case class HostReady(hostname: Host, path: String, cloudProvider: CloudProvider) extends HostStatus { + // useHttp = true means the proxy connects to the backend via plain HTTP (port 80) instead of + // HTTPS (proxyConfig.proxyPort). Used for Galaxy VM apps whose nginx serves HTTP only. + final case class HostReady(hostname: Host, path: String, cloudProvider: CloudProvider, useHttp: Boolean = false) + extends HostStatus { def toUri: Uri = Uri.unsafeFromString(s"https://${hostname.address()}/proxy/${path}") def toNotebooksUri: Uri = Uri.unsafeFromString(s"https://${hostname.address()}/notebooks/${path}") diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala index 6f7ff4783c..4d286c7f4b 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala @@ -7,6 +7,7 @@ import org.broadinstitute.dsde.workbench.leonardo.dao.HostStatus import org.broadinstitute.dsde.workbench.leonardo.dao.HostStatus.{HostNotFound, HostNotReady, HostReady} import org.broadinstitute.dsde.workbench.leonardo.db.{DbReference, KubernetesServiceDbQueries} import org.broadinstitute.dsde.workbench.leonardo.http.{kubernetesProxyHost, GetAppResult} +import org.broadinstitute.dsde.workbench.leonardo.AppType.Galaxy import org.broadinstitute.dsde.workbench.leonardo.{AppName, CloudContext, CloudProvider} import org.broadinstitute.dsde.workbench.model.IP import org.broadinstitute.dsde.workbench.openTelemetry.OpenTelemetryMetrics @@ -48,8 +49,13 @@ final class KubernetesDnsCache[F[_]: Logger: OpenTelemetryMetrics]( case None => F.pure[HostStatus](HostNotReady) case Some(ip) => val h = kubernetesProxyHost(appResult.cluster, proxyConfig.proxyDomain) + // Galaxy VM apps serve HTTP on port 80. The proxy should connect via plain HTTP, + // and we map the fake hostname to the VM's internal IP (stored in loadBalancerIp). + val isGalaxyVm = appResult.app.appType == Galaxy hostToIpMapping .getAndUpdate(_ + (h.address -> ip)) - .as[HostStatus](HostReady(h, "", CloudProvider.Gcp)) // TODO: update this once we start support AKS + .as[HostStatus]( + HostReady(h, "", CloudProvider.Gcp, useHttp = isGalaxyVm) + ) // TODO: update this once we start support AKS } } diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala index 5d989946bd..0ba9da08e0 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala @@ -256,7 +256,7 @@ class ProxyService( hostStatus <- getRuntimeTargetHost(cloudContext, runtimeName) _ <- hostStatus match { - case HostReady(_, _, _) => + case HostReady(_, _, _, _) => dateAccessUpdaterQueue.offer( UpdateDateAccessedMessage(UpdateTarget.Runtime(runtimeName), cloudContext, ctx.now) ) @@ -350,7 +350,7 @@ class ProxyService( } else IO.unit hostStatus <- getAppTargetHost(cloudContext, appName) _ <- hostStatus match { - case HostReady(_, _, _) => + case HostReady(_, _, _, _) => dateAccessUpdaterQueue.offer(UpdateDateAccessedMessage(UpdateTarget.App(appName), cloudContext, ctx.now)) case _ => IO.unit } @@ -376,16 +376,16 @@ class ProxyService( for { ctx <- ev.ask[AppContext] res <- hostContext.status match { - case HostReady(targetHost, _, _) => + case HostReady(targetHost, _, _, useHttp) => // If this is a WebSocket request (e.g. wss://leo:8080/...) then akka-http injects a // virtual UpgradeToWebSocket header which contains facilities to handle the WebSocket data. // The presence of this header distinguishes WebSocket from http requests. val res = for { response <- request.attribute(AttributeKeys.webSocketUpgrade) match { case Some(upgrade) => - IO.fromFuture(IO(handleWebSocketRequest(targetHost, request, upgrade))) + IO.fromFuture(IO(handleWebSocketRequest(targetHost, request, upgrade, useHttp))) case None => - IO.fromFuture(IO(handleHttpRequest(targetHost, request))) + IO.fromFuture(IO(handleHttpRequest(targetHost, request, useHttp))) } r <- if (response.status.isFailure()) @@ -418,9 +418,10 @@ class ProxyService( } } yield res - private def handleHttpRequest(targetHost: Host, request: HttpRequest): Future[HttpResponse] = { - logger.debug(s"Opening https connection to ${targetHost.address}:${proxyConfig.proxyPort}") - + private def handleHttpRequest(targetHost: Host, + request: HttpRequest, + useHttp: Boolean = false + ): Future[HttpResponse] = { // A note on akka-http philosophy: // The Akka HTTP server is implemented on top of Streams and makes heavy use of it. Requests come // in as a Source[HttpRequest] and responses are returned as a Sink[HttpResponse]. The transformation @@ -429,12 +430,24 @@ class ProxyService( // Initializes a Flow representing a prospective connection to the given endpoint. The connection // is not made until a Source and Sink are plugged into the Flow (i.e. it is materialized). - val flow = Http() - .connectionTo(targetHost.address) - .toPort(proxyConfig.proxyPort) - .withCustomHttpsConnectionContext(httpsConnectionContext) - .withClientConnectionSettings(clientConnectionSettings) - .https() + // Galaxy VM apps use plain HTTP on port 80; all other backends use HTTPS on proxyConfig.proxyPort. + val flow = + if (useHttp) { + logger.debug(s"Opening http connection to ${targetHost.address}:80") + Http() + .connectionTo(targetHost.address) + .toPort(80) + .withClientConnectionSettings(clientConnectionSettings) + .http() + } else { + logger.debug(s"Opening https connection to ${targetHost.address}:${proxyConfig.proxyPort}") + Http() + .connectionTo(targetHost.address) + .toPort(proxyConfig.proxyPort) + .withCustomHttpsConnectionContext(httpsConnectionContext) + .withClientConnectionSettings(clientConnectionSettings) + .https() + } // Now build a Source[Request] out of the original HttpRequest. We need to make some modifications // to the original request in order for the proxy to work: @@ -492,7 +505,8 @@ class ProxyService( private def handleWebSocketRequest(targetHost: Host, request: HttpRequest, - upgrade: WebSocketUpgrade + upgrade: WebSocketUpgrade, + useHttp: Boolean = false ): Future[HttpResponse] = { logger.info(s"Opening websocket connection to ${targetHost.address}") @@ -509,19 +523,35 @@ class ProxyService( // Make a single WebSocketRequest to the notebook server, passing in our Flow. This returns a Future[WebSocketUpgradeResponse]. // Keep our publisher/subscriber (e.g. sink/source) for use later. These are returned because we specified Keep.both above. // Note that we are rewriting the paths for any requests that are routed to /proxy/*/*/jupyter/ - val (responseFuture, (publisher, subscriber)) = Http().singleWebSocketRequest( - WebSocketRequest( - request.uri.copy(path = rewriteJupyterPath(request.uri.path), - authority = request.uri.authority.copy(host = targetHost, port = proxyConfig.proxyPort), - scheme = "wss" - ), - extraHeaders = filterHeaders(request.headers), - upgrade.requestedProtocols.headOption - ), - flow, - httpsConnectionContext, - settings = clientConnectionSettings - ) + // Galaxy VM apps use ws:// on port 80; all other backends use wss:// on proxyConfig.proxyPort. + val (responseFuture, (publisher, subscriber)) = + if (useHttp) + Http().singleWebSocketRequest( + WebSocketRequest( + request.uri.copy(path = rewriteJupyterPath(request.uri.path), + authority = request.uri.authority.copy(host = targetHost, port = 80), + scheme = "ws" + ), + extraHeaders = filterHeaders(request.headers), + upgrade.requestedProtocols.headOption + ), + flow, + settings = clientConnectionSettings + ) + else + Http().singleWebSocketRequest( + WebSocketRequest( + request.uri.copy(path = rewriteJupyterPath(request.uri.path), + authority = request.uri.authority.copy(host = targetHost, port = proxyConfig.proxyPort), + scheme = "wss" + ), + extraHeaders = filterHeaders(request.headers), + upgrade.requestedProtocols.headOption + ), + flow, + httpsConnectionContext, + settings = clientConnectionSettings + ) // If we got a valid WebSocketUpgradeResponse, call handleMessages with our publisher/subscriber, which are // already materialized from the HttpRequest. diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala index 719b57386b..0e9992a673 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala @@ -1275,41 +1275,45 @@ class GKEInterpreter[F[_]]( s"Galaxy VM instance ${instanceName.value} submitted for project ${googleProject.value}; polling for external IP" ) - // Poll until the instance has an external IP, then store it as the cluster's load balancer IP - // so that KubernetesDnsCache can resolve the proxy host. - externalIpOpt <- streamFUntilDone( + // Poll until the instance has both internal and external IPs assigned. + // We store the internal IP as the proxy backend (KubernetesDnsCache loadBalancerIp) so that + // the Leo proxy connects to the VM over the internal VPC network using plain HTTP. + // The external IP is only used for the readiness health check (TCP to port 80). + ipPairOpt <- streamFUntilDone( computeService.getInstance(googleProject, zoneParam, instanceName).map { instanceOpt => instanceOpt.flatMap { inst => import scala.jdk.CollectionConverters._ for { iface <- Option(inst.getNetworkInterfacesList).flatMap(_.asScala.headOption) + internalIp = IP(iface.getNetworkIP) cfg <- Option(iface.getAccessConfigsList).flatMap(_.asScala.headOption) natIp <- Option(cfg.getNatIP).filter(_.nonEmpty) - } yield IP(natIp) + } yield (internalIp, IP(natIp)) } }, config.monitorConfig.createApp.maxAttempts, config.monitorConfig.createApp.interval ).compile.lastOrError - externalIp <- F.fromOption( - externalIpOpt, + (internalIp, externalIp) <- F.fromOption( + ipPairOpt, AppCreationException( - s"Galaxy VM ${instanceName.value} did not obtain an external IP after ${config.monitorConfig.createApp.interruptAfter}", + s"Galaxy VM ${instanceName.value} did not obtain an IP after ${config.monitorConfig.createApp.interruptAfter}", traceId = Some(ctx.traceId) ) ) _ <- logger.info(ctx.loggingCtx)( - s"Galaxy VM ${instanceName.value} has external IP ${externalIp.asString}; storing in cluster async fields" + s"Galaxy VM ${instanceName.value} has internal IP ${internalIp.asString} / external IP ${externalIp.asString}; storing internal IP in cluster async fields" ) - // Store the VM's external IP as the cluster load balancer IP consumed by KubernetesDnsCache + // Store the VM's internal IP as the cluster load balancer IP consumed by KubernetesDnsCache. + // The proxy will connect to this IP via HTTP on port 80 (Galaxy VM serves HTTP, not HTTPS). _ <- kubernetesClusterQuery .updateAsyncFields( dbCluster.id, KubernetesClusterAsyncFields( - externalIp, + internalIp, IP(""), NetworkFields(NetworkName(""), SubnetworkName(""), IpRange("")) ) @@ -1318,10 +1322,13 @@ class GKEInterpreter[F[_]]( _ <- kubernetesClusterQuery.updateStatus(dbCluster.id, KubernetesClusterStatus.Running).transaction _ <- logger.info(ctx.loggingCtx)( - s"Polling Galaxy readiness for app ${app.appName.value} at ${externalIp.asString}:80" + s"Polling Galaxy readiness for app ${app.appName.value} via proxy (backend: ${internalIp.asString}:80)" ) - // Wait for Galaxy's nginx ingress to respond on port 80 + // Wait for Galaxy's nginx to respond. + // Uses isProxyAvailable which routes through the Leo proxy. The proxy now connects to the + // VM's internal IP on port 80 (plain HTTP) because KubernetesDnsCache sets useHttp=true for + // Galaxy apps and stores the internal IP as loadBalancerIp. isDone <- streamFUntilDone( appDao.isProxyAvailable(googleProject, app.appName, ServiceName("galaxy"), ctx.traceId), config.monitorConfig.createApp.maxAttempts, diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala index dd542cbd3a..3b368a36a8 100644 --- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala +++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala @@ -101,7 +101,7 @@ class LeoPubsubMessageSubscriberSpec } val iamDAO = new MockGoogleIamDAO - // Returns a GCE instance with external IP "1.2.3.4" so Galaxy VM IP polling succeeds in tests. + // Returns a GCE instance with internal IP "10.0.0.1" and external IP "1.2.3.4" so Galaxy VM IP polling succeeds in tests. val galaxyComputeService: GoogleComputeService[IO] = new FakeGoogleComputeService { override def getInstance(project: GoogleProject, zone: ZoneName, instanceName: InstanceName)(implicit ev: Ask[IO, TraceId] @@ -111,6 +111,7 @@ class LeoPubsubMessageSubscriberSpec .addNetworkInterfaces( NetworkInterface .newBuilder() + .setNetworkIP("10.0.0.1") .addAccessConfigs(AccessConfig.newBuilder().setNatIP("1.2.3.4").build()) .build() ) @@ -937,9 +938,9 @@ class LeoPubsubMessageSubscriberSpec getApp.cluster.status shouldBe KubernetesClusterStatus.Running // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified getApp.nodepool.status shouldBe NodepoolStatus.Unspecified - // Galaxy VM path stores external IP as loadBalancerIp; network fields are not populated + // Galaxy VM path stores internal IP as loadBalancerIp (proxy routes to VM via HTTP on port 80); network fields are not populated getApp.cluster.asyncFields shouldBe Some( - KubernetesClusterAsyncFields(IP("1.2.3.4"), + KubernetesClusterAsyncFields(IP("10.0.0.1"), IP(""), NetworkFields(NetworkName(""), SubnetworkName(""), IpRange("")) ) @@ -1091,9 +1092,9 @@ class LeoPubsubMessageSubscriberSpec getApp1.app.appResources.kubernetesServiceAccountName shouldBe Some( ServiceAccountName("gxy-ksa") ) - // Galaxy VM path stores external IP as loadBalancerIp; network fields are not populated + // Galaxy VM path stores internal IP as loadBalancerIp (proxy routes to VM via HTTP on port 80); network fields are not populated getApp1.cluster.asyncFields shouldBe Some( - KubernetesClusterAsyncFields(IP("1.2.3.4"), + KubernetesClusterAsyncFields(IP("10.0.0.1"), IP(""), NetworkFields(NetworkName(""), SubnetworkName(""), IpRange("")) ) @@ -1853,9 +1854,9 @@ class LeoPubsubMessageSubscriberSpec getApp.cluster.status shouldBe KubernetesClusterStatus.Running // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified getApp.nodepool.status shouldBe NodepoolStatus.Unspecified - // Galaxy VM path stores external IP as loadBalancerIp; network fields are not populated + // Galaxy VM path stores internal IP as loadBalancerIp (proxy routes to VM via HTTP on port 80); network fields are not populated getApp.cluster.asyncFields shouldBe Some( - KubernetesClusterAsyncFields(IP("1.2.3.4"), + KubernetesClusterAsyncFields(IP("10.0.0.1"), IP(""), NetworkFields(NetworkName(""), SubnetworkName(""), IpRange("")) ) From bea6e9d556a4afa0778fca21568f9228266fafbf Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Mon, 13 Apr 2026 09:39:49 -0400 Subject: [PATCH 09/17] Fix Galaxy VM readiness check: use direct HTTP to VM internal IP --- .../dsde/workbench/leonardo/dao/HttpAppDAO.scala | 16 ++++++++++++++++ .../workbench/leonardo/util/GKEInterpreter.scala | 8 ++++---- .../dsde/workbench/leonardo/dao/MockAppDAO.scala | 5 ++++- 3 files changed, 24 insertions(+), 5 deletions(-) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala index e846792f11..7fc78868ef 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dao/HttpAppDAO.scala @@ -44,6 +44,20 @@ class HttpAppDAO[F[_]: Async](kubernetesDnsCache: KubernetesDnsCache[F], client: ) case _ => Async[F].pure(false) // Update once we support Relay for apps } + + def isVmReachable(ip: org.broadinstitute.dsde.workbench.model.IP, port: Int, traceId: TraceId): F[Boolean] = + client + .status( + Request[F]( + method = Method.GET, + uri = Uri.unsafeFromString(s"http://${ip.asString}:${port}/"), + headers = Headers(Header.Raw(CIString("X-Request-ID"), traceId.asString)) + ) + ) + .map(status => status.code < 500) + .handleErrorWith(t => + logger.error(Map("traceId" -> traceId.asString), t)("Fail to check if VM is reachable").as(false) + ) } trait AppDAO[F[_]] { @@ -52,4 +66,6 @@ trait AppDAO[F[_]] { serviceName: ServiceName, traceId: TraceId ): F[Boolean] + + def isVmReachable(ip: org.broadinstitute.dsde.workbench.model.IP, port: Int, traceId: TraceId): F[Boolean] } diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala index 0e9992a673..40fc8c437e 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala @@ -1326,11 +1326,11 @@ class GKEInterpreter[F[_]]( ) // Wait for Galaxy's nginx to respond. - // Uses isProxyAvailable which routes through the Leo proxy. The proxy now connects to the - // VM's internal IP on port 80 (plain HTTP) because KubernetesDnsCache sets useHttp=true for - // Galaxy apps and stores the internal IP as loadBalancerIp. + // Uses a direct HTTP check to the VM's internal IP on port 80, bypassing the Leo proxy + // hostname chain (which would require the proxy wildcard DNS to be reachable from within + // the Leo pod — unreliable in BEE environments due to hairpin NAT). isDone <- streamFUntilDone( - appDao.isProxyAvailable(googleProject, app.appName, ServiceName("galaxy"), ctx.traceId), + appDao.isVmReachable(internalIp, 80, ctx.traceId), config.monitorConfig.createApp.maxAttempts, config.monitorConfig.createApp.interval ).interruptAfter(config.monitorConfig.createApp.interruptAfter).compile.lastOrError diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/dao/MockAppDAO.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/dao/MockAppDAO.scala index 8ee7c2c2cb..647f3cc77f 100644 --- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/dao/MockAppDAO.scala +++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/dao/MockAppDAO.scala @@ -3,7 +3,7 @@ package org.broadinstitute.dsde.workbench.leonardo.dao import cats.effect.IO import org.broadinstitute.dsde.workbench.google2.KubernetesSerializableName.ServiceName import org.broadinstitute.dsde.workbench.leonardo.AppName -import org.broadinstitute.dsde.workbench.model.TraceId +import org.broadinstitute.dsde.workbench.model.{IP, TraceId} import org.broadinstitute.dsde.workbench.model.google.GoogleProject class MockAppDAO(isUp: Boolean = true) extends AppDAO[IO] { @@ -13,5 +13,8 @@ class MockAppDAO(isUp: Boolean = true) extends AppDAO[IO] { traceId: TraceId ): IO[Boolean] = IO.pure(isUp) + + override def isVmReachable(ip: IP, port: Int, traceId: TraceId): IO[Boolean] = + IO.pure(isUp) } object MockAppDAO extends MockAppDAO(isUp = true) From c28fcca00d9aa1f51120f0228a0edb1e819c2302 Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Mon, 13 Apr 2026 12:46:58 -0400 Subject: [PATCH 10/17] Fix Galaxy VM proxy: use external IP instead of internal IP --- .../leonardo/dns/KubernetesDnsCache.scala | 3 ++- .../workbench/leonardo/util/GKEInterpreter.scala | 16 ++++++++++------ .../monitor/LeoPubsubMessageSubscriberSpec.scala | 12 ++++++------ 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala index 4d286c7f4b..a0bc7196fb 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/dns/KubernetesDnsCache.scala @@ -50,7 +50,8 @@ final class KubernetesDnsCache[F[_]: Logger: OpenTelemetryMetrics]( case Some(ip) => val h = kubernetesProxyHost(appResult.cluster, proxyConfig.proxyDomain) // Galaxy VM apps serve HTTP on port 80. The proxy should connect via plain HTTP, - // and we map the fake hostname to the VM's internal IP (stored in loadBalancerIp). + // and we map the fake hostname to the VM's external IP (stored in loadBalancerIp). + // External IP is used because Leo's pod is in a different VPC from the user's workspace project. val isGalaxyVm = appResult.app.appType == Galaxy hostToIpMapping .getAndUpdate(_ + (h.address -> ip)) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala index 40fc8c437e..322931d48b 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala @@ -1304,16 +1304,20 @@ class GKEInterpreter[F[_]]( ) _ <- logger.info(ctx.loggingCtx)( - s"Galaxy VM ${instanceName.value} has internal IP ${internalIp.asString} / external IP ${externalIp.asString}; storing internal IP in cluster async fields" + s"Galaxy VM ${instanceName.value} has internal IP ${internalIp.asString} / external IP ${externalIp.asString}; storing external IP in cluster async fields for proxy access" ) - // Store the VM's internal IP as the cluster load balancer IP consumed by KubernetesDnsCache. + // Store the VM's external IP as the cluster load balancer IP consumed by KubernetesDnsCache. // The proxy will connect to this IP via HTTP on port 80 (Galaxy VM serves HTTP, not HTTPS). + // We use the external IP because Leo's GKE cluster is in Leo's GCP project while the Galaxy VM + // is in the user's workspace project — the two VPCs are not peered, so the internal IP is + // not routable from Leo's pod. The leonardo-allow-http firewall rule (0.0.0.0/0 → port 80, + // targeting VMs with the "leonardo" tag) allows Leo to reach the VM on its external IP. _ <- kubernetesClusterQuery .updateAsyncFields( dbCluster.id, KubernetesClusterAsyncFields( - internalIp, + externalIp, IP(""), NetworkFields(NetworkName(""), SubnetworkName(""), IpRange("")) ) @@ -1322,15 +1326,15 @@ class GKEInterpreter[F[_]]( _ <- kubernetesClusterQuery.updateStatus(dbCluster.id, KubernetesClusterStatus.Running).transaction _ <- logger.info(ctx.loggingCtx)( - s"Polling Galaxy readiness for app ${app.appName.value} via proxy (backend: ${internalIp.asString}:80)" + s"Polling Galaxy readiness for app ${app.appName.value} via proxy (backend: ${externalIp.asString}:80)" ) // Wait for Galaxy's nginx to respond. - // Uses a direct HTTP check to the VM's internal IP on port 80, bypassing the Leo proxy + // Uses a direct HTTP check to the VM's external IP on port 80, bypassing the Leo proxy // hostname chain (which would require the proxy wildcard DNS to be reachable from within // the Leo pod — unreliable in BEE environments due to hairpin NAT). isDone <- streamFUntilDone( - appDao.isVmReachable(internalIp, 80, ctx.traceId), + appDao.isVmReachable(externalIp, 80, ctx.traceId), config.monitorConfig.createApp.maxAttempts, config.monitorConfig.createApp.interval ).interruptAfter(config.monitorConfig.createApp.interruptAfter).compile.lastOrError diff --git a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala index 3b368a36a8..3da67b1856 100644 --- a/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala +++ b/http/src/test/scala/org/broadinstitute/dsde/workbench/leonardo/monitor/LeoPubsubMessageSubscriberSpec.scala @@ -938,9 +938,9 @@ class LeoPubsubMessageSubscriberSpec getApp.cluster.status shouldBe KubernetesClusterStatus.Running // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified getApp.nodepool.status shouldBe NodepoolStatus.Unspecified - // Galaxy VM path stores internal IP as loadBalancerIp (proxy routes to VM via HTTP on port 80); network fields are not populated + // Galaxy VM path stores external IP as loadBalancerIp (proxy uses external IP because Leo VPC ≠ user VPC); network fields are not populated getApp.cluster.asyncFields shouldBe Some( - KubernetesClusterAsyncFields(IP("10.0.0.1"), + KubernetesClusterAsyncFields(IP("1.2.3.4"), IP(""), NetworkFields(NetworkName(""), SubnetworkName(""), IpRange("")) ) @@ -1092,9 +1092,9 @@ class LeoPubsubMessageSubscriberSpec getApp1.app.appResources.kubernetesServiceAccountName shouldBe Some( ServiceAccountName("gxy-ksa") ) - // Galaxy VM path stores internal IP as loadBalancerIp (proxy routes to VM via HTTP on port 80); network fields are not populated + // Galaxy VM path stores external IP as loadBalancerIp (proxy uses external IP because Leo VPC ≠ user VPC); network fields are not populated getApp1.cluster.asyncFields shouldBe Some( - KubernetesClusterAsyncFields(IP("10.0.0.1"), + KubernetesClusterAsyncFields(IP("1.2.3.4"), IP(""), NetworkFields(NetworkName(""), SubnetworkName(""), IpRange("")) ) @@ -1854,9 +1854,9 @@ class LeoPubsubMessageSubscriberSpec getApp.cluster.status shouldBe KubernetesClusterStatus.Running // Galaxy VM path does not create/poll GKE nodepools — their status stays Unspecified getApp.nodepool.status shouldBe NodepoolStatus.Unspecified - // Galaxy VM path stores internal IP as loadBalancerIp (proxy routes to VM via HTTP on port 80); network fields are not populated + // Galaxy VM path stores external IP as loadBalancerIp (proxy uses external IP because Leo VPC ≠ user VPC); network fields are not populated getApp.cluster.asyncFields shouldBe Some( - KubernetesClusterAsyncFields(IP("10.0.0.1"), + KubernetesClusterAsyncFields(IP("1.2.3.4"), IP(""), NetworkFields(NetworkName(""), SubnetworkName(""), IpRange("")) ) From d8683d8b402f1352a2269a20467100ffcc031a07 Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Mon, 13 Apr 2026 16:20:20 -0400 Subject: [PATCH 11/17] Fix Galaxy VM bootstrap: use galaxy-k8s-boot image --- http/src/main/resources/init-resources/galaxy-user-data.sh | 2 +- http/src/main/resources/reference.conf | 6 ++++-- .../dsde/workbench/leonardo/util/GKEInterpreter.scala | 4 +++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/http/src/main/resources/init-resources/galaxy-user-data.sh b/http/src/main/resources/init-resources/galaxy-user-data.sh index 3b7fbb640b..fb3306e6c9 100644 --- a/http/src/main/resources/init-resources/galaxy-user-data.sh +++ b/http/src/main/resources/init-resources/galaxy-user-data.sh @@ -1,6 +1,6 @@ +#cloud-config # Sourced from https://github.com/galaxyproject/galaxy-k8s-boot/blob/dev/bin/user_data.sh # When updating this file, sync it manually from that repository and verify the changes. -#cloud-config write_files: - path: /usr/local/bin/galaxy_bootstrap.sh permissions: '0755' diff --git a/http/src/main/resources/reference.conf b/http/src/main/resources/reference.conf index 64b9b732fe..d10e1fbdb2 100644 --- a/http/src/main/resources/reference.conf +++ b/http/src/main/resources/reference.conf @@ -434,8 +434,10 @@ groups { } galaxyVm { - # Debian 12 image — galaxy-k8s-boot bootstrap requires a standard Debian VM, not COS - sourceImage = "projects/debian-cloud/global/images/family/debian-12" + # Pre-built galaxy-k8s-boot image with all dependencies (Ansible, RKE2, etc.) pre-installed. + # Has cloud-init, which processes the "user-data" metadata key on first boot. + # Source: https://github.com/galaxyproject/galaxy-k8s-boot (image built by the Galaxy team) + sourceImage = "projects/anvil-and-terra-development/global/images/galaxy-k8s-boot-v2026-02-25" machineType = "n1-highmem-8" bootDiskSizeGb = 50 postgresDiskSizeGb = 10 diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala index 322931d48b..73b8751fa5 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala @@ -1060,7 +1060,9 @@ class GKEInterpreter[F[_]]( ) // Load cloud-config content bundled from galaxy-k8s-boot bin/user_data.sh. - // Intentionally not fetched at runtime to avoid unexpected production changes. + // Passed as the "user-data" metadata key, processed by cloud-init on first boot only. + // The galaxy-k8s-boot custom image has cloud-init pre-installed; "#cloud-config" must be + // the first line for cloud-init to recognise the file format. // To update, sync manually from https://github.com/galaxyproject/galaxy-k8s-boot/blob/dev/bin/user_data.sh userDataContent = scala.io.Source .fromResource("init-resources/galaxy-user-data.sh") From 66135f30d70a5e9296671c9027cc1ab138677e79 Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Tue, 14 Apr 2026 10:15:25 -0400 Subject: [PATCH 12/17] increase boot disk size to Galaxy VM disk image reqs --- http/src/main/resources/reference.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/http/src/main/resources/reference.conf b/http/src/main/resources/reference.conf index d10e1fbdb2..d4ea61e98c 100644 --- a/http/src/main/resources/reference.conf +++ b/http/src/main/resources/reference.conf @@ -439,7 +439,7 @@ galaxyVm { # Source: https://github.com/galaxyproject/galaxy-k8s-boot (image built by the Galaxy team) sourceImage = "projects/anvil-and-terra-development/global/images/galaxy-k8s-boot-v2026-02-25" machineType = "n1-highmem-8" - bootDiskSizeGb = 50 + bootDiskSizeGb = 100 postgresDiskSizeGb = 10 # Suffix appended to the NFS disk name to derive the postgres disk name. # Must match the value used in LeoPubsubMessageSubscriber (galaxyDisk.postgresDiskNameSuffix). From 98ec831f553233d85a086b4e8afec9805eb65e6f Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Tue, 14 Apr 2026 16:34:14 -0400 Subject: [PATCH 13/17] mark cluster as deleted and wait for iam to propagate --- .../dsde/workbench/leonardo/util/GKEInterpreter.scala | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala index 73b8751fa5..8e110f2cb1 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala @@ -711,6 +711,10 @@ class GKEInterpreter[F[_]]( s"Failed to delete Galaxy VM ${instanceName.value}: ${e.getMessage}. Continuing with app deletion." ) } + // Mark the cluster DB record as deleted so future app creation in this project is not blocked. + // For Galaxy, the "cluster" is a pure DB abstraction (no real GKE cluster); it must be + // cleaned up here because no separate cluster-deletion pubsub message is sent. + _ <- kubernetesClusterQuery.markAsDeleted(dbCluster.id, ctx.now).transaction } yield () case _ => @@ -1196,7 +1200,7 @@ class GKEInterpreter[F[_]]( .void ) ) - val retryConfig = RetryPredicates.retryConfigWithPredicates(when409) + val retryConfig = RetryPredicates.retryConfigWithPredicates(when409, whenGroupDoesNotExist) tracedRetryF(retryConfig)( call, s"googleIamDAO.addRoles(batch.jobsEditor) for pet SA ${app.googleServiceAccount.value} in project ${googleProject.value}" @@ -1241,7 +1245,7 @@ class GKEInterpreter[F[_]]( .void ) ) - val retryConfig = RetryPredicates.retryConfigWithPredicates(when409) + val retryConfig = RetryPredicates.retryConfigWithPredicates(when409, whenGroupDoesNotExist) tracedRetryF(retryConfig)( call, s"googleIamDAO.addRoles(batch.jobsEditor, iam.serviceAccountUser) for Batch SA $gcpBatchSa in project ${googleProject.value}" From b3502847e00a2a615780395579b9da59ecfd9ab3 Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Wed, 15 Apr 2026 09:59:20 -0400 Subject: [PATCH 14/17] fix Gb to Gib conversion --- .../dsde/workbench/leonardo/util/GKEInterpreter.scala | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala index 8e110f2cb1..c823e255ae 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala @@ -1079,8 +1079,13 @@ class GKEInterpreter[F[_]]( config.galaxyDiskConfig.postgresDiskNameSuffix ) - // Persistent-volume-size passed to ansible-pull (leave ~11 GiB for filesystem overhead on 150 GB disk) - pvSizeGi = math.max(1, nfsDisk.size.gb - 11) + // Persistent-volume-size passed to ansible-pull. + // nfsDisk.size.gb is in decimal GB; convert to binary GiB before subtracting filesystem overhead. + // Example: a 500 GB disk = (500 * 10^9) / 2^30 ≈ 465 GiB, so we request 465 - 11 = 454 GiB. + // Using raw gb - 11 (treating GB as GiB) overestimates by ~23 GiB on a 500 GB disk and + // causes the NFS provisioner to fail with "insufficient available space". + diskSizeGiB = (nfsDisk.size.gb.toLong * 1000L * 1000L * 1000L) / (1024L * 1024L * 1024L) + pvSizeGi = math.max(1, diskSizeGiB - 11) pvSize = s"${pvSizeGi}Gi" // Get or create the galaxy-batch-runner SA in the user's project. From 25c66051748539e0fc0d71e2cfc120727fb26cbb Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Wed, 15 Apr 2026 12:15:45 -0400 Subject: [PATCH 15/17] strip leo proxy prefix --- .../leonardo/http/service/ProxyService.scala | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala index 0ba9da08e0..841e75f7d0 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala @@ -355,7 +355,20 @@ class ProxyService( case _ => IO.unit } hostContext = HostContext(hostStatus, s"${cloudContext.asString}/${appName.value}/${serviceName.value}") - r <- proxyInternal(hostContext, request) + // Galaxy VM apps serve at /, but Leo forwards the full proxy path + // (e.g. /proxy/google/v1/apps/{project}/{app}/galaxy). Strip the Leo + // prefix so Galaxy's nginx sees requests rooted at /. + adjustedRequest = hostStatus match { + case HostReady(_, _, _, useHttp) if useHttp => + val prefix = s"/proxy/google/v1/apps/${cloudContext.asString}/${appName.value}/${serviceName.value}" + val stripped = request.uri.path.toString.stripPrefix(prefix) match { + case "" | "/" => "/" + case p => p + } + request.withUri(request.uri.withPath(Uri.Path(stripped))) + case _ => request + } + r <- proxyInternal(hostContext, adjustedRequest) appType <- appQuery.getAppType(appName).transaction result = if (r.status.isSuccess()) "success" else "failure" _ <- metrics.incrementCounter( From 2739d88abdd466dc3d547cabfd6dfbf1f748973d Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Wed, 15 Apr 2026 14:28:17 -0400 Subject: [PATCH 16/17] pass the galaxy_url_prefix --- .../resources/init-resources/galaxy-user-data.sh | 12 ++++++++++++ .../workbench/leonardo/util/GKEInterpreter.scala | 14 ++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/http/src/main/resources/init-resources/galaxy-user-data.sh b/http/src/main/resources/init-resources/galaxy-user-data.sh index fb3306e6c9..c21013f438 100644 --- a/http/src/main/resources/init-resources/galaxy-user-data.sh +++ b/http/src/main/resources/init-resources/galaxy-user-data.sh @@ -88,6 +88,13 @@ write_files: GCP_BATCH_SERVICE_ACCOUNT_EMAIL=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/gcp_batch_service_account_email" -H "Metadata-Flavor: Google" 2>/dev/null || echo "galaxy-batch-runner@anvil-and-terra-development.iam.gserviceaccount.com") echo "[$(date)] - GCP Batch service account email: ${GCP_BATCH_SERVICE_ACCOUNT_EMAIL}" + # Leo proxy path prefix for this Galaxy app (e.g. /proxy/google/v1/apps/{project}/{appName}/galaxy). + # Passed to ansible as galaxy_url_prefix so Galaxy generates correct absolute links (JS/CSS/API) + # that include the full proxy path. Without this Galaxy emits links rooted at / which the + # browser resolves against Leo's host and gets 404s → blank page. + GALAXY_URL_PREFIX=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/galaxy-url-prefix" -H "Metadata-Flavor: Google" 2>/dev/null || echo "") + echo "[$(date)] - Galaxy URL prefix: ${GALAXY_URL_PREFIX}" + GIT_REPO=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/git-repo" -H "Metadata-Flavor: Google" 2>/dev/null || echo "https://github.com/galaxyproject/galaxy-k8s-boot.git") GIT_BRANCH=$(curl -s -f "http://metadata.google.internal/computeMetadata/v1/instance/attributes/git-branch" -H "Metadata-Flavor: Google" 2>/dev/null || echo "master") @@ -108,6 +115,11 @@ write_files: echo "[$(date)] - Galaxy Restore Mode: Disabled" fi + if [ -n "$GALAXY_URL_PREFIX" ]; then + PULL_ARGS+=(--extra-vars "galaxy_url_prefix=${GALAXY_URL_PREFIX}") + echo "[$(date)] - Galaxy URL prefix passed to ansible: ${GALAXY_URL_PREFIX}" + fi + PULL_ARGS+=(playbook.yml) mkdir -p /tmp/ansible-inventory diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala index c823e255ae..1901685b67 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/util/GKEInterpreter.scala @@ -1184,6 +1184,20 @@ class GKEInterpreter[F[_]]( .addItems(Items.newBuilder().setKey("gcp-region").setValue(regionParam.value).build()) .addItems(Items.newBuilder().setKey("gcp-network").setValue(network.value).build()) .addItems(Items.newBuilder().setKey("gcp-subnet").setValue(subnetwork.value).build()) + // Galaxy needs to know its public URL prefix so it generates correct absolute links + // (JS, CSS, API calls) that include the full Leo proxy path. + // galaxy-k8s-boot's ansible playbook must accept galaxy_url_prefix and set it in + // Galaxy's helm values (galaxy.yml). Without this, Galaxy generates links rooted at / + // which the browser resolves against Leo's host and gets 404s → blank page. + .addItems( + Items + .newBuilder() + .setKey("galaxy-url-prefix") + .setValue( + s"/proxy/google/v1/apps/${googleProject.value}/${app.appName.value}/galaxy" + ) + .build() + ) .build() ) .putAllLabels(Map("leonardo" -> "true").asJava) From a98b3534a90a437f6760f26e542ee67a9ead97ca Mon Sep 17 00:00:00 2001 From: Liz Baldo Date: Fri, 8 May 2026 13:21:29 -0400 Subject: [PATCH 17/17] use new proxy path env variable --- .../init-resources/galaxy-user-data.sh | 2 +- .../leonardo/http/service/ProxyService.scala | 19 +++++-------------- 2 files changed, 6 insertions(+), 15 deletions(-) diff --git a/http/src/main/resources/init-resources/galaxy-user-data.sh b/http/src/main/resources/init-resources/galaxy-user-data.sh index c21013f438..1aa42cb6c8 100644 --- a/http/src/main/resources/init-resources/galaxy-user-data.sh +++ b/http/src/main/resources/init-resources/galaxy-user-data.sh @@ -116,7 +116,7 @@ write_files: fi if [ -n "$GALAXY_URL_PREFIX" ]; then - PULL_ARGS+=(--extra-vars "galaxy_url_prefix=${GALAXY_URL_PREFIX}") + PULL_ARGS+=(--extra-vars "galaxy_prefix=${GALAXY_URL_PREFIX}") echo "[$(date)] - Galaxy URL prefix passed to ansible: ${GALAXY_URL_PREFIX}" fi diff --git a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala index 841e75f7d0..5f407ad424 100644 --- a/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala +++ b/http/src/main/scala/org/broadinstitute/dsde/workbench/leonardo/http/service/ProxyService.scala @@ -355,20 +355,11 @@ class ProxyService( case _ => IO.unit } hostContext = HostContext(hostStatus, s"${cloudContext.asString}/${appName.value}/${serviceName.value}") - // Galaxy VM apps serve at /, but Leo forwards the full proxy path - // (e.g. /proxy/google/v1/apps/{project}/{app}/galaxy). Strip the Leo - // prefix so Galaxy's nginx sees requests rooted at /. - adjustedRequest = hostStatus match { - case HostReady(_, _, _, useHttp) if useHttp => - val prefix = s"/proxy/google/v1/apps/${cloudContext.asString}/${appName.value}/${serviceName.value}" - val stripped = request.uri.path.toString.stripPrefix(prefix) match { - case "" | "/" => "/" - case p => p - } - request.withUri(request.uri.withPath(Uri.Path(stripped))) - case _ => request - } - r <- proxyInternal(hostContext, adjustedRequest) + // Galaxy VM apps: forward the full Leo proxy path to the VM unchanged. + // galaxy-k8s-boot configures nginx location blocks and galaxy_url_prefix + // using the ingress.path value (= the Leo proxy prefix), so the VM expects + // to receive the full path including the prefix. + r <- proxyInternal(hostContext, request) appType <- appQuery.getAppType(appName).transaction result = if (r.status.isSuccess()) "success" else "failure" _ <- metrics.incrementCounter(