From ef91dd574c26d2ac7ad396ab118601ca9293a74d Mon Sep 17 00:00:00 2001 From: skunnath Date: Thu, 11 Jun 2026 10:07:26 -0700 Subject: [PATCH 1/4] Add FallbackCandidate data class with validation and CasC/UI support Defines the data structure for a single fallback candidate (zone, machineType, region, subnetwork, template) with: - Input validation via doCheck methods in DescriptorImpl - Region auto-derivation from zone name - MAX_FALLBACK_CANDIDATES cap (10) to bound provisioner thread time - Help files for the Jenkins UI Co-authored-by: Cursor --- .../computeengine/FallbackCandidate.java | 171 ++++++++++++++++++ .../FallbackCandidate/config.jelly | 31 ++++ .../FallbackCandidate/help-machineType.html | 6 + .../FallbackCandidate/help-region.html | 6 + .../FallbackCandidate/help-subnetwork.html | 6 + .../FallbackCandidate/help-template.html | 4 + .../FallbackCandidate/help-zone.html | 5 + 7 files changed, 229 insertions(+) create mode 100644 src/main/java/com/google/jenkins/plugins/computeengine/FallbackCandidate.java create mode 100644 src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/config.jelly create mode 100644 src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-machineType.html create mode 100644 src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-region.html create mode 100644 src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-subnetwork.html create mode 100644 src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-template.html create mode 100644 src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-zone.html diff --git a/src/main/java/com/google/jenkins/plugins/computeengine/FallbackCandidate.java b/src/main/java/com/google/jenkins/plugins/computeengine/FallbackCandidate.java new file mode 100644 index 00000000..43a0cfd8 --- /dev/null +++ b/src/main/java/com/google/jenkins/plugins/computeengine/FallbackCandidate.java @@ -0,0 +1,171 @@ +/* + * Copyright 2026 CloudBees, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.jenkins.plugins.computeengine; + +import hudson.Extension; +import hudson.model.Describable; +import hudson.model.Descriptor; +import hudson.util.FormValidation; +import jenkins.model.Jenkins; +import lombok.EqualsAndHashCode; +import lombok.Getter; +import org.kohsuke.stapler.DataBoundConstructor; +import org.kohsuke.stapler.DataBoundSetter; +import org.kohsuke.stapler.QueryParameter; + +/** + * A single ordered fallback candidate for a {@link InstanceConfiguration}. + * + *

When the primary location/machine of an instance configuration cannot be provisioned because + * of a capacity-related error (for example {@code ZONE_RESOURCE_POOL_EXHAUSTED}), the plugin walks + * the configured list of fallback candidates in order, trying each one until provisioning succeeds + * or the list is exhausted. Each candidate may change the zone and machine type (and, for + * cross-region fallbacks, the subnetwork) while keeping the same logical Jenkins label. + * + *

{@code zone} is required; when an instance template is configured, {@code machineType} may be + * left blank (the template provides it). {@code region} is auto-derived from the zone when left + * blank but can be overridden for cross-region subnetwork routing. {@code subnetwork} and + * {@code template} are optional; when a field is left blank the value from the parent + * {@link InstanceConfiguration} is used. + * + *

The maximum number of fallback candidates per {@link InstanceConfiguration} is + * {@value #MAX_FALLBACK_CANDIDATES}. This bounds worst-case provisioner thread hold time. + */ +@Getter +@EqualsAndHashCode +public class FallbackCandidate implements Describable { + + /** + * Upper bound on fallback candidates per configuration. Prevents unbounded retry chains that + * could hold a provisioner thread for too long on a shared controller. + */ + public static final int MAX_FALLBACK_CANDIDATES = 10; + + private final String zone; + private final String machineType; + + private String region; + private String subnetwork; + private String template; + + @DataBoundConstructor + public FallbackCandidate(String zone, String machineType) { + this.zone = zone != null ? zone.trim() : ""; + this.machineType = machineType != null ? machineType.trim() : ""; + } + + @DataBoundSetter + public void setRegion(String region) { + this.region = region; + } + + @DataBoundSetter + public void setSubnetwork(String subnetwork) { + this.subnetwork = subnetwork; + } + + @DataBoundSetter + public void setTemplate(String template) { + this.template = template; + } + + /** + * Derives the region from the zone name. GCE zone names encode the region as the prefix + * before the last hyphen-letter segment (e.g. "us-west1-a" → "us-west1"). + * + * @return the derived region, or empty string if the zone is blank or has no recognizable suffix. + */ + public String getEffectiveRegion() { + if (region != null && !region.isEmpty()) { + return region; + } + return deriveRegionFromZone(zone); + } + + static String deriveRegionFromZone(String zoneName) { + if (zoneName == null || zoneName.isEmpty()) { + return ""; + } + String name = zoneName.contains("/") ? zoneName.substring(zoneName.lastIndexOf('/') + 1) : zoneName; + int lastDash = name.lastIndexOf('-'); + if (lastDash > 0 && lastDash < name.length() - 1) { + return name.substring(0, lastDash); + } + return ""; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append("zone=").append(zone).append(" machineType=").append(machineType); + if (region != null && !region.isEmpty()) { + sb.append(" region=").append(region); + } + if (subnetwork != null && !subnetwork.isEmpty()) { + sb.append(" subnetwork=").append(subnetwork); + } + if (template != null && !template.isEmpty()) { + sb.append(" template=").append(template); + } + return sb.toString(); + } + + @Override + @SuppressWarnings("unchecked") + public Descriptor getDescriptor() { + return Jenkins.get().getDescriptorOrDie(getClass()); + } + + @Extension + public static final class DescriptorImpl extends Descriptor { + @Override + public String getDisplayName() { + return "Fallback Candidate"; + } + + public FormValidation doCheckZone(@QueryParameter String value) { + if (value == null || value.trim().isEmpty()) { + return FormValidation.error("Zone is required for a fallback candidate."); + } + return FormValidation.ok(); + } + + public FormValidation doCheckMachineType( + @QueryParameter String value, @QueryParameter("template") String template) { + if ((value == null || value.trim().isEmpty()) + && (template == null || template.trim().isEmpty())) { + return FormValidation.warning( + "Machine type is required unless an instance template is specified."); + } + return FormValidation.ok(); + } + + public FormValidation doCheckRegion( + @QueryParameter String value, @QueryParameter("zone") String zone) { + if (value != null && !value.trim().isEmpty()) { + return FormValidation.ok(); + } + if (zone != null && !zone.trim().isEmpty()) { + String derived = deriveRegionFromZone(zone); + if (!derived.isEmpty()) { + return FormValidation.ok("Region will be auto-derived as: " + derived); + } + } + return FormValidation.ok(); + } + } +} diff --git a/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/config.jelly b/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/config.jelly new file mode 100644 index 00000000..4e9c819f --- /dev/null +++ b/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/config.jelly @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + diff --git a/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-machineType.html b/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-machineType.html new file mode 100644 index 00000000..bedbdbc9 --- /dev/null +++ b/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-machineType.html @@ -0,0 +1,6 @@ +

+ The GCE machine type to request for this fallback candidate (for example + n4d-standard-32 or c4d-standard-32). Required. Short names are + resolved against this candidate's zone, so a fallback can change both zone and machine + family/type. +
diff --git a/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-region.html b/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-region.html new file mode 100644 index 00000000..b9a03d30 --- /dev/null +++ b/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-region.html @@ -0,0 +1,6 @@ +
+ Optional. The GCP region for this candidate (for example us-central1). Informational + for cross-region fallbacks; the zone determines the region for machine-type resolution. When + falling back to a different region with a non-default network, also set the + Subnetwork field. +
diff --git a/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-subnetwork.html b/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-subnetwork.html new file mode 100644 index 00000000..4dffe4d5 --- /dev/null +++ b/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-subnetwork.html @@ -0,0 +1,6 @@ +
+ Optional. Overrides the subnetwork for this candidate. Required only when this candidate is in a + different region than the primary configuration and you are not using the default + network, because subnetworks are region-scoped. When blank, the primary configuration's + subnetwork is used. +
diff --git a/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-template.html b/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-template.html new file mode 100644 index 00000000..3ac66d72 --- /dev/null +++ b/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-template.html @@ -0,0 +1,4 @@ +
+ Optional. A GCE instance template to use for this candidate instead of the primary + configuration's template. When blank, the primary configuration's template (if any) is used. +
diff --git a/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-zone.html b/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-zone.html new file mode 100644 index 00000000..41bf772e --- /dev/null +++ b/src/main/resources/com/google/jenkins/plugins/computeengine/FallbackCandidate/help-zone.html @@ -0,0 +1,5 @@ +
+ The GCP zone to try for this fallback candidate (for example us-west1-b). Required. + When the primary configuration fails with a capacity-related error, the plugin attempts this + zone with the candidate's machine type. +
From f4c8e8c06a72c09b8ac9cb71aa4b2158bed3dc2a Mon Sep 17 00:00:00 2001 From: skunnath Date: Thu, 11 Jun 2026 10:07:33 -0700 Subject: [PATCH 2/4] Add ProvisioningErrorClassifier for retryable vs non-retryable errors Classifies GCE operation errors into capacity-related (retryable via fallback) and non-retryable (abort immediately) buckets. - Conservative unknown-error policy: unrecognized codes are non-retryable - Covers ZONE_RESOURCE_POOL_EXHAUSTED, STOCKOUT, RESOURCE_NOT_READY - Explicitly excludes QUOTA errors from retry - Documents GCP error code reference URLs for maintainability Co-authored-by: Cursor --- .../ProvisioningErrorClassifier.java | 141 ++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100644 src/main/java/com/google/jenkins/plugins/computeengine/ProvisioningErrorClassifier.java diff --git a/src/main/java/com/google/jenkins/plugins/computeengine/ProvisioningErrorClassifier.java b/src/main/java/com/google/jenkins/plugins/computeengine/ProvisioningErrorClassifier.java new file mode 100644 index 00000000..710d516d --- /dev/null +++ b/src/main/java/com/google/jenkins/plugins/computeengine/ProvisioningErrorClassifier.java @@ -0,0 +1,141 @@ +/* + * Copyright 2026 CloudBees, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.jenkins.plugins.computeengine; + +import com.google.api.services.compute.model.Operation; +import java.util.Locale; + +/** + * Classifies GCE provisioning failures into "retryable" (capacity-related) and "non-retryable" + * (configuration/permission/quota) buckets, so that ordered fallback is only attempted for + * transient capacity shortages. + * + *

Retryable conditions are zone/machine-family/machine-type capacity exhaustion, for example + * {@code ZONE_RESOURCE_POOL_EXHAUSTED} or a stockout message such as + * "does not have enough resources available to fulfill the request". + * + *

Explicitly non-retryable conditions include quota failures, authentication/permission + * failures, invalid configuration, invalid network configuration and invalid image/instance + * template — retrying these in another zone would not help. + * + *

Unknown error policy

+ *

Unrecognized error codes are treated as non-retryable (fallback is NOT attempted). + * Rationale: a conservative policy avoids masking real configuration bugs behind fallback + * retries. If GCP introduces new capacity-related error codes, they should be added to + * {@link #RETRYABLE_MARKERS} after verifying the GCP documentation. + * + *

GCP error code reference

+ *

The authoritative list of GCE operation error codes is maintained at: + * + * Troubleshooting VM creation and + * + * Zone Operations REST reference. New capacity-related codes should be added to + * {@link #RETRYABLE_MARKERS} as they appear. + * + * @see + * GCP: Troubleshooting VM creation + */ +public final class ProvisioningErrorClassifier { + + /** + * Substrings (matched case-insensitively against an error code or message) that indicate a + * capacity shortage where trying a different zone or machine type may succeed. + * + *

Sources: + *

    + *
  • {@code ZONE_RESOURCE_POOL_EXHAUSTED} — zone lacks capacity for the requested resource
  • + *
  • {@code ZONE_RESOURCE_POOL_EXHAUSTED_WITH_DETAILS} — same with extended detail payload
  • + *
  • {@code RESOURCE_POOL_EXHAUSTED} — regional capacity shortage (less common)
  • + *
  • {@code STOCKOUT} — newer capacity error code observed in some regions
  • + *
  • {@code RESOURCE_NOT_READY} — transient resource readiness failure (e.g. host maintenance)
  • + *
  • "does not have enough resources" — free-text message accompanying capacity errors
  • + *
+ * + *

Maintenance: if GCP introduces additional capacity-related codes, add them here after + * confirming in the GCP documentation that retrying in another zone is appropriate. + */ + private static final String[] RETRYABLE_MARKERS = { + "ZONE_RESOURCE_POOL_EXHAUSTED", + "RESOURCE_POOL_EXHAUSTED", + "STOCKOUT", + "RESOURCE_NOT_READY", + "DOES NOT HAVE ENOUGH RESOURCES", + "DOES_NOT_HAVE_ENOUGH_RESOURCES" + }; + + private ProvisioningErrorClassifier() {} + + /** + * Classifies a GCE error code or message as retryable (capacity-related) or not. + * + *

Policy for unrecognized codes: returns {@code false} (non-retryable). This is a + * deliberate conservative choice — unknown errors abort immediately rather than silently + * retrying all fallback candidates, which could mask real configuration bugs. If you observe + * a new GCP capacity error code that should trigger fallback, add it to + * {@link #RETRYABLE_MARKERS}. + * + * @param codeOrMessage a GCE operation error code (e.g. {@code ZONE_RESOURCE_POOL_EXHAUSTED}) or + * a free-text error message. + * @return {@code true} if the failure looks like a transient capacity shortage that fallback + * should retry in another zone/machine type; {@code false} for {@code null}, unknown, or + * clearly non-capacity errors (including quota failures). + */ + public static boolean isRetryable(String codeOrMessage) { + if (codeOrMessage == null) { + return false; + } + String normalized = codeOrMessage.toUpperCase(Locale.ROOT); + if (normalized.contains("QUOTA")) { + return false; + } + for (String marker : RETRYABLE_MARKERS) { + if (normalized.contains(marker)) { + return true; + } + } + return false; + } + + /** @return {@code true} if the operation error carries at least one error entry. */ + public static boolean hasErrors(Operation.Error error) { + return error != null && error.getErrors() != null && !error.getErrors().isEmpty(); + } + + /** @return the first error code on the operation error, or {@code null} if none. */ + public static String firstErrorCode(Operation.Error error) { + if (!hasErrors(error)) { + return null; + } + return error.getErrors().get(0).getCode(); + } + + /** @return a short {@code CODE: message} summary of the first error, for logging. */ + public static String errorSummary(Operation.Error error) { + if (!hasErrors(error)) { + return "unknown error"; + } + Operation.Error.Errors first = error.getErrors().get(0); + String code = first.getCode(); + String message = first.getMessage(); + StringBuilder sb = new StringBuilder(); + sb.append(code != null ? code : "ERROR"); + if (message != null && !message.isEmpty()) { + sb.append(": ").append(message); + } + return sb.toString(); + } +} From 813738fc864e57dd5a7ecee8d51c95b2e84b7fdd Mon Sep 17 00:00:00 2001 From: skunnath Date: Thu, 11 Jun 2026 10:07:42 -0700 Subject: [PATCH 3/4] Implement ordered fallback provisioning in InstanceConfiguration Refactors provision() to iterate through primary + fallback candidates: - Waits for GCE operation completion when fallback is configured - Retries next candidate on retryable capacity errors - Aborts immediately on non-retryable errors (quota, permission, config) - Best-effort cleanup of failed VMs before trying next candidate - Caps fallback list at MAX_FALLBACK_CANDIDATES; skips blank-zone entries - Re-zones disk type self-links for cross-zone fallback - Null-safe shortName() helper for logging template-based configs - UI section for configuring fallback candidates Co-authored-by: Cursor --- .../computeengine/InstanceConfiguration.java | 297 ++++++++++++++++-- .../InstanceConfiguration/config.jelly | 10 + 2 files changed, 289 insertions(+), 18 deletions(-) diff --git a/src/main/java/com/google/jenkins/plugins/computeengine/InstanceConfiguration.java b/src/main/java/com/google/jenkins/plugins/computeengine/InstanceConfiguration.java index 96c90239..b476310a 100644 --- a/src/main/java/com/google/jenkins/plugins/computeengine/InstanceConfiguration.java +++ b/src/main/java/com/google/jenkins/plugins/computeengine/InstanceConfiguration.java @@ -38,6 +38,7 @@ import com.google.api.services.compute.model.Zone; import com.google.cloud.graphite.platforms.plugin.client.ClientFactory; import com.google.cloud.graphite.platforms.plugin.client.ComputeClient; +import com.google.cloud.graphite.platforms.plugin.client.ComputeClient.OperationException; import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Strings; import com.google.jenkins.plugins.computeengine.client.ClientUtil; @@ -185,6 +186,17 @@ public class InstanceConfiguration implements Describable private MinimumNumberOfInstancesTimeRangeConfig minimumNumberOfInstancesTimeRangeConfig; private String template; + + /** + * Ordered list of fallback candidates tried, in order, when the primary + * location/machine of this configuration cannot be provisioned due to a capacity-related + * error. {@code null}/empty means no fallback (behaviour unchanged from before this feature). + * + * @see FallbackCandidate + */ + @Nullable + private List fallbackCandidates; + // Optional not possible due to serialization requirement @Nullable private WindowsConfiguration windowsConfiguration; @@ -383,12 +395,169 @@ public void appendLabel(String key, String value) { } public ComputeEngineInstance provision() throws IOException { + List attempts = buildProvisioningAttempts(); + boolean fallbackEnabled = attempts.size() > 1; + + if (fallbackEnabled) { + log.info(String.format( + "Provisioning label=%s using %d ordered candidate(s) for config [%s]", + labels, attempts.size(), description)); + } + + RetryableProvisioningException lastRetryable = null; + for (int i = 0; i < attempts.size(); i++) { + ProvisioningAttempt attempt = attempts.get(i); + if (fallbackEnabled) { + log.info(String.format( + "Candidate %d/%d: zone=%s machineType=%s%s (config [%s], label=%s)", + i + 1, + attempts.size(), + shortName(attempt.zone()), + shortName(attempt.machineType()), + notNullOrEmpty(attempt.template()) ? " template=" + shortName(attempt.template()) : "", + description, + labels)); + } + try { + ComputeEngineInstance node = provisionAttempt(attempt, fallbackEnabled); + if (node != null && fallbackEnabled) { + log.info(String.format( + "Provisioned instance=%s zone=%s machineType=%s (candidate %d/%d, label=%s)", + node.getNodeName(), + shortName(attempt.zone()), + shortName(attempt.machineType()), + i + 1, + attempts.size(), + labels)); + } + return node; + } catch (RetryableProvisioningException rpe) { + lastRetryable = rpe; + boolean moreToTry = i < attempts.size() - 1; + log.warning(String.format( + "Candidate %d/%d failed: %s. Retryable=true.%s", + i + 1, + attempts.size(), + rpe.getMessage(), + moreToTry ? " Retrying next fallback candidate." : " No more fallback candidates.")); + } + } + + log.warning(String.format( + "All %d candidate(s) failed to provision for label=%s (config [%s]).", + attempts.size(), labels, description)); + if (lastRetryable != null) { + throw new IOException( + "Exhausted all fallback candidates for config [" + description + "]; last error: " + + lastRetryable.getMessage(), + lastRetryable); + } + return null; + } + + /** + * Builds the ordered list of provisioning attempts: the primary configuration first, followed + * by each configured {@link FallbackCandidate} (with blank fields inherited from this + * configuration). Candidates with a blank zone are skipped with a warning. The list is capped + * at {@link FallbackCandidate#MAX_FALLBACK_CANDIDATES} entries to bound provisioner thread time. + */ + private List buildProvisioningAttempts() { + List attempts = new ArrayList<>(); + attempts.add(new ProvisioningAttempt(zone, machineType, template, null)); + if (fallbackCandidates != null) { + int limit = Math.min(fallbackCandidates.size(), FallbackCandidate.MAX_FALLBACK_CANDIDATES); + if (fallbackCandidates.size() > FallbackCandidate.MAX_FALLBACK_CANDIDATES) { + log.warning(String.format( + "Config [%s] has %d fallback candidates but maximum is %d; extras will be ignored.", + description, fallbackCandidates.size(), FallbackCandidate.MAX_FALLBACK_CANDIDATES)); + } + for (int i = 0; i < limit; i++) { + FallbackCandidate candidate = fallbackCandidates.get(i); + if (candidate.getZone() == null || candidate.getZone().trim().isEmpty()) { + log.warning(String.format( + "Skipping fallback candidate %d for config [%s]: zone is blank.", i + 1, description)); + continue; + } + attempts.add(new ProvisioningAttempt( + firstNonEmpty(candidate.getZone(), zone), + firstNonEmpty(candidate.getMachineType(), machineType), + firstNonEmpty(candidate.getTemplate(), template), + Strings.emptyToNull(candidate.getSubnetwork()))); + } + } + return attempts; + } + + /** + * Provisions a single attempt. + * + *

When {@code fallbackEnabled} is {@code false} (no fallback candidates configured) this + * behaves exactly as before: it submits the insert and returns immediately, leaving the + * launcher to wait on the operation. When {@code fallbackEnabled} is {@code true} it waits for + * the insert operation to reach {@code DONE} so that capacity errors — which GCE reports + * asynchronously on the zone operation — surface here and can drive the fallback decision. + * + * @throws RetryableProvisioningException if this attempt failed with a capacity-related error + * and the caller should try the next fallback candidate. + * @throws IOException for non-retryable failures (the whole provision should abort). + */ + private ComputeEngineInstance provisionAttempt(ProvisioningAttempt attempt, boolean fallbackEnabled) + throws IOException, RetryableProvisioningException { + Instance instance = instance(attempt.zone(), attempt.machineType(), attempt.template(), attempt.subnetwork()); + + Operation operation; try { - Instance instance = instance(); // TODO: JENKINS-55285 - Operation operation = - cloud.getClient().insertInstance(cloud.getProjectId(), Optional.ofNullable(template), instance); - log.info("Sent insert request for instance configuration [" + description + "]"); + operation = cloud.getClient() + .insertInstance(cloud.getProjectId(), Optional.ofNullable(attempt.template()), instance); + } catch (IOException ioe) { + // Some capacity shortages are rejected synchronously. Only treat clearly capacity-related + // synchronous failures as retryable; everything else aborts. + if (fallbackEnabled && ProvisioningErrorClassifier.isRetryable(ioe.getMessage())) { + bestEffortTerminate(instance.getName(), nameFromSelfLink(attempt.zone())); + throw new RetryableProvisioningException("insert request rejected: " + ioe.getMessage(), ioe); + } + throw ioe; + } + log.info("Sent insert request for instance [" + instance.getName() + "] (config [" + description + "])"); + + if (fallbackEnabled) { + Operation.Error opError = null; + try { + Operation completed = cloud.getClient() + .waitForOperationCompletion(cloud.getProjectId(), operation, getLaunchTimeoutMillis()); + opError = completed.getError(); + } catch (OperationException oe) { + opError = oe.getError(); + } catch (InterruptedException ie) { + Thread.currentThread().interrupt(); + throw new IOException( + "Interrupted while waiting for insert operation of instance [" + instance.getName() + "]", ie); + } + + if (ProvisioningErrorClassifier.hasErrors(opError)) { + String summary = ProvisioningErrorClassifier.errorSummary(opError); + boolean retryable = + ProvisioningErrorClassifier.isRetryable(ProvisioningErrorClassifier.firstErrorCode(opError)); + log.warning(String.format( + "Insert operation failed for instance [%s]: %s. Retryable=%b.", + instance.getName(), summary, retryable)); + // Best-effort cleanup so a partially-created VM does not linger as an orphan. + bestEffortTerminate(instance.getName(), nameFromSelfLink(attempt.zone())); + if (retryable) { + throw new RetryableProvisioningException(summary); + } + throw new IOException( + "Non-retryable provisioning error for instance [" + instance.getName() + "]: " + summary); + } + } + + return buildNode(instance, operation); + } + + /** Builds the {@link ComputeEngineInstance} node for a successfully-inserted instance. */ + private ComputeEngineInstance buildNode(Instance instance, Operation operation) throws IOException { + try { String targetRemoteFs = this.remoteFs; ComputeEngineComputerLauncher launcher; if (this.windowsConfiguration != null) { @@ -433,6 +602,63 @@ public ComputeEngineInstance provision() throws IOException { } } + /** + * Best-effort asynchronous delete of an instance that failed to provision, so that a + * partially-created VM does not linger as an orphan before the next fallback candidate is + * tried. Failures here are logged at FINE and otherwise ignored; the periodic + * {@link CleanLostNodesWork} sweep is the backstop. + */ + private void bestEffortTerminate(String instanceName, String zone) { + try { + cloud.getClient().terminateInstanceAsync(cloud.getProjectId(), zone, instanceName); + log.info("Requested cleanup (delete) of failed instance [" + instanceName + "] in zone " + zone); + } catch (IOException e) { + log.log( + Level.FINE, + "Best-effort cleanup of failed instance [" + instanceName + "] did not complete: " + + e.getMessage()); + } + } + + private static String firstNonEmpty(String preferred, String fallback) { + return notNullOrEmpty(preferred) ? preferred : fallback; + } + + /** + * Null-safe short name for logging. Returns the trailing segment of a GCE self-link, or + * {@code "(from template)"} when the value is blank (e.g. a template-based configuration leaves + * machineType unset). Never throws, so logging cannot break provisioning. + */ + private static String shortName(String selfLinkOrName) { + if (!notNullOrEmpty(selfLinkOrName)) { + return "(from template)"; + } + try { + return nameFromSelfLink(selfLinkOrName); + } catch (RuntimeException e) { + return selfLinkOrName; + } + } + + /** Effective zone/machine/template/subnetwork for a single provisioning attempt. */ + private record ProvisioningAttempt(String zone, String machineType, String template, String subnetwork) {} + + /** + * Internal signal that a provisioning attempt failed with a capacity-related error and the + * next configured fallback candidate should be tried. + */ + private static final class RetryableProvisioningException extends Exception { + private static final long serialVersionUID = 1L; + + RetryableProvisioningException(String message) { + super(message); + } + + RetryableProvisioningException(String message, Throwable cause) { + super(message, cause); + } + } + /** Initializes transient properties */ protected Object readResolve() { labelSet = Label.parse(labels); @@ -451,10 +677,22 @@ protected Object readResolve() { } public Instance instance() throws IOException { + return instance(zone, machineType, template, null); + } + + /** + * Builds the {@link Instance} insert request for a specific attempt. The effective zone, + * machine type, instance template and subnetwork may differ from the configured values when a + * {@link FallbackCandidate} is being tried, so all zone/region-scoped resources are resolved + * against the effective values here. + */ + Instance instance( + String effectiveZone, String effectiveMachineType, String effectiveTemplate, String effectiveSubnetwork) + throws IOException { Instance instance = new Instance(); instance.setName(uniqueName()); instance.setDescription(description); - instance.setZone(nameFromSelfLink(zone)); + instance.setZone(nameFromSelfLink(effectiveZone)); instance.setMetadata(newMetadata()); if (windowsConfiguration == null) { @@ -482,9 +720,9 @@ public Instance instance() throws IOException { effectiveGoogleLabels.put( CleanLostNodesWork.NODE_IN_USE_LABEL_KEY, CleanLostNodesWork.getLastRefreshLabelVal()); - if (StringUtils.isNotEmpty(template)) { - InstanceTemplate instanceTemplate = - cloud.getClient().getTemplate(nameFromSelfLink(cloud.getProjectId()), nameFromSelfLink(template)); + if (StringUtils.isNotEmpty(effectiveTemplate)) { + InstanceTemplate instanceTemplate = cloud.getClient() + .getTemplate(nameFromSelfLink(cloud.getProjectId()), nameFromSelfLink(effectiveTemplate)); /* Since we have to set the metadata to include the autogenerated SSH keypair, we need to ensure we include metadata properties which might be set in the template. */ if (instanceTemplate.getProperties() != null @@ -505,12 +743,12 @@ public Instance instance() throws IOException { } else { configureStartupScript(instance); instance.setLabels(effectiveGoogleLabels); - instance.setMachineType(stripSelfLinkPrefix(machineType)); + instance.setMachineType(stripSelfLinkPrefix(effectiveMachineType)); instance.setTags(tags()); instance.setScheduling(scheduling()); - instance.setDisks(disks()); + instance.setDisks(disks(effectiveZone)); instance.setGuestAccelerators(accelerators()); - instance.setNetworkInterfaces(networkInterfaces()); + instance.setNetworkInterfaces(networkInterfaces(effectiveSubnetwork)); instance.setServiceAccounts(serviceAccounts()); // optional @@ -710,15 +948,18 @@ Scheduling scheduling() { return scheduling; } - /** Builds the list of disks for the instance: the boot disk followed by any additional - * disks parsed from the {@link #diskMapping} field. */ - private List disks() { + /** + * Builds the list of disks for the instance. The boot disk type is zone-scoped, so when a + * {@link FallbackCandidate} targets a different zone the self-link is re-pointed to the + * effective zone. + */ + private List disks(String effectiveZone) { AttachedDisk boot = new AttachedDisk(); boot.setBoot(true); boot.setAutoDelete(bootDiskAutoDelete); boot.setInitializeParams(new AttachedDiskInitializeParams() .setDiskSizeGb(bootDiskSizeGb) - .setDiskType(bootDiskType) + .setDiskType(rezoneSelfLink(bootDiskType, effectiveZone)) .setSourceImage(bootDiskSourceImageName)); List disks = new ArrayList<>(); @@ -743,6 +984,20 @@ private List disks() { return disks; } + /** + * Re-points the {@code /zones//} segment of a GCE self-link to {@code effectiveZone}. Used + * so zone-scoped resources (e.g. the boot disk type) resolve in the zone actually being tried + * during fallback. Returns the input unchanged when either argument is blank or the link has no + * zone segment. + */ + private static String rezoneSelfLink(String selfLink, String effectiveZone) { + if (Strings.isNullOrEmpty(selfLink) || Strings.isNullOrEmpty(effectiveZone)) { + return selfLink; + } + String targetZone = nameFromSelfLink(effectiveZone); + return selfLink.replaceAll("/zones/[^/]+/", "/zones/" + targetZone + "/"); + } + private List accelerators() { if (acceleratorConfiguration != null && notNullOrEmpty(acceleratorConfiguration.getGpuCount()) @@ -756,14 +1011,19 @@ && notNullOrEmpty(acceleratorConfiguration.getGpuType())) { return null; } - private List networkInterfaces() { + private List networkInterfaces(String effectiveSubnetwork) { List networkInterfaces = new ArrayList<>(); NetworkInterface networkInterface = networkInterfaceIpStackMode.getNetworkInterface(); + // A fallback candidate may override the subnetwork (required for cross-region fallback); + // otherwise use the configured network's subnetwork. + String subnetwork = + notNullOrEmpty(effectiveSubnetwork) ? effectiveSubnetwork : networkConfiguration.getSubnetwork(); + // Don't include subnetwork name if using default - if (!networkConfiguration.getSubnetwork().equals("default")) { - networkInterface.setSubnetwork(stripSelfLinkPrefix(networkConfiguration.getSubnetwork())); + if (!subnetwork.equals("default")) { + networkInterface.setSubnetwork(stripSelfLinkPrefix(subnetwork)); } networkInterfaces.add(networkInterface); @@ -1353,6 +1613,7 @@ public InstanceConfiguration build() { instanceConfiguration.setMinimumNumberOfInstancesTimeRangeConfig( this.minimumNumberOfInstancesTimeRangeConfig); instanceConfiguration.setTemplate(this.template); + instanceConfiguration.setFallbackCandidates(this.fallbackCandidates); instanceConfiguration.setCreateSnapshot(this.createSnapshot); instanceConfiguration.setDiskMapping(this.diskMapping); instanceConfiguration.setTerminateIdleDuringShutdown(this.terminateIdleDuringShutdown); diff --git a/src/main/resources/com/google/jenkins/plugins/computeengine/InstanceConfiguration/config.jelly b/src/main/resources/com/google/jenkins/plugins/computeengine/InstanceConfiguration/config.jelly index d4cc7d92..819a3506 100644 --- a/src/main/resources/com/google/jenkins/plugins/computeengine/InstanceConfiguration/config.jelly +++ b/src/main/resources/com/google/jenkins/plugins/computeengine/InstanceConfiguration/config.jelly @@ -155,6 +155,16 @@ + + + +

+ +
+ + + From b3e78816070fc6bdf3fb13d688b318abf38713e7 Mon Sep 17 00:00:00 2001 From: skunnath Date: Thu, 11 Jun 2026 10:07:50 -0700 Subject: [PATCH 4/4] Add unit tests for fallback provisioning and error classification - InstanceConfigurationFallbackTest: fallback ordering, non-retryable abort, all-exhausted, OperationException handling, legacy no-fallback - ProvisioningErrorClassifierTest: retryable codes, quota exclusion, unknown-code policy, case insensitivity, null safety - ConfigAsCodeTest: CasC round-trip for fallbackCandidates field Co-authored-by: Cursor --- .../computeengine/ConfigAsCodeTest.java | 16 ++ .../InstanceConfigurationFallbackTest.java | 248 ++++++++++++++++++ .../ProvisioningErrorClassifierTest.java | 107 ++++++++ .../computeengine/configuration-as-code.yml | 7 + 4 files changed, 378 insertions(+) create mode 100644 src/test/java/com/google/jenkins/plugins/computeengine/InstanceConfigurationFallbackTest.java create mode 100644 src/test/java/com/google/jenkins/plugins/computeengine/ProvisioningErrorClassifierTest.java diff --git a/src/test/java/com/google/jenkins/plugins/computeengine/ConfigAsCodeTest.java b/src/test/java/com/google/jenkins/plugins/computeengine/ConfigAsCodeTest.java index a8c89c0d..1b6d5cef 100644 --- a/src/test/java/com/google/jenkins/plugins/computeengine/ConfigAsCodeTest.java +++ b/src/test/java/com/google/jenkins/plugins/computeengine/ConfigAsCodeTest.java @@ -82,6 +82,22 @@ public void shouldCreateCloudInstanceFromCode() { assertEquals(true, timeRangeConfig.getFriday()); assertEquals(false, timeRangeConfig.getSaturday()); assertEquals(false, timeRangeConfig.getSunday()); + assertNotNull("fallbackCandidates should not be null", configuration.getFallbackCandidates()); + assertEquals( + "Wrong fallbackCandidates size", + 2, + configuration.getFallbackCandidates().size()); + assertEquals("us-west1-b", configuration.getFallbackCandidates().get(0).getZone()); + assertEquals( + "n4d-standard-32", configuration.getFallbackCandidates().get(0).getMachineType()); + assertEquals( + "us-central1-a", configuration.getFallbackCandidates().get(1).getZone()); + assertEquals( + "n2d-standard-32", configuration.getFallbackCandidates().get(1).getMachineType()); + assertEquals("us-central1", configuration.getFallbackCandidates().get(1).getRegion()); + assertEquals( + "gce-jenkins-central", + configuration.getFallbackCandidates().get(1).getSubnetwork()); } @Test diff --git a/src/test/java/com/google/jenkins/plugins/computeengine/InstanceConfigurationFallbackTest.java b/src/test/java/com/google/jenkins/plugins/computeengine/InstanceConfigurationFallbackTest.java new file mode 100644 index 00000000..3d70d480 --- /dev/null +++ b/src/test/java/com/google/jenkins/plugins/computeengine/InstanceConfigurationFallbackTest.java @@ -0,0 +1,248 @@ +/* + * Copyright 2026 CloudBees, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.jenkins.plugins.computeengine; + +import static com.google.jenkins.plugins.computeengine.InstanceConfigurationTest.instanceConfigurationBuilder; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertThrows; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyLong; +import static org.mockito.ArgumentMatchers.anyString; +import static org.mockito.ArgumentMatchers.eq; +import static org.mockito.Mockito.never; +import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +import com.google.api.services.compute.model.Instance; +import com.google.api.services.compute.model.Operation; +import com.google.cloud.graphite.platforms.plugin.client.ComputeClient; +import com.google.cloud.graphite.platforms.plugin.client.ComputeClient.OperationException; +import java.io.IOException; +import java.util.List; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.jvnet.hudson.test.JenkinsRule; +import org.mockito.ArgumentCaptor; +import org.mockito.Mock; +import org.mockito.junit.MockitoJUnitRunner; + +/** Unit tests for the ordered zone/machine-type fallback logic in {@link InstanceConfiguration#provision()}. */ +@RunWith(MockitoJUnitRunner.class) +public class InstanceConfigurationFallbackTest { + + private static final String PROJECT_ID = "test-project"; + + private static final String PRIMARY_ZONE = "us-west1-a"; + private static final String PRIMARY_MACHINE_TYPE = "n1-standard-1"; + private static final String F1_ZONE = "us-west1-b"; + private static final String F1_MACHINE_TYPE = "n4d-standard-32"; + private static final String F2_ZONE = "us-central1-a"; + private static final String F2_MACHINE_TYPE = "n2d-standard-32"; + + private static final String CAPACITY_CODE = "ZONE_RESOURCE_POOL_EXHAUSTED"; + + @Mock + public ComputeEngineCloud cloud; + + @Mock + public ComputeClient computeClient; + + @Rule + public JenkinsRule r = new JenkinsRule(); + + @Before + public void init() { + when(cloud.getProjectId()).thenReturn(PROJECT_ID); + when(cloud.getClient()).thenReturn(computeClient); + when(cloud.getCloudName()).thenReturn("test"); + } + + private InstanceConfiguration configWithFallbacks(FallbackCandidate... candidates) { + return instanceConfigurationBuilder() + .zone(PRIMARY_ZONE) + .machineType(PRIMARY_MACHINE_TYPE) + .fallbackCandidates(List.of(candidates)) + .cloud(cloud) + .build(); + } + + private static FallbackCandidate candidate(String zone, String machineType) { + return new FallbackCandidate(zone, machineType); + } + + private static Operation op(String name, String zone) { + return new Operation().setName(name).setZone(zone); + } + + private static Operation opWithError(String name, String zone, String code) { + return op(name, zone) + .setError(new Operation.Error() + .setErrors(List.of( + new Operation.Error.Errors().setCode(code).setMessage("simulated " + code)))); + } + + @Test + public void firstCandidateSucceeds_noFallbackUsed() throws Exception { + InstanceConfiguration config = configWithFallbacks(candidate(F1_ZONE, F1_MACHINE_TYPE)); + + when(computeClient.insertInstance(eq(PROJECT_ID), any(), any(Instance.class))) + .thenReturn(op("op-primary", PRIMARY_ZONE)); + when(computeClient.waitForOperationCompletion(eq(PROJECT_ID), any(Operation.class), anyLong())) + .thenReturn(op("op-primary", PRIMARY_ZONE)); // no error + + ComputeEngineInstance node = config.provision(); + + assertNotNull(node); + assertEquals(PRIMARY_ZONE, node.getZone()); + verify(computeClient, times(1)).insertInstance(anyString(), any(), any(Instance.class)); + verify(computeClient, never()).terminateInstanceAsync(anyString(), anyString(), anyString()); + } + + @Test + public void capacityFailureFallsBackToNextCandidate() throws Exception { + InstanceConfiguration config = configWithFallbacks(candidate(F1_ZONE, F1_MACHINE_TYPE)); + + ArgumentCaptor instanceCaptor = ArgumentCaptor.forClass(Instance.class); + when(computeClient.insertInstance(eq(PROJECT_ID), any(), any(Instance.class))) + .thenReturn(op("op-primary", PRIMARY_ZONE), op("op-fallback", F1_ZONE)); + when(computeClient.waitForOperationCompletion(eq(PROJECT_ID), any(Operation.class), anyLong())) + .thenReturn( + opWithError("op-primary", PRIMARY_ZONE, CAPACITY_CODE), // primary exhausted + op("op-fallback", F1_ZONE)); // fallback succeeds + + ComputeEngineInstance node = config.provision(); + + assertNotNull(node); + // The winning node should be in the fallback candidate's zone. + assertEquals(F1_ZONE, node.getZone()); + + // Two insert attempts, in order: primary then fallback. + verify(computeClient, times(2)).insertInstance(anyString(), any(), instanceCaptor.capture()); + List attempted = instanceCaptor.getAllValues(); + assertEquals(PRIMARY_ZONE, attempted.get(0).getZone()); + assertEquals(PRIMARY_MACHINE_TYPE, attempted.get(0).getMachineType()); + assertEquals(F1_ZONE, attempted.get(1).getZone()); + assertEquals(F1_MACHINE_TYPE, attempted.get(1).getMachineType()); + + // The failed primary VM must be cleaned up exactly once, in the primary zone. + verify(computeClient, times(1)).terminateInstanceAsync(eq(PROJECT_ID), eq(PRIMARY_ZONE), anyString()); + } + + @Test + public void walksMultipleCandidatesInOrder() throws Exception { + InstanceConfiguration config = + configWithFallbacks(candidate(F1_ZONE, F1_MACHINE_TYPE), candidate(F2_ZONE, F2_MACHINE_TYPE)); + + when(computeClient.insertInstance(eq(PROJECT_ID), any(), any(Instance.class))) + .thenReturn(op("op-primary", PRIMARY_ZONE), op("op-f1", F1_ZONE), op("op-f2", F2_ZONE)); + when(computeClient.waitForOperationCompletion(eq(PROJECT_ID), any(Operation.class), anyLong())) + .thenReturn( + opWithError("op-primary", PRIMARY_ZONE, CAPACITY_CODE), + opWithError("op-f1", F1_ZONE, CAPACITY_CODE), + op("op-f2", F2_ZONE)); + + ComputeEngineInstance node = config.provision(); + + assertNotNull(node); + assertEquals(F2_ZONE, node.getZone()); + verify(computeClient, times(3)).insertInstance(anyString(), any(), any(Instance.class)); + verify(computeClient, times(2)).terminateInstanceAsync(anyString(), anyString(), anyString()); + } + + @Test + public void nonRetryableErrorAbortsImmediately() throws Exception { + InstanceConfiguration config = configWithFallbacks(candidate(F1_ZONE, F1_MACHINE_TYPE)); + + when(computeClient.insertInstance(eq(PROJECT_ID), any(), any(Instance.class))) + .thenReturn(op("op-primary", PRIMARY_ZONE), op("op-fallback", F1_ZONE)); + when(computeClient.waitForOperationCompletion(eq(PROJECT_ID), any(Operation.class), anyLong())) + .thenReturn(opWithError("op-primary", PRIMARY_ZONE, "QUOTA_EXCEEDED")); + + IOException ex = assertThrows(IOException.class, config::provision); + assertEquals(true, ex.getMessage().contains("Non-retryable")); + + // Must NOT try the fallback candidate on a non-retryable error. + verify(computeClient, times(1)).insertInstance(anyString(), any(), any(Instance.class)); + // Still cleans up the failed VM. + verify(computeClient, times(1)).terminateInstanceAsync(eq(PROJECT_ID), eq(PRIMARY_ZONE), anyString()); + } + + @Test + public void allCandidatesExhaustedThrows() throws Exception { + InstanceConfiguration config = configWithFallbacks(candidate(F1_ZONE, F1_MACHINE_TYPE)); + + when(computeClient.insertInstance(eq(PROJECT_ID), any(), any(Instance.class))) + .thenReturn(op("op-primary", PRIMARY_ZONE), op("op-fallback", F1_ZONE)); + when(computeClient.waitForOperationCompletion(eq(PROJECT_ID), any(Operation.class), anyLong())) + .thenReturn( + opWithError("op-primary", PRIMARY_ZONE, CAPACITY_CODE), + opWithError("op-fallback", F1_ZONE, CAPACITY_CODE)); + + IOException ex = assertThrows(IOException.class, config::provision); + assertEquals(true, ex.getMessage().contains("Exhausted all fallback candidates")); + + verify(computeClient, times(2)).insertInstance(anyString(), any(), any(Instance.class)); + verify(computeClient, times(2)).terminateInstanceAsync(anyString(), anyString(), anyString()); + } + + @Test + public void operationExceptionIsTreatedAsFailure() throws Exception { + InstanceConfiguration config = configWithFallbacks(candidate(F1_ZONE, F1_MACHINE_TYPE)); + + when(computeClient.insertInstance(eq(PROJECT_ID), any(), any(Instance.class))) + .thenReturn(op("op-primary", PRIMARY_ZONE), op("op-fallback", F1_ZONE)); + when(computeClient.waitForOperationCompletion(eq(PROJECT_ID), any(Operation.class), anyLong())) + .thenThrow(new OperationException(new Operation.Error() + .setErrors(List.of(new Operation.Error.Errors() + .setCode(CAPACITY_CODE) + .setMessage("thrown"))))) + .thenReturn(op("op-fallback", F1_ZONE)); + + ComputeEngineInstance node = config.provision(); + + assertNotNull(node); + assertEquals(F1_ZONE, node.getZone()); + verify(computeClient, times(2)).insertInstance(anyString(), any(), any(Instance.class)); + } + + @Test + public void noFallbackConfigured_behaviorUnchanged() throws Exception { + // No fallback candidates: provision() must NOT wait on the operation and must return the + // node immediately, leaving the launcher to handle the operation (legacy behavior). + InstanceConfiguration config = instanceConfigurationBuilder() + .zone(PRIMARY_ZONE) + .machineType(PRIMARY_MACHINE_TYPE) + .cloud(cloud) + .build(); + + when(computeClient.insertInstance(eq(PROJECT_ID), any(), any(Instance.class))) + .thenReturn(op("op-primary", PRIMARY_ZONE)); + + ComputeEngineInstance node = config.provision(); + + assertNotNull(node); + assertEquals(PRIMARY_ZONE, node.getZone()); + verify(computeClient, times(1)).insertInstance(anyString(), any(), any(Instance.class)); + // Legacy path does not pre-wait on the operation inside provision(). + verify(computeClient, never()).waitForOperationCompletion(anyString(), any(Operation.class), anyLong()); + verify(computeClient, never()).terminateInstanceAsync(anyString(), anyString(), anyString()); + } +} diff --git a/src/test/java/com/google/jenkins/plugins/computeengine/ProvisioningErrorClassifierTest.java b/src/test/java/com/google/jenkins/plugins/computeengine/ProvisioningErrorClassifierTest.java new file mode 100644 index 00000000..9cdfba6b --- /dev/null +++ b/src/test/java/com/google/jenkins/plugins/computeengine/ProvisioningErrorClassifierTest.java @@ -0,0 +1,107 @@ +/* + * Copyright 2026 CloudBees, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.google.jenkins.plugins.computeengine; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; + +import com.google.api.services.compute.model.Operation; +import java.util.List; +import org.junit.Test; + +public class ProvisioningErrorClassifierTest { + + private static Operation.Error error(String code, String message) { + return new Operation.Error() + .setErrors(List.of(new Operation.Error.Errors().setCode(code).setMessage(message))); + } + + @Test + public void capacityCodesAreRetryable() { + assertTrue(ProvisioningErrorClassifier.isRetryable("ZONE_RESOURCE_POOL_EXHAUSTED")); + assertTrue(ProvisioningErrorClassifier.isRetryable("ZONE_RESOURCE_POOL_EXHAUSTED_WITH_DETAILS")); + assertTrue(ProvisioningErrorClassifier.isRetryable("RESOURCE_POOL_EXHAUSTED")); + assertTrue(ProvisioningErrorClassifier.isRetryable("STOCKOUT")); + assertTrue(ProvisioningErrorClassifier.isRetryable("RESOURCE_NOT_READY")); + } + + @Test + public void stockoutMessageIsRetryable() { + assertTrue( + ProvisioningErrorClassifier.isRetryable( + "The zone 'projects/p/zones/us-west1-a' does not have enough resources available to fulfill the request.")); + } + + @Test + public void caseInsensitive() { + assertTrue(ProvisioningErrorClassifier.isRetryable("zone_resource_pool_exhausted")); + assertTrue(ProvisioningErrorClassifier.isRetryable("Stockout")); + assertTrue(ProvisioningErrorClassifier.isRetryable("resource_not_ready")); + } + + @Test + public void quotaIsNotRetryable() { + assertFalse(ProvisioningErrorClassifier.isRetryable("QUOTA_EXCEEDED")); + assertFalse(ProvisioningErrorClassifier.isRetryable("CPUS_QUOTA_EXCEEDED")); + } + + @Test + public void nonCapacityErrorsAreNotRetryable() { + assertFalse(ProvisioningErrorClassifier.isRetryable("PERMISSION_DENIED")); + assertFalse(ProvisioningErrorClassifier.isRetryable("INVALID_FIELD_VALUE")); + assertFalse(ProvisioningErrorClassifier.isRetryable("RESOURCE_NOT_FOUND")); + assertFalse(ProvisioningErrorClassifier.isRetryable("UNSUPPORTED_OPERATION")); + } + + @Test + public void unknownErrorCodesAreNotRetryable() { + assertFalse(ProvisioningErrorClassifier.isRetryable("SOME_FUTURE_ERROR_CODE")); + assertFalse(ProvisioningErrorClassifier.isRetryable("UNEXPECTED_FAILURE")); + assertFalse(ProvisioningErrorClassifier.isRetryable("")); + } + + @Test + public void nullIsNotRetryable() { + assertFalse(ProvisioningErrorClassifier.isRetryable(null)); + } + + @Test + public void hasErrorsHandlesNullAndEmpty() { + assertFalse(ProvisioningErrorClassifier.hasErrors(null)); + assertFalse(ProvisioningErrorClassifier.hasErrors(new Operation.Error())); + assertTrue(ProvisioningErrorClassifier.hasErrors(error("ZONE_RESOURCE_POOL_EXHAUSTED", "boom"))); + } + + @Test + public void firstErrorCodeExtractsCode() { + assertEquals( + "ZONE_RESOURCE_POOL_EXHAUSTED", + ProvisioningErrorClassifier.firstErrorCode(error("ZONE_RESOURCE_POOL_EXHAUSTED", "boom"))); + assertNull(ProvisioningErrorClassifier.firstErrorCode(null)); + assertNull(ProvisioningErrorClassifier.firstErrorCode(new Operation.Error())); + } + + @Test + public void errorSummaryIncludesCodeAndMessage() { + String summary = ProvisioningErrorClassifier.errorSummary(error("ZONE_RESOURCE_POOL_EXHAUSTED", "no capacity")); + assertTrue(summary.contains("ZONE_RESOURCE_POOL_EXHAUSTED")); + assertTrue(summary.contains("no capacity")); + assertEquals("unknown error", ProvisioningErrorClassifier.errorSummary(null)); + } +} diff --git a/src/test/resources/com/google/jenkins/plugins/computeengine/configuration-as-code.yml b/src/test/resources/com/google/jenkins/plugins/computeengine/configuration-as-code.yml index ca8b53e0..c9ccb0af 100644 --- a/src/test/resources/com/google/jenkins/plugins/computeengine/configuration-as-code.yml +++ b/src/test/resources/com/google/jenkins/plugins/computeengine/configuration-as-code.yml @@ -67,6 +67,13 @@ jenkins: enableSecureBoot: true enableVtpm: true enableIntegrityMonitoring: false + fallbackCandidates: + - zone: "us-west1-b" + machineType: "n4d-standard-32" + - zone: "us-central1-a" + machineType: "n2d-standard-32" + region: "us-central1" + subnetwork: "gce-jenkins-central" credentials: system: domainCredentials: