Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

package com.google.jenkins.plugins.computeengine;

import com.google.api.client.googleapis.json.GoogleJsonResponseException;
import com.google.api.services.compute.model.AccessConfig;
import com.google.api.services.compute.model.Instance;
import com.google.api.services.compute.model.NetworkInterface;
Expand All @@ -40,6 +41,7 @@
import java.io.PrintStream;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.net.SocketTimeoutException;
import java.util.Base64;
import java.util.Optional;
import java.util.logging.Level;
Expand Down Expand Up @@ -154,14 +156,16 @@ public void launch(SlaveComputer slaveComputer, TaskListener listener) {
}
if (opError != null) {
LOGGER.info(String.format(
"Launch failed while waiting for operation %s to complete. Operation error was %s",
"Launch failed while waiting for operation %s to complete. Operation error was %s. Terminating instance.",
insertOperationId, opError.getErrors().get(0).getMessage()));
terminateNode(computer, listener);
return;
}
} catch (InterruptedException e) {
LOGGER.info(String.format(
"Launch failed while waiting for operation %s to complete. Operation error was %s",
"Launch failed while waiting for operation %s to complete. Operation error was %s. Terminating instance",
insertOperationId, opError.getErrors().get(0).getMessage()));
terminateNode(computer, listener);
return;
}

Expand Down Expand Up @@ -214,19 +218,26 @@ public void launch(SlaveComputer slaveComputer, TaskListener listener) {
launch(computer, listener);
} catch (IOException ioe) {
ioe.printStackTrace(listener.error(ioe.getMessage()));
node = (ComputeEngineInstance) slaveComputer.getNode();
if (node != null) {
try {
node.terminate();
} catch (Exception e) {
listener.error(String.format("Failed to terminate node %s", node.getDisplayName()));
}
}
terminateNode(slaveComputer, listener);
} catch (InterruptedException ie) {

}
}

private static void terminateNode(SlaveComputer slaveComputer, TaskListener listener) {
ComputeEngineInstance node = (ComputeEngineInstance) slaveComputer.getNode();
if (node != null) {
Comment thread
Artmorse marked this conversation as resolved.
try {
node.terminate();
} catch (Exception e) {
listener.error(String.format("Failed to terminate node %s", node.getDisplayName()));
}
} else {
LOGGER.fine(
String.format("Tried to terminate unknown node from computer %s", slaveComputer.getDisplayName()));
}
}

private boolean testCommand(
ComputeEngineComputer computer,
Connection conn,
Expand Down Expand Up @@ -343,6 +354,10 @@ protected Connection connectToSsh(ComputeEngineComputer computer, TaskListener l
+ ")");
}
Instance instance = computer.refreshInstance();
// the instance will be null when the node is terminated
if (instance == null) {
return null;
}

String host = "";

Expand Down Expand Up @@ -410,10 +425,25 @@ protected Connection connectToSsh(ComputeEngineComputer computer, TaskListener l
SSH_TIMEOUT_MILLIS);
logInfo(computer, listener, "Connected via SSH.");
return conn;
} catch (IOException e) {
} catch (GoogleJsonResponseException e) {
if (e.getStatusCode() == 404) {
log(
LOGGER,
Level.SEVERE,
listener,
String.format("Instance %s not found. Terminating instance.", computer.getName()));
terminateNode(computer, listener);
}
} catch (SocketTimeoutException e) {
// keep retrying until SSH comes up
logInfo(computer, listener, "Failed to connect via ssh: " + e.getMessage());
logInfo(computer, listener, "Waiting for SSH to come up. Sleeping 5.");
logInfo(computer, listener, String.format("Failed to connect via ssh: %s", e.getMessage()));
logInfo(
computer,
listener,
String.format("Waiting for SSH to come up. Sleeping %d.", SSH_SLEEP_MILLIS / 1000));
Thread.sleep(SSH_SLEEP_MILLIS);
} catch (IOException e) {
Comment thread
Artmorse marked this conversation as resolved.
logWarning(computer, listener, String.format("An error occured: %s", e.getMessage()));
Thread.sleep(SSH_SLEEP_MILLIS);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,11 @@

package com.google.jenkins.plugins.computeengine;

import static com.google.jenkins.plugins.computeengine.ComputeEngineCloud.CLOUD_ID_LABEL_KEY;

import com.google.cloud.graphite.platforms.plugin.client.ComputeClient.OperationException;
import com.google.common.base.Strings;
import com.google.common.collect.ImmutableMap;
import com.google.jenkins.plugins.computeengine.ssh.GoogleKeyCredential;
import edu.umd.cs.findbugs.annotations.Nullable;
import hudson.Extension;
Expand All @@ -30,6 +33,7 @@
import hudson.slaves.RetentionStrategy;
import java.io.IOException;
import java.util.Collections;
import java.util.Map;
import java.util.Optional;
import java.util.logging.Level;
import java.util.logging.Logger;
Expand Down Expand Up @@ -130,9 +134,16 @@ protected void _terminate(TaskListener listener) throws IOException, Interrupted
.createSnapshotSync(cloud.getProjectId(), this.zone, this.getNodeName(), createSnapshotTimeout);
}

// If the instance is running, attempt to terminate it. This is an async call and we
Map<String, String> filterLabel = ImmutableMap.of(CLOUD_ID_LABEL_KEY, cloud.getInstanceId());
var instanceExistsInCloud =
cloud.getClient().listInstancesWithLabel(cloud.getProjectId(), filterLabel).stream()
.anyMatch(instance -> instance.getName().equals(name));

// If the instance exists in the cloud, attempt to terminate it. This is an async call and we
// return immediately, hoping for the best.
cloud.getClient().terminateInstanceAsync(cloud.getProjectId(), zone, name);
if (instanceExistsInCloud) {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this needed? Is it just to avoid an unnecessary error message in the logs in case the instance does not exist in the cloud?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This PR is based on another branch (see #470) but I don't think this condition is useful. Theoretically we could even verify the instance exists in the cloud but when running the terminateInstanceAsync the instance could have been removed. I will remove that part.

Please @Theoderich if we're missing cases, please tell us.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done into f6fc88c

@Theoderich Theoderich Dec 11, 2024

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is part of the fix for #471 and should not be removed.
The Problem:
When Jenkins tries to start a node and the quotas in the target cloud do not allow it, the instance is not started in the cloud but Jenkins thinks that a node was started. Now there is a Zombie node in Jenkins.

When trying to delete this Zombie node, Jenkins calls cloud.getClient().terminateInstanceAsync which throws an Exception since the Instance does not exist in the cloud. The Delete is aborted and the zombie node stays, forever undeletable.

The Solution:
I added a check if the node actually exists in the cloud before calling cloud.getClient().terminateInstanceAsync. If the node does not exist as an instance in the cloud, there is no need to terminate it and the deletion can proceed.

Why not just catch the exception thrown by cloud.getClient().terminateInstanceAsync?
It is not clear from the exception why the terminate operation failed. If the instance exists in the cloud and the termination failed for some other reason, we do not want to delete the node in Jenkins.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, thanks for the detailed explanation.
I've reverted the changes.

cloud.getClient().terminateInstanceAsync(cloud.getProjectId(), zone, name);
}
} catch (CloudNotFoundException cnfe) {
listener.error(cnfe.getMessage());
} catch (OperationException oe) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,9 @@ private Optional<Connection> bootstrap(
logInfo(computer, listener, "Authenticating as " + node.getSshUser());
try {
bootstrapConn = connectToSsh(computer, listener);
if (bootstrapConn == null) {
break;
}
isAuthenticated = bootstrapConn.authenticateWithPublicKey(
node.getSshUser(),
Secret.toString(keyCred.getPrivateKey()).toCharArray(),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ private Optional<Connection> bootstrap(ComputeEngineComputer computer, TaskListe
logInfo(computer, listener, "Authenticating as " + node.getSshUser());
try {
bootstrapConn = connectToSsh(computer, listener);
if (bootstrapConn == null) {
break;
}
isAuthenticated = authenticateSSH(node.getSshUser(), windowsConfig, bootstrapConn, listener);
} catch (IOException e) {
logException(computer, listener, "Exception trying to authenticate", e);
Expand Down