Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 1 addition & 5 deletions tests/e2e/npd_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -208,18 +208,14 @@ func tearDownPrometheusOperator() {
}

func (s *E2ESuite) TestNodeProblemDetector(c *C) {
if s.simEnable {
skipTest(c, "Skipping for non amd gpu testbed")
}

_, err := s.dClient.DeviceConfigs(s.ns).Get(s.cfgName, metav1.GetOptions{})
assert.Errorf(c, err, fmt.Sprintf("expected no config to be present. but config %v exists", s.cfgName))

exporterEnable := true
driverEnable := false
devCfg := s.getDeviceConfig(c)
devCfg.Spec.MetricsExporter.Enable = &exporterEnable
devCfg.Spec.MetricsExporter.Image = exporterImage
devCfg.Spec.MetricsExporter.Image = exporterMockImage
devCfg.Spec.MetricsExporter.ImagePullPolicy = "Always"
devCfg.Spec.MetricsExporter.Port = 5000
devCfg.Spec.Driver.Enable = &driverEnable
Expand Down
23 changes: 11 additions & 12 deletions tests/e2e/remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ func (s *E2ESuite) populateDeviceConfig(c *C) *v1alpha1.DeviceConfig {
devCfg.Spec.RemediationWorkflow.Enable = &remediationEnable
devCfg.Spec.RemediationWorkflow.TesterImage = agfhcTestRunnerImage
devCfg.Spec.MetricsExporter.Enable = &remediationEnable
devCfg.Spec.MetricsExporter.Image = exporterImage
devCfg.Spec.MetricsExporter.Image = exporterMockImage
devCfg.Spec.MetricsExporter.ImagePullPolicy = "Always"
devCfg.Spec.MetricsExporter.Port = 5000
devCfg.Spec.CommonConfig.UtilsContainer.Image = utilsContainerImage
Expand Down Expand Up @@ -285,10 +285,6 @@ func (s *E2ESuite) TestAutoNodeRemediationWithPhysicalAction(c *C) {
}

func (s *E2ESuite) TestAutoNodeRemediationAbortWorkflow(c *C) {
if s.simEnable {
skipTest(c, "Skipping for non amd gpu testbed")
}

logger.Infof("Starting Auto Node Remediation abort workflow test")

nodes, err := s.clientSet.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
Expand Down Expand Up @@ -351,20 +347,17 @@ func (s *E2ESuite) TestAutoNodeRemediationAbortWorkflow(c *C) {

logger.Infof("Reverting Node Problem Detector (NPD) thresholds to original configuration")
s.updateConfigForNPD(c, npdInband2RASErrorConfigPath, npdInbandRASConfigPath)
s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionFalse)

//verify workflow is aborted and deleted
logger.Infof("Verifying remediation workflow is aborted and deleted on the node %s", nodeName)
assert.Eventually(c, func() bool {
return s.checkWorkflowExistence(c, nodeName, false)
}, 1*time.Minute, 10*time.Second, "Remediation workflow was not aborted and deleted")
}, 2*time.Minute, 10*time.Second, "Remediation workflow was not aborted and deleted")
}

func (s *E2ESuite) TestAutoNodeRemediationRecoveryPolicy(c *C) {
logger.Infof("Starting Auto Node Remediation recovery policy test")
if s.simEnable {
skipTest(c, "Skipping for non amd gpu testbed")
}

nodes, err := s.clientSet.CoreV1().Nodes().List(context.Background(), metav1.ListOptions{
LabelSelector: "feature.node.kubernetes.io/amd-gpu=true",
})
Expand Down Expand Up @@ -428,6 +421,11 @@ func (s *E2ESuite) TestAutoNodeRemediationRecoveryPolicy(c *C) {
return s.checkWorkflowExistence(c, nodeName, true)
}, 2*time.Minute, 10*time.Second, "Remediation workflow was started despite max retries reached")

defer func() {
s.untaintNode(nodeName)
s.clearRemediationWorkflowStatusMetaData(devCfg.Namespace, c)
}()

//verify workflow is suspended waiting for physical action
logger.Infof("Verifying remediation workflow is suspended on the node %s", nodeName)
assert.Eventually(c, func() bool {
Expand All @@ -441,13 +439,14 @@ func (s *E2ESuite) TestAutoNodeRemediationRecoveryPolicy(c *C) {

logger.Infof("Reverting Node Problem Detector (NPD) thresholds to original configuration")
s.updateConfigForNPD(c, npdInband2RASErrorConfigPath, npdInbandRASConfigPath)
s.verifyNodeCondition(c, conditionInternalError, corev1.ConditionFalse)

//verify workflow is aborted and deleted
logger.Infof("Verifying remediation workflow is aborted and deleted on the node %s", nodeName)
assert.Eventually(c, func() bool {
return s.checkWorkflowExistence(c, nodeName, false)
}, 1*time.Minute, 10*time.Second, "Remediation workflow was not aborted and deleted")
s.untaintNode(nodeName)
}, 2*time.Minute, 10*time.Second, "Remediation workflow was not aborted and deleted")

}

func (s *E2ESuite) verifyDeviceConfigErrorStatus(devCfg *v1alpha1.DeviceConfig, c *C, errStr string) {
Expand Down
Loading