-
Notifications
You must be signed in to change notification settings - Fork 38
feat: Add switch leak detection and powering off switches on detection of l… #566
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
4f36a52
530def3
89503c8
af3faf1
cb37314
78632be
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -136,6 +136,27 @@ func (c *grpcClient) GetLeakingMachineIds(ctx context.Context) ([]string, error) | |
| return ids, nil | ||
| } | ||
|
|
||
| func (c *grpcClient) GetLeakingSwitchIds(ctx context.Context) ([]string, error) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Core Is it acceptable to power off all switches with the leak alert regardless of power state? If not, we need a Core field (or post-filter via another RPC) before Flow can match machine behavior. |
||
| ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout) | ||
| defer cancel() | ||
|
|
||
| alert := "hardware-health.tray-leak-detection" | ||
| searchConfig := pb.SwitchSearchFilter{ | ||
| OnlyWithHealthAlert: &alert, | ||
| } | ||
|
|
||
| switchIDs, err := c.gclient.FindSwitchIds(ctx, &searchConfig) | ||
| if err != nil { | ||
| return nil, err | ||
| } | ||
|
|
||
| ids := make([]string, 0, len(switchIDs.GetIds())) | ||
| for _, switchID := range switchIDs.GetIds() { | ||
| ids = append(ids, switchID.GetId()) | ||
| } | ||
| return ids, nil | ||
| } | ||
|
|
||
| // Version returns the version string of nico-core-api, mainly as a "ping" | ||
| func (c *grpcClient) Version(ctx context.Context) (string, error) { | ||
| ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout) | ||
|
|
||
| Original file line number | Diff line number | Diff line change | ||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
|
|
@@ -3976,6 +3976,7 @@ message ManagedHostNetworkConfigResponse { | |||||||||||||||||
|
|
||||||||||||||||||
| // Route imports and tagging details for exports | ||||||||||||||||||
| // used by FNN configs. | ||||||||||||||||||
| // Deprecated: use FlatInterfaceConfig.routing_profile for per-VPC routing. | ||||||||||||||||||
| // NOTE: This will replace internet_l3_vni and common_internal_route_target but could allow | ||||||||||||||||||
| // common_internal_route_target to be renamed/repurposed as a site-level RT. | ||||||||||||||||||
| // to become a site-level common route target. | ||||||||||||||||||
|
|
@@ -4200,6 +4201,10 @@ message FlatInterfaceConfig { | |||||||||||||||||
| // IPv6 configuration for dual-stack FNN interfaces. | ||||||||||||||||||
| optional FlatInterfaceIpv6Config ipv6_interface_config = 19; | ||||||||||||||||||
|
|
||||||||||||||||||
| // Route imports and tagging details for exports used by FNN configs. | ||||||||||||||||||
| // This is scoped to the VPC that owns this interface. | ||||||||||||||||||
| optional RoutingProfile routing_profile = 20; | ||||||||||||||||||
|
Comment on lines
+4204
to
+4206
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Define precedence when both routing profile fields are present. Line 4206 adds per-interface Suggested comment-level contract clarification- // Route imports and tagging details for exports used by FNN configs.
- // This is scoped to the VPC that owns this interface.
+ // Route imports and tagging details for exports used by FNN configs.
+ // This is scoped to the VPC that owns this interface.
+ // Precedence: when set, this field overrides ManagedHostNetworkConfigResponse.routing_profile.
+ // Fallback: use ManagedHostNetworkConfigResponse.routing_profile only when this field is unset.
optional RoutingProfile routing_profile = 20;As per coding guidelines: 📝 Committable suggestion
Suggested change
🤖 Prompt for AI Agents |
||||||||||||||||||
|
|
||||||||||||||||||
| // The details of the network security group associated with | ||||||||||||||||||
| // either the instance or its parent VPC. | ||||||||||||||||||
| // Currently, source would either be INSTANCE or VPC. | ||||||||||||||||||
|
|
||||||||||||||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -48,17 +48,37 @@ func runLeakDetectionOne( | |
| for _, machineID := range leakingMachineIds { | ||
| log.Info().Msgf("Leaking machine ID: %s, submitting force power-off task", machineID) | ||
|
|
||
| if err := submitPowerOffTask(ctx, taskMgr, machineID); err != nil { | ||
| err := submitPowerOffTask(ctx, taskMgr, machineID, devicetypes.ComponentTypeCompute) | ||
| if err != nil { | ||
| log.Error().Err(err).Str("machine_id", machineID). | ||
| Msg("Failed to submit power-off task for leaking machine") | ||
| } | ||
| } | ||
|
|
||
| leakingSwitchIds, err := nicoClient.GetLeakingSwitchIds(ctx) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If Is it as expectation? |
||
| if err != nil { | ||
| log.Error().Err(err).Msg("Unable to retrieve leaking switch IDs from NICo") | ||
| return | ||
| } | ||
|
|
||
| log.Info().Msgf("Found %d leaking switch IDs", len(leakingSwitchIds)) | ||
|
|
||
| for _, switchID := range leakingSwitchIds { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: We can add e.g.
|
||
| log.Info().Msgf("Leaking switch ID: %s, submitting force power-off task", switchID) | ||
|
|
||
| err := submitPowerOffTask(ctx, taskMgr, switchID, devicetypes.ComponentTypeNVSwitch) | ||
| if err != nil { | ||
| log.Error().Err(err).Str("switch_id", switchID). | ||
| Msg("Failed to submit power-off task for leaking switch") | ||
| } | ||
|
coderabbitai[bot] marked this conversation as resolved.
|
||
| } | ||
| } | ||
|
|
||
| func submitPowerOffTask( | ||
| ctx context.Context, | ||
| taskMgr taskmanager.Manager, | ||
| machineID string, | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now that this helper serves both compute and NVSwitch, the parameter name |
||
| componentType devicetypes.ComponentType, | ||
| ) error { | ||
| info := &operations.PowerControlTaskInfo{ | ||
| Operation: operations.PowerOperationForcePowerOff, | ||
|
|
@@ -80,7 +100,7 @@ func submitPowerOffTask( | |
| Components: []operation.ComponentTarget{ | ||
| { | ||
| External: &operation.ExternalRef{ | ||
| Type: devicetypes.ComponentTypeCompute, | ||
| Type: componentType, | ||
| ID: machineID, | ||
| }, | ||
| }, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -65,7 +65,7 @@ func TestSubmitPowerOffTask_Success(t *testing.T) { | |
| mgr := &mockManager{} | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tests were updated for
Please extend the existing tests. |
||
| machineID := "machine-abc-123" | ||
|
|
||
| err := submitPowerOffTask(ctx, mgr, machineID) | ||
| err := submitPowerOffTask(ctx, mgr, machineID, devicetypes.ComponentTypeCompute) | ||
| require.NoError(t, err) | ||
| require.Len(t, mgr.requests, 1) | ||
|
|
||
|
|
@@ -92,7 +92,7 @@ func TestSubmitPowerOffTask_NoTasksCreated(t *testing.T) { | |
| ctx := context.Background() | ||
| mgr := &mockManager{returnNoTaskID: true} | ||
|
|
||
| err := submitPowerOffTask(ctx, mgr, "machine-xyz") | ||
| err := submitPowerOffTask(ctx, mgr, "machine-xyz", devicetypes.ComponentTypeCompute) | ||
| require.Error(t, err) | ||
| assert.Contains(t, err.Error(), "failed to create any power-off tasks") | ||
| } | ||
|
|
@@ -101,7 +101,7 @@ func TestSubmitPowerOffTask_SubmitError(t *testing.T) { | |
| ctx := context.Background() | ||
| mgr := &mockManager{submitErr: errors.New("submit failed")} | ||
|
|
||
| err := submitPowerOffTask(ctx, mgr, "machine-xyz") | ||
| err := submitPowerOffTask(ctx, mgr, "machine-xyz", devicetypes.ComponentTypeCompute) | ||
| require.Error(t, err) | ||
| assert.Contains(t, err.Error(), "submit failed") | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can add some comments on
GetLeakingSwitchIds(same style asGetLeakingMachineIds): what NICo API is called, what IDs are returned (CoreSwitchId), and that callers use them as Flowexternal_idforComponentTypeNVSwitch.