Skip to content
2,998 changes: 1,506 additions & 1,492 deletions flow/internal/nicoapi/gen/nico.pb.go

Large diffs are not rendered by default.

21 changes: 21 additions & 0 deletions flow/internal/nicoapi/grpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,27 @@ func (c *grpcClient) GetLeakingMachineIds(ctx context.Context) ([]string, error)
return ids, nil
}

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can add some comments on GetLeakingSwitchIds (same style as GetLeakingMachineIds): what NICo API is called, what IDs are returned (Core SwitchId), and that callers use them as Flow external_id for ComponentTypeNVSwitch.

func (c *grpcClient) GetLeakingSwitchIds(ctx context.Context) ([]string, error) {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Core SwitchSearchFilter (forge.proto) has only_with_health_alert but no only_with_power_state, unlike MachineSearchConfig. So we can't mirror the machine "on" filter without a Core proto/API change.

Is it acceptable to power off all switches with the leak alert regardless of power state? If not, we need a Core field (or post-filter via another RPC) before Flow can match machine behavior.

ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout)
defer cancel()

alert := "hardware-health.tray-leak-detection"
searchConfig := pb.SwitchSearchFilter{
OnlyWithHealthAlert: &alert,
}

switchIDs, err := c.gclient.FindSwitchIds(ctx, &searchConfig)
if err != nil {
return nil, err
}

ids := make([]string, 0, len(switchIDs.GetIds()))
for _, switchID := range switchIDs.GetIds() {
ids = append(ids, switchID.GetId())
}
return ids, nil
}

// Version returns the version string of nico-core-api, mainly as a "ping"
func (c *grpcClient) Version(ctx context.Context) (string, error) {
ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout)
Expand Down
5 changes: 5 additions & 0 deletions flow/internal/nicoapi/mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ type mockClient struct {
machineInterfaces map[string]MachineInterface
expectedSwitches map[string]ExpectedSwitchInfo // keyed by BMC MAC
leakingMachineIds []string
leakingSwitchIds []string
firmwareUpdateTimeWindowErr error // If set, SetFirmwareUpdateTimeWindow will return this error
adminPowerControlErr error // If set, AdminPowerControl will return this error
desiredFirmwareVersions []*pb.DesiredFirmwareVersionEntry
Expand Down Expand Up @@ -62,6 +63,10 @@ func (c *mockClient) GetLeakingMachineIds(ctx context.Context) ([]string, error)
return c.leakingMachineIds, nil
}

func (c *mockClient) GetLeakingSwitchIds(ctx context.Context) ([]string, error) {
return c.leakingSwitchIds, nil
}

func (c *mockClient) SetLeakingMachineIds(ids []string) {
c.leakingMachineIds = ids
}
Expand Down
1 change: 1 addition & 0 deletions flow/internal/nicoapi/mod.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ type Client interface {
Version(ctx context.Context) (string, error)
GetMachines(ctx context.Context) ([]MachineDetail, error)
GetLeakingMachineIds(ctx context.Context) ([]string, error)
GetLeakingSwitchIds(ctx context.Context) ([]string, error)
GetPowerStates(ctx context.Context, machineIds []string) (ret []MachinePowerState, err error)
SetFirmwareUpdateTimeWindow(ctx context.Context, machineIds []string, startTime, endTime time.Time) error
// FindInterfaces returns all machine interfaces known by nico-core-api, keyed by MAC address
Expand Down
5 changes: 5 additions & 0 deletions flow/internal/nicoapi/nicoproto/nico.proto
Original file line number Diff line number Diff line change
Expand Up @@ -3976,6 +3976,7 @@ message ManagedHostNetworkConfigResponse {

// Route imports and tagging details for exports
// used by FNN configs.
// Deprecated: use FlatInterfaceConfig.routing_profile for per-VPC routing.
// NOTE: This will replace internet_l3_vni and common_internal_route_target but could allow
// common_internal_route_target to be renamed/repurposed as a site-level RT.
// to become a site-level common route target.
Expand Down Expand Up @@ -4200,6 +4201,10 @@ message FlatInterfaceConfig {
// IPv6 configuration for dual-stack FNN interfaces.
optional FlatInterfaceIpv6Config ipv6_interface_config = 19;

// Route imports and tagging details for exports used by FNN configs.
// This is scoped to the VPC that owns this interface.
optional RoutingProfile routing_profile = 20;
Comment on lines +4204 to +4206
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Define precedence when both routing profile fields are present.

Line 4206 adds per-interface routing_profile, while the legacy response-level field at Line 3985 still exists. The contract should explicitly state precedence/fallback behavior to avoid divergent client behavior when both are populated.

Suggested comment-level contract clarification
-  // Route imports and tagging details for exports used by FNN configs.
-  // This is scoped to the VPC that owns this interface.
+  // Route imports and tagging details for exports used by FNN configs.
+  // This is scoped to the VPC that owns this interface.
+  // Precedence: when set, this field overrides ManagedHostNetworkConfigResponse.routing_profile.
+  // Fallback: use ManagedHostNetworkConfigResponse.routing_profile only when this field is unset.
   optional RoutingProfile routing_profile = 20;

As per coding guidelines: **/*.proto: Review the Protobuf definitions, point out issues relative to compatibility, and expressiveness.

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
// Route imports and tagging details for exports used by FNN configs.
// This is scoped to the VPC that owns this interface.
optional RoutingProfile routing_profile = 20;
// Route imports and tagging details for exports used by FNN configs.
// This is scoped to the VPC that owns this interface.
// Precedence: when set, this field overrides ManagedHostNetworkConfigResponse.routing_profile.
// Fallback: use ManagedHostNetworkConfigResponse.routing_profile only when this field is unset.
optional RoutingProfile routing_profile = 20;
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@flow/internal/nicoapi/nicoproto/nico.proto` around lines 4204 - 4206, The
proto lacks a clear precedence rule when both the per-interface RoutingProfile
(routing_profile on the interface message) and the legacy response-level
routing_profile are set; update the comments to state that the interface-level
routing_profile takes precedence and is used for routing decisions, and that if
it is unset the implementation should fall back to the legacy response-level
routing_profile; apply this clarification to the doc comment on the interface's
routing_profile field and to the legacy response-level routing_profile comment
(and optionally mark the legacy field as deprecated with the deprecation option
if intended).


// The details of the network security group associated with
// either the instance or its parent VPC.
// Currently, source would either be INSTANCE or VPC.
Expand Down
24 changes: 22 additions & 2 deletions flow/internal/scheduler/jobs/leakdetection/leakdetection.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,17 +48,37 @@ func runLeakDetectionOne(
for _, machineID := range leakingMachineIds {
log.Info().Msgf("Leaking machine ID: %s, submitting force power-off task", machineID)

if err := submitPowerOffTask(ctx, taskMgr, machineID); err != nil {
err := submitPowerOffTask(ctx, taskMgr, machineID, devicetypes.ComponentTypeCompute)
if err != nil {
log.Error().Err(err).Str("machine_id", machineID).
Msg("Failed to submit power-off task for leaking machine")
}
}

leakingSwitchIds, err := nicoClient.GetLeakingSwitchIds(ctx)
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If GetLeakingSwitchIds fails after machines were processed, we return without attempting switches. That's reasonable, but machine remediation already ran.

Is it as expectation?

if err != nil {
log.Error().Err(err).Msg("Unable to retrieve leaking switch IDs from NICo")
return
}

log.Info().Msgf("Found %d leaking switch IDs", len(leakingSwitchIds))

for _, switchID := range leakingSwitchIds {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit:
We have TestRunLeakDetectionOne_SubmitsTaskPerMachine but nothing for switches.

We can add e.g. TestRunLeakDetectionOne_SubmitsTaskPerSwitch:

  • nicoClient.SetLeakingSwitchIds([]string{"switch-1", "switch-2"}) (needs setter on Client — see mock comment)
  • run runLeakDetectionOne
  • assert 2 requests with External.Type == ComponentTypeNVSwitch and External.ID matching the switch IDs

log.Info().Msgf("Leaking switch ID: %s, submitting force power-off task", switchID)

err := submitPowerOffTask(ctx, taskMgr, switchID, devicetypes.ComponentTypeNVSwitch)
if err != nil {
log.Error().Err(err).Str("switch_id", switchID).
Msg("Failed to submit power-off task for leaking switch")
}
Comment thread
coderabbitai[bot] marked this conversation as resolved.
}
}

func submitPowerOffTask(
ctx context.Context,
taskMgr taskmanager.Manager,
machineID string,
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now that this helper serves both compute and NVSwitch, the parameter name machineID is misleading. Consider renaming to externalComponentID or componentExternalID.

componentType devicetypes.ComponentType,
) error {
info := &operations.PowerControlTaskInfo{
Operation: operations.PowerOperationForcePowerOff,
Expand All @@ -80,7 +100,7 @@ func submitPowerOffTask(
Components: []operation.ComponentTarget{
{
External: &operation.ExternalRef{
Type: devicetypes.ComponentTypeCompute,
Type: componentType,
ID: machineID,
},
},
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func TestSubmitPowerOffTask_Success(t *testing.T) {
mgr := &mockManager{}
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests were updated for ComponentTypeCompute on submitPowerOffTask, but there is no coverage for:

  • ComponentTypeNVSwitch in TestSubmitPowerOffTask_Success
  • runLeakDetectionOne with leaking switches

Please extend the existing tests.

machineID := "machine-abc-123"

err := submitPowerOffTask(ctx, mgr, machineID)
err := submitPowerOffTask(ctx, mgr, machineID, devicetypes.ComponentTypeCompute)
require.NoError(t, err)
require.Len(t, mgr.requests, 1)

Expand All @@ -92,7 +92,7 @@ func TestSubmitPowerOffTask_NoTasksCreated(t *testing.T) {
ctx := context.Background()
mgr := &mockManager{returnNoTaskID: true}

err := submitPowerOffTask(ctx, mgr, "machine-xyz")
err := submitPowerOffTask(ctx, mgr, "machine-xyz", devicetypes.ComponentTypeCompute)
require.Error(t, err)
assert.Contains(t, err.Error(), "failed to create any power-off tasks")
}
Expand All @@ -101,7 +101,7 @@ func TestSubmitPowerOffTask_SubmitError(t *testing.T) {
ctx := context.Background()
mgr := &mockManager{submitErr: errors.New("submit failed")}

err := submitPowerOffTask(ctx, mgr, "machine-xyz")
err := submitPowerOffTask(ctx, mgr, "machine-xyz", devicetypes.ComponentTypeCompute)
require.Error(t, err)
assert.Contains(t, err.Error(), "submit failed")
}
Expand Down
Loading