-
Notifications
You must be signed in to change notification settings - Fork 149
qbft: clarify round-changes #2835
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: stage
Are you sure you want to change the base?
Changes from all commits
f977043
bcd0593
5ff4c09
8db85bb
82c4b6b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -11,7 +11,9 @@ import ( | |
| spectypes "github.com/ssvlabs/ssv-spec/types" | ||
| "go.uber.org/zap" | ||
|
|
||
| "github.com/ssvlabs/ssv/observability" | ||
| "github.com/ssvlabs/ssv/observability/log/fields" | ||
| "github.com/ssvlabs/ssv/protocol/v2/qbft" | ||
| ssvtypes "github.com/ssvlabs/ssv/protocol/v2/types" | ||
| ) | ||
|
|
||
|
|
@@ -22,9 +24,8 @@ func (i *Instance) uponRoundChange( | |
| logger *zap.Logger, | ||
| msg *specqbft.ProcessingMessage, | ||
| ) error { | ||
| prevRound := i.State.Round | ||
| logger = logger.With( | ||
| zap.Uint64("qbft_instance_round", uint64(prevRound)), | ||
| zap.Uint64("qbft_instance_round", uint64(i.State.Round)), | ||
| zap.Uint64("qbft_instance_height", uint64(i.State.Height)), | ||
| ) | ||
|
|
||
|
|
@@ -53,6 +54,19 @@ func (i *Instance) uponRoundChange( | |
| } | ||
|
|
||
| if justifiedRoundChangeMsg != nil { | ||
| i.metrics.EndStage(ctx, i.State.Round) | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we consider only advancing the stage after the broadcast actually succeeds?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Keeping consistent with the "best-effort broadcast" framing this PR uses throughout: the protocol stage advances when we move on (we've already built the new message and the state machine has progressed); broadcast failures are tracked independently. Gating stage advancement on a successful broadcast would create a different inconsistency — on broadcast failure the instance would still be in the new round but the dashboard would show the old stage. |
||
| i.metrics.StartStage(stageProposal) | ||
|
|
||
| // hasReceivedProposalJustificationForLeadingRound accepts future-round quorums, but CreateProposal | ||
| // uses i.State.Round as the proposal round and MessagesForRound(i.State.Round) selects the | ||
| // round-change justifications - so bump to the justified round here before building the proposal. | ||
| // | ||
| // In practice this branch never observes a future-round quorum: on natural message flow, the | ||
| // partial-quorum branch (f+1 < 2f+1) fires on an earlier uponRoundChange call and advances | ||
| // State.Round before this branch can see a full quorum, so bumpToRound here is always a no-op. | ||
| // Kept as defense-in-depth in case the message-processing flow changes. | ||
| i.bumpToRound(justifiedRoundChangeMsg.QBFTMessage.Round) | ||
|
|
||
| roundChangeJustificationSignedMessages, _ := justifiedRoundChangeMsg.QBFTMessage.GetRoundChangeJustifications() // no need to check error, check on isValidRoundChange | ||
|
|
||
| roundChangeJustification := make([]*specqbft.ProcessingMessage, 0) | ||
|
|
@@ -66,68 +80,60 @@ func (i *Instance) uponRoundChange( | |
|
|
||
| proposal, err := i.CreateProposal( | ||
| valueToPropose, | ||
| i.State.RoundChangeContainer.MessagesForRound(prevRound), // TODO - might be optimized to include only necessary quorum | ||
| i.State.RoundChangeContainer.MessagesForRound(i.State.Round), // TODO - might be optimized to include only necessary quorum | ||
|
julienh-ssv marked this conversation as resolved.
|
||
| roundChangeJustification, | ||
| ) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to create proposal: %w", err) | ||
| } | ||
|
|
||
| valueToProposeRoot, err := specqbft.HashDataRoot(valueToPropose) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to hash value-to-propose: %w", err) | ||
| } | ||
| valueToProposeRoot := qbft.HashDataRoot(valueToPropose) | ||
| logger = logger.With(zap.String("qbft_value_to_propose_root", hex.EncodeToString(valueToProposeRoot[:]))) | ||
|
|
||
| i.metrics.EndStage(ctx, prevRound) | ||
| i.metrics.StartStage(stageProposal) | ||
|
|
||
| i.metrics.RecordRoundChange(ctx, prevRound, reasonJustified) | ||
|
|
||
| logger.Debug("🔄 got justified round change, leader broadcasting proposal message", | ||
| zap.Any("round_change_signers", allSigners(i.State.RoundChangeContainer.MessagesForRound(prevRound))), | ||
| zap.Any("round_change_signers", allSigners(i.State.RoundChangeContainer.MessagesForRound(i.State.Round))), | ||
| ) | ||
|
|
||
| if err := i.Broadcast(proposal); err != nil { | ||
| return fmt.Errorf("failed to broadcast proposal message: %w", err) | ||
| } | ||
| } else if partialQuorum, rcs := i.hasReceivedPartialQuorum(); partialQuorum { | ||
| newRound := minRound(rcs) | ||
| if newRound <= prevRound { | ||
| if newRound <= i.State.Round { | ||
| // No need to advance round, we've already changed it. | ||
| return nil | ||
| } | ||
|
|
||
| i.metrics.EndStage(ctx, prevRound) | ||
| i.metrics.StartStage(stageRoundChange) | ||
|
|
||
| i.metrics.RecordRoundChange(ctx, prevRound, reasonPartialQuorum) | ||
|
|
||
| err := i.uponChangeRoundPartialQuorum(logger, newRound) | ||
| if err != nil { | ||
| if err := i.uponChangeRoundPartialQuorum(ctx, logger, newRound); err != nil { | ||
| return err | ||
| } | ||
| } | ||
| return nil | ||
| } | ||
|
|
||
| func (i *Instance) uponChangeRoundPartialQuorum(logger *zap.Logger, newRound specqbft.Round) error { | ||
| func (i *Instance) uponChangeRoundPartialQuorum(ctx context.Context, logger *zap.Logger, newRound specqbft.Round) error { | ||
| ctx, span := tracer.Start(ctx, observability.InstrumentName(observabilityNamespace, "qbft.instance.change_round_partial_quorum")) | ||
| defer span.End() | ||
|
|
||
| prevRound := i.State.Round | ||
|
|
||
| i.metrics.EndStage(ctx, prevRound) | ||
| i.metrics.StartStage(stageRoundChange) | ||
| i.metrics.RecordRoundChange(ctx, prevRound, reasonPartialQuorum) | ||
|
|
||
| // Always move on to the next round. The round-change message broadcast is a best-effort thing, the QBFT | ||
| // cluster as a whole can progress further even if our round-change message cannot be created/broadcast | ||
| // for whatever reason. | ||
| i.bumpToRound(newRound) | ||
| i.State.ProposalAcceptedForCurrentRound = nil | ||
|
|
||
| i.roundTimer.TimeoutForRound(newRound) | ||
| startValueRoot := qbft.HashDataRoot(i.StartValue) | ||
| logger = logger.With(zap.String("qbft_start_value_root", hex.EncodeToString(startValueRoot[:]))) | ||
|
|
||
| roundChange, err := i.CreateRoundChange(newRound) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to create round change message: %w", err) | ||
| } | ||
|
|
||
| startValueRoot, err := specqbft.HashDataRoot(i.StartValue) | ||
| if err != nil { | ||
| return fmt.Errorf("failed to hash instance start value: %w", err) | ||
| } | ||
| logger = logger.With(zap.String("qbft_start_value_root", hex.EncodeToString(startValueRoot[:]))) | ||
|
|
||
| logger.Debug("📢 broadcasting round change message (got partial quorum)", | ||
| zap.Uint64("qbft_new_round", uint64(newRound)), | ||
| zap.Any("round_change_signers", roundChange.OperatorIDs), | ||
|
|
@@ -282,10 +288,7 @@ func (i *Instance) validRoundChangeForDataIgnoreSignature( | |
| // Addition to formal spec | ||
| // We add this extra tests on the msg itself to filter round change msgs with invalid justifications, before they are inserted into msg containers | ||
| if msg.QBFTMessage.RoundChangePrepared() { | ||
| r, err := specqbft.HashDataRoot(fullData) | ||
| if err != nil { | ||
| return fmt.Errorf("could not hash input data: %w", err) | ||
| } | ||
| r := qbft.HashDataRoot(fullData) | ||
|
|
||
| // validate prepare message justifications | ||
| prepareSignedMsgs, _ := msg.QBFTMessage.GetRoundChangeJustifications() // no need to check error, checked on msg.QBFTMessage.Validate() | ||
|
|
@@ -384,10 +387,7 @@ func (i *Instance) getRoundChangeData() (specqbft.Round, [32]byte, []byte, []*sp | |
| return specqbft.NoRound, [32]byte{}, nil, nil, fmt.Errorf("could not get round change justification: %w", err) | ||
| } | ||
|
|
||
| r, err := specqbft.HashDataRoot(i.State.LastPreparedValue) | ||
| if err != nil { | ||
| return specqbft.NoRound, [32]byte{}, nil, nil, fmt.Errorf("could not hash input data: %w", err) | ||
| } | ||
| r := qbft.HashDataRoot(i.State.LastPreparedValue) | ||
|
|
||
| return i.State.LastPreparedRound, r, i.State.LastPreparedValue, justifications, nil | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What do you think about keeping
reasonJustified? Once the leader-path round-bump issue (open thread onround_change.go) is addressed and a real round change happens in the justified path, there will be no labeled metric for it. Operators currently can distinguish 'round changed because of justified RC quorum' from 'timeout' / 'partial-quorum' — losing that makes it harder to diagnose why a cluster keeps round-changing.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fair concern. After digging in, this metric was effectively double-counting in pre-PR code: by the time the leader-on-justified-quorum branch fired,
State.Roundhad already been advanced (and recorded as a round change) viareasonTimeoutorreasonPartialQuorum. The reasoning:RoundChangeContaineris populated only insideuponRoundChange:34— single call site (grep-verified).uponRoundChange, partial-quorum is checked afterAddFirstMsgForSignerAndRoundon every call.processMsgF.Run()serializes processing.So as RCs stream in, the (f+1)-th always triggers
uponChangeRoundPartialQuorum→ bumpsState.Round— before the (2f+1)-th arrives and the leader branch fires. By the time the leader branch executes,bumpToRound(newRound)is always a no-op (State.Round == newRoundalready). No real round change happens in this branch.Soft confirmation: if this future-round leader case were a real scenario in normal flow, ssv-spec tests would have a scenario for it — they don't. Leaving the metric removed.
(Julien's
bumpToRoundfix in 7ddb771 is still worth keeping for defense-in-depth against future refactors, but the metric would just be a perpetual zero.)