diff --git a/.config/nextest.toml b/.config/nextest.toml index 7caa4a35e28..db99a2e3f95 100644 --- a/.config/nextest.toml +++ b/.config/nextest.toml @@ -17,7 +17,7 @@ filter = "package(solana-zk-elgamal-proof-program-tests) & test(/^test_batched_r threads-required = "num-cpus" [[profile.ci.overrides]] -filter = "package(solana-turbine) | package(solana-gossip) | package(solana-perf)" +filter = "package(solana-turbine) | package(solana-gossip) | package(solana-perf) | package(solana-quic-datagram)" retries = 0 [[profile.ci.overrides]] diff --git a/Cargo.lock b/Cargo.lock index 605ef93fe84..2d2aa104b6d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -436,10 +436,13 @@ dependencies = [ "agave-votor", "agave-votor-messages", "bitvec", + "bytes", + "clap 4.6.1", "crossbeam-channel", "itertools 0.14.0", "lazy-lru", "log", + "num_cpus", "parking_lot 0.12.3", "qualifier_attr", "rand 0.9.4", @@ -464,6 +467,7 @@ dependencies = [ "solana-net-utils", "solana-perf", "solana-pubkey 4.2.0", + "solana-quic-datagram", "solana-rpc", "solana-runtime", "solana-sdk-ids", @@ -472,6 +476,7 @@ dependencies = [ "solana-signer-store", "solana-streamer", "solana-time-utils", + "solana-tls-utils", "solana-transaction", "solana-transaction-error", "solana-vote", @@ -479,6 +484,7 @@ dependencies = [ "tempfile", "test-case", "thiserror 2.0.18", + "tokio", "tokio-util 0.7.18", "wincode", ] @@ -639,21 +645,6 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" -[[package]] -name = "anstream" -version = "0.6.18" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" -dependencies = [ - "anstyle", - "anstyle-parse 0.2.6", - "anstyle-query", - "anstyle-wincon", - "colorchoice", - "is_terminal_polyfill", - "utf8parse", -] - [[package]] name = "anstream" version = "1.0.0" @@ -661,7 +652,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "824a212faf96e9acacdbd09febd34438f8f711fb84e09a8916013cd7815ca28d" dependencies = [ "anstyle", - "anstyle-parse 1.0.0", + "anstyle-parse", "anstyle-query", "anstyle-wincon", "colorchoice", @@ -675,15 +666,6 @@ version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "940b3a0ca603d1eade50a4846a2afffd5ef57a9feac2c0e2ec2e14f9ead76000" -[[package]] -name = "anstyle-parse" -version = "0.2.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b2d16507662817a6a20a9ea92df6652ee4f94f914589377d69f3b21bc5798a9" -dependencies = [ - "utf8parse", -] - [[package]] name = "anstyle-parse" version = "1.0.0" @@ -1804,9 +1786,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.31" +version = "4.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "027bb0d98429ae334a8698531da7077bdf906419543a35a55c2cb1b66437d767" +checksum = "1ddb117e43bbf7dacf0a4190fef4d345b9bad68dfc649cb349e7d17d28428e51" dependencies = [ "clap_builder", "clap_derive", @@ -1814,21 +1796,21 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.31" +version = "4.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5589e0cba072e0f3d23791efac0fd8627b49c829c196a492e88168e6a669d863" +checksum = "714a53001bf66416adb0e2ef5ac857140e7dc3a0c48fb28b2f10762fc4b5069f" dependencies = [ - "anstream 0.6.18", + "anstream", "anstyle", - "clap_lex 0.7.4", + "clap_lex 1.1.0", "strsim 0.11.1", ] [[package]] name = "clap_derive" -version = "4.5.28" +version = "4.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf4ced95c6f4a675af3da73304b9ac4ed991640c36374e4b46795c49e17cf1ed" +checksum = "f2ce8604710f6733aa641a2b3731eaa1e8b3d9973d5e3565da11800813f997a9" dependencies = [ "heck", "proc-macro2", @@ -1847,9 +1829,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.4" +version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f46ad14479a25103f283c0f10005961cf086d8dc42205bb44c46ac563475dca6" +checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" [[package]] name = "cmov" @@ -2051,7 +2033,7 @@ dependencies = [ "anes", "cast", "ciborium", - "clap 4.5.31", + "clap 4.6.1", "criterion-plot", "itertools 0.13.0", "num-traits", @@ -2753,7 +2735,7 @@ version = "0.11.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0621c04f2196ac3f488dd583365b9c09be011a4ab8b9f37248ffcc8f6198b56a" dependencies = [ - "anstream 1.0.0", + "anstream", "anstyle", "env_filter", "jiff", @@ -7854,6 +7836,7 @@ dependencies = [ "solana-poh-config", "solana-program-binaries", "solana-pubkey 4.2.0", + "solana-quic-datagram", "solana-rayon-threadlimit", "solana-rent", "solana-rpc", @@ -9599,6 +9582,30 @@ dependencies = [ "tokio-util 0.7.18", ] +[[package]] +name = "solana-quic-datagram" +version = "4.2.0-alpha.0" +dependencies = [ + "arc-swap", + "arrayvec", + "bytes", + "crossbeam-channel", + "dashmap", + "futures 0.3.32", + "log", + "quinn", + "quinn-proto", + "rustls", + "solana-keypair", + "solana-metrics", + "solana-net-utils", + "solana-pubkey 4.2.0", + "solana-quic-datagram", + "solana-tls-utils", + "thiserror 2.0.18", + "tokio", +] + [[package]] name = "solana-rayon-threadlimit" version = "4.2.0-alpha.0" @@ -10514,7 +10521,7 @@ dependencies = [ "assert_matches", "bytes", "chrono", - "clap 4.5.31", + "clap 4.6.1", "crossbeam-channel", "futures 0.3.32", "histogram", diff --git a/Cargo.toml b/Cargo.toml index a1744628870..57f7e6c4849 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -72,6 +72,7 @@ members = [ "programs/zk-token-proof", "pubsub-client", "quic-client", + "quic-datagram", "random", "rayon-threadlimit", "rbpf-cli", @@ -308,6 +309,7 @@ protobuf-src = "1.1.0" protosol = "=8.0.0" qualifier_attr = { version = "0.2.2", default-features = false } quinn = "0.11.9" +quinn-proto = "0.11.14" rand = "0.9.4" rand_chacha = "0.9.0" rayon = "1.12.0" @@ -446,6 +448,7 @@ solana-program-test = { path = "program-test", version = "=4.2.0-alpha.0", featu solana-pubkey = { version = "4.2.0", default-features = false } solana-pubsub-client = { path = "pubsub-client", version = "=4.2.0-alpha.0" } solana-quic-client = { path = "quic-client", version = "=4.2.0-alpha.0", features = ["agave-unstable-api"] } +solana-quic-datagram = { path = "quic-datagram", version = "=4.2.0-alpha.0", features = ["agave-unstable-api"] } solana-rayon-threadlimit = { path = "rayon-threadlimit", version = "=4.2.0-alpha.0", features = ["agave-unstable-api"] } solana-remote-wallet = { path = "remote-wallet", version = "=4.2.0-alpha.0", default-features = false, features = ["agave-unstable-api"] } solana-rent = "4.2.0" diff --git a/core/Cargo.toml b/core/Cargo.toml index bf171f15265..b22e8d93d25 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -129,6 +129,7 @@ solana-perf = { workspace = true } solana-poh = { workspace = true } solana-poh-config = { workspace = true } solana-pubkey = { workspace = true } +solana-quic-datagram = { workspace = true } solana-rayon-threadlimit = { workspace = true } solana-rpc = { workspace = true } solana-rpc-client-api = { workspace = true } diff --git a/core/src/admin_rpc_post_init.rs b/core/src/admin_rpc_post_init.rs index 79259fc625c..e520fd96634 100644 --- a/core/src/admin_rpc_post_init.rs +++ b/core/src/admin_rpc_post_init.rs @@ -34,6 +34,8 @@ pub enum KeyUpdaterType { Bls, /// BLS all-to-all connection cache key updater BlsConnectionCache, + /// Votor QUIC datagram endpoint key updater + VotorDatagram, } /// Responsible for managing the updaters for identity key change diff --git a/core/src/bls_sigverify/bls_cert_sigverify.rs b/core/src/bls_sigverify/bls_cert_sigverify.rs index 2dc83daf07c..55b3a1b9f0f 100644 --- a/core/src/bls_sigverify/bls_cert_sigverify.rs +++ b/core/src/bls_sigverify/bls_cert_sigverify.rs @@ -1,5 +1,9 @@ use { - super::{bls_sigverifier::BAN_TIMEOUT, errors::SigVerifyCertError, stats::SigVerifyCertStats}, + super::{ + bls_sigverifier::{BAN_TIMEOUT, SigVerifierBanlist}, + errors::SigVerifyCertError, + stats::SigVerifyCertStats, + }, crate::bls_sigverify::{bls_sigverifier::NUM_SLOTS_FOR_VERIFY, utils::send_certs_to_pool}, agave_bls_cert_verify::cert_verify::Error as BlsCertVerifyError, agave_votor_messages::{ @@ -16,7 +20,6 @@ use { solana_measure::measure::Measure, solana_pubkey::Pubkey, solana_runtime::bank::Bank, - solana_streamer::nonblocking::simple_qos::SimpleQosBanlist, std::{collections::HashSet, num::NonZeroU64}, thiserror::Error, }; @@ -53,7 +56,7 @@ pub(super) fn verify_and_send_certificates( certs: Vec, root_bank: &Bank, channel_to_pool: &Sender>, - banlist: &SimpleQosBanlist, + banlist: &dyn SigVerifierBanlist, thread_pool: &ThreadPool, ) -> Result { for cert in certs.iter().map(|cert_payload| &cert_payload.cert) { @@ -95,7 +98,7 @@ fn verify_certs( root_bank: &Bank, verified_certs_set: &mut HashSet, stats: &mut SigVerifyCertStats, - banlist: &SimpleQosBanlist, + banlist: &dyn SigVerifierBanlist, thread_pool: &ThreadPool, ) -> Vec { let verified = thread_pool.install(|| { diff --git a/core/src/bls_sigverify/bls_sigverifier.rs b/core/src/bls_sigverify/bls_sigverifier.rs index 129c8dd8980..81b2d8a51b8 100644 --- a/core/src/bls_sigverify/bls_sigverifier.rs +++ b/core/src/bls_sigverify/bls_sigverifier.rs @@ -27,6 +27,7 @@ use { solana_ledger::leader_schedule_cache::LeaderScheduleCache, solana_measure::measure_us, solana_pubkey::Pubkey, + solana_quic_datagram::{Banlist, endpoint::Datagram}, solana_runtime::{bank::Bank, bank_forks::SharableBanks}, solana_streamer::{nonblocking::simple_qos::SimpleQosBanlist, packet::PacketBatch}, std::{ @@ -50,9 +51,25 @@ pub(super) const NUM_SLOTS_FOR_VERIFY: Slot = 90_000; /// We ban the sender for 2 days which roughly corresponds to an epoch pub(super) const BAN_TIMEOUT: Duration = Duration::from_hours(48); +pub(crate) trait SigVerifierBanlist: Send + Sync { + fn ban(&self, pubkey: Pubkey, timeout: Duration) -> bool; +} + +impl SigVerifierBanlist for SimpleQosBanlist { + fn ban(&self, pubkey: Pubkey, timeout: Duration) -> bool { + SimpleQosBanlist::ban(self, pubkey, timeout) + } +} + +impl SigVerifierBanlist for Banlist { + fn ban(&self, pubkey: Pubkey, timeout: Duration) -> bool { + Banlist::ban(self, pubkey, timeout) + } +} + pub(crate) struct SigVerifierContext { pub(crate) migration_status: Arc, - pub(crate) banlist: Arc, + pub(crate) banlist: Arc, pub(crate) sharable_banks: SharableBanks, pub(crate) cluster_info: Arc, pub(crate) leader_schedule: Arc, @@ -60,8 +77,13 @@ pub(crate) struct SigVerifierContext { pub(crate) generated_cert_types: Arc, } +pub(crate) enum SigVerifierInputReceiver { + PacketBatches(Receiver), + Datagrams(Receiver), +} + pub(crate) struct SigVerifierChannels { - pub(crate) packet_receiver: Receiver, + pub(crate) input_receiver: SigVerifierInputReceiver, pub(crate) channel_to_repair: VerifiedVoterSlotsSender, pub(crate) channel_to_reward: Sender, pub(crate) channel_to_pool: Sender>, @@ -84,7 +106,7 @@ pub(crate) fn spawn_service( struct SigVerifier { migration_status: Arc, - banlist: Arc, + banlist: Arc, channels: SigVerifierChannels, /// Container to look up root banks from. sharable_banks: SharableBanks, @@ -100,6 +122,11 @@ struct SigVerifier { generated_cert_types: Arc, } +struct SigverifyItem { + remote_pubkey: Option, + bytes: Vec, +} + impl SigVerifier { fn new(context: SigVerifierContext, channels: SigVerifierChannels) -> Self { let SigVerifierContext { @@ -135,15 +162,16 @@ impl SigVerifier { fn run(mut self, exit: Arc) { while !exit.load(Ordering::Relaxed) { const SOFT_RECEIVE_CAP: usize = 5000; - let Ok(batches) = recv_batches(&self.channels.packet_receiver, SOFT_RECEIVE_CAP) else { - error!("packet_receiver disconnected: Exiting."); + let Ok(input) = recv_input(&self.channels.input_receiver, SOFT_RECEIVE_CAP) else { + error!("sigverify input receiver disconnected: Exiting."); break; }; - if batches.is_empty() || self.migration_status.is_pre_feature_activation() { + if input.is_empty() || self.migration_status.is_pre_feature_activation() { continue; } + let items = self.input_to_sigverify_items(input); - let (verify_res, verify_time_us) = measure_us!(self.verify_and_send_batches(batches)); + let (verify_res, verify_time_us) = measure_us!(self.verify_and_send_items(items)); self.stats .verify_and_send_batch_us .add_sample(verify_time_us); @@ -156,12 +184,18 @@ impl SigVerifier { self.stats.do_report(self.sharable_banks.root().slot()); } + #[cfg(test)] fn verify_and_send_batches(&mut self, batches: Vec) -> Result<(), SigVerifyError> { + let items = self.packet_batches_to_sigverify_items(batches); + self.verify_and_send_items(items) + } + + fn verify_and_send_items(&mut self, items: Vec) -> Result<(), SigVerifyError> { let root_bank = self.sharable_banks.root(); self.maybe_prune_caches(root_bank.slot()); let ((certs_to_verify, votes_to_verify), extract_msgs_us) = - measure_us!(self.extract_and_filter_msgs(batches, &root_bank)); + measure_us!(self.extract_and_filter_sigverify_items(items, &root_bank)); self.stats .extract_filter_msgs_us .add_sample(extract_msgs_us); @@ -173,7 +207,7 @@ impl SigVerifier { &root_bank, &self.cluster_info, &self.leader_schedule, - &self.banlist, + self.banlist.as_ref(), &self.thread_pool, &self.channels, ) @@ -184,7 +218,7 @@ impl SigVerifier { certs_to_verify, &root_bank, &self.channels.channel_to_pool, - &self.banlist, + self.banlist.as_ref(), &self.thread_pool, ) }, @@ -205,14 +239,11 @@ impl SigVerifier { } } - fn extract_and_filter_msgs( + fn packet_batches_to_sigverify_items( &mut self, batches: Vec, - root_bank: &Bank, - ) -> (Vec, Vec) { - let root_slot = root_bank.slot(); - let mut certs = Vec::new(); - let mut votes = Vec::new(); + ) -> Vec { + let mut items = Vec::new(); let mut num_pkts = 0u64; for packet in batches.iter().flatten() { num_pkts = num_pkts.saturating_add(1); @@ -220,11 +251,65 @@ impl SigVerifier { self.stats.num_discarded_pkts += 1; continue; } - let Ok(msg) = packet.deserialize_slice::(..) else { + let Some(bytes) = packet.data(..) else { + self.stats.num_malformed_pkts += 1; + continue; + }; + items.push(SigverifyItem { + remote_pubkey: packet.meta().remote_pubkey(), + bytes: bytes.to_vec(), + }); + } + self.stats.num_pkts.add_sample(num_pkts); + items + } + + fn datagrams_to_sigverify_items(&mut self, datagrams: Vec) -> Vec { + let mut items = Vec::with_capacity(datagrams.len()); + let mut num_pkts = 0u64; + for Datagram { + peer_pubkey, + message, + .. + } in datagrams + { + num_pkts = num_pkts.saturating_add(1); + items.push(SigverifyItem { + remote_pubkey: Some(peer_pubkey), + bytes: message.to_vec(), + }); + } + self.stats.num_pkts.add_sample(num_pkts); + items + } + + fn input_to_sigverify_items(&mut self, input: SigVerifierInput) -> Vec { + match input { + SigVerifierInput::PacketBatches(batches) => { + self.packet_batches_to_sigverify_items(batches) + } + SigVerifierInput::Datagrams(datagrams) => self.datagrams_to_sigverify_items(datagrams), + } + } + + fn extract_and_filter_sigverify_items( + &mut self, + items: Vec, + root_bank: &Bank, + ) -> (Vec, Vec) { + let root_slot = root_bank.slot(); + let mut certs = Vec::new(); + let mut votes = Vec::new(); + for SigverifyItem { + remote_pubkey, + bytes, + } in items + { + let Ok(msg) = wincode::deserialize_exact::(&bytes) else { self.stats.num_malformed_pkts += 1; continue; }; - let Some(remote_pubkey) = packet.meta().remote_pubkey() else { + let Some(remote_pubkey) = remote_pubkey else { debug_assert!(false, "BLS packet missing remote pubkey"); self.stats.num_malformed_pkts += 1; continue; @@ -261,7 +346,6 @@ impl SigVerifier { } } } - self.stats.num_pkts.add_sample(num_pkts); (certs, votes) } @@ -294,15 +378,40 @@ impl SigVerifier { } } -/// Receives a `Vec` from the `receiver` while adhering to the `soft_receive_cap` limit. +enum SigVerifierInput { + PacketBatches(Vec), + Datagrams(Vec), +} + +impl SigVerifierInput { + fn is_empty(&self) -> bool { + match self { + Self::PacketBatches(batches) => batches.is_empty(), + Self::Datagrams(datagrams) => datagrams.is_empty(), + } + } +} + +fn recv_input( + receiver: &SigVerifierInputReceiver, + soft_receive_cap: usize, +) -> Result { + match receiver { + SigVerifierInputReceiver::PacketBatches(receiver) => { + recv_items(receiver, soft_receive_cap).map(SigVerifierInput::PacketBatches) + } + SigVerifierInputReceiver::Datagrams(receiver) => { + recv_items(receiver, soft_receive_cap).map(SigVerifierInput::Datagrams) + } + } +} + +/// Receives items from `receiver` while adhering to the `soft_receive_cap` limit. /// /// Returns `Err(())` if the channel disconnected. -fn recv_batches( - receiver: &Receiver, - soft_receive_cap: usize, -) -> Result, ()> { - let batch = match receiver.recv_timeout(Duration::from_secs(1)) { - Ok(b) => b, +fn recv_items(receiver: &Receiver, soft_receive_cap: usize) -> Result, ()> { + let first = match receiver.recv_timeout(Duration::from_secs(1)) { + Ok(item) => item, Err(e) => match e { RecvTimeoutError::Timeout => { return Ok(vec![]); @@ -312,20 +421,20 @@ fn recv_batches( } }, }; - let mut batches = Vec::with_capacity(soft_receive_cap); - batches.push(batch); - while batches.len() < soft_receive_cap { + let mut items = Vec::with_capacity(soft_receive_cap); + items.push(first); + while items.len() < soft_receive_cap { match receiver.try_recv() { - Ok(b) => { - batches.push(b); + Ok(item) => { + items.push(item); } Err(e) => match e { - TryRecvError::Empty => return Ok(batches), + TryRecvError::Empty => return Ok(items), TryRecvError::Disconnected => return Err(()), }, } } - Ok(batches) + Ok(items) } #[cfg(test)] @@ -435,7 +544,7 @@ mod tests { generated_cert_types: generated_cert_types.clone(), }, SigVerifierChannels { - packet_receiver, + input_receiver: SigVerifierInputReceiver::PacketBatches(packet_receiver), channel_to_repair, channel_to_reward, channel_to_pool, @@ -1311,7 +1420,7 @@ mod tests { generated_cert_types: Arc::new(GeneratedCertTypes::default()), }, SigVerifierChannels { - packet_receiver, + input_receiver: SigVerifierInputReceiver::PacketBatches(packet_receiver), channel_to_repair: votes_for_repair_sender, channel_to_reward: reward_votes_sender, channel_to_pool: message_sender, diff --git a/core/src/bls_sigverify/bls_vote_sigverify.rs b/core/src/bls_sigverify/bls_vote_sigverify.rs index af298633217..a9a2323cac2 100644 --- a/core/src/bls_sigverify/bls_vote_sigverify.rs +++ b/core/src/bls_sigverify/bls_vote_sigverify.rs @@ -5,7 +5,9 @@ use { crate::{ block_creation_loop::rewards::msg_types::AddVoteMessage, bls_sigverify::{ - bls_sigverifier::{BAN_TIMEOUT, NUM_SLOTS_FOR_VERIFY, SigVerifierChannels}, + bls_sigverifier::{ + BAN_TIMEOUT, NUM_SLOTS_FOR_VERIFY, SigVerifierBanlist, SigVerifierChannels, + }, utils::{ send_votes_to_metrics, send_votes_to_pool, send_votes_to_repair, send_votes_to_rewards, @@ -32,7 +34,6 @@ use { solana_measure::{measure::Measure, measure_us}, solana_pubkey::Pubkey, solana_runtime::bank::Bank, - solana_streamer::nonblocking::simple_qos::SimpleQosBanlist, std::{collections::HashMap, sync::Arc}, }; @@ -76,7 +77,7 @@ pub(super) fn verify_and_send_votes( root_bank: &Bank, cluster_info: &ClusterInfo, leader_schedule: &LeaderScheduleCache, - banlist: &SimpleQosBanlist, + banlist: &dyn SigVerifierBanlist, thread_pool: &ThreadPool, channels: &SigVerifierChannels, ) -> Result { @@ -181,7 +182,7 @@ fn verify_votes( root_bank: &Bank, votes_to_verify: Vec, stats: &mut SigVerifyVoteStats, - banlist: &SimpleQosBanlist, + banlist: &dyn SigVerifierBanlist, thread_pool: &ThreadPool, ) -> Vec { // Filter votes too far in the future. diff --git a/core/src/tvu.rs b/core/src/tvu.rs index dd32fcd3bf0..f2cb30bc5d7 100644 --- a/core/src/tvu.rs +++ b/core/src/tvu.rs @@ -6,7 +6,9 @@ use { admin_rpc_post_init::{KeyUpdaterType, KeyUpdaters}, banking_trace::BankingTracer, block_creation_loop::{ReplayHighestFrozen, rewards::msg_types::AddVoteMessage}, - bls_sigverify::bls_sigverifier::{self, SigVerifierChannels, SigVerifierContext}, + bls_sigverify::bls_sigverifier::{ + self, SigVerifierChannels, SigVerifierContext, SigVerifierInputReceiver, + }, cluster_info_vote_listener::{ DuplicateConfirmedSlotsReceiver, GossipVerifiedVoteHashReceiver, VerifiedVoterSlotsReceiver, VerifiedVoterSlotsSender, VoteTracker, @@ -57,6 +59,7 @@ use { }, solana_poh::{poh_controller::PohController, poh_recorder::PohRecorder}, solana_pubkey::Pubkey, + solana_quic_datagram::{Banlist, StakedNodesAllowlist, endpoint::Datagram}, solana_rpc::{ max_slots::MaxSlots, optimistically_confirmed_bank_tracker::BankNotificationSenderConfig, rpc_subscriptions::RpcSubscriptions, slot_status_notifier::SlotStatusNotifier, @@ -117,7 +120,7 @@ pub struct Tvu { warm_quic_cache_service: Option, drop_bank_service: DropBankService, duplicate_shred_listener: DuplicateShredListener, - bls_sigverify_threads: Option<(JoinHandle<()>, JoinHandle<()>)>, + bls_sigverify_threads: Vec>, votor: Votor, commitment_service: AggregateCommitmentService, } @@ -163,6 +166,21 @@ impl Default for TvuConfig { } } +pub enum AlpenglowTransport { + QuicStream { + bls_connection_cache: Arc, + cancel: CancellationToken, + staked_nodes: Arc>, + key_notifiers: Arc>, + }, + QuicDatagram { + egress: tokio::sync::mpsc::Sender, + ingress: Receiver, + banlist: Arc>, + allowlist: Option>, + }, +} + /// Shared state from validator necessary to instantiate votor and related services pub struct AlpenglowInitializationState { // Shared with block creation loop @@ -179,13 +197,7 @@ pub struct AlpenglowInitializationState { pub votor_event_sender: VotorEventSender, pub votor_event_receiver: VotorEventReceiver, - // For BLS streamer setup - pub cancel: CancellationToken, - pub staked_nodes: Arc>, - pub key_notifiers: Arc>, - - // For BLS voting service - pub bls_connection_cache: Arc, + pub alpenglow_transport: AlpenglowTransport, pub voting_service_test_override: Option, } @@ -264,10 +276,7 @@ impl Tvu { bank_forks_controller_receiver, votor_event_sender, votor_event_receiver, - cancel, - staked_nodes, - key_notifiers, - bls_connection_cache, + alpenglow_transport, voting_service_test_override, highest_finalized, } = votor_init; @@ -279,72 +288,127 @@ impl Tvu { bounded(MAX_IN_FLIGHT_CONSENSUS_EVENTS); let generated_cert_types = Arc::new(GeneratedCertTypes::default()); - // The BLS socket is currently only available on Testnet and Development clusters. - // Closer to release we will enable this for all clusters. - let bls_sigverify_threads = if let Some(bls_socket) = bls_socket { - let (bls_packet_sender, bls_packet_receiver) = bounded(MAX_ALPENGLOW_PACKET_NUM); + enum BlsVotingTransport { + QuicStream(Arc), + QuicDatagram { + egress: tokio::sync::mpsc::Sender, + allowlist: Option>, + }, + } - let ( - SpawnServerResult { - endpoints: _, - thread: bls_streamer_t, - key_updater: bls_key_updater, - }, - banlist, - ) = { - let quic_server_params = QuicStreamerConfig { - num_threads: NonZeroUsize::new(4.min(num_cpus::get())).unwrap(), - ..Default::default() - }; - let qos_config = SimpleQosConfig { - max_streams_per_second: 30, - // Cap by # of active validators (some overhead for epoch boundaries) - max_staked_connections: MAX_ALPENGLOW_VOTE_ACCOUNTS * 2, - // Two staked connection per validator to account for hotspares - max_connections_per_peer: 2, + let (bls_sigverify_threads, bls_voting_transport) = match alpenglow_transport { + AlpenglowTransport::QuicStream { + bls_connection_cache, + cancel, + staked_nodes, + key_notifiers, + } => { + let threads = if let Some(bls_socket) = bls_socket { + let (bls_packet_sender, bls_packet_receiver) = + bounded(MAX_ALPENGLOW_PACKET_NUM); + + let ( + SpawnServerResult { + endpoints: _, + thread: bls_streamer_t, + key_updater: bls_key_updater, + }, + banlist, + ) = { + let quic_server_params = QuicStreamerConfig { + num_threads: NonZeroUsize::new(4.min(num_cpus::get())).unwrap(), + ..Default::default() + }; + let qos_config = SimpleQosConfig { + max_streams_per_second: 30, + // Cap by # of active validators (some overhead for epoch boundaries) + max_staked_connections: MAX_ALPENGLOW_VOTE_ACCOUNTS * 2, + // Two staked connection per validator to account for hotspares + max_connections_per_peer: 2, + }; + spawn_simple_qos_server( + "solQuicBLS", + "quic_streamer_bls", + vec![bls_socket.into()], + &cluster_info.keypair(), + bls_packet_sender, + staked_nodes, + quic_server_params, + qos_config, + cancel, + ) + .unwrap() + }; + + let sharable_banks = bank_forks.read().unwrap().sharable_banks(); + let bls_sigverifier_t = bls_sigverifier::spawn_service( + exit.clone(), + SigVerifierContext { + migration_status: migration_status.clone(), + banlist, + sharable_banks, + cluster_info: cluster_info.clone(), + leader_schedule: leader_schedule_cache.clone(), + num_threads: tvu_config.bls_sigverify_threads.get(), + generated_cert_types: generated_cert_types.clone(), + }, + SigVerifierChannels { + input_receiver: SigVerifierInputReceiver::PacketBatches( + bls_packet_receiver, + ), + channel_to_repair: verified_voter_slots_sender, + channel_to_reward: reward_votes_sender, + channel_to_pool: consensus_message_sender.clone(), + channel_to_metrics: consensus_metrics_sender.clone(), + }, + ); + + key_notifiers + .write() + .unwrap() + .add(KeyUpdaterType::Bls, bls_key_updater); + + vec![bls_streamer_t, bls_sigverifier_t] + } else { + Vec::new() }; - spawn_simple_qos_server( - "solQuicBLS", - "quic_streamer_bls", - vec![bls_socket.into()], - &cluster_info.keypair(), - bls_packet_sender, - staked_nodes, - quic_server_params, - qos_config, - cancel, + ( + threads, + BlsVotingTransport::QuicStream(bls_connection_cache), ) - .unwrap() - }; - - // sigverifier - let sharable_banks = bank_forks.read().unwrap().sharable_banks(); - let bls_sigverifier_t = bls_sigverifier::spawn_service( - exit.clone(), - SigVerifierContext { - migration_status: migration_status.clone(), - banlist, - sharable_banks, - cluster_info: cluster_info.clone(), - leader_schedule: leader_schedule_cache.clone(), - num_threads: tvu_config.bls_sigverify_threads.get(), - generated_cert_types: generated_cert_types.clone(), - }, - SigVerifierChannels { - packet_receiver: bls_packet_receiver, - channel_to_repair: verified_voter_slots_sender, - channel_to_reward: reward_votes_sender, - channel_to_pool: consensus_message_sender.clone(), - channel_to_metrics: consensus_metrics_sender.clone(), - }, - ); - - let mut key_notifiers = key_notifiers.write().unwrap(); - key_notifiers.add(KeyUpdaterType::Bls, bls_key_updater); - - Some((bls_streamer_t, bls_sigverifier_t)) - } else { - None + } + AlpenglowTransport::QuicDatagram { + egress, + ingress, + banlist, + allowlist, + } => { + let _ = bls_socket; + let sharable_banks = bank_forks.read().unwrap().sharable_banks(); + let bls_sigverifier_t = bls_sigverifier::spawn_service( + exit.clone(), + SigVerifierContext { + migration_status: migration_status.clone(), + banlist, + sharable_banks, + cluster_info: cluster_info.clone(), + leader_schedule: leader_schedule_cache.clone(), + num_threads: tvu_config.bls_sigverify_threads.get(), + generated_cert_types: generated_cert_types.clone(), + }, + SigVerifierChannels { + input_receiver: SigVerifierInputReceiver::Datagrams(ingress), + channel_to_repair: verified_voter_slots_sender, + channel_to_reward: reward_votes_sender, + channel_to_pool: consensus_message_sender.clone(), + channel_to_metrics: consensus_metrics_sender.clone(), + }, + ); + ( + vec![bls_sigverifier_t], + BlsVotingTransport::QuicDatagram { egress, allowlist }, + ) + } }; let (fetch_sender, fetch_receiver) = EvictingSender::new_bounded(SHRED_FETCH_CHANNEL_SIZE); @@ -591,14 +655,27 @@ impl Tvu { vote_connection_cache.clone(), ); - let bls_voting_service = BLSVotingService::new( - bls_receiver, - cluster_info.clone(), - vote_history_storage, - bls_connection_cache, - bank_forks.clone(), - voting_service_test_override, - ); + let bls_voting_service = match bls_voting_transport { + BlsVotingTransport::QuicStream(bls_connection_cache) => BLSVotingService::new( + bls_receiver, + cluster_info.clone(), + vote_history_storage, + bls_connection_cache, + bank_forks.clone(), + voting_service_test_override, + ), + BlsVotingTransport::QuicDatagram { egress, allowlist } => { + BLSVotingService::new_datagram( + bls_receiver, + cluster_info.clone(), + vote_history_storage, + egress, + allowlist, + bank_forks.clone(), + voting_service_test_override, + ) + } + }; let warm_quic_cache_service = create_cache_warmer_if_needed( None, @@ -671,9 +748,8 @@ impl Tvu { } self.drop_bank_service.join()?; self.duplicate_shred_listener.join()?; - if let Some((streamer, sigverifier)) = self.bls_sigverify_threads { - streamer.join()?; - sigverifier.join()?; + for thread in self.bls_sigverify_threads { + thread.join()?; } self.votor.join()?; self.commitment_service.join()?; @@ -891,10 +967,12 @@ pub mod tests { highest_parent_ready, votor_event_sender, votor_event_receiver, - cancel, - staked_nodes, - key_notifiers, - bls_connection_cache: Arc::new(bls_connection_cache), + alpenglow_transport: AlpenglowTransport::QuicStream { + bls_connection_cache: Arc::new(bls_connection_cache), + cancel, + staked_nodes, + key_notifiers, + }, voting_service_test_override: None, highest_finalized: Arc::new(RwLock::new(None)), bank_forks_controller, diff --git a/core/src/validator.rs b/core/src/validator.rs index 0a79970b8f4..a1e71e9bd0c 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -27,7 +27,7 @@ use { SystemMonitorService, SystemMonitorStatsReportConfig, verify_net_stats_access, }, tpu::{Tpu, TpuSockets}, - tvu::{AlpenglowInitializationState, Tvu, TvuConfig, TvuSockets}, + tvu::{AlpenglowInitializationState, AlpenglowTransport, Tvu, TvuConfig, TvuSockets}, }, agave_snapshots::{ SnapshotInterval, snapshot_archive_info::SnapshotArchiveInfoGetter as _, @@ -316,6 +316,13 @@ pub struct ValidatorLogConfig { pub logrotate_flag: Arc, } +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +pub enum VotorTransportProtocol { + QuicStream, + #[default] + QuicDatagram, +} + pub struct ValidatorConfig { /// Log messages go to `stderr` if `None` pub log_config: Option, @@ -393,6 +400,7 @@ pub struct ValidatorConfig { pub tvu_shred_sigverify_threads: NonZeroUsize, pub tvu_bls_sigverify_threads: NonZeroUsize, pub delay_leader_block_for_pending_fork: bool, + pub votor_transport_protocol: VotorTransportProtocol, pub voting_service_test_override: Option, pub repair_handler_type: RepairHandlerType, // Thread niceness adjustment for snapshot packager service @@ -478,6 +486,7 @@ impl ValidatorConfig { tvu_shred_sigverify_threads: NonZeroUsize::new(2).expect("2 is non-zero"), tvu_bls_sigverify_threads: NonZeroUsize::new(2).expect("2 is non-zero"), delay_leader_block_for_pending_fork: false, + votor_transport_protocol: VotorTransportProtocol::default(), voting_service_test_override: None, repair_handler_type: RepairHandlerType::default(), snapshot_packager_niceness_adj: 0, @@ -1187,30 +1196,7 @@ impl Validator { )) }; - let bls_connection_cache = Arc::new(ConnectionCache::new_with_client_options( - "connection_cache_bls_quic", - // BLS consensus messaging is extremely low throughput (5 PPS). Even during standstill operations - // we wouldn't expect more than a 100 PPS. 1 connection is enough. - 1, /* connection_pool_size */ - Some(node.sockets.quic_alpenglow_client), - Some(( - &identity_keypair, - node.info - .alpenglow() - .ok_or_else(|| { - ValidatorError::Other(String::from( - "Invalid QUIC address for Alpenglow BLS", - )) - })? - .ip(), - )), - Some((&staked_nodes, &identity_keypair.pubkey())), - )); let key_notifiers = Arc::new(RwLock::new(KeyUpdaters::default())); - key_notifiers.write().unwrap().add( - KeyUpdaterType::BlsConnectionCache, - bls_connection_cache.clone(), - ); // test-validator crate may start the validator in a tokio runtime // context which forces us to use the same runtime because a nested @@ -1227,6 +1213,97 @@ impl Validator { .unwrap() }); + let alpenglow_transport = match config.votor_transport_protocol { + VotorTransportProtocol::QuicStream => { + let bls_connection_cache = Arc::new(ConnectionCache::new_with_client_options( + "connection_cache_bls_quic", + // BLS consensus messaging is extremely low throughput (5 PPS). Even during + // standstill operations we would not expect more than 100 PPS. One connection + // is enough. + 1, /* connection_pool_size */ + Some(node.sockets.quic_alpenglow_client), + Some(( + &identity_keypair, + node.info + .alpenglow() + .ok_or_else(|| { + ValidatorError::Other(String::from( + "Invalid QUIC address for Alpenglow BLS", + )) + })? + .ip(), + )), + Some((&staked_nodes, &identity_keypair.pubkey())), + )); + key_notifiers.write().unwrap().add( + KeyUpdaterType::BlsConnectionCache, + bls_connection_cache.clone(), + ); + AlpenglowTransport::QuicStream { + bls_connection_cache, + cancel: cancel.clone(), + staked_nodes: staked_nodes.clone(), + key_notifiers: key_notifiers.clone(), + } + } + VotorTransportProtocol::QuicDatagram => { + let alpenglow_socket = if matches!( + genesis_config.cluster_type, + ClusterType::Testnet | ClusterType::Development, + ) { + node.sockets.alpenglow.take() + } else { + None + }; + + let (egress, ingress, banlist, allowlist, _alpenglow_endpoint_task) = + if let Some(socket) = alpenglow_socket { + let votor_rt_handle = tpu_client_next_runtime + .as_ref() + .map(TokioRuntime::handle) + .unwrap_or_else(|| current_runtime_handle.as_ref().unwrap()); + let (ingress_tx, ingress_rx) = + crossbeam_channel::bounded(crate::tvu::MAX_ALPENGLOW_PACKET_NUM); + let banlist = Arc::new(solana_quic_datagram::Banlist::::default()); + let allowlist = + agave_votor::datagram_endpoint::build_allowlist(&bank_forks); + let solana_quic_datagram::QuicDatagramEndpoint { + endpoint: _quic_endpoint, + egress, + key_updater, + task, + } = agave_votor::datagram_endpoint::spawn( + votor_rt_handle, + &identity_keypair, + socket, + ingress_tx, + allowlist.clone(), + banlist.clone(), + ) + .map_err(|e| ValidatorError::Other(format!("alpenglow endpoint: {e:?}")))?; + key_notifiers + .write() + .unwrap() + .add(KeyUpdaterType::VotorDatagram, key_updater); + (egress, ingress_rx, banlist, Some(allowlist), Some(task)) + } else { + let (egress, _) = tokio::sync::mpsc::channel(1); + let (_, ingress) = crossbeam_channel::bounded::< + solana_quic_datagram::endpoint::Datagram, + >(1); + let banlist = Arc::new(solana_quic_datagram::Banlist::::default()); + (egress, ingress, banlist, None, None) + }; + + AlpenglowTransport::QuicDatagram { + egress, + ingress, + banlist, + allowlist, + } + } + }; + let rpc_override_health_check = Arc::new(AtomicBool::new(config.rpc_config.disable_health_check)); let ( @@ -1647,10 +1724,7 @@ impl Validator { bank_forks_controller_receiver, votor_event_sender: votor_event_sender.clone(), votor_event_receiver, - cancel: cancel.clone(), - staked_nodes: staked_nodes.clone(), - key_notifiers: key_notifiers.clone(), - bls_connection_cache, + alpenglow_transport, voting_service_test_override: config.voting_service_test_override.clone(), highest_finalized, }, diff --git a/local-cluster/src/validator_configs.rs b/local-cluster/src/validator_configs.rs index ebd76009044..7750d317319 100644 --- a/local-cluster/src/validator_configs.rs +++ b/local-cluster/src/validator_configs.rs @@ -82,6 +82,7 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig { tvu_shred_sigverify_threads: config.tvu_shred_sigverify_threads, tvu_bls_sigverify_threads: config.tvu_bls_sigverify_threads, delay_leader_block_for_pending_fork: config.delay_leader_block_for_pending_fork, + votor_transport_protocol: config.votor_transport_protocol, voting_service_test_override: config.voting_service_test_override.clone(), repair_handler_type: config.repair_handler_type.clone(), snapshot_packager_niceness_adj: config.snapshot_packager_niceness_adj, diff --git a/local-cluster/tests/local_cluster.rs b/local-cluster/tests/local_cluster.rs index 2b82b8a0391..2ce00b40cba 100644 --- a/local-cluster/tests/local_cluster.rs +++ b/local-cluster/tests/local_cluster.rs @@ -1,13 +1,15 @@ #![allow(clippy::arithmetic_side_effects)] use { agave_snapshots::{ - SnapshotArchiveKind, SnapshotInterval, paths as snapshot_paths, - snapshot_archive_info::SnapshotArchiveInfoGetter, snapshot_config::SnapshotConfig, + paths as snapshot_paths, snapshot_archive_info::SnapshotArchiveInfoGetter, + snapshot_config::SnapshotConfig, SnapshotArchiveKind, SnapshotInterval, + }, + agave_votor::voting_service::{ + AdditionalListener, AlpenglowPortOverride, VotingServiceOverride, }, - agave_votor::voting_service::{AlpenglowPortOverride, VotingServiceOverride}, agave_votor_messages::migration::MIGRATION_SLOT_OFFSET, assert_matches::assert_matches, - crossbeam_channel::{Receiver, unbounded}, + crossbeam_channel::{unbounded, Receiver}, gag::BufferRedirect, itertools::Itertools, log::*, @@ -16,12 +18,12 @@ use { solana_account::AccountSharedData, solana_accounts_db::utils::create_accounts_run_and_snapshot_dirs, solana_client_traits::AsyncClient, - solana_clock::{DEFAULT_SLOTS_PER_EPOCH, DEFAULT_TICKS_PER_SLOT, MAX_PROCESSING_AGE, Slot}, + solana_clock::{Slot, DEFAULT_SLOTS_PER_EPOCH, DEFAULT_TICKS_PER_SLOT, MAX_PROCESSING_AGE}, solana_cluster_type::ClusterType, solana_commitment_config::CommitmentConfig, solana_core::{ consensus::{ - SWITCH_FORK_THRESHOLD, Tower, VOTE_THRESHOLD_DEPTH, tower_storage::FileTowerStorage, + tower_storage::FileTowerStorage, Tower, SWITCH_FORK_THRESHOLD, VOTE_THRESHOLD_DEPTH, }, optimistic_confirmation_verifier::OptimisticConfirmationVerifier, repair::{ @@ -37,18 +39,18 @@ use { solana_gossip::{crds_data::MAX_VOTES, gossip_service::discover_validators}, solana_hard_forks::HardForks, solana_hash::Hash, - solana_keypair::{Keypair, keypair_from_seed}, + solana_keypair::{keypair_from_seed, Keypair}, solana_leader_schedule::{FixedSchedule, LeaderSchedule, SlotLeader}, solana_ledger::{ ancestor_iterator::AncestorIterator, bank_forks_utils, - blockstore::{Blockstore, PurgeType, entries_to_test_shreds}, + blockstore::{entries_to_test_shreds, Blockstore, PurgeType}, blockstore_options::{AccessType, BlockstoreOptions}, blockstore_processor::{self, ProcessOptions}, leader_schedule_cache::LeaderScheduleCache, shred::{ - DATA_SHREDS_PER_FEC_BLOCK, ProcessShredsStats, ReedSolomonCache, Shred, Shredder, filter::{TurbineMode, TurbineModeKind}, + ProcessShredsStats, ReedSolomonCache, Shred, Shredder, DATA_SHREDS_PER_FEC_BLOCK, }, use_snapshot_archives_at_startup::UseSnapshotArchivesAtStartup, }, @@ -56,19 +58,19 @@ use { cluster::{Cluster, ClusterValidatorInfo, QuicTpuClient}, cluster_tests, integration_tests::{ - AG_DEBUG_LOG_FILTER, DEFAULT_NODE_STAKE, RUST_LOG_FILTER, SnapshotValidatorConfig, - ValidatorKeys, ValidatorTestConfig, copy_blocks, create_custom_leader_schedule, + copy_blocks, create_custom_leader_schedule, create_custom_leader_schedule_with_random_keys, farf_dir, generate_account_paths, last_root_in_tower, last_vote_in_tower, ms_for_n_slots, open_blockstore, purge_slots_with_count, remove_tower, remove_tower_if_exists, restore_tower, run_cluster_partition, run_kill_partition_switch_threshold, save_tower, setup_snapshot_validator_config, test_faulty_node, wait_for_duplicate_proof, - wait_for_last_vote_in_tower_to_land_in_ledger, + wait_for_last_vote_in_tower_to_land_in_ledger, SnapshotValidatorConfig, ValidatorKeys, + ValidatorTestConfig, AG_DEBUG_LOG_FILTER, DEFAULT_NODE_STAKE, RUST_LOG_FILTER, }, - local_cluster::{ClusterConfig, DEFAULT_MINT_LAMPORTS, LocalCluster}, + local_cluster::{ClusterConfig, LocalCluster, DEFAULT_MINT_LAMPORTS}, validator_configs::*, }, - solana_net_utils::{SocketAddrSpace, sockets::bind_to_localhost_unique}, + solana_net_utils::{sockets::bind_to_localhost_unique, SocketAddrSpace}, solana_poh_config::PohConfig, solana_pubkey::Pubkey, solana_pubsub_client::pubsub_client::PubsubClient, @@ -87,8 +89,8 @@ use { solana_system_transaction as system_transaction, solana_transaction_error::TransportError, solana_turbine::broadcast_stage::{ - BroadcastStageType, broadcast_duplicates_run::{BroadcastDuplicatesConfig, ClusterPartition}, + BroadcastStageType, }, solana_vote::{vote_parser, vote_transaction}, solana_vote_interface::state::TowerSync, @@ -101,10 +103,10 @@ use { num::{NonZeroU64, NonZeroUsize}, path::Path, sync::{ - Arc, Mutex, atomic::{AtomicBool, AtomicUsize, Ordering}, + Arc, Mutex, }, - thread::{Builder, JoinHandle, sleep}, + thread::{sleep, Builder, JoinHandle}, time::{Duration, Instant}, }, strum::{EnumCount, IntoEnumIterator}, @@ -5990,7 +5992,10 @@ fn test_alpenglow_imbalanced_stakes_catchup() { let mut validator_config = ValidatorConfig::default_for_test(); validator_config.fixed_leader_schedule = Some(leader_schedule); validator_config.voting_service_test_override = Some(VotingServiceOverride { - additional_listeners: vec![vote_listener_addr.local_addr().unwrap()], + additional_listeners: vec![AdditionalListener { + pubkey: Pubkey::new_unique(), + addr: vote_listener_addr.local_addr().unwrap(), + }], alpenglow_port_override: AlpenglowPortOverride::default(), }); validator_config.wait_for_supermajority = Some(0); @@ -6177,7 +6182,10 @@ fn test_alpenglow_migration( let vote_listener_addr = vote_listener_socket.try_clone().unwrap(); let mut validator_config = ValidatorConfig::default_for_test(); validator_config.voting_service_test_override = Some(VotingServiceOverride { - additional_listeners: vec![vote_listener_addr.local_addr().unwrap()], + additional_listeners: vec![AdditionalListener { + pubkey: Pubkey::new_unique(), + addr: vote_listener_addr.local_addr().unwrap(), + }], alpenglow_port_override: AlpenglowPortOverride::default(), }); validator_config.wait_for_supermajority = Some(0); diff --git a/quic-datagram/Cargo.toml b/quic-datagram/Cargo.toml new file mode 100644 index 00000000000..f515cbfde3c --- /dev/null +++ b/quic-datagram/Cargo.toml @@ -0,0 +1,47 @@ +[package] +name = "solana-quic-datagram" +description = "QUIC datagram endpoint with staked-only allowlist for Solana consensus traffic" +documentation = "https://docs.rs/solana-quic-datagram" +version = { workspace = true } +authors = { workspace = true } +repository = { workspace = true } +homepage = { workspace = true } +license = { workspace = true } +edition = { workspace = true } +publish = false + +[lib] + +[features] +# Exposes allowlist::AllowAll and other test-only helpers. Production binaries +# must NOT enable this feature. +dev-context-only-utils = [] +agave-unstable-api = [] + +[dependencies] +arc-swap = { workspace = true } +arrayvec = { workspace = true } +bytes = { workspace = true } +crossbeam-channel = { workspace = true } +dashmap = { workspace = true } +futures = { workspace = true } +log = { workspace = true } +quinn = { workspace = true } +quinn-proto = { workspace = true } +rustls = { workspace = true } +solana-keypair = { workspace = true } +solana-metrics = { workspace = true } +solana-net-utils = { workspace = true } +solana-pubkey = { workspace = true } +solana-tls-utils = { workspace = true, features = ["agave-unstable-api"] } +thiserror = { workspace = true } +tokio = { workspace = true, features = ["rt-multi-thread", "macros", "sync", "time"] } + +[dev-dependencies] +solana-keypair = { workspace = true } +solana-net-utils = { workspace = true } +solana-quic-datagram = { path = ".", features = ["dev-context-only-utils", "agave-unstable-api"] } +tokio = { workspace = true, features = ["full"] } + +[lints] +workspace = true diff --git a/quic-datagram/README.md b/quic-datagram/README.md new file mode 100644 index 00000000000..5b12e293d0d --- /dev/null +++ b/quic-datagram/README.md @@ -0,0 +1,206 @@ +# solana-quic-datagram + +A QUIC-datagram transport designed for Solana's votor consensus traffic. +Single [`quinn::Endpoint`] per node bound to one UDP socket, playing both +client and server roles. Peer identity is the ed25519 pubkey embedded in +a self-signed TLS cert; connections are gated by an `Allowlist` trait. +Bidirectional and unidirectional streams are disabled at the transport +level - only QUIC datagrams flow. + +## Why this architecture + +- **No HoL blocking.** Stream-based transports retransmit every message + on the same connection when a packet is lost. A vote that didn't arrive + within ~one slot is worthless. So for consensus messages + losing one shouldn't delay the transmission of the next. +- **No per-message stream open/close.** Votor sends small (≤1200 byte) + payloads at ~4/slot/peer. A stream per message is pure overhead. +- **Fire-and-forget semantics match production.** `VotingService`'s + `broadcast_consensus_message` already uses `try_send` with + drop-on-full; datagrams reflect that on the wire. +- **No congestion control.** Votor traffic is not flexible, it has + fixed bandwidth demands and can not slow down in response to + congestion. +- **ACKs throttled.** `AckFrequencyConfig` is tuned to minimize the + amount of ACK packets that we have to carry on the wire. + +## Architecture + +One tokio task - `endpoint::EndpointLoop::run` - multiplexes every +event source via a single `tokio::select!`: + +- server-accept (`endpoint.accept()`) +- client egress (`egress_rx.recv()`) +- banlist evictions (`banlist_evictions.recv()`) +- identity rotation (`identity_rx.changed()`) +- banlist-prune timer (hourly) +- metrics-report timer (every 2 s) + +Heavy per-event work (TLS handshake, dial, etc.) is spawned onto its own +short-lived task so a slow handshake never blocks dispatch. + + +## Connection table + +`Arc>`, where: + +```text +enum ConnectionTableEntry { + Dialing(IdGeneration), // placeholder; one dial task in flight + Established(Connection), // live connection for fan-out +} +``` + +This is needed to avoid having more than one connection to a given peer. +The server-accept pipeline (TLS, identity validation, allowlist check) lives +entirely in the spawned per-incoming task and only touches the table at +the end via `insert_connection`. There's no useful in-flight server-side +state worth modeling in the table. + +## Lex-pubkey tiebreaker + +For every peer pair, the node with the **lower** pubkey is the +dialer; the higher pubkey listens. The rule is enforced on both sides: + +- Server closes any inbound from a peer with `pubkey >= local` with + `WRONG_DIRECTION`. +- Client drops egress to peers with `pubkey <= local` when no cached + connection exists (counted as `egress_dropped_higher_pubkey`); it + waits for the peer to dial in, then `send_over_inbound` finds the + inbound and uses it for the higher→lower direction. + +This partitions connections cleanly: the lex-higher side only ever holds +inbound (server-accepted) entries; the lex-lower side only +ever holds outbound (we-dialed) ones. That partition is load-bearing +for the `PEER_MOVED` logic below. + +### No buffering during `Dialing` + +When no connection exists to a peer, the first packet to be sent is +buffered while dialing is going on. This is critical for standstill to +resolve in a timely manner. A second egress that arrives while a dial +s in flight is **dropped on the floor** (counted in +`egress_dropped_dial_in_progress`). Not buffered, not parked. Rationale: + +- Votor sends ≤4 pkt/slot/peer (~50 ms apart). A dial-in-flight window + loses ~a few packets; the upper layer keeps producing fresh ones. +- Buffering would require a per-peer wakeup primitive (the per-peer + `Notify` we explicitly avoided) plus a queue, both with their own + capacity-bound questions. +- Old votes aren't worth delivering by the time a fresh dial completes. +- Standstill is covered by buffering of the first packet. + +### Why no `Backoff` state + +A peer that's permanently down will hammer with one dial per egress to +that pubkey (votor-paced, ~5 attempts/s per peer worst case). No +exponential backoff. Trade-off: simpler state machine, marginally more +wasted dials against dead peers. Acceptable because the failure cost +(one quinn `connect` + handshake-timeout) is bounded and the volume is +votor-paced. + +## Handover + +A new successful handshake from a pubkey already in `Established` +closes the prior connection with `HANDOVER` and installs the new one - +the validator hot-spare handover path. The displaced side observes +`HANDOVER` in its read loop and soft-bans the evicting peer so it does +not feed bogus votes while waiting for the gossip notification to kill +itself. + +## PEER_MOVED (address-aware eviction) + +On the lex-lower (dialer) side, the cached `Established`'s +`remote_address()` is exactly the addr we dialed. If a later egress +arrives with a different addr for the same pubkey (e.g., gossip +published a new addr for the peer), the slot is atomically swapped to +`Dialing`, the displaced connection is closed with `PEER_MOVED`, and a +fresh dial spawns at the new addr. + +The lex-higher side **does not** do this check. Its cached entries are +inbound; their `remote_address()` is the peer's NAT-mapped source addr, +which can legitimately differ from the gossip-published one the caller +hands us. The higher side trusts whatever inbound connection it +accepted, addr argument ignored on cache hit. + +## Security posture + +1. **QUIC RETRY** (mandatory). Every inbound is bounced through a RETRY. + Blocks address-spoofed handshake floods - an attacker without a working + return path completes nothing and does not burn our CPU. +2. **Per-subnet rate limit.** Post-RETRY (source addr is now validated), + bucket per /24 (v4). Catches the basic attack profile - baby cams and + residential routers flooding connection attempts. Loopback is + exempt. Same mechanism as with streamer. No IPv6 to keep this easy. +3. **Bidirectional TLS ID validation.** Peer's cert must yield a + recoverable Solana ed25519 pubkey; client checks the attested pubkey + matches the targeted one, server checks that client owns a staked + identity. +4. **Stake check** (`Allowlist` trait, default impl + `StakedNodesAllowlist`). Peer must be in the current staked-set + snapshot. +5. **External banlist** (`DatagramBanlist`). Application-level bans + (e.g., BLS sigverify failures) are funneled through here; the + evictor task reaps every cached connection for the banned pubkey. +6. **Per-connection RX token bucket.** Bursts above + `BURST_DATAGRAMS_PER_SECOND_PER_PEER` are silently dropped (the + bucket itself is the throttle). The connection stays alive; the + peer is **not** banned (consensus traffic legitimately bursts). +7. **MAX_PEERS = 2000.** Defensive cap on the connection table. + + +## Identity rotation + +`KeyUpdater::update_key(new_keypair)` publishes a new +`IdentitySnapshot` through a `tokio::sync::watch`. The control loop +observes the change and: + +1. Rebuilds server + client TLS configs. +2. Swaps them into the quinn endpoint. +3. Evicts every cached connection with `IDENTITY_ROTATED` close. +4. Adopts the new pubkey for the lex-tiebreak rule going forward. + +`Dialing` placeholders are dropped too. In-flight dial tasks were +dispatched with the pre-rotation generation tag; when they finish their +handshake and try to install the connection, the generation check +inside `insert_connection` trips, returns `InsertOutcome::Stale`, and +the new connection is closed with `IDENTITY_ROTATED`. No TLS-level +failure is needed - the gen counter is the single source of truth for +"this dial belongs to the previous identity." + +## Payload size + +A datagram payload is bounded by quinn's negotiated `max_datagram_size` +(transport-config-driven, currently a fixed 1280). Payloads above that +bound fail at `send_datagram` with `SendDatagramError::TooLarge` and the +crate does no fragmentation - callers are expected to keep messages +under the cap. + +## Egress / ingress channels + +- **Egress** (caller → endpoint, `EGRESS_CHANNEL_CAP = 16384`). + Bounded mpsc. Callers must `try_send` and accept drop-on-full - + matches `VotingService::broadcast_consensus_message`. Sized to absorb + a full slot's burst with headroom. +- **Ingress** (endpoint → caller, caller-supplied + `crossbeam_channel::Sender`). Drop-on-full counted in + `datagram_ingress_dropped_channel_full`. + +### Epoch boundary handling + +At the end of an epoch, we may experience churn in the allowlist. +Some peers will no longer be allowed, while some new ones will be +admitted. We do not proactively break connections to peers which are +no longer in the allowlist; instead they are evicted when we are +inserting a new connection while out of room in the connection table. +This keeps the logic simpler than the explicit notify during epoch change. + +## What this crate is not + +- Not a general-purpose datagram transport. The lex-tiebreaker, the + drop-on-`Dialing` behavior, the lack of backoff, and the + identity-rotation evict-all all assume a votor-style workload (small + peer set, predictable cadence, no retransmits). With some effort this + could be adapted for e.g. turbine. +- Not a buffered transport. There is no per-peer queue. Bytes that hit + a full channel or a `Dialing` placeholder are gone. diff --git a/quic-datagram/src/allowlist.rs b/quic-datagram/src/allowlist.rs new file mode 100644 index 00000000000..566765b9c0e --- /dev/null +++ b/quic-datagram/src/allowlist.rs @@ -0,0 +1,70 @@ +//! Allowlist policy: decides whether the local endpoint will accept +//! a TLS-attested peer pubkey. + +use { + arc_swap::ArcSwap, + solana_pubkey::Pubkey, + std::{collections::HashMap, sync::Arc}, +}; + +/// Called once per inbound handshake (after the peer cert is parsed) and once +/// per outbound dial-success. A `false` answer closes the connection with the +/// `NOT_ADMITTED` error code. +/// +/// Implementations must be cheap - this is on the hot path of every new +/// connection. +pub trait Allowlist: Send + Sync + 'static { + fn allow(&self, peer: &Pubkey) -> bool; +} + +/// Snapshot of the currently-allowed peer set with their epoch stake. +/// Producers (typically the staked-validators cache) call +/// [`StakedNodesAllowlist::swap`] to publish a new generation; consumers +/// see it on the next `allow` call. +/// +/// Stake values are available for future use (e.g. weighted eviction). +/// Peers inserted for test-only purposes (e.g. `extra_admit`) carry stake 0. +#[derive(Default)] +pub struct StakedNodesAllowlist { + inner: ArcSwap>, +} + +impl StakedNodesAllowlist { + pub fn new(initial: HashMap) -> Self { + Self { + inner: ArcSwap::new(Arc::new(initial)), + } + } + + /// Publish a new allowlist. Atomic; no readers block. + pub fn swap(&self, next: HashMap) { + self.inner.store(Arc::new(next)); + } + + /// Number of allowed peers in the current generation. + pub fn len(&self) -> usize { + self.inner.load().len() + } + + pub fn is_empty(&self) -> bool { + self.inner.load().is_empty() + } +} + +impl Allowlist for StakedNodesAllowlist { + fn allow(&self, peer: &Pubkey) -> bool { + self.inner.load().contains_key(peer) + } +} + +/// Allow every peer. **Test/bench use only** - gated behind +/// `dev-context-only-utils` so production binaries cannot accidentally use it. +#[cfg(any(test, feature = "dev-context-only-utils"))] +pub struct AllowAll; + +#[cfg(any(test, feature = "dev-context-only-utils"))] +impl Allowlist for AllowAll { + fn allow(&self, _: &Pubkey) -> bool { + true + } +} diff --git a/quic-datagram/src/client.rs b/quic-datagram/src/client.rs new file mode 100644 index 00000000000..7a830f374c6 --- /dev/null +++ b/quic-datagram/src/client.rs @@ -0,0 +1,131 @@ +//! Client-side connection lifecycle. + +use { + crate::{ + Banlist, + allowlist::Allowlist, + close_codes, + connection_table::{ConnectionTable, IdGeneration, InsertOutcome}, + endpoint::Datagram, + error::Error, + read_loop::read_datagram_loop, + stats::{QuicDatagramStats, add, record_error}, + }, + bytes::Bytes, + crossbeam_channel::Sender, + log::error, + quinn::Endpoint, + solana_pubkey::Pubkey, + solana_tls_utils::{get_remote_pubkey, socket_addr_to_quic_server_name}, + std::{net::SocketAddr, sync::Arc}, +}; + +/// A client-side connection's full lifecycle: dial, validate identity, +/// install in the table, run the read loop, reap on exit. Built by the +/// control loop and dispatched via [`Self::spawn`]. +pub(crate) struct ClientConnection { + pub(crate) endpoint: Endpoint, + pub(crate) peer: Pubkey, + pub(crate) addr: SocketAddr, + pub(crate) id_generation: IdGeneration, + /// The egress that triggered this dial. Carried into the dial task + /// and sent the moment `insert_connection` succeeds, before the + /// read loop starts. Cold-start traffic (standstill) needs this to + /// reach the peer at all. + pub(crate) trigger: Bytes, + pub(crate) ingress: Sender, + pub(crate) allowlist: Arc, + pub(crate) banlist: Arc>, + pub(crate) table: Arc, + pub(crate) stats: Arc, +} + +impl ClientConnection { + /// Spawn a tokio task that drives this connection to completion. + /// Returns immediately; the task logs and records any error before + /// exiting. + pub(crate) fn spawn(self) { + let peer = self.peer; + let addr = self.addr; + let stats = self.stats.clone(); + tokio::spawn(async move { + if let Err(e) = self.run().await { + error!("Connection attempt to ({peer}, {addr}) failed: {e:?}"); + record_error(&e, &stats); + } + }); + } + + async fn run(self) -> Result<(), Error> { + // Clone the needed data for cleanup + let table_ref_for_cleanup = self.table.clone(); + let peer = self.peer; + let id_generation = self.id_generation; + + let result: Result<(), Error> = async move { + let server_name = socket_addr_to_quic_server_name(self.addr); + let connection = self.endpoint.connect(self.addr, &server_name)?.await?; + + // Server identity must match the pubkey the caller targeted. + let attested = + get_remote_pubkey(&connection).ok_or(Error::InvalidIdentity(self.addr))?; + if attested != self.peer { + close_codes::INVALID_IDENTITY.close(&connection); + return Err(Error::InvalidIdentity(self.addr)); + } + + // On cap-induced Rejected, the table will try to evict a + // now-unadmitted peer before giving up - epoch-boundary churn + // can momentarily push the staked-set union past `MAX_PEERS`. + match self.table.insert_connection_or_evict( + self.peer, + connection.clone(), + id_generation, + |pk| self.allowlist.allow(pk), + &self.stats, + ) { + InsertOutcome::Rejected => { + close_codes::TABLE_FULL.close(&connection); + return Err(Error::TableFull); + } + InsertOutcome::Stale => { + close_codes::IDENTITY_ROTATED.close(&connection); + return Err(Error::IdentityRotated(self.peer)); + } + InsertOutcome::Inserted | InsertOutcome::Replaced => { + self.stats.record_connection_count(self.table.len()); + } + } + + // Send the trigger packet that started this dial. Quinn-level + // failures are recorded but not propagated - the connection + // itself is healthy and the read loop should still run. + match connection.send_datagram(self.trigger) { + Ok(()) => add(&self.stats.datagrams_sent), + Err(e) => record_error(&Error::from(e), &self.stats), + } + + // Drive the read loop inline. + let stable_id = connection.stable_id(); + read_datagram_loop( + connection, + self.peer, + self.addr, + self.ingress, + self.banlist, + self.stats, + ) + .await; + // Reap our slot only if it still points at *this* connection + // (HANDOVER could have replaced it). + self.table.maybe_reap_connection(&self.peer, stable_id); + Ok(()) + } + .await; + + if result.is_err() { + table_ref_for_cleanup.clear_dialing_placeholder(&peer, id_generation); + } + result + } +} diff --git a/quic-datagram/src/close_codes.rs b/quic-datagram/src/close_codes.rs new file mode 100644 index 00000000000..7257e575ac9 --- /dev/null +++ b/quic-datagram/src/close_codes.rs @@ -0,0 +1,97 @@ +//! Application close codes and reason strings, paired. +//! +//! Each `Spec` bundles a wire close code (`VarInt`) with its reason string +//! so the two cannot drift apart at call sites. Use as +//! `close::HANDOVER.close(&conn)`. +//! +//! Numeric codes are part of the wire format — adding a new one is fine, +//! changing the value of an existing one is a breaking protocol change. + +use quinn::{Connection, VarInt}; + +pub(crate) struct Spec { + pub code: VarInt, + pub reason: &'static [u8], +} + +impl Spec { + /// Close `conn` with this spec's code/reason pair. + pub fn close(&self, conn: &Connection) { + conn.close(self.code, self.reason); + } +} + +/// Local endpoint is shutting down. Issued by [`crate::QuicDatagramEndpoint::close`] +/// to every live connection. +pub(crate) const SHUTDOWN: Spec = Spec { + code: VarInt::from_u32(0), + reason: b"SHUTDOWN", +}; + +/// Peer's TLS cert chain did not yield a recoverable Solana ed25519 pubkey +/// (wrong shape, multi-cert chain, parse failure). Issued from the server +/// accept path and from the client after dial when the attested pubkey does +/// not match the one the caller targeted. +pub(crate) const INVALID_IDENTITY: Spec = Spec { + code: VarInt::from_u32(2), + reason: b"INVALID_IDENTITY", +}; + +/// Peer's pubkey is not in the local [`crate::Allowlist`] allow-list +/// (typically: not in the current staked-nodes snapshot). +pub(crate) const NOT_ADMITTED: Spec = Spec { + code: VarInt::from_u32(3), + reason: b"NOT_ADMITTED", +}; + +/// Peer is in the local [`crate::Banlist`] — either via an external +/// trigger (e.g. BLS sigverify failure) or via the only internal trigger, +/// HANDOVER reception in the per-connection read loop. +pub(crate) const BANNED: Spec = Spec { + code: VarInt::from_u32(4), + reason: b"BANNED", +}; + +/// Connection table is at [`crate::MAX_PEERS`] and this is a fresh pubkey +/// — no slot available. +pub(crate) const TABLE_FULL: Spec = Spec { + code: VarInt::from_u32(5), + reason: b"TABLE_FULL", +}; + +/// Lex-pubkey tiebreaker violation: peer dialed us but its pubkey is greater +/// than or equal to ours. Per the rule the lower pubkey is the canonical +/// dialer; the higher pubkey listens. +pub(crate) const WRONG_DIRECTION: Spec = Spec { + code: VarInt::from_u32(9), + reason: b"WRONG_DIRECTION", +}; + +/// A new handshake from a pubkey already in our table replaced the prior +/// connection. The receiving side observes this close in its read loop, +/// soft-bans the evicting peer, and forwards a handover-event uplevel so +/// consensus can decide whether to shut the node down. +pub(crate) const HANDOVER: Spec = Spec { + code: VarInt::from_u32(10), + reason: b"HANDOVER", +}; + +/// Local endpoint's identity (TLS cert / pubkey) was rotated. All existing +/// connections are closed so peers re-handshake against the new cert. The +/// receiving side reaps the connection without banning — we are still the +/// same logical node, just with a fresh certificate. +pub(crate) const IDENTITY_ROTATED: Spec = Spec { + code: VarInt::from_u32(11), + reason: b"IDENTITY_ROTATED", +}; + +/// The caller (e.g. votor via a gossip refresh) supplied a different socket +/// address for this peer than the one our cached outbound connection was +/// opened to. We close the stale connection and re-connect at the new addr. +/// Only emitted by the lex-lower (dialer) side; lex-higher cached entries +/// are server-accepted and trusted regardless of caller's claimed addr +/// (NAT-mapped source addrs can legitimately differ from gossip ones). +pub(crate) const PEER_MOVED: Spec = Spec { + code: VarInt::from_u32(12), + reason: b"PEER_MOVED", +}; diff --git a/quic-datagram/src/connection_table.rs b/quic-datagram/src/connection_table.rs new file mode 100644 index 00000000000..f3e44ec39c0 --- /dev/null +++ b/quic-datagram/src/connection_table.rs @@ -0,0 +1,500 @@ +use { + crate::{ + MAX_PEERS, close_codes, + error::Error, + stats::{QuicDatagramStats, add, record_error}, + }, + bytes::Bytes, + dashmap::{DashMap, mapref::entry::Entry}, + log::info, + quinn::Connection, + solana_pubkey::Pubkey, + std::{ + net::SocketAddr, + sync::atomic::{AtomicU64, Ordering}, + }, +}; + +/// Per-peer connection table. Holds up to [`crate::MAX_PEERS`] entries. +/// +/// Each entry is a [`ConnectionTableEntry`] - either `Dialing` (placeholder +/// while a single in-flight dial task brings the connection up) or +/// `Established` (a live [`quinn::Connection`] for one peer). A new +/// handshake from a pubkey already in `Established` closes the old +/// connection with `HANDOVER` and installs the new one - we assume that +/// the remote validator is a hot-spare that was just brought up. +/// The displaced side observes `HANDOVER` in its +/// read loop, soft-bans the peer, and reports the event to the driving +/// service (e.g. votor). +/// +/// Backed by [`DashMap`] - multiple tasks (server-accept, client-egress, +/// per-connection read loops, banlist evictor) hammer the table concurrently. +/// +/// INVARIANT: no code path can hold a `Ref`/`RefMut` from the DashMap across +/// an `.await` - DashMap entry locks held across yield points will deadlock +/// the tokio runtime under contention. All DashMap access is encapsulated +/// inside this module. Every public method is a plain `fn` (never `async fn`) +/// and never returns a [`dashmap::mapref`] guard, so the "no `Ref`/`RefMut` +/// held across `.await`" invariant is enforced. +pub(crate) struct ConnectionTable { + inner: DashMap, + /// See [`IdGeneration`]. Bumped by [`Self::clear`]. + generation: AtomicU64, +} + +/// State of a peer's entry in the connection table. +pub(crate) enum ConnectionTableEntry { + /// A placeholder inserted by the dialer side before spawning a dial + /// task. Tagged with the [`IdGeneration`] live at insert time so a + /// stale (post-rotation) cleanup from an old dial task can't clobber + /// a fresh placeholder installed by a new dial task at a later + /// generation. + Dialing(IdGeneration), + /// Holds the live connection for fan-out. + Established(Connection), +} + +/// Outcome of attempting to insert a fresh incoming connection. +pub(crate) enum InsertOutcome { + /// Table had no prior entry (or held our own `Dialing` placeholder); + /// the new connection takes the slot. + Inserted, + /// Table already held an `Established` connection for this peer. The + /// old one was closed with `HANDOVER`; the new one took its place. + Replaced, + /// Table was at [`MAX_PEERS`] and this is a fresh pubkey. Caller must + /// close the connection with `TABLE_FULL`. + Rejected, + /// Identity rotated between the start of the dial / accept and this + /// insert; the connection is authenticated under our previous cert + /// and must not be installed. Caller closes with `IDENTITY_ROTATED`. + Stale, +} + +/// Outcome of [`ConnectionTable::dispatch_outbound`]. +pub(crate) enum EgressDispatch { + /// A cached `Established` for the requested addr existed; the datagram + /// was queued on it inside the method (the caller does nothing more). + Sent, + /// Another dial task is already in flight for this peer; the datagram + /// was dropped (counted in `egress_dropped_dial_in_progress`). + Dialing, + /// No usable connection. The slot now holds `Dialing` (placeholder + /// installed by this call, possibly after closing a stale + /// `Established` with `PEER_MOVED`). Caller is expected to spawn a + /// dial task and pass the returned [`IdGeneration`] to + /// [`ConnectionTable::insert_connection`] and + /// [`ConnectionTable::clear_dialing_placeholder`]. + SpawnDialTask { generation: IdGeneration }, +} + +/// Monotonic counter incremented on every identity rotation. In-flight +/// dial / accept tasks capture the generation at start and pass it into +/// [`ConnectionTable::insert_connection`]; a mismatch at insert time +/// means our identity changed mid-handshake and the resulting connection +/// is authenticated under our previous cert. We should close such +/// connections with `IDENTITY_ROTATED`. +pub(crate) type IdGeneration = u64; + +impl ConnectionTable { + /// Create a new empty connection table. + pub(crate) fn new() -> Self { + Self { + inner: DashMap::new(), + generation: AtomicU64::new(0), + } + } + + /// Current number of entries (both `Dialing` and `Established`). Sampled + /// by the control loop for the live-connection gauge and by the + /// establishment paths to update the peak high-water mark. + /// + /// MUST NOT be called while holding a DashMap entry guard - `len()` + /// iterates every shard and would deadlock on the pinned shard's lock. + pub(crate) fn len(&self) -> u64 { + self.inner.len() as u64 + } + + /// Snapshot the current [`IdGeneration`]. Server-accept and dial + /// paths read this at the start of their handshake and pass it back + /// into [`Self::insert_connection`] and (on the dial side) + /// [`Self::clear_dialing_placeholder`] so a rotation that happens + /// mid-operation can be detected. + pub(crate) fn current_generation(&self) -> IdGeneration { + // this happens rarely, SeqCst is ok + self.generation.load(Ordering::SeqCst) + } + + /// Register an `Established` connection for `peer`. Server-accept and + /// dial-completion paths both call this with the [`IdGeneration`] + /// they snapshotted at handshake start. Returns: + /// - `Inserted` on a fresh slot (or one previously holding our own + /// `Dialing` placeholder at the same generation). + /// - `Replaced` if a prior `Established` was displaced (HANDOVER); the + /// old connection is closed inside this method. + /// - `Rejected` if the table is at [`MAX_PEERS`] and this is a fresh + /// pubkey; caller must close `conn` with `TABLE_FULL`. This should + /// never happen during normal operation. + /// - `Stale` if our identity rotated between the handshake start and + /// this call; the connection is authenticated under our old cert + /// and the caller must close it with `IDENTITY_ROTATED`. + pub(crate) fn insert_connection( + &self, + peer: Pubkey, + conn: Connection, + gen_at_start: IdGeneration, + stats: &QuicDatagramStats, + ) -> InsertOutcome { + if self.current_generation() != gen_at_start { + return InsertOutcome::Stale; + } + // CRITICAL: check `len()` BEFORE acquiring an `Entry`. Holding an + // entry pins one of DashMap's internal shard locks; `len()` then + // iterates over every shard and would deadlock on its own shard + // (std::sync::RwLock is not reentrant). + // + // The MAX_PEERS check is racy under concurrent inserts to different + // vacant keys - going slightly over is acceptable. + let at_cap = self.inner.len() >= MAX_PEERS; + match self.inner.entry(peer) { + Entry::Vacant(slot) => { + if at_cap { + return InsertOutcome::Rejected; + } + slot.insert(ConnectionTableEntry::Established(conn)); + InsertOutcome::Inserted + } + Entry::Occupied(mut slot) => { + let old = + std::mem::replace(slot.get_mut(), ConnectionTableEntry::Established(conn)); + match old { + // Our own placeholder; just transition state - not a handover. + ConnectionTableEntry::Dialing(_) => InsertOutcome::Inserted, + // A prior live connection for this pubkey. Normal during + // handover. + ConnectionTableEntry::Established(old_conn) => { + // close the displaced connection with appropriate code. + close_codes::HANDOVER.close(&old_conn); + stats + .connection_replaced_handover + .fetch_add(1, Ordering::Relaxed); + InsertOutcome::Replaced + } + } + } + } + } + + /// Like [`Self::insert_connection`], but on cap-induced `Rejected` first + /// tries to free a slot by evicting an entry whose peer is no longer + /// admitted by `allow`, then retries the insert. Returns the final + /// outcome; `Rejected` only if the table is at cap AND no unadmitted + /// entry could be found to displace. + /// + /// This is the canonical entry point for handshake-complete inserts on + /// both the dial and accept paths. Direct `insert_connection` callers + /// are tests / contexts where eviction policy isn't applicable. + pub(crate) fn insert_connection_or_evict bool>( + &self, + peer: Pubkey, + conn: Connection, + gen_at_start: IdGeneration, + allow: F, + stats: &QuicDatagramStats, + ) -> InsertOutcome { + // `Connection` is Arc-backed; the clone here is just an Arc bump + // so we can retry on the eviction-success path. + let outcome = self.insert_connection(peer, conn.clone(), gen_at_start, stats); + if matches!(outcome, InsertOutcome::Rejected) && self.evict_one_unadmitted(allow, stats) { + self.insert_connection(peer, conn, gen_at_start, stats) + } else { + outcome + } + } + + /// Transmit over the cached inbound connection for `peer` (lex-higher side). + /// Returns `true` iff a cached `Established` was found and the send was + /// attempted; `false` if the slot is empty. The lex-higher side never + /// installs `Dialing` placeholders, so that arm is impossible by + /// construction and trips a `debug_assert` if hit. Successful sends bump + /// `datagrams_sent`; send failures are recorded via `record_error`. + pub(crate) fn send_over_inbound_connection( + &self, + peer: &Pubkey, + bytes: &Bytes, + stats: &QuicDatagramStats, + ) -> bool { + let Some(entry) = self.inner.get(peer) else { + return false; + }; + match entry.value() { + ConnectionTableEntry::Established(conn) => { + match conn.send_datagram(bytes.clone()) { + Ok(()) => add(&stats.datagrams_sent), + Err(e) => record_error(&Error::from(e), stats), + } + true + } + ConnectionTableEntry::Dialing(_) => { + debug_assert!( + false, + "lex-higher side should never hold Dialing for lower pubkey" + ); + false + } + } + } + + /// Outbound connection (dialer) packet dispatch. + /// Encapsulates the 4-case decision over the table entry for `peer`: + /// * vacant -> initiate new connection, + /// * dial in progress -> drop, + /// * established -> send, + /// * established to wrong address: evict + ask for redial. + pub(crate) fn send_over_outbound_connection( + &self, + peer: Pubkey, + addr: SocketAddr, + bytes: &Bytes, + stats: &QuicDatagramStats, + ) -> EgressDispatch { + let generation = self.current_generation(); + match self.inner.entry(peer) { + Entry::Vacant(slot) => { + slot.insert(ConnectionTableEntry::Dialing(generation)); + // Trigger packet is carried into the dial task and sent + // the moment the connection lands - the cold-start / + // standstill case needs it. Subsequent followers during + // `Dialing` are dropped (see the `Dialing` arm below). + EgressDispatch::SpawnDialTask { generation } + } + Entry::Occupied(mut slot) => match slot.get() { + ConnectionTableEntry::Dialing(_) => { + stats + .egress_dropped_dial_in_progress + .fetch_add(1, Ordering::Relaxed); + EgressDispatch::Dialing + } + ConnectionTableEntry::Established(conn) if conn.remote_address() == addr => { + match conn.send_datagram(bytes.clone()) { + Ok(()) => add(&stats.datagrams_sent), + Err(e) => record_error(&Error::from(e), stats), + } + EgressDispatch::Sent + } + ConnectionTableEntry::Established(_) => { + // Peer moved - swap the slot to `Dialing(generation)`... + let old = std::mem::replace( + slot.get_mut(), + ConnectionTableEntry::Dialing(generation), + ); + // ... and close the displaced connection with PEER_MOVED. + if let ConnectionTableEntry::Established(old_conn) = old { + close_codes::PEER_MOVED.close(&old_conn); + stats + .connection_evicted_peer_moved + .fetch_add(1, Ordering::Relaxed); + info!("peer {peer} moved; re-dialing at {addr}"); + } + // Trigger packet is carried into the new dial task and + // sent the moment the new connection lands at `addr`. + EgressDispatch::SpawnDialTask { generation } + } + }, + } + } + + /// Remove the slot for `peer` if and only if it currently holds an + /// `Established` with the given `stable_id`. No-op if the slot holds + /// a different connection (HANDOVER replacement landed) or a + /// `Dialing` placeholder. + pub(crate) fn maybe_reap_connection(&self, peer: &Pubkey, stable_id: usize) { + if let Entry::Occupied(entry) = self.inner.entry(*peer) + && matches!(entry.get(), ConnectionTableEntry::Established(c) if c.stable_id() == stable_id) + { + entry.remove(); + } + } + + /// Scan the table for an entry whose peer is no longer in the allowlist + /// and evict it. Used at insert-cap time to make room for a + /// freshly-allowed peer (typically across an epoch boundary where the + /// staked-set churn temporarily pushes the union past `MAX_PEERS`). + /// Closes any displaced `Established` with `NOT_ADMITTED` and bumps + /// `connection_evicted_allowlist`. Returns `true` on a hit. Stops at + /// the first eviction - one slot is enough for the caller's retry. + pub(crate) fn evict_one_unadmitted bool>( + &self, + allow: F, + stats: &QuicDatagramStats, + ) -> bool { + // Collect a candidate key first (release the iterator's shard + // guards before mutating). `iter` holds shard read-locks; any + // call that wants a write lock on the same shard would deadlock + // if we mutated inside the loop. + let victim = self + .inner + .iter() + .find(|entry| !allow(entry.key())) + .map(|entry| *entry.key()); + let Some(peer) = victim else { + return false; + }; + if let Some((_, entry)) = self.inner.remove(&peer) { + if let ConnectionTableEntry::Established(conn) = entry { + close_codes::NOT_ADMITTED.close(&conn); + } + stats + .connection_evicted_allowlist + .fetch_add(1, Ordering::Relaxed); + true + } else { + // Slot vanished between the scan and the remove. Caller will + // re-attempt insert; if the cap is still hit they'll retry + // eviction. + false + } + } + + /// Remove the slot for `peer` iff it still holds a `Dialing` + /// placeholder tagged with the same generation. Called by the + /// dialer task on failure so the next egress can spawn a fresh + /// dial. Race-safe: silently does nothing if some other path + /// replaced the slot with an `Established`, or if a later + /// generation's dialer already swapped its own `Dialing` in (we + /// must not clobber a fresher placeholder). + pub(crate) fn clear_dialing_placeholder(&self, peer: &Pubkey, generation: IdGeneration) { + if let Entry::Occupied(slot) = self.inner.entry(*peer) + && matches!(slot.get(), ConnectionTableEntry::Dialing(g) if *g == generation) + { + slot.remove(); + } + } + + /// Bump the identity generation and wipe the table. Every + /// `Established` is closed with `IDENTITY_ROTATED` on the way out; + /// `Dialing` placeholders are dropped. Any in-flight dial / accept + /// whose handshake completes after the gen bump will hit `Stale` + /// in [`Self::insert_connection`] and bail without installing a + /// wrong-identity connection. Returns the count of entries + /// dropped (caller bumps the `connection_evicted_identity_rotated` + /// stat). + pub(crate) fn clear_for_id_change(&self) -> u64 { + // Bump generation FIRST so any concurrent dial that observes + // the new generation via `current_generation()` after this + // point bails on insert. + // Overkill on SeqCst since this is super rare + self.generation.fetch_add(1, Ordering::SeqCst); + let mut evicted: u64 = 0; + self.inner.retain(|_, entry| { + if let ConnectionTableEntry::Established(conn) = entry { + close_codes::IDENTITY_ROTATED.close(conn); + } + evicted = evicted.saturating_add(1); + false + }); + evicted + } +} + +#[cfg(test)] +mod tests { + use {super::*, std::collections::HashSet}; + + /// Build a `ConnectionTable` seeded with `n_admitted` plus `n_unadmitted` + /// `Dialing` placeholders. Returns (table, admitted_set, unadmitted_set) + /// so the test can pass the admitted set as the "allow" predicate. + /// + /// Using `Dialing` placeholders (not `Established`) lets us exercise the + /// scan / remove / stat-bump path at scale without standing up real QUIC + /// connections. The `Established` close path in `evict_one_unadmitted` is + /// a single `close_codes::NOT_ADMITTED.close(&conn)` call; not exercised + /// here, but it's the same one-liner used in every other eviction site. + fn seed_table( + n_admitted: usize, + n_unadmitted: usize, + ) -> (ConnectionTable, HashSet, HashSet) { + let table = ConnectionTable::new(); + let mut admitted = HashSet::with_capacity(n_admitted); + let mut unadmitted = HashSet::with_capacity(n_unadmitted); + for _ in 0..n_admitted { + let pk = Pubkey::new_unique(); + table.inner.insert(pk, ConnectionTableEntry::Dialing(0)); + admitted.insert(pk); + } + for _ in 0..n_unadmitted { + let pk = Pubkey::new_unique(); + table.inner.insert(pk, ConnectionTableEntry::Dialing(0)); + unadmitted.insert(pk); + } + (table, admitted, unadmitted) + } + + #[test] + fn evict_one_unadmitted_returns_false_on_empty() { + let table = ConnectionTable::new(); + let stats = QuicDatagramStats::default(); + assert!(!table.evict_one_unadmitted(|_| true, &stats)); + assert_eq!( + stats.connection_evicted_allowlist.load(Ordering::Relaxed), + 0 + ); + } + + #[test] + fn evict_one_unadmitted_returns_false_when_all_admitted() { + let (table, admitted, _) = seed_table(50, 0); + let stats = QuicDatagramStats::default(); + assert!(!table.evict_one_unadmitted(|pk| admitted.contains(pk), &stats)); + assert_eq!(table.inner.len(), 50); + assert_eq!( + stats.connection_evicted_allowlist.load(Ordering::Relaxed), + 0 + ); + } + + #[test] + fn evict_one_unadmitted_drains_only_unadmitted_at_scale() { + // 1000 admitted + 1000 unadmitted. Loop evict until it returns + // false; verify every unadmitted entry is gone, every admitted + // entry remains, and the stat counter matches the eviction count. + const N: usize = 1000; + let (table, admitted, unadmitted) = seed_table(N, N); + assert_eq!(table.inner.len(), 2 * N); + + let stats = QuicDatagramStats::default(); + let mut evicted = 0usize; + while table.evict_one_unadmitted(|pk| admitted.contains(pk), &stats) { + evicted += 1; + assert!( + evicted <= N, + "evict_one_unadmitted returned true past the unadmitted set; must not touch \ + admitted entries" + ); + } + + assert_eq!(evicted, N, "must evict exactly the {N} unadmitted entries"); + assert_eq!( + table.inner.len(), + N, + "exactly the {N} admitted entries must remain" + ); + for pk in &admitted { + assert!( + table.inner.contains_key(pk), + "admitted peer {pk} was evicted", + ); + } + for pk in &unadmitted { + assert!( + !table.inner.contains_key(pk), + "unadmitted peer {pk} survived eviction", + ); + } + assert_eq!( + stats.connection_evicted_allowlist.load(Ordering::Relaxed), + N as u64, + "stat counter must match eviction count" + ); + } +} diff --git a/quic-datagram/src/endpoint.rs b/quic-datagram/src/endpoint.rs new file mode 100644 index 00000000000..8a9014634eb --- /dev/null +++ b/quic-datagram/src/endpoint.rs @@ -0,0 +1,689 @@ +//! QUIC datagram endpoint: public handle ([`QuicDatagramEndpoint`]) plus +//! its unified control loop ([`EndpointLoop`]). One tokio task handles +//! server accept, client egress, banlist eviction, identity rotation, +//! and metrics. Each event source is an arm of a single `tokio::select!`; +//! heavy per-event work is spawned onto its own task so a slow handshake +//! or dial does not block dispatch of the next event. + +use { + crate::{ + Banlist, EGRESS_CHANNEL_CAP, + allowlist::Allowlist, + client::ClientConnection, + close_codes, + connection_table::{ConnectionTable, EgressDispatch}, + error::Error, + key_updater::{IdentitySnapshot, KeyUpdater}, + server::ServerConnection, + stats::{self, QuicDatagramStats, add}, + subnet_rate_limit::SubnetRateLimiter, + transport::{new_client_config, new_server_config}, + }, + bytes::Bytes, + crossbeam_channel::Sender, + log::{debug, info, warn}, + quinn::{Endpoint, EndpointConfig, Incoming, TokioRuntime}, + solana_keypair::{Keypair, Signer}, + solana_pubkey::Pubkey, + solana_tls_utils::new_dummy_x509_certificate, + std::{ + net::{SocketAddr, UdpSocket}, + sync::{Arc, atomic::Ordering}, + time::Duration, + }, + tokio::{ + sync::{mpsc, watch}, + task::JoinHandle, + time::MissedTickBehavior, + }, +}; + +const BANLIST_PRUNE_INTERVAL: Duration = Duration::from_secs(60 * 60); +const METRICS_INTERVAL: Duration = Duration::from_secs(2); + +/// Datagram envelope used on both directions of the endpoint. +#[derive(Debug)] +pub struct Datagram { + pub peer_pubkey: Pubkey, + pub peer_address: SocketAddr, + pub message: Bytes, +} + +/// Datagram-only QUIC endpoint bound to UDP socket. The single control +/// loop task is spawned by [`Self::new`]; await `task` after +/// [`Self::close`] to observe full drain. +pub struct QuicDatagramEndpoint { + pub endpoint: Endpoint, + pub egress: mpsc::Sender, + /// Handle for rotating the local identity (TLS cert / pubkey). Wraps a + /// `tokio::sync::watch` channel; the actual swap happens asynchronously + /// in the control loop. Implements `solana_tls_utils::NotifyKeyUpdate` + /// so it slots into the validator's `KeyUpdaters` registry. + pub key_updater: Arc, + pub task: JoinHandle<()>, +} + +impl QuicDatagramEndpoint { + /// Construct a datagram-only QUIC endpoint bound to `socket`. Spawns the + /// unified control loop on `runtime`. Received datagrams flow into + /// `ingress` via `try_send`; full ingress channel results in a drop + /// (counted in `datagram_ingress_dropped_channel_full`). + /// + /// `allowlist` is consulted once per new connection in either direction. + /// `banlist` is consulted on every send and at handshake. + #[allow(clippy::too_many_arguments)] + pub fn new( + runtime: &tokio::runtime::Handle, + keypair: &Keypair, + socket: UdpSocket, + alpn_protocol_id: &'static [u8], + ingress: Sender, + allowlist: Arc, + banlist: Arc>, + ) -> Result { + let local_pubkey = keypair.pubkey(); + let (cert, key) = new_dummy_x509_certificate(keypair); + let server_config = new_server_config(cert.clone(), key.clone_key(), alpn_protocol_id); + let client_config = new_client_config(cert, key, alpn_protocol_id); + + let mut endpoint = { + // Endpoint::new requires being inside the runtime context, else it + // panics on its first internal `tokio::spawn`. + let _guard = runtime.enter(); + Endpoint::new( + EndpointConfig::default(), + Some(server_config), + socket, + Arc::new(TokioRuntime), + ) + .map_err(Error::Endpoint)? + }; + endpoint.set_default_client_config(client_config); + + let table = Arc::new(ConnectionTable::new()); + let stats = Arc::::default(); + let (egress_tx, egress_rx) = mpsc::channel(EGRESS_CHANNEL_CAP); + let (key_updater, identity_rx) = KeyUpdater::new(); + let key_updater = Arc::new(key_updater); + + let control = EndpointLoop { + endpoint: endpoint.clone(), + local_pubkey, + egress_rx, + ingress, + allowlist, + banlist, + identity_rx, + connections: table, + stats, + alpn: alpn_protocol_id, + }; + let task = runtime.spawn(control.run()); + + Ok(Self { + endpoint, + egress: egress_tx, + key_updater, + task, + }) + } + + /// Initiate endpoint shutdown. The control loop exits on its next + /// `endpoint.accept()` resolution (returns `None` once closed); in-flight + /// connections are closed with `SHUTDOWN`. Callers should `await` + /// [`Self::task`] to ensure all connections are terminated gracefully. + pub fn close(&self) { + self.endpoint + .close(close_codes::SHUTDOWN.code, close_codes::SHUTDOWN.reason); + } +} + +struct EndpointLoop { + endpoint: Endpoint, + local_pubkey: Pubkey, + egress_rx: mpsc::Receiver, + ingress: Sender, + allowlist: Arc, + banlist: Arc>, + identity_rx: watch::Receiver>>, + connections: Arc, + stats: Arc, + /// Held so that an identity rotation can rebuild server + client TLS + /// configs without revisiting the caller. + alpn: &'static [u8], +} + +impl EndpointLoop { + async fn run(mut self) { + let subnet_limit = Arc::new(SubnetRateLimiter::new()); + + let mut prune = tokio::time::interval(BANLIST_PRUNE_INTERVAL); + prune.set_missed_tick_behavior(MissedTickBehavior::Skip); + + let mut metrics = tokio::time::interval(METRICS_INTERVAL); + metrics.set_missed_tick_behavior(MissedTickBehavior::Skip); + + // The loop exits when egress / accept channels close - running a + // half-broken endpoint after one of those has gone away is + // pointless. The `identity_rx` arm is the lone exception: when + // the `KeyUpdater`-side `watch::Sender` is dropped we disable the + // arm via `id_closed` and keep running without identity rotation. + // + // TODO: this tolerance exists only to paper over `local-cluster` + // passing a throwaway `Arc::new(RwLock::new(None))` for the + // `admin_rpc_service_post_init` parameter to `Validator::new`. + // That Arc drops the moment `Validator::new` returns, taking the + // whole `Arc → Arc → watch::Sender` + // chain with it. Fix on the caller side is very invasive, so + // was not applied. + let mut id_closed = false; + loop { + tokio::select! { + biased; + // If identity is changed we should reconnect immediately + // to maintain coherent state. Any existing backlog of packets + // is probably invalid/irrelevant. + changed = self.identity_rx.changed(), if !id_closed => { + if changed.is_err() { + // See TODO at top of loop + warn!("identity rotation channel closed; endpoint will run without rotation support"); + id_closed = true; + continue; + } + let snap = self.identity_rx.borrow_and_update().clone(); + if let Some(snap) = snap { + self.apply_identity_change(snap); + } + } + // egress to existing peers more important than accepting new + maybe_datagram = self.egress_rx.recv() => { + let Some(datagram) = maybe_datagram else { break }; + self.handle_datagram(datagram); + } + maybe_incoming = self.endpoint.accept() => { + let Some(incoming) = maybe_incoming else { break }; + self.accept_connection(incoming, &subnet_limit); + } + // when idle we can take care of bookkeeping. If these are delayed + // it is usually not a problem. + _ = prune.tick() => self.banlist.prune(), + _ = metrics.tick() => stats::report(&self.stats, self.connections.len()), + } + } + } + + /// Rebuild TLS configs against the new identity, swap them into the + /// quinn endpoint, evict every cached connection so peers re-handshake, + /// and adopt the new pubkey for the lex-direction rule going forward. + fn apply_identity_change(&mut self, snap: Arc) { + let server_config = new_server_config(snap.cert.clone(), snap.key.clone_key(), self.alpn); + let client_config = new_client_config(snap.cert.clone(), snap.key.clone_key(), self.alpn); + + self.local_pubkey = snap.pubkey; + self.endpoint.set_default_client_config(client_config); + self.endpoint.set_server_config(Some(server_config)); + + let evicted = self.connections.clear_for_id_change(); + self.stats + .connection_evicted_identity_rotated + .fetch_add(evicted, Ordering::Relaxed); + info!( + "identity rotated to {} ({} connection(s) evicted)", + snap.pubkey, evicted + ); + } + + fn handle_datagram(&self, datagram: Datagram) { + let Datagram { + peer_pubkey: peer, + peer_address: addr, + message: bytes, + } = datagram; + debug_assert_ne!(self.local_pubkey, peer, "egress to self is a caller bug"); + if self.banlist.is_banned(&peer) { + return; + } + // Lex rule partition connection directions: lex-higher side + // only ever holds inbound (server-accepted) entries; lex-lower side + // only ever holds outbound (we-dialed) ones. + // + // - Higher side: trust whatever inbound conn we have. The peer's + // source addr can legitimately differ from the + // gossip-published addr due to gossip lag, so we ignore the + // addr argument on hit. + // + // - Lower side: cached `Established`'s remote addr is exactly the + // addr we dialed. If the caller now wants a different addr (e.g. + // gossip refreshed the peer's published addr), the peer has + // moved - evict and re-dial. `Dialing` placeholder means another + // egress already kicked off a dial for this peer; drop this + // datagram rather than queueing. + if self.local_pubkey >= peer { + if self + .connections + .send_over_inbound_connection(&peer, &bytes, &self.stats) + { + return; + } + add(&self.stats.egress_dropped_higher_pubkey); + return; + } + + // Ask connection table if we should spawn a dialing task + // retain the generation ID from it. + let generation = + match self + .connections + .send_over_outbound_connection(peer, addr, &bytes, &self.stats) + { + EgressDispatch::Sent | EgressDispatch::Dialing => return, + EgressDispatch::SpawnDialTask { generation } => generation, + }; + + // Carry the trigger packet into the dial task; it'll be sent on + // the new connection the moment `insert_connection` succeeds. + // This is what lets a standstill broadcast (one cert every + // `DELTA_STANDSTILL`) actually reach a peer whose connection + // had died. Followers arriving during `Dialing` still drop on + // the floor (see `dispatch_outbound`'s `Dialing` arm). + ClientConnection { + endpoint: self.endpoint.clone(), + peer, + addr, + id_generation: generation, + trigger: bytes, + ingress: self.ingress.clone(), + allowlist: self.allowlist.clone(), + banlist: self.banlist.clone(), + table: self.connections.clone(), + stats: self.stats.clone(), + } + .spawn(); + } + + /// Performs the non-expensive checks to handle incoming connections. + /// Then spawns the statemachine to handle the handshake and serve connection. + fn accept_connection(&self, incoming: Incoming, subnet_limit: &Arc) { + let remote_addr = incoming.remote_address(); + if remote_addr.is_ipv6() || remote_addr.ip().is_multicast() { + incoming.ignore(); + return; + } + if !incoming.remote_address_validated() { + match incoming.retry() { + Ok(()) => add(&self.stats.handshake_retry_sent), + Err(e) => { + debug!("retry() failed for {remote_addr}"); + e.into_incoming().ignore(); + } + } + return; + } + // Post-RETRY: Gate per-subnet here, + // *before* spending CPU on the TLS handshake. An attacker with a + // large IP pool can complete RETRY on each address but is bounded + // by the per-/24 burst budget (100 attempts, refilling 1/min). + if !subnet_limit.admit(remote_addr.ip()) { + add(&self.stats.handshake_rejected_overload); + incoming.refuse(); + return; + } + ServerConnection { + incoming, + local_pubkey: self.local_pubkey, + ingress: self.ingress.clone(), + allowlist: self.allowlist.clone(), + banlist: self.banlist.clone(), + table: self.connections.clone(), + stats: self.stats.clone(), + } + .spawn(); + } +} + +#[cfg(test)] +mod tests { + use { + crate::{ + allowlist::AllowAll, + testutils::{ + clone_keypair, drain_matching, keypair_below, make_runtime, recv_until, + send_until_received, spawn_node, spawn_node_with, + }, + }, + bytes::Bytes, + solana_keypair::Signer, + solana_tls_utils::NotifyKeyUpdate, + std::{sync::Arc, time::Duration}, + }; + + #[test] + /// When `local_pubkey >= peer` and no cached + /// connection exists, the egress datagram is dropped (counted as + /// `egress_dropped_higher_pubkey`). Once the lower-pubkey peer dials in, + /// the higher side's egress flows via send_over_inbound on the cached inbound. + fn higher_pubkey_drops_egress_until_lower_dials_in() { + let rt = make_runtime(); + // B has the higher pubkey, A the lower. + let b = spawn_node(&rt, Arc::new(AllowAll)); + let a = spawn_node_with(&rt, Arc::new(AllowAll), keypair_below(&b.pubkey())); + + // B (higher) tries to send to A (lower) before A has dialed B. Per the + // lex rule, B's client drops the datagram silently - A must not receive. + let dropped = Bytes::from_static(b"should-be-dropped"); + rt.block_on(async { + b.endpoint + .egress + .send(crate::endpoint::Datagram { + peer_pubkey: a.pubkey(), + peer_address: a.addr, + message: dropped.clone(), + }) + .await + .unwrap(); + }); + let stray = a.ingress_rx.recv_timeout(Duration::from_millis(800)); + assert!( + stray.is_err(), + "lower-pubkey peer must not receive a higher-pubkey peer's egress without first \ + dialing in; got {stray:?}" + ); + + // A dials B (correct direction per the lex rule). B's server accepts + // and caches the inbound from A. Trigger-drop means we need to retry + // until the dial completes and a follower lands. + let from_a = Bytes::from_static(b"from-A"); + send_until_received( + &rt, + &a.endpoint, + b.pubkey(), + b.addr, + from_a.clone(), + &b.ingress_rx, + Duration::from_secs(5), + |d| (d.message == from_a).then_some(()), + "B did not receive A's dial", + ); + drain_matching(&b.ingress_rx, Duration::from_millis(200), |d| { + d.message == from_a + }); + + // B can now reach A via send_over_inbound on the cached inbound. No + // dial needed — single send is enough. + let from_b = Bytes::from_static(b"from-B-after-A-dialed"); + rt.block_on(async { + b.endpoint + .egress + .send(crate::endpoint::Datagram { + peer_pubkey: a.pubkey(), + peer_address: a.addr, + message: from_b.clone(), + }) + .await + .unwrap(); + }); + recv_until( + &a.ingress_rx, + Duration::from_secs(5), + |d| (d.message == from_b).then_some(()), + "A did not receive B's datagram after the inbound established a connection", + ); + } + + #[test] + fn rotation_evicts_connections_and_resends_under_new_identity() { + let rt = make_runtime(); + let server = spawn_node(&rt, Arc::new(AllowAll)); + + let k1 = keypair_below(&server.pubkey()); + let k1_pk = k1.pubkey(); + let client = spawn_node_with(&rt, Arc::new(AllowAll), k1); + + // Send under K1. Server should observe message attributed to K1. + let p1 = Bytes::from_static(b"under-K1"); + send_until_received( + &rt, + &client.endpoint, + server.pubkey(), + server.addr, + p1.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.peer_pubkey == k1_pk && d.message == p1).then_some(()), + "server never received message attributed to K1", + ); + drain_matching(&server.ingress_rx, Duration::from_millis(200), |d| { + d.message == p1 + }); + + // Rotate to K2. Pick K2 also below the server so dial direction stays. + let k2 = keypair_below(&server.pubkey()); + let k2_pk = k2.pubkey(); + assert_ne!(k1_pk, k2_pk, "K1 and K2 must differ"); + client + .endpoint + .key_updater + .update_key(&k2) + .expect("identity rotation accepted"); + + // The control loop applies the rotation asynchronously: rebuild TLS + // configs, evict cached connections (server sees IDENTITY_ROTATED). + // Give it a beat. + std::thread::sleep(Duration::from_millis(500)); + + // Send under K2. Server's table no longer holds the K1 entry; a + // fresh handshake under K2 establishes a new connection, and the + // server observes the message attributed to K2. + let p2 = Bytes::from_static(b"under-K2"); + send_until_received( + &rt, + &client.endpoint, + server.pubkey(), + server.addr, + p2.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.peer_pubkey == k2_pk && d.message == p2).then_some(()), + "server never received message attributed to K2 after rotation", + ); + + // Client side: the rotated client should NOT have soft-banned the + // server (rotation is a local event, not a HANDOVER-style takeover). + assert!( + !client.banlist.is_banned(&server.pubkey()), + "rotation must not soft-ban peers we close ourselves" + ); + } + + #[test] + fn outbound_addr_change_redials_new_addr() { + let rt = make_runtime(); + + // S1 establishes the lex order. Client will be lex-lower. + let s1 = spawn_node(&rt, Arc::new(AllowAll)); + let s_key = clone_keypair(&s1.keypair); + let s_pubkey = s1.pubkey(); + + let client_kp = keypair_below(&s_pubkey); + let client = spawn_node_with(&rt, Arc::new(AllowAll), client_kp); + + // Initial send: client dials S1 at its addr A1. + let p1 = Bytes::from_static(b"p1"); + send_until_received( + &rt, + &client.endpoint, + s_pubkey, + s1.addr, + p1.clone(), + &s1.ingress_rx, + Duration::from_secs(5), + |d| (d.message == p1).then_some(()), + "S1 did not receive p1", + ); + drain_matching(&s1.ingress_rx, Duration::from_millis(200), |d| { + d.message == p1 + }); + + // S2 takes the same identity at a new addr - simulates a peer host + // move with gossip publishing a new SocketAddr for the same pubkey. + let s2 = spawn_node_with(&rt, Arc::new(AllowAll), s_key); + assert_ne!(s1.addr, s2.addr, "S1 and S2 must bind distinct addrs"); + + // Send to the new addr. Client must observe the addr mismatch, + // evict its cached conn to A1, and re-dial A2. + let p2 = Bytes::from_static(b"p2"); + send_until_received( + &rt, + &client.endpoint, + s_pubkey, + s2.addr, + p2.clone(), + &s2.ingress_rx, + Duration::from_secs(5), + |d| (d.message == p2).then_some(()), + "S2 (post-move) did not receive p2", + ); + + // Stale conn to S1 was evicted; S1 must not see post-move datagrams. + let stray = s1.ingress_rx.recv_timeout(Duration::from_millis(800)); + assert!( + stray.is_err(), + "S1 must not see post-move datagrams; got {stray:?}" + ); + } + + #[test] + fn higher_side_ignores_caller_addr_on_hit() { + // Inbound (server-accepted) connections live on the lex-higher side. + // Their cached `remote_address` is the peer's NAT-mapped source addr, + // which the caller (working from gossip) may not know. The higher + // side must trust the cached conn regardless of caller's addr. + let rt = make_runtime(); + + let server = spawn_node(&rt, Arc::new(AllowAll)); // lex-higher + let client_kp = keypair_below(&server.pubkey()); + let client = spawn_node_with(&rt, Arc::new(AllowAll), client_kp); + let c_pubkey = client.pubkey(); + + // Client dials in so the server caches an inbound conn for c_pubkey. + let probe = Bytes::from_static(b"open"); + send_until_received( + &rt, + &client.endpoint, + server.pubkey(), + server.addr, + probe.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.message == probe).then_some(()), + "server did not receive client's probe", + ); + + // Server now sends back to client. The server is lex-higher and + // already holds an `Established` for c_pubkey (the inbound from + // the client's dial), so `send_over_inbound` goes through the cached + // connection without a dial — single send + recv works here. + // We deliberately hand a bogus addr to verify the higher side + // ignores it on cache hit. + let bogus_addr: std::net::SocketAddr = "203.0.113.99:65000".parse().unwrap(); + let pay = Bytes::from_static(b"reply"); + rt.block_on(async { + server + .endpoint + .egress + .send(crate::endpoint::Datagram { + peer_pubkey: c_pubkey, + peer_address: bogus_addr, + message: pay.clone(), + }) + .await + .unwrap(); + }); + recv_until( + &client.ingress_rx, + Duration::from_secs(5), + |d| (d.message == pay).then_some(()), + "client did not receive server's reply on the cached inbound conn", + ); + } + + #[test] + fn chained_address_changes_each_redial() { + // Three servers sharing one identity, distinct addrs. Each egress to + // a new addr evicts the prior `Established` with PEER_MOVED and dials + // the new addr. Verifies the state machine can roll through + // Established(A1) → Dialing → Established(A2) → Dialing → + // Established(A3) without corruption. + let rt = make_runtime(); + let s1 = spawn_node(&rt, Arc::new(AllowAll)); + let s_key = clone_keypair(&s1.keypair); + let s_pubkey = s1.pubkey(); + let client = spawn_node_with(&rt, Arc::new(AllowAll), keypair_below(&s_pubkey)); + + let s2 = spawn_node_with(&rt, Arc::new(AllowAll), clone_keypair(&s_key)); + let s3 = spawn_node_with(&rt, Arc::new(AllowAll), clone_keypair(&s_key)); + assert!( + s1.addr != s2.addr && s2.addr != s3.addr && s1.addr != s3.addr, + "the three servers must bind distinct addrs", + ); + + let p1 = Bytes::from_static(b"to-s1"); + let p2 = Bytes::from_static(b"to-s2"); + let p3 = Bytes::from_static(b"to-s3"); + + send_until_received( + &rt, + &client.endpoint, + s_pubkey, + s1.addr, + p1.clone(), + &s1.ingress_rx, + Duration::from_secs(5), + |d| (d.message == p1).then_some(()), + "s1 did not receive p1", + ); + drain_matching(&s1.ingress_rx, Duration::from_millis(200), |d| { + d.message == p1 + }); + + send_until_received( + &rt, + &client.endpoint, + s_pubkey, + s2.addr, + p2.clone(), + &s2.ingress_rx, + Duration::from_secs(5), + |d| (d.message == p2).then_some(()), + "s2 did not receive p2 - first address change failed", + ); + drain_matching(&s2.ingress_rx, Duration::from_millis(200), |d| { + d.message == p2 + }); + + send_until_received( + &rt, + &client.endpoint, + s_pubkey, + s3.addr, + p3.clone(), + &s3.ingress_rx, + Duration::from_secs(5), + |d| (d.message == p3).then_some(()), + "s3 did not receive p3 - chained address change failed", + ); + + // Each server saw only its targeted datagram (plus retry duplicates + // we already drained), nothing else. + let stray1 = s1.ingress_rx.recv_timeout(Duration::from_millis(500)); + assert!( + stray1.is_err(), + "s1 unexpectedly received extra {stray1:?} after move", + ); + let stray2 = s2.ingress_rx.recv_timeout(Duration::from_millis(500)); + assert!( + stray2.is_err(), + "s2 unexpectedly received extra {stray2:?} after move", + ); + } +} diff --git a/quic-datagram/src/error.rs b/quic-datagram/src/error.rs new file mode 100644 index 00000000000..f930a069314 --- /dev/null +++ b/quic-datagram/src/error.rs @@ -0,0 +1,64 @@ +use { + quinn::{ConnectError, ConnectionError, SendDatagramError}, + solana_pubkey::Pubkey, + std::{io, net::SocketAddr}, + thiserror::Error, +}; + +/// All errors observed by the endpoint. Returned from public APIs and stamped +/// into [`crate::stats::QuicDatagramStats`] via `record_error`. +#[derive(Error, Debug)] +pub enum Error { + #[error("egress channel closed")] + EgressChannelClosed, + + #[error("ingress channel closed")] + IngressChannelClosed, + + #[error(transparent)] + Connect(#[from] ConnectError), + + #[error(transparent)] + Connection(#[from] ConnectionError), + + /// TLS handshake succeeded but the peer cert did not yield a recoverable + /// solana ed25519 pubkey. The connection is closed by the caller. + #[error("invalid identity from {0:?}")] + InvalidIdentity(SocketAddr), + + /// Peer pubkey is not in the allowlist. + #[error("peer {0} is not admitted (unstaked)")] + NotAdmitted(Pubkey), + + /// Peer pubkey is currently banned. + #[error("peer {0} is banned")] + Banned(Pubkey), + + /// Connection table already holds [`crate::MAX_PEERS`] distinct pubkeys + /// and the incoming peer is not among them. + #[error("connection table full")] + TableFull, + + /// Inbound handshake from a peer whose pubkey is greater than ours. + /// Per the lex-pubkey tiebreaker, the lower pubkey dials and the higher + /// listens - so a peer with pubkey >= local must not be dialing us. We + /// close the inbound; the surviving connection is the one our own client + /// dials in the reverse direction. + #[error("inbound from {0} rejected: wrong direction (lex tiebreaker)")] + WrongDirection(Pubkey), + + #[error(transparent)] + SendDatagram(#[from] SendDatagramError), + + /// Identity rotated between handshake start and the post-handshake + /// `insert_connection`. The completed connection is authenticated + /// under our previous cert and was closed with `IDENTITY_ROTATED` + /// rather than inserted into the table. + #[error("identity rotated mid-handshake; connection to {0} aborted")] + IdentityRotated(Pubkey), + + /// `quinn::Endpoint::new` failed (e.g. socket already bound by another + /// process). Construction-time only. + #[error(transparent)] + Endpoint(#[from] io::Error), +} diff --git a/quic-datagram/src/key_updater.rs b/quic-datagram/src/key_updater.rs new file mode 100644 index 00000000000..e7433a1057b --- /dev/null +++ b/quic-datagram/src/key_updater.rs @@ -0,0 +1,68 @@ +//! Identity rotation handle for [`crate::QuicDatagramEndpoint`]. +//! +//! Construction surfaces an `Arc` that implements +//! [`solana_tls_utils::NotifyKeyUpdate`] so callers can register it in the +//! validator's `KeyUpdaters` registry alongside the other rotatable +//! identity consumers. On `update_key`, a fresh `IdentitySnapshot` is +//! published on a `tokio::sync::watch` channel; the control loop applies +//! it asynchronously (rebuild TLS configs, swap them via quinn, evict the +//! connection table with `IDENTITY_ROTATED`, switch the lex-direction +//! pubkey). +//! +//! In-flight dials and queued egress at the time of rotation are not +//! retried - callers must re-send if they care. Steady-state vote +//! traffic recovers on the next slot's send. + +use { + rustls::pki_types::{CertificateDer, PrivateKeyDer}, + solana_keypair::{Keypair, Signer}, + solana_pubkey::Pubkey, + solana_tls_utils::{NotifyKeyUpdate, new_dummy_x509_certificate}, + std::sync::Arc, + tokio::sync::watch, +}; + +/// Identity material derived from a keypair: the ed25519 pubkey plus the +/// self-signed TLS cert/key that the endpoint presents to peers. +pub(crate) struct IdentitySnapshot { + pub pubkey: Pubkey, + pub cert: CertificateDer<'static>, + pub key: PrivateKeyDer<'static>, +} + +impl IdentitySnapshot { + pub fn from_keypair(keypair: &Keypair) -> Self { + let (cert, key) = new_dummy_x509_certificate(keypair); + Self { + pubkey: keypair.pubkey(), + cert, + key, + } + } +} + +/// Handle for caller-driven identity rotation. Cloneable and thread-safe. +/// Implements [`NotifyKeyUpdate`] so it slots into the validator's existing +/// key-rotation plumbing. +pub struct KeyUpdater { + tx: watch::Sender>>, +} + +impl KeyUpdater { + pub(crate) fn new() -> (Self, watch::Receiver>>) { + let (tx, rx) = watch::channel(None); + (Self { tx }, rx) + } +} + +impl NotifyKeyUpdate for KeyUpdater { + fn update_key(&self, keypair: &Keypair) -> Result<(), Box> { + let snap = Arc::new(IdentitySnapshot::from_keypair(keypair)); + self.tx + .send(Some(snap)) + .map_err(|_| -> Box { + "quic-datagram endpoint has shut down; identity update rejected".into() + })?; + Ok(()) + } +} diff --git a/quic-datagram/src/lib.rs b/quic-datagram/src/lib.rs new file mode 100644 index 00000000000..976572293c7 --- /dev/null +++ b/quic-datagram/src/lib.rs @@ -0,0 +1,56 @@ +#![doc = include_str!("../README.md")] +#![cfg(feature = "agave-unstable-api")] + +pub mod allowlist; +pub mod error; + +#[cfg(any(test, feature = "dev-context-only-utils"))] +pub mod testutils; + +pub(crate) mod client; +pub(crate) mod close_codes; +pub(crate) mod connection_table; +pub mod endpoint; +pub mod key_updater; +pub(crate) mod read_loop; +pub(crate) mod server; +pub(crate) mod stats; +pub(crate) mod subnet_rate_limit; +pub(crate) mod transport; + +use std::time::Duration; +pub use { + allowlist::{Allowlist, StakedNodesAllowlist}, + endpoint::QuicDatagramEndpoint, + error::Error, + key_updater::KeyUpdater, + solana_net_utils::banlist::Banlist, +}; + +/// Ban duration applied by the crate's only internal trigger: HANDOVER +/// reception in the per-connection read loop. Short window because the +/// peer may legitimately be retrying a hot-spare promotion. +pub const BAN_DURATION_SHORT: Duration = Duration::from_secs(5); + +/// Maximum number of unique peer pubkeys held in the connection table. +/// Sized for the maximum expected alpenglow staked-node count. +pub const MAX_PEERS: usize = 2000; + +/// Capacity of the egress channel held by [`QuicDatagramEndpoint`]. Senders +/// must `try_send` and accept drop-on-full. +/// +/// Sized to absorb a full slot's burst with headroom - `MAX_PEERS` peers +/// × ~4 messages/slot = ~8 K items, doubled for safety against bursty +/// drainers. Production steady-state pressure is far lower (per-validator +/// vote rate is ~1 message/slot). +pub const EGRESS_CHANNEL_CAP: usize = 16384; + +/// Per-peer receive-side rate limit. Each connection read loop +/// enforces RX rate via a token bucket; bucket starts full at +/// connection open and refills continously. Any datagram arriving +/// when the bucket has no tokens left is dropped. +pub const MAX_DATAGRAMS_PER_SECOND_PER_PEER: f64 = 30.0; + +/// Per-peer receive side burst limit. Complimentary to +/// [`MAX_DATAGRAMS_PER_SECOND_PER_PEER`], allows for bursty arrivals +pub const BURST_DATAGRAMS_PER_SECOND_PER_PEER: u64 = 100; diff --git a/quic-datagram/src/read_loop.rs b/quic-datagram/src/read_loop.rs new file mode 100644 index 00000000000..36970b37ae5 --- /dev/null +++ b/quic-datagram/src/read_loop.rs @@ -0,0 +1,357 @@ +//! Per-connection reader task. Shared by server-accepted and client-dialed +//! connections - both push received datagrams into the same ingress channel. + +use { + crate::{ + BAN_DURATION_SHORT, BURST_DATAGRAMS_PER_SECOND_PER_PEER, Banlist, + MAX_DATAGRAMS_PER_SECOND_PER_PEER, close_codes, + endpoint::Datagram, + error::Error, + stats::{QuicDatagramStats, record_error}, + }, + crossbeam_channel::{Sender, TrySendError}, + log::debug, + quinn::{Connection, ConnectionError}, + solana_net_utils::token_bucket::TokenBucket, + solana_pubkey::Pubkey, + std::{ + net::SocketAddr, + sync::{Arc, atomic::Ordering}, + }, +}; + +/// Drive the per-connection read loop to completion. Returns when the +/// connection closes (peer-initiated, banlist-trip, ingress disconnect, +/// HANDOVER). Caller is responsible for reaping the connection table entry +/// afterwards. +pub(crate) async fn read_datagram_loop( + connection: Connection, + peer: Pubkey, + remote_addr: SocketAddr, + ingress: Sender, + banlist: Arc>, + stats: Arc, +) { + // Per-connection rate limiter. Any datagram arriving with the bucket + // empty is dropped. We do NOT close the connection or ban the peer here, + // honest peers legitimately burst above the refill rate during catch-up. + let rate_limit = TokenBucket::new( + BURST_DATAGRAMS_PER_SECOND_PER_PEER, + BURST_DATAGRAMS_PER_SECOND_PER_PEER, + MAX_DATAGRAMS_PER_SECOND_PER_PEER, + ); + loop { + match connection.read_datagram().await { + Ok(bytes) => { + // Banlist check happens AFTER the read so a ban that + // lands while we're awaiting can't let a follow-up + // datagram leak through to ingress. + if banlist.is_banned(&peer) { + close_codes::BANNED.close(&connection); + break; + } + if rate_limit.consume_tokens(1).is_err() { + drop(bytes); + stats.datagram_rate_limited.fetch_add(1, Ordering::Relaxed); + continue; + } + match ingress.try_send(Datagram { + peer_pubkey: peer, + peer_address: remote_addr, + message: bytes, + }) { + Ok(()) => { + stats.datagrams_received.fetch_add(1, Ordering::Relaxed); + } + Err(TrySendError::Full(_)) => { + stats + .datagram_ingress_dropped_channel_full + .fetch_add(1, Ordering::Relaxed); + } + Err(TrySendError::Disconnected(_)) => { + debug!("ingress disconnected; reader for {peer} exiting"); + break; + } + } + } + Err(e) => { + if matches!(&e, ConnectionError::ApplicationClosed(c) if c.error_code == close_codes::HANDOVER.code) + { + handle_handover(peer, &banlist, &stats); + } + record_error(&Error::from(e), &stats); + break; + } + } + } +} + +/// Soft-ban the peer on `HANDOVER` close so our client side stops dialing +/// it — the peer has accepted a different instance of our identity and +/// re-dialing would risk double-voting. +fn handle_handover(peer: Pubkey, banlist: &Banlist, stats: &QuicDatagramStats) { + stats.handover_received.fetch_add(1, Ordering::Relaxed); + banlist.ban(peer, BAN_DURATION_SHORT); +} + +#[cfg(test)] +mod tests { + use { + crate::{ + BAN_DURATION_SHORT, BURST_DATAGRAMS_PER_SECOND_PER_PEER, + allowlist::AllowAll, + endpoint::Datagram, + testutils::{ + clone_keypair, drain_matching, keypair_below, make_runtime, recv_until, + send_until_received, spawn_node, spawn_node_with, + }, + }, + bytes::Bytes, + solana_keypair::Signer, + std::{sync::Arc, time::Duration}, + }; + + #[test] + fn ban_evicts_existing_and_blocks_rehandshake() { + let rt = make_runtime(); + let server = spawn_node(&rt, Arc::new(AllowAll)); + // Lex tiebreaker: dialer's pubkey must be lower than listener's. + let client = spawn_node_with(&rt, Arc::new(AllowAll), keypair_below(&server.pubkey())); + + // Establish a connection by driving send-until-receive: the trigger + // packet is dropped; the retry through the now-Established slot lands. + let probe = Bytes::from_static(b"probe"); + send_until_received( + &rt, + &client.endpoint, + server.pubkey(), + server.addr, + probe.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.message == probe).then_some(()), + "first datagram never arrived", + ); + drain_matching(&server.ingress_rx, Duration::from_millis(200), |d| { + d.message == probe + }); + + // Ban the client at the server side. Eviction task should close the + // server-side connection; client's read loop sees ConnectionClosed and + // exits, dropping its cache entry on the way out. + server.banlist.ban(client.pubkey(), BAN_DURATION_SHORT); + + // Best-effort wait for eviction to flush. The eviction task is async; on + // a quiet system it observes the channel within a few ms. + std::thread::sleep(Duration::from_millis(200)); + + // Now have the client send again. The send path dials a new connection + // (server-side cache no longer holds the old one); but the server is + // banning this pubkey, so handshake closes with BANNED and no datagram + // ever reaches ingress. + let again = Bytes::from_static(b"after-ban"); + rt.block_on(async { + client + .endpoint + .egress + .send(Datagram { + peer_pubkey: server.pubkey(), + peer_address: server.addr, + message: again.clone(), + }) + .await + .unwrap(); + }); + + let bad = server.ingress_rx.recv_timeout(Duration::from_millis(1500)); + assert!( + bad.is_err(), + "banned client must not deliver datagrams to server; got {bad:?}" + ); + } + + #[test] + fn second_connection_with_same_keypair_handovers_first() { + let rt = make_runtime(); + let server = spawn_node(&rt, Arc::new(AllowAll)); + + // Lex rule: shared client keypair must be lower than server's. + let shared = keypair_below(&server.pubkey()); + let c_pubkey = shared.pubkey(); + let c1 = spawn_node_with(&rt, Arc::new(AllowAll), clone_keypair(&shared)); + + // C1 establishes a connection by driving send-until-receive. + let p1 = Bytes::from_static(b"p1"); + send_until_received( + &rt, + &c1.endpoint, + server.pubkey(), + server.addr, + p1.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.message == p1).then_some(()), + "server did not receive c1's probe", + ); + drain_matching(&server.ingress_rx, Duration::from_millis(200), |d| { + d.message == p1 + }); + + // C2 arrives with the same keypair. Its handshake completion at the + // server triggers HANDOVER on c1's connection. + let c2 = spawn_node_with(&rt, Arc::new(AllowAll), clone_keypair(&shared)); + let p2 = Bytes::from_static(b"p2"); + send_until_received( + &rt, + &c2.endpoint, + server.pubkey(), + server.addr, + p2.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.message == p2).then_some(()), + "server did not receive c2's probe", + ); + drain_matching(&server.ingress_rx, Duration::from_millis(200), |d| { + d.message == p2 + }); + + let server_pk = server.pubkey(); + + // C1's read loop observes the HANDOVER close and soft-bans the server. + // Give the async read loop a moment to process the close. + std::thread::sleep(Duration::from_millis(500)); + assert!( + c1.banlist.is_banned(&server_pk), + "c1 must soft-ban the server that handovered it" + ); + + // After HANDOVER the server's table holds c2 only. Server's egress to + // c_pubkey reaches c2; c1 must not see post-handover datagrams. + let after = Bytes::from_static(b"post-handover"); + rt.block_on(async { + server + .endpoint + .egress + .send(Datagram { + peer_pubkey: c_pubkey, + peer_address: c1.addr, + message: after.clone(), + }) + .await + .unwrap(); + }); + recv_until( + &c2.ingress_rx, + Duration::from_secs(5), + |d| (d.message == after).then_some(()), + "c2 (post-handover) must receive server's datagram", + ); + let stray = c1.ingress_rx.recv_timeout(Duration::from_millis(800)); + assert!( + stray.is_err(), + "c1 was handovered; must not receive post-handover datagrams; got {stray:?}" + ); + } + + #[test] + /// Per-connection receive token bucket. A burst beyond + /// `BURST_DATAGRAMS_PER_SECOND_PER_PEER` is *dropped* silently - the + /// bucket itself is the throttle. The connection stays alive and the + /// peer is NOT banned (consensus traffic legitimately bursts above the + /// refill rate during catch-up). + fn burst_exceeding_rate_limit_drops_excess_without_banning() { + let rt = make_runtime(); + let server = spawn_node(&rt, Arc::new(AllowAll)); + let client = spawn_node_with(&rt, Arc::new(AllowAll), keypair_below(&server.pubkey())); + + // Establish the connection: retry the probe until one lands (first one + // triggers the dial, gets dropped; followers ride the Established slot). + let probe = Bytes::from_static(b"probe"); + send_until_received( + &rt, + &client.endpoint, + server.pubkey(), + server.addr, + probe.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.message == probe).then_some(()), + "first datagram never arrived", + ); + // Drain probe retry duplicates so they don't inflate the post-burst + // delivery count below. + drain_matching(&server.ingress_rx, Duration::from_millis(200), |d| { + d.message == probe + }); + + // Blast a burst far in excess of the bucket capacity. Probe retries + // have already consumed some tokens; the (capacity)-th additional + // datagram trips the limiter. + let burst = (BURST_DATAGRAMS_PER_SECOND_PER_PEER as usize) * 4; + rt.block_on(async { + for i in 0..burst { + let payload = Bytes::from(format!("burst-{i:04}").into_bytes()); + client + .endpoint + .egress + .send(Datagram { + peer_pubkey: server.pubkey(), + peer_address: server.addr, + message: payload, + }) + .await + .unwrap(); + } + }); + + // Let the receiver chew through whatever fits in the bucket. + std::thread::sleep(Duration::from_millis(500)); + + // The peer must NOT have been banned - drop-only semantics. + assert!( + !server.banlist.is_banned(&client.pubkey()), + "client pubkey should NOT be banned by RX rate limit (drop-only semantics)" + ); + + // Drain whatever made it through ingress. The bucket caps how many + // post-probe datagrams can be delivered within the first refill window. + let mut delivered = 0usize; + while server + .ingress_rx + .recv_timeout(Duration::from_millis(50)) + .is_ok() + { + delivered = delivered.saturating_add(1); + } + let cap = BURST_DATAGRAMS_PER_SECOND_PER_PEER as usize; + assert!( + delivered <= cap + 2, + "delivered {delivered} datagrams post-probe, exceeds bucket capacity {cap} + slop" + ); + + // After waiting long enough for the bucket to refill, the sender should + // be able to deliver fresh datagrams on the SAME connection (proves the + // connection wasn't torn down). + std::thread::sleep(Duration::from_secs(2)); + let resume = Bytes::from_static(b"after-refill"); + rt.block_on(async { + client + .endpoint + .egress + .send(Datagram { + peer_pubkey: server.pubkey(), + peer_address: server.addr, + message: resume.clone(), + }) + .await + .unwrap(); + }); + recv_until( + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.message == resume).then_some(()), + "post-refill datagram never arrived - connection may have been torn down", + ); + } +} diff --git a/quic-datagram/src/server.rs b/quic-datagram/src/server.rs new file mode 100644 index 00000000000..68a48e15448 --- /dev/null +++ b/quic-datagram/src/server.rs @@ -0,0 +1,201 @@ +//! Server-side connection lifecycle. + +use { + crate::{ + Banlist, + allowlist::Allowlist, + close_codes, + connection_table::{ConnectionTable, InsertOutcome}, + endpoint::Datagram, + error::Error, + read_loop::read_datagram_loop, + stats::{QuicDatagramStats, record_error}, + }, + crossbeam_channel::Sender, + log::debug, + quinn::Incoming, + solana_pubkey::Pubkey, + solana_tls_utils::get_remote_pubkey, + std::sync::Arc, +}; + +/// A server-side connection representation +pub(crate) struct ServerConnection { + pub(crate) incoming: Incoming, + pub(crate) local_pubkey: Pubkey, + pub(crate) ingress: Sender, + pub(crate) allowlist: Arc, + pub(crate) banlist: Arc>, + pub(crate) table: Arc, + pub(crate) stats: Arc, +} + +impl ServerConnection { + /// Spawn a tokio task that drives this connection to completion. + /// Returns immediately; the task logs and records any error before + /// exiting. + pub(crate) fn spawn(self) { + let remote_addr = self.incoming.remote_address(); + let stats = self.stats.clone(); + tokio::spawn(async move { + if let Err(err) = self.run().await { + debug!("Failed processing incoming connection from ({remote_addr}): {err:?}"); + record_error(&err, &stats); + } + }); + } + + /// post-RETRY accept, validate identity / banlist / allowlist, + /// install in the table, run the read loop, reap on exit. + async fn run(self) -> Result<(), Error> { + // Snapshot the identity generation BEFORE the handshake starts. + // If our identity rotates while quinn is driving the handshake, + // `insert_connection` below will see a mismatched gen and + // return `Stale`, and we close the resulting (wrong-identity) + // connection with `IDENTITY_ROTATED` instead of installing it. + let gen_at_start = self.table.current_generation(); + let remote_addr = self.incoming.remote_address(); + let connecting = self.incoming.accept()?; + let connection = connecting.await?; + let Some(peer) = get_remote_pubkey(&connection) else { + close_codes::INVALID_IDENTITY.close(&connection); + return Err(Error::InvalidIdentity(remote_addr)); + }; + + // Check the pubkey tiebreaker: the lower pubkey dials, the higher + // listens. An inbound from a peer with pubkey >= local + // violates the rule. + if peer >= self.local_pubkey { + close_codes::WRONG_DIRECTION.close(&connection); + return Err(Error::WrongDirection(peer)); + } + + if self.banlist.is_banned(&peer) { + close_codes::BANNED.close(&connection); + return Err(Error::Banned(peer)); + } + if !self.allowlist.allow(&peer) { + close_codes::NOT_ADMITTED.close(&connection); + return Err(Error::NotAdmitted(peer)); + } + + // On cap-induced Rejected, the table will try to evict a + // now-unadmitted peer before giving up - epoch-boundary churn + // can momentarily push the staked-set union past `MAX_PEERS`. + match self.table.insert_connection_or_evict( + peer, + connection.clone(), + gen_at_start, + |pk| self.allowlist.allow(pk), + &self.stats, + ) { + InsertOutcome::Rejected => { + close_codes::TABLE_FULL.close(&connection); + return Err(Error::TableFull); + } + InsertOutcome::Stale => { + close_codes::IDENTITY_ROTATED.close(&connection); + return Err(Error::IdentityRotated(peer)); + } + InsertOutcome::Inserted | InsertOutcome::Replaced => { + self.stats.record_connection_count(self.table.len()); + } + } + + // We're already on the per-incoming task; run the read loop + // inline rather than chaining another spawn. + let stable_id = connection.stable_id(); + read_datagram_loop( + connection, + peer, + remote_addr, + self.ingress, + self.banlist, + self.stats, + ) + .await; + // Reap our slot only if it still points at *this* connection + // (a newer one may have taken our place via HANDOVER). + self.table.maybe_reap_connection(&peer, stable_id); + Ok(()) + } +} + +#[cfg(test)] +mod tests { + use { + crate::{ + allowlist::{AllowAll, StakedNodesAllowlist}, + endpoint::Datagram, + testutils::{ + drain_matching, keypair_below, make_runtime, send_until_received, spawn_node_with, + }, + }, + bytes::Bytes, + solana_keypair::{Keypair, Signer}, + std::{collections::HashMap, sync::Arc, time::Duration}, + }; + + #[test] + fn staked_peer_is_admitted_unstaked_is_rejected() { + let rt = make_runtime(); + + // Lex tiebreaker requires the dialing client's pubkey < server's. Pick + // the server keypair first, then derive client A's keypair below it. + let server_kp = Keypair::new(); + let server_pk = server_kp.pubkey(); + let a_kp = keypair_below(&server_pk); + let a_pk = a_kp.pubkey(); + let admit_map: HashMap<_, _> = std::iter::once((a_pk, 100u64)).collect(); + let server = spawn_node_with( + &rt, + Arc::new(StakedNodesAllowlist::new(admit_map)), + server_kp, + ); + + // Client A - admitted (pubkey < server's, so handshake passes lex check). + let client_a = spawn_node_with(&rt, Arc::new(AllowAll), a_kp); + // Client B - not admitted. Pick below server so the rejection is by the + // allowlist check, not the lex check. + let client_b = spawn_node_with(&rt, Arc::new(AllowAll), keypair_below(&server_pk)); + + let payload_a = Bytes::from_static(b"hello-from-A"); + send_until_received( + &rt, + &client_a.endpoint, + server.pubkey(), + server.addr, + payload_a.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.peer_pubkey == a_pk && d.message == payload_a).then_some(()), + "server never received payload from admitted peer A", + ); + // send_until_received may have left retry duplicates of payload_a + // in the channel; drain them before asserting B's rejection. + drain_matching(&server.ingress_rx, Duration::from_millis(200), |d| { + d.message == payload_a + }); + + let payload_b = Bytes::from_static(b"hello-from-B"); + rt.block_on(async { + client_b + .endpoint + .egress + .send(Datagram { + peer_pubkey: server.pubkey(), + peer_address: server.addr, + message: payload_b.clone(), + }) + .await + .expect("egress send B"); + }); + + // Server must close the handshake before any datagram from B is queued. + let bad = server.ingress_rx.recv_timeout(Duration::from_millis(800)); + assert!( + bad.is_err(), + "unstaked peer B's datagram should not reach server ingress, got {bad:?}" + ); + } +} diff --git a/quic-datagram/src/stats.rs b/quic-datagram/src/stats.rs new file mode 100644 index 00000000000..b4c89e2bdc5 --- /dev/null +++ b/quic-datagram/src/stats.rs @@ -0,0 +1,230 @@ +use { + crate::error::Error, + quinn::{ConnectionError, SendDatagramError}, + solana_metrics::datapoint_info, + std::sync::atomic::{AtomicU64, Ordering}, +}; + +/// Counters reported every metrics tick (cadence owned by the control loop) +/// under the `votor_datagram` namespace. +/// +/// Error variants are collapsed into coarse buckets: the exact variant is +/// rarely actionable on its own and is still emitted to the log by +/// `record_error`'s callers. +#[derive(Default)] +pub(crate) struct QuicDatagramStats { + // --- Positive path --- + /// High-water mark of live table entries over the reporting period. + /// Updated via `fetch_max` on every successful insert; re-baselined to the + /// current occupancy at each report. + pub(crate) peak_connections: AtomicU64, + /// Datagrams successfully delivered to the ingress channel. + pub(crate) datagrams_received: AtomicU64, + /// Datagrams successfully handed to quinn for transmission. + pub(crate) datagrams_sent: AtomicU64, + /// A live connection ended through an expected teardown path + /// (application/connection close, local close, reset, timeout), or a + /// `send_datagram` found the connection already gone. This is normal + /// churn, not likely to be a fault. + pub(crate) connection_lost: AtomicU64, + /// Egress dropped because local_pubkey >= peer (we are the lex-higher + /// side, so we never dial) and no inbound connection from that peer + /// has been accepted yet. + pub(crate) egress_dropped_higher_pubkey: AtomicU64, + /// Follower egress dropped because a dial is already in flight for the + /// peer. The egress that *triggered* the dial is carried into the dial + /// task and sent the moment the connection lands; only subsequent egress + /// arriving while the slot still holds a `Dialing` placeholder is dropped. + pub(crate) egress_dropped_dial_in_progress: AtomicU64, + + // --- Connection lifecycle management --- + /// Connections closed because the local identity (TLS cert / pubkey) was + /// rotated. Each rotation evicts every cached connection in one burst. + pub(crate) connection_evicted_identity_rotated: AtomicU64, + /// Cached connection closed at insert-cap time to make room for a + /// freshly-allowed peer that displaced a no-longer-allowed one. + pub(crate) connection_evicted_allowlist: AtomicU64, + /// Lex-lower side evicted a cached outbound connection because the caller + /// supplied a new socket addr for the same pubkey (peer moved). + pub(crate) connection_evicted_peer_moved: AtomicU64, + /// We closed a peer's existing connection with HANDOVER because a new + /// handshake from the same pubkey arrived. + pub(crate) connection_replaced_handover: AtomicU64, + /// A peer closed our connection with HANDOVER - we have been replaced. + pub(crate) handover_received: AtomicU64, + + // --- Error buckets --- + /// A connection failed abnormally: dial setup error, protocol-level + /// connection fault, or a non-transient `send_datagram` failure. All are + /// local/config/protocol faults that should not occur in prod for a + /// routable, correctly-built peer. + pub(crate) connect_failed: AtomicU64, + /// Handshake refused because the peer is not permitted: not in the + /// allowlist, or currently banned. + pub(crate) handshake_rejected_unauthorized: AtomicU64, + /// Handshake refused because the peer misbehaved at the protocol level: + /// unrecoverable identity, or wrong dial direction (lex tiebreaker). + pub(crate) handshake_rejected_protocol: AtomicU64, + /// Handshake refused due to a resource limit: connection table full, or + /// the source subnet exceeded its handshake-attempt budget. + pub(crate) handshake_rejected_overload: AtomicU64, + /// Peer's incoming datagram exceeded the per-connection rate. + pub(crate) datagram_rate_limited: AtomicU64, + + // --- Drops on internal channles (distinct backpressure signals) --- + /// We have received a datagram but have nowhere to put it + pub(crate) datagram_ingress_dropped_channel_full: AtomicU64, + + // --- DOS management --- + /// RETRY tokens sent to unvalidated incoming addresses. + pub(crate) handshake_retry_sent: AtomicU64, +} + +#[inline] +pub(crate) fn add(metric: &AtomicU64) { + metric.fetch_add(1, Ordering::Relaxed); +} + +impl QuicDatagramStats { + /// Updates the peak-occupancy high-water mark after a successful connection + /// install. + pub(crate) fn record_connection_count(&self, table_len: u64) { + self.peak_connections + .fetch_max(table_len, Ordering::Relaxed); + } +} + +pub(crate) fn record_error(err: &Error, stats: &QuicDatagramStats) { + match err { + Error::EgressChannelClosed | Error::IngressChannelClosed => {} + Error::Connection( + ConnectionError::ApplicationClosed(_) + | ConnectionError::ConnectionClosed(_) + | ConnectionError::LocallyClosed + | ConnectionError::Reset + | ConnectionError::TimedOut, + ) + // A send that fails because the connection vanished is the same + // class of event as a connection closing under us. + | Error::SendDatagram(SendDatagramError::ConnectionLost(_)) => { + add(&stats.connection_lost) + } + Error::Connect(_) + | Error::Connection( + ConnectionError::TransportError(_) + | ConnectionError::VersionMismatch + | ConnectionError::CidsExhausted, + ) + // TooLarge / UnsupportedByPeer / Disabled: config/protocol faults, + // same class as the dial-setup and protocol bucket above. + | Error::SendDatagram( + SendDatagramError::TooLarge + | SendDatagramError::UnsupportedByPeer + | SendDatagramError::Disabled, + ) => add(&stats.connect_failed), + Error::NotAdmitted(_) | Error::Banned(_) => add(&stats.handshake_rejected_unauthorized), + Error::InvalidIdentity(_) | Error::WrongDirection(_) => { + add(&stats.handshake_rejected_protocol) + } + Error::TableFull => add(&stats.handshake_rejected_overload), + Error::IdentityRotated(_) => { + // The new connection would have been an Established under the + // rotated identity, so it counts as a rotation-driven eviction + // (just preempted before it landed). + add(&stats.connection_evicted_identity_rotated) + } + Error::Endpoint(_) => {} // construction-time only; no runtime counter + } +} + +/// Emit the accumulated counters and reset them. `live_connections` is the +/// current table occupancy, sampled by the control loop; it is NOT reported on +/// its own (a single sample of a churning table at the 2s tick is noise), but +/// is used to re-baseline the peak high-water mark for the next period so a +/// steady-state population still reports its size. +pub(crate) fn report(stats: &QuicDatagramStats, live_connections: u64) { + macro_rules! swap { + ($m:expr) => { + $m.swap(0, Ordering::Relaxed) as i64 + }; + } + // Re-baseline the peak to the current occupancy: the next period's peak + // starts from where this one left off, not from zero. + let peak_connections = stats + .peak_connections + .swap(live_connections, Ordering::Relaxed) + .max(live_connections) as i64; + datapoint_info!( + "votor_datagram", + // Positive path + ("connections_peak", peak_connections, i64), + ("datagrams_received", swap!(stats.datagrams_received), i64), + ("datagrams_sent", swap!(stats.datagrams_sent), i64), + // Error buckets + ("connect_failed", swap!(stats.connect_failed), i64), + ("connection_lost", swap!(stats.connection_lost), i64), + ( + "handshake_rejected_unauthorized", + swap!(stats.handshake_rejected_unauthorized), + i64 + ), + ( + "handshake_rejected_protocol", + swap!(stats.handshake_rejected_protocol), + i64 + ), + ( + "handshake_rejected_overload", + swap!(stats.handshake_rejected_overload), + i64 + ), + // Drops + ( + "datagram_ingress_dropped_channel_full", + swap!(stats.datagram_ingress_dropped_channel_full), + i64 + ), + ( + "datagram_rate_limited", + swap!(stats.datagram_rate_limited), + i64 + ), + ( + "egress_dropped_higher_pubkey", + swap!(stats.egress_dropped_higher_pubkey), + i64 + ), + ( + "egress_dropped_dial_in_progress", + swap!(stats.egress_dropped_dial_in_progress), + i64 + ), + // Lifecycle + ( + "handshake_retry_sent", + swap!(stats.handshake_retry_sent), + i64 + ), + ( + "connection_evicted_identity_rotated", + swap!(stats.connection_evicted_identity_rotated), + i64 + ), + ( + "connection_evicted_allowlist", + swap!(stats.connection_evicted_allowlist), + i64 + ), + ( + "connection_evicted_peer_moved", + swap!(stats.connection_evicted_peer_moved), + i64 + ), + ( + "connection_replaced_handover", + swap!(stats.connection_replaced_handover), + i64 + ), + ("handover_received", swap!(stats.handover_received), i64), + ); +} diff --git a/quic-datagram/src/subnet_rate_limit.rs b/quic-datagram/src/subnet_rate_limit.rs new file mode 100644 index 00000000000..e0f34d257c6 --- /dev/null +++ b/quic-datagram/src/subnet_rate_limit.rs @@ -0,0 +1,129 @@ +//! Per-subnet connection-attempt rate limit on the server-accept path. +//! +//! Bounds peak CPU spent on TLS cert validation when an attacker spreads a +//! handshake flood across many source addresses. We gate **after** quinn's +//! stateless RETRY (so an off-path spoofer pays nothing here) and **before** +//! committing to the TLS handshake (so the cert + signature work is bounded +//! by the per-subnet burst budget). +//! +//! Same primitive (`KeyedRateLimiter` + `TokenBucket`) as +//! `streamer::nonblocking::connection_rate_limiter::ConnectionRateLimiter`, +//! but keyed on the IPv4 **/24 subnet prefix** rather than the full address. +//! /24 is the operational unit of an attacker pool - they share an upstream +//! allocation and rotating host bits inside a subnet shouldn't reset the +//! budget. IPv6 is rejected outright at `dispatch_accept` so this limiter +//! only ever sees v4 keys. +use { + solana_net_utils::token_bucket::{KeyedRateLimiter, TokenBucket}, + std::net::{IpAddr, Ipv4Addr}, +}; + +/// Maximum handshake attempts a single subnet can burn through before the +/// bucket is empty. Once exhausted, further attempts from that subnet are +/// refused until the refill timer credits a token. +pub const BURST_PER_SUBNET: u64 = 100; + +/// Refill cadence in attempts per minute. Sustained legitimate use sits at +/// roughly one new staked-validator dial per minute per subnet, so this +/// rate matches expected workload while making a flood costly. +pub const REFILL_PER_MINUTE: u64 = 1; + +/// Target entries retained. Each entry is roughly 120 B (subnet key, +/// TokenBucket state, DashMap overhead). Under the LazyLRU's 2× growth +/// ceiling, this caps memory near 100 MB. +const TARGET_CAPACITY: usize = 350_000; + +/// DashMap shard count. Power of two; sized for typical async-reactor +/// contention on the accept path. +const SHARDS: usize = 64; + +pub(crate) struct SubnetRateLimiter { + inner: KeyedRateLimiter, +} + +impl SubnetRateLimiter { + pub fn new() -> Self { + let prototype = TokenBucket::new( + BURST_PER_SUBNET, + BURST_PER_SUBNET, + REFILL_PER_MINUTE as f64 / 60.0, + ); + Self { + inner: KeyedRateLimiter::new(TARGET_CAPACITY, prototype, SHARDS), + } + } + + /// Returns true if a handshake from `ip` is admitted (a token was + /// consumed from the subnet bucket); false if the subnet has exhausted + /// its burst budget. + /// + /// Loopback addresses bypass the limit entirely - they are trusted + /// implicitly. + pub fn admit(&self, ip: IpAddr) -> bool { + if ip.is_loopback() { + return true; + } + self.inner.consume_tokens(subnet_key(ip), 1).is_ok() + } +} + +/// Normalise a source address to its subnet aggregate. IPv4 → /24. +fn subnet_key(ip: IpAddr) -> IpAddr { + match ip { + IpAddr::V4(v4) => { + let o = v4.octets(); + IpAddr::V4(Ipv4Addr::new(o[0], o[1], o[2], 0)) + } + // We do not support v6, but do not want to panic here either + IpAddr::V6(_v6) => IpAddr::V4(Ipv4Addr::UNSPECIFIED), + } +} + +#[cfg(test)] +mod tests { + use {super::*, std::net::Ipv4Addr}; + + #[test] + fn subnet_key_collapses_v4_host_bits() { + let a = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 1)); + let b = IpAddr::V4(Ipv4Addr::new(10, 0, 0, 254)); + assert_eq!(subnet_key(a), subnet_key(b)); + let c = IpAddr::V4(Ipv4Addr::new(10, 0, 1, 1)); + assert_ne!(subnet_key(a), subnet_key(c)); + } + + #[test] + fn burst_then_exhausts() { + let lim = SubnetRateLimiter::new(); + let ip = IpAddr::V4(Ipv4Addr::new(203, 0, 113, 7)); + for _ in 0..BURST_PER_SUBNET { + assert!(lim.admit(ip), "should admit during burst"); + } + assert!(!lim.admit(ip), "should refuse after burst exhausted"); + } + + #[test] + fn host_bits_do_not_reset_budget() { + let lim = SubnetRateLimiter::new(); + for host in 0..BURST_PER_SUBNET as u8 { + let ip = IpAddr::V4(Ipv4Addr::new(203, 0, 113, host)); + assert!(lim.admit(ip)); + } + let next = IpAddr::V4(Ipv4Addr::new(203, 0, 113, 200)); + assert!( + !lim.admit(next), + "rotating host bits in the same /24 must not bypass the limit" + ); + } + + #[test] + fn loopback_bypasses_limit() { + let lim = SubnetRateLimiter::new(); + // 5× burst from loopback - must all admit since the limit doesn't + // apply to loopback (per project conduct rule: localhost is + // trusted). + for _ in 0..BURST_PER_SUBNET * 5 { + assert!(lim.admit(IpAddr::V4(Ipv4Addr::LOCALHOST))); + } + } +} diff --git a/quic-datagram/src/testutils.rs b/quic-datagram/src/testutils.rs new file mode 100644 index 00000000000..5e48e998af0 --- /dev/null +++ b/quic-datagram/src/testutils.rs @@ -0,0 +1,182 @@ +#![cfg(any(test, feature = "dev-context-only-utils"))] +#![allow(clippy::arithmetic_side_effects)] + +use { + crate::{ + Banlist, + allowlist::Allowlist, + endpoint::{Datagram, QuicDatagramEndpoint}, + }, + bytes::Bytes, + crossbeam_channel::Receiver, + solana_keypair::{Keypair, Signer}, + solana_net_utils::sockets::bind_to_localhost_unique, + solana_pubkey::Pubkey, + std::{net::SocketAddr, sync::Arc, time::Duration}, + tokio::runtime::Runtime, +}; + +pub const TEST_ALPN: &[u8] = b"votor-test"; + +pub struct TestNode { + pub keypair: Keypair, + pub addr: SocketAddr, + pub endpoint: QuicDatagramEndpoint, + pub ingress_rx: Receiver, + pub banlist: Arc>, +} + +impl TestNode { + pub fn pubkey(&self) -> Pubkey { + self.keypair.pubkey() + } +} + +pub fn make_runtime() -> Runtime { + tokio::runtime::Builder::new_multi_thread() + .worker_threads(4) + .enable_all() + .build() + .expect("tokio multi-thread runtime") +} + +/// Spawn an endpoint with `allowlist` policy and a fresh keypair. +pub fn spawn_node(rt: &Runtime, allowlist: Arc) -> TestNode { + spawn_node_with(rt, allowlist, Keypair::new()) +} + +/// Spawn with caller-supplied keypair (lets the test write a server allowlist +/// keyed on the same pubkey). +pub fn spawn_node_with( + rt: &Runtime, + allowlist: Arc, + keypair: Keypair, +) -> TestNode { + let socket = bind_to_localhost_unique().expect("bind UDP socket"); + let addr = socket.local_addr().expect("local addr"); + let (ingress_tx, ingress_rx) = crossbeam_channel::bounded(4096); + let banlist = Arc::new(Banlist::::default()); + let endpoint = QuicDatagramEndpoint::new( + rt.handle(), + &keypair, + socket, + TEST_ALPN, + ingress_tx, + allowlist, + banlist.clone(), + ) + .expect("QuicDatagramEndpoint::new"); + TestNode { + keypair, + addr, + endpoint, + ingress_rx, + banlist, + } +} + +/// Pull from `rx` until `cond` returns `Some(T)` or `timeout` elapses. Panics +/// with `msg` on timeout. +pub fn recv_until( + rx: &Receiver, + timeout: Duration, + mut cond: impl FnMut(&Datagram) -> Option, + msg: &str, +) -> T { + let deadline = std::time::Instant::now() + timeout; + while std::time::Instant::now() < deadline { + let remaining = deadline.saturating_duration_since(std::time::Instant::now()); + match rx.recv_timeout(remaining) { + Ok(item) => { + if let Some(t) = cond(&item) { + return t; + } + } + Err(_) => break, + } + } + panic!("{msg}"); +} + +/// Drive a send + receive round-trip, re-sending the payload every +/// `POLL_INTERVAL` until `cond` returns `Some(T)` or `timeout` elapses. +/// +/// The first egress of a fresh peer triggers a dial; the trigger +/// packet and any followers that arrive while `Dialing` is in flight +/// are dropped on the floor. So single-shot tests need to keep +/// re-sending until the dial completes and the cached `Established` +/// can carry the payload. Use this in place of +/// `rt.block_on(egress.send(...)) + recv_until(...)` whenever the +/// payload must actually reach the receiver. +/// +/// Once `cond` fires, the matching item is returned. Stray duplicates +/// of `payload` may sit in `rx` afterwards (one per retry that landed +/// after the dial completed); drain them if the test cares. +pub fn send_until_received( + rt: &Runtime, + endpoint: &QuicDatagramEndpoint, + target_pk: Pubkey, + target_addr: SocketAddr, + payload: Bytes, + rx: &Receiver, + timeout: Duration, + mut cond: impl FnMut(&Datagram) -> Option, + msg: &str, +) -> T { + const POLL_INTERVAL: Duration = Duration::from_millis(100); + let deadline = std::time::Instant::now() + timeout; + while std::time::Instant::now() < deadline { + rt.block_on(async { + let _ = endpoint.egress.try_send(Datagram { + peer_pubkey: target_pk, + peer_address: target_addr, + message: payload.clone(), + }); + }); + if let Ok(item) = rx.recv_timeout(POLL_INTERVAL) + && let Some(t) = cond(&item) + { + return t; + } + } + panic!("{msg}"); +} + +/// Drain any backlog from `rx` that matches `pred` within `timeout`. +/// Used after [`send_until_received`] when the test needs to assert +/// "no further packets" — by then `rx` may hold retry duplicates that +/// landed after the dial completed. +pub fn drain_matching( + rx: &Receiver, + timeout: Duration, + mut pred: impl FnMut(&Datagram) -> bool, +) { + let deadline = std::time::Instant::now() + timeout; + while std::time::Instant::now() < deadline { + let remaining = deadline.saturating_duration_since(std::time::Instant::now()); + match rx.recv_timeout(remaining.min(Duration::from_millis(50))) { + Ok(item) if pred(&item) => continue, + Ok(_) => continue, + Err(_) => break, + } + } +} + +/// Generate a fresh keypair whose pubkey is strictly less than `upper`. +/// +/// Required by client→server tests under the lex-pubkey tiebreaker: the +/// dialer's pubkey must be lower than the listener's, otherwise the server +/// rejects the inbound with `WRONG_DIRECTION` and the test would observe a +/// silent connection failure. +pub fn keypair_below(upper: &Pubkey) -> Keypair { + loop { + let k = Keypair::new(); + if &k.pubkey() < upper { + return k; + } + } +} + +pub fn clone_keypair(k: &Keypair) -> Keypair { + k.insecure_clone() +} diff --git a/quic-datagram/src/transport.rs b/quic-datagram/src/transport.rs new file mode 100644 index 00000000000..55b273c7b23 --- /dev/null +++ b/quic-datagram/src/transport.rs @@ -0,0 +1,146 @@ +use { + crate::MAX_PEERS, + quinn::{ + AckFrequencyConfig, ClientConfig, IdleTimeout, ServerConfig, TransportConfig, VarInt, + congestion::{Controller, ControllerFactory}, + crypto::rustls::{QuicClientConfig, QuicServerConfig}, + }, + quinn_proto::RttEstimator, + rustls::pki_types::{CertificateDer, PrivateKeyDer}, + solana_tls_utils::{tls_client_config_builder, tls_server_config_builder}, + std::{ + any::Any, + sync::Arc, + time::{Duration, Instant}, + }, +}; + +// Transport tuning constants. Tuned for a votor-style workload: +// long-lived connections to MAX_PEERS=2000 peers with bursty datagram traffic +// around slot boundaries. + +/// Receive datagram buffer (per endpoint, total across all peers). +/// Provisions enough room for 1 full slot at max load. +const DATAGRAM_RECEIVE_BUFFER: usize = MAX_PEERS * 8; +/// Send datagram buffer (per endpoint). Not too huge since we are +/// supposed to operate in realtime. +const DATAGRAM_SEND_BUFFER: usize = MAX_PEERS * 2; +/// Close connections after this much time without feedback from peer. +/// This should be more than reasonable max time between PING (or data) +/// frames arriving from the peer. We also want to keep this rather short +/// to make sure we reinitiate connections that are truly stuck. +const MAX_IDLE_TIMEOUT: Duration = Duration::from_secs(5); +/// QUIC keep-alive heartbeat. Must be << `MAX_IDLE_TIMEOUT` to avoid +/// connections dying when no vote traffic is available, and > MAX_ACK_DELAY +/// to make sure we actually get ACKs for every keepalive we send. +const KEEP_ALIVE_INTERVAL: Duration = Duration::from_millis(600); +/// `max_ack_delay` we request the peer to use via the QUIC ACK Frequency +/// extension (RFC 9799). Loosens the peer's ACK cadence to cut +/// reverse-direction packet rate. Set this ~ 1 slot to avoid ACKs during +/// normal votor operation. +const MAX_ACK_DELAY: Duration = Duration::from_millis(400); +/// Initial MTU before path discovery. Floor of 1280 matches QUIC spec. +const INITIAL_MTU: u16 = 1280; +/// Minimum MTU after path discovery. +const MIN_MTU: u16 = 1280; + +/// No-op congestion controller: fixed [`NOP_CONGESTION_WINDOW`] bytes +/// in-flight, ignores all loss and ECN-CE signals (lost_bytes ≥ 0, +/// is_persistent_congestion, ECN-marks all routed through +/// `on_congestion_event` are no-ops). Used to take quinn's +/// congestion-control machinery out of the picture for consensus +/// traffic, which is realtime and not flexible, so pacing/CWND +/// adjustments only add delay without buying anything we want. +const NOP_CONGESTION_WINDOW: u64 = 8 * 1024 * 1024; + +#[derive(Clone)] +struct NopCongestion; + +impl Controller for NopCongestion { + fn on_congestion_event(&mut self, _: Instant, _: Instant, _: bool, _: u64) {} + fn on_ack(&mut self, _: Instant, _: Instant, _: u64, _: bool, _: &RttEstimator) {} + fn on_mtu_update(&mut self, _: u16) {} + fn window(&self) -> u64 { + NOP_CONGESTION_WINDOW + } + fn initial_window(&self) -> u64 { + NOP_CONGESTION_WINDOW + } + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + fn into_any(self: Box) -> Box { + self + } +} + +impl ControllerFactory for NopCongestion { + fn build(self: Arc, _: Instant, _: u16) -> Box { + Box::new(NopCongestion) + } +} + +pub(crate) fn new_transport_config() -> TransportConfig { + let max_idle = + IdleTimeout::try_from(MAX_IDLE_TIMEOUT).expect("MAX_IDLE_TIMEOUT fits IdleTimeout"); + let mut ack_freq = AckFrequencyConfig::default(); + ack_freq.max_ack_delay(Some(MAX_ACK_DELAY)); + // prevent acks from being elicited by packet count + ack_freq.ack_eliciting_threshold(VarInt::from_u32(512)); + // disable reordering notifications + ack_freq.reordering_threshold(VarInt::from_u32(0)); + let mut c = TransportConfig::default(); + c.datagram_receive_buffer_size(Some(DATAGRAM_RECEIVE_BUFFER)) + .datagram_send_buffer_size(DATAGRAM_SEND_BUFFER) + .initial_mtu(INITIAL_MTU) + .min_mtu(MIN_MTU) + .mtu_discovery_config(None) + .keep_alive_interval(Some(KEEP_ALIVE_INTERVAL)) + .max_idle_timeout(Some(max_idle)) + .ack_frequency_config(Some(ack_freq)) + .congestion_controller_factory(Arc::new(NopCongestion)) + // Datagrams only - disable streams entirely. + .max_concurrent_bidi_streams(VarInt::from(0u8)) + .max_concurrent_uni_streams(VarInt::from(0u8)); + c +} + +/// Build the rustls + quinn server config. Panics on internal failures - +/// the cert/key pair comes from `new_dummy_x509_certificate` on a valid +/// solana keypair, and rustls's TLS 1.3-only setup yields an initial cipher +/// suite by construction. Both error paths indicate a programmer bug, not +/// a runtime condition the validator could recover from. +pub(crate) fn new_server_config( + cert: CertificateDer<'static>, + key: PrivateKeyDer<'static>, + alpn: &[u8], +) -> ServerConfig { + let mut tls = tls_server_config_builder() + .with_single_cert(vec![cert], key) + .expect("rustls accepts our self-signed solana cert/key pair"); + tls.alpn_protocols = vec![alpn.to_vec()]; + let quic = QuicServerConfig::try_from(tls) + .expect("TLS 1.3-only config yields an initial cipher suite"); + let mut cfg = ServerConfig::with_crypto(Arc::new(quic)); + cfg.transport_config(Arc::new(new_transport_config())) + .migration(false); + cfg +} + +/// Build the rustls + quinn client config. See [`new_server_config`] for +/// why the construction is panic-on-failure. +pub(crate) fn new_client_config( + cert: CertificateDer<'static>, + key: PrivateKeyDer<'static>, + alpn: &[u8], +) -> ClientConfig { + let mut tls = tls_client_config_builder() + .with_client_auth_cert(vec![cert], key) + .expect("rustls accepts our solana cert/key pair"); + tls.enable_early_data = true; + tls.alpn_protocols = vec![alpn.to_vec()]; + let quic = QuicClientConfig::try_from(tls).expect("TLS config should be valid"); + let mut cfg = ClientConfig::new(Arc::new(quic)); + cfg.transport_config(Arc::new(new_transport_config())); + cfg +} diff --git a/quic-datagram/tests/common.rs b/quic-datagram/tests/common.rs new file mode 100644 index 00000000000..765791f0f0c --- /dev/null +++ b/quic-datagram/tests/common.rs @@ -0,0 +1,2 @@ +#![allow(dead_code, clippy::arithmetic_side_effects)] +pub use solana_quic_datagram::testutils::*; diff --git a/quic-datagram/tests/dialing_state.rs b/quic-datagram/tests/dialing_state.rs new file mode 100644 index 00000000000..fb2f52028f9 --- /dev/null +++ b/quic-datagram/tests/dialing_state.rs @@ -0,0 +1,155 @@ +//! `ConnectionTableEntry::Dialing` placeholder semantics. +//! +//! Covers the lex-lower dialer cleaning up its placeholder on dial +//! failure so a subsequent egress to the same pubkey can retry +//! (`endpoint.connect` timeout, server identity mismatch). + +#[path = "common.rs"] +mod common; + +use { + bytes::Bytes, + common::{ + drain_matching, keypair_below, make_runtime, send_until_received, spawn_node, + spawn_node_with, + }, + solana_keypair::Keypair, + solana_net_utils::sockets::bind_to_localhost_unique, + solana_quic_datagram::{allowlist::AllowAll, endpoint::Datagram}, + std::{net::UdpSocket, sync::Arc, time::Duration}, +}; + +fn clone_keypair(k: &Keypair) -> Keypair { + k.insecure_clone() +} + +/// Hold a bound UDP socket that never speaks QUIC. The kernel queues +/// arriving packets in the socket's recv buffer; a peer dialing this +/// address sees no quinn responses and eventually times the handshake out. +fn bind_blackhole() -> UdpSocket { + bind_to_localhost_unique().expect("bind blackhole socket") +} + +#[test] +fn failed_connect_clears_placeholder() { + // Dial to a blackhole address times out (quinn max_idle = 2s per + // transport.rs::MAX_IDLE_TIMEOUT). On failure the `Dialing` placeholder is + // removed so a follow-up egress to the same pubkey at a working addr + // can spawn a fresh dial and reach the server. + let rt = make_runtime(); + let server = spawn_node(&rt, Arc::new(AllowAll)); + let s_pubkey = server.pubkey(); + + let client = spawn_node_with(&rt, Arc::new(AllowAll), keypair_below(&s_pubkey)); + + // Pretend gossip published a bogus addr for `s_pubkey`. The dial will + // fail; if the placeholder is not cleaned up, the follow-up below + // would see `Occupied(Dialing)` and drop forever. + let blackhole = bind_blackhole(); + let bogus_addr = blackhole.local_addr().expect("blackhole addr"); + + let p1 = Bytes::from_static(b"p1-blackhole"); + rt.block_on(async { + client + .endpoint + .egress + .send(Datagram { + peer_pubkey: s_pubkey, + peer_address: bogus_addr, + message: p1.clone(), + }) + .await + .unwrap(); + }); + + // Give the dial enough time to fail. transport.rs sets + // max_idle_timeout = 2s, but quinn's handshake PTO + retransmits can + // run longer than the configured idle before the local watchdog + // fires; wait a generous margin. + std::thread::sleep(Duration::from_secs(8)); + + let p2 = Bytes::from_static(b"p2-real"); + send_until_received( + &rt, + &client.endpoint, + s_pubkey, + server.addr, + p2.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.message == p2).then_some(()), + "server did not receive the retry - placeholder not cleared after dial failure", + ); + + // p1 went to the blackhole; server should never see it. Drain p2 + // retry duplicates, then assert no foreign packet arrives. + drain_matching(&server.ingress_rx, Duration::from_millis(200), |d| { + d.message == p2 + }); + let stray = server.ingress_rx.recv_timeout(Duration::from_millis(500)); + assert!( + stray.is_err(), + "server unexpectedly received {stray:?}; p1 should have been lost to the blackhole", + ); + drop(blackhole); +} + +#[test] +fn identity_mismatch_clears_placeholder() { + // Dial to a server whose attested identity differs from the pubkey + // the caller targeted: the client closes the conn with INVALID_IDENTITY + // and removes its `Dialing` placeholder. A follow-up egress with the + // correct pubkey/addr must succeed. + let rt = make_runtime(); + + // Two servers with distinct keypairs at distinct addrs. We send to + // (s2_pk, s1.addr) - quinn handshake succeeds, then the attested pk + // (s1's) does not match the caller-requested s2_pk → INVALID_IDENTITY. + let s1 = spawn_node(&rt, Arc::new(AllowAll)); + let s2 = spawn_node(&rt, Arc::new(AllowAll)); + + // Client must be lex-lower than both servers (lex tiebreak - only the + // lower side dials). + let lower_bound = s1.pubkey().min(s2.pubkey()); + let client = spawn_node_with(&rt, Arc::new(AllowAll), keypair_below(&lower_bound)); + + let s2_pk = s2.pubkey(); + let s2_addr = s2.addr; + let bad = Bytes::from_static(b"to-wrong-addr"); + rt.block_on(async { + client + .endpoint + .egress + .send(Datagram { + peer_pubkey: s2_pk, + peer_address: s1.addr, + message: bad.clone(), + }) + .await + .unwrap(); + }); + + // Give the bogus dial time to complete its handshake, observe the + // identity mismatch, and clear the placeholder. + std::thread::sleep(Duration::from_secs(1)); + + // Same pk, now at the real addr. Must succeed. + let good = Bytes::from_static(b"to-right-addr"); + send_until_received( + &rt, + &client.endpoint, + s2_pk, + s2_addr, + good.clone(), + &s2.ingress_rx, + Duration::from_secs(5), + |d| (d.message == good).then_some(()), + "S2 did not receive retry - placeholder not cleared after INVALID_IDENTITY", + ); + + // S1 must not see either datagram. The bogus one was closed before + // any data flowed; the good one targeted s2. + let stray = s1.ingress_rx.recv_timeout(Duration::from_millis(500)); + assert!(stray.is_err(), "S1 unexpectedly received {stray:?}",); + let _ = clone_keypair(&s1.keypair); // silence unused-fn warning if any +} diff --git a/quic-datagram/tests/peer_moved.rs b/quic-datagram/tests/peer_moved.rs new file mode 100644 index 00000000000..5a0d51a741d --- /dev/null +++ b/quic-datagram/tests/peer_moved.rs @@ -0,0 +1,107 @@ +//! Caller-driven addr refresh: when a caller (e.g. votor via a gossip +//! refresh) hands the endpoint a new SocketAddr for a pubkey already in +//! the connection table, the lex-lower (dialer) side must evict its +//! stale outbound connection and re-dial the new addr. Lex-higher +//! cached entries are server-accepted (peer's NAT-mapped source addr +//! can legitimately differ from gossip's published one) and are trusted +//! regardless of the caller's claimed addr. + +#[path = "common.rs"] +mod common; + +use { + bytes::Bytes, + common::{ + drain_matching, keypair_below, make_runtime, send_until_received, spawn_node, + spawn_node_with, + }, + solana_net_utils::sockets::bind_to_localhost_unique, + solana_quic_datagram::{allowlist::AllowAll, endpoint::Datagram}, + std::{sync::Arc, time::Duration}, +}; + +#[test] +fn egress_during_dial_dropped_addr_recovers_after_timeout() { + // Mid-dial address change: while a dial to addr A1 is still in flight + // (blackhole'd, will time out), the caller queues an egress with a + // different addr A2 for the same pubkey. The state-machine drops it + // (placeholder is `Dialing`, not `Established`, so the addr-mismatch + // eviction path doesn't apply). Once the first dial times out and + // clears the placeholder, a fresh egress to A2 must succeed. + let rt = make_runtime(); + let server = spawn_node(&rt, Arc::new(AllowAll)); + let s_pubkey = server.pubkey(); + let client = spawn_node_with(&rt, Arc::new(AllowAll), keypair_below(&s_pubkey)); + + let blackhole = bind_to_localhost_unique().expect("blackhole socket"); + let blackhole_addr = blackhole.local_addr().expect("blackhole addr"); + + // 1. start dial to blackhole - server's pubkey, wrong addr. + let p1 = Bytes::from_static(b"dial-to-blackhole"); + rt.block_on(async { + client + .endpoint + .egress + .send(Datagram { + peer_pubkey: s_pubkey, + peer_address: blackhole_addr, + message: p1.clone(), + }) + .await + .unwrap(); + }); + // Give the dial a beat to install its `Dialing` placeholder. + std::thread::sleep(Duration::from_millis(200)); + + // 2. while `Dialing`, send to the REAL addr. Should be dropped (we do + // not buffer; one dial task per peer at a time). + let p2 = Bytes::from_static(b"during-dial-dropped"); + rt.block_on(async { + client + .endpoint + .egress + .send(Datagram { + peer_pubkey: s_pubkey, + peer_address: server.addr, + message: p2.clone(), + }) + .await + .unwrap(); + }); + // Confirm server does NOT see p2 within the dial-in-progress window. + let stray = server.ingress_rx.recv_timeout(Duration::from_millis(500)); + assert!( + stray.is_err(), + "server unexpectedly received {stray:?} while dial-in-flight" + ); + + // 3. wait for the blackhole dial to time out, clearing the placeholder. + std::thread::sleep(Duration::from_secs(8)); + + // 4. retry the real addr - must succeed. + let p3 = Bytes::from_static(b"post-timeout-retry"); + send_until_received( + &rt, + &client.endpoint, + s_pubkey, + server.addr, + p3.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.message == p3).then_some(()), + "server did not receive post-timeout retry", + ); + + // p1 and p2 never reach the server: p1 went to the blackhole, p2 was + // dropped while `Dialing`. Drain p3 retry duplicates, then assert + // no foreign packet arrives. + drain_matching(&server.ingress_rx, Duration::from_millis(200), |d| { + d.message == p3 + }); + let stray = server.ingress_rx.recv_timeout(Duration::from_millis(500)); + assert!( + stray.is_err(), + "server unexpectedly received {stray:?} after the retry landed", + ); + drop(blackhole); +} diff --git a/quic-datagram/tests/tiebreak.rs b/quic-datagram/tests/tiebreak.rs new file mode 100644 index 00000000000..5372942fc9f --- /dev/null +++ b/quic-datagram/tests/tiebreak.rs @@ -0,0 +1,142 @@ +#[path = "common.rs"] +mod common; + +use { + bytes::Bytes, + common::{TestNode, make_runtime, spawn_node}, + solana_pubkey::Pubkey, + solana_quic_datagram::{allowlist::AllowAll, endpoint::Datagram}, + std::{collections::HashMap, net::SocketAddr, sync::Arc, time::Duration}, +}; + +#[test] +/// Lex-pubkey tiebreaker - verifies the simultaneous-dial race resolves to +/// a single connection per peer in each side's table. +/// +/// Setup: N=32 nodes all mutually-admitted, every node sends to every other +/// at the same time. +/// +/// Assertion: after warm-up settles, a follow-up "burst" of one unique +/// payload per (sender, receiver) pair must arrive exactly once at every +/// receiver - no duplicates from dual flows. +fn tiebreak_no_duplicates_after_settle() { + const N: usize = 32; + let rt = make_runtime(); + let nodes: Vec = (0..N) + .map(|_| spawn_node(&rt, Arc::new(AllowAll))) + .collect(); + let pubkeys: Vec = nodes.iter().map(|n| n.pubkey()).collect(); + let addrs: Vec = nodes.iter().map(|n| n.addr).collect(); + + // Warm-up: every node concurrently sends one payload to every other - + // provokes the simultaneous-dial race for every pair. + rt.block_on(async { + let mut tasks = Vec::new(); + for (i, n) in nodes.iter().enumerate() { + let egress = n.endpoint.egress.clone(); + let targets: Vec = (0..N) + .filter(|&j| j != i) + .map(|j| Datagram { + peer_pubkey: pubkeys[j], + peer_address: addrs[j], + message: Bytes::from_static(b"warmup"), + }) + .collect(); + tasks.push(tokio::spawn(async move { + for t in targets { + let _ = egress.send(t).await; + } + })); + } + for t in tasks { + let _ = t.await; + } + }); + + // Allow the race to resolve: WRONG_DIRECTION close needs ~1 RTT + // (localhost: microseconds) plus task scheduling. Generous slack. + std::thread::sleep(Duration::from_secs(2)); + + // Drain warm-up ingress so it can't pollute the burst-phase counts. + for n in &nodes { + while n.ingress_rx.try_recv().is_ok() {} + } + + // Burst: one unique payload per ordered pair. With clean dedup each + // receiver sees each sender's payload exactly once. + rt.block_on(async { + let mut tasks = Vec::new(); + for (i, n) in nodes.iter().enumerate() { + let egress = n.endpoint.egress.clone(); + let targets: Vec = (0..N) + .filter(|&j| j != i) + .map(|j| { + let payload = Bytes::copy_from_slice(format!("burst-{i}-{j}").as_bytes()); + Datagram { + peer_pubkey: pubkeys[j], + peer_address: addrs[j], + message: payload, + } + }) + .collect(); + tasks.push(tokio::spawn(async move { + for t in targets { + let _ = egress.send(t).await; + } + })); + } + for t in tasks { + let _ = t.await; + } + }); + + std::thread::sleep(Duration::from_secs(1)); + + let mut duplicates = Vec::new(); + let mut losses = Vec::new(); + for (j, n) in nodes.iter().enumerate() { + let mut counts: HashMap = HashMap::new(); + while let Ok(d) = n.ingress_rx.try_recv() { + let s = String::from_utf8_lossy(&d.message).into_owned(); + *counts.entry(s).or_insert(0) += 1; + } + for i in 0..N { + if i == j { + continue; + } + let key = format!("burst-{i}-{j}"); + let c = counts.get(&key).copied().unwrap_or(0); + match c { + 1 => {} + 0 => losses.push(format!("node {j}: lost '{key}'")), + _ => duplicates.push(format!("node {j}: got {c} copies of '{key}'")), + } + } + } + + let total_pairs = N * (N - 1); + assert!( + duplicates.is_empty(), + "{}/{} pairs duplicated (lex tiebreaker failed):\n{}", + duplicates.len(), + total_pairs, + duplicates + .iter() + .take(10) + .cloned() + .collect::>() + .join("\n") + ); + assert!( + losses.is_empty(), + "{}/{} pairs lost their burst datagram (warm-up did not establish a connection):\n{}", + losses.len(), + total_pairs, + losses + .iter() + .take(10) + .cloned() + .collect::>() + .join("\n") + ); +} diff --git a/validator/src/admin_rpc_service.rs b/validator/src/admin_rpc_service.rs index 13366d1598e..634ef21ef0b 100644 --- a/validator/src/admin_rpc_service.rs +++ b/validator/src/admin_rpc_service.rs @@ -1372,8 +1372,7 @@ mod tests { KeyUpdaterType::TpuVote, KeyUpdaterType::Forward, KeyUpdaterType::RpcService, - KeyUpdaterType::Bls, - KeyUpdaterType::BlsConnectionCache, + KeyUpdaterType::VotorDatagram, ]) ); let mut io = MetaIoHandler::default(); diff --git a/validator/src/commands/run/execute.rs b/validator/src/commands/run/execute.rs index 4a43fa3a41c..fc31c203f43 100644 --- a/validator/src/commands/run/execute.rs +++ b/validator/src/commands/run/execute.rs @@ -45,7 +45,7 @@ use { validator::{ BlockProductionMethod, BlockVerificationMethod, SchedulerPacing, Validator, ValidatorConfig, ValidatorLogConfig, ValidatorStartProgress, ValidatorTpuConfig, - is_snapshot_config_valid, + VotorTransportProtocol, is_snapshot_config_valid, }, }, solana_genesis_utils::MAX_GENESIS_ARCHIVE_UNPACKED_SIZE, @@ -867,6 +867,7 @@ pub fn execute( Arc::new(AtomicBool::new(false)), )] .into(), + votor_transport_protocol: VotorTransportProtocol::default(), voting_service_test_override: None, snapshot_packager_niceness_adj: value_t_or_exit!( matches, diff --git a/votor-changes.md b/votor-changes.md new file mode 100644 index 00000000000..cdcc104ccaf --- /dev/null +++ b/votor-changes.md @@ -0,0 +1,302 @@ +# Votor QUIC Datagram Change Stack + +## Why This Stack Is Shaped This Way + +The goal is to switch votor from QUIC stream mode to QUIC datagram mode. The stack is intentionally organized so the protocol pieces and the production behavior flip are not reviewed all at once. + +The key idea is: + +```text +First: keep the current stream setup working +Then: add the datagram setup beside it +Then: wire both through an explicit transport enum +Finally: flip the default from stream to datagram +``` + +That is why the core wiring commit introduces `VotorTransportProtocol::{ QuicStream, QuicDatagram }` and `AlpenglowTransport::{ QuicStream, QuicDatagram }` instead of directly deleting the old path. The enum makes the old behavior and new behavior explicit. The first integration commit keeps `QuicStream` as the default, so review can focus on whether the datagram path is wired correctly without also reviewing a rollout. The final commit changes only the default to `QuicDatagram`, making the actual behavior switch small and obvious. + +Why this approach is useful: + +- It preserves the current votor stream path while the datagram implementation is introduced. +- It makes rollback simple: use `QuicStream` again instead of reverting the whole transport implementation. +- It keeps the production behavior switch isolated to one small commit. +- It separates protocol-sensitive changes from implementation plumbing where possible. +- It reduces risk to unrelated validator paths: Tower, ReplayStage, TPU transaction networking, TPU vote networking, and non-votor vote paths are not supposed to change. + +Top-level commit flow: + +1. `c21e1fc76d` - Preserve peer identity in votor fanout targets. + Votor still sends through `ConnectionCache` streams, but fanout now carries `(Pubkey, SocketAddr)` so the later datagram path can target authenticated peers. + +2. `fbd2d8e721` - Normalize BLS sigverify input bytes. + BLS sigverify still receives old streamer `PacketBatch` input, but internally converts it into authenticated byte items. This prepares sigverify for datagram input without changing the transport yet. + +3. `edbf3b9730` - Add the unused `solana-quic-datagram` endpoint crate. + This introduces the generic datagram endpoint, allowlist, banlist, connection handling, transport tuning, and endpoint tests without wiring it into validator startup. + +4. `c5ab519d4f` - Test connection semantics. + This validates the important datagram transport rules such as one connection per peer, dialing placeholders, tiebreaking, and peer movement before votor depends on them. + +5. `b65c44561f` - Add the votor datagram wrapper and benchmark. + This creates the votor-specific wrapper around the generic endpoint, including ALPN and allowlist construction, but still does not switch production votor. + +6. `c1517add2c` - Add an additive datagram send path to `VotingService`. + `VotingService::new()` still uses `ConnectionCache -> QUIC stream`. A new `VotingService::new_datagram()` path sends to the datagram egress channel, but nobody uses it by default yet. + +7. `c3730df0a1` - Test datagram identity rotation through the votor wrapper. + This verifies that the key updater exposed by the votor wrapper works before validator startup registers it. + +8. `e77dcd9691` - Wire votor datagram transport behind config. + Validator and TVU can now select either stream mode or datagram mode through `VotorTransportProtocol` and `AlpenglowTransport`. The default remains `QuicStream`, so this is integration without rollout. + +9. `05b3d50d24` - Default votor to QUIC datagrams. + This is the rollout commit. It changes the default from `QuicStream` to `QuicDatagram` and updates the admin RPC test expectation for the datagram key updater. + +Bottom line: the stack first makes the new datagram path possible, then wires it through the current validator setup as a selectable mode, and only at the end changes the default. That is the cleanest review shape for a transport change because the final behavior switch is easy to see and easy to revert. + +## Differences From `pr-12572` + +This stack is no longer trying to be final-code-identical to `pr-12572`. The core goal is still the same: make votor use QUIC datagrams instead of QUIC streams. The important difference is how the change is introduced. + +`pr-12572` is closer to "replace stream mode with datagram mode now." This stack is closer to "add datagram mode, keep stream mode as a selectable fallback, default to datagram, and remove stream mode later only after the new path is validated." + +The intentional differences are: + +1. Validator and TVU keep two explicit transport modes. `QuicStream` preserves the old BLS `ConnectionCache` plus simple-qos streamer path, while `QuicDatagram` builds the new datagram endpoint and routes datagram ingress into BLS sigverify. This is different from `pr-12572`, which more directly wires startup into datagrams. + +2. The old BLS streamer path remains selectable. That is not protocol-required for datagrams, but it gives a fallback and lets the default flip be reviewed separately from the implementation. + +3. BLS sigverify accepts both stream packet batches and authenticated datagrams. This is needed because the stack preserves both transports; the verifier should only care about authenticated bytes and the remote pubkey, not which transport produced them. + +4. `VotingService` gains datagram sending additively. `VotingService::new()` keeps using `ConnectionCache -> QUIC stream`, while `VotingService::new_datagram()` sends to the datagram egress channel. `pr-12572` is more direct because it replaces the main voting-service path sooner. + +5. `StakedValidatorsCache` keeps compatibility constructors while adding allowlist support for datagram admission. That avoids forcing stream-mode callers to understand the datagram allowlist before the transport switch. + +6. Local-cluster and tests differ because this stack has both modes for a while. Tests cover the old fallback and the new default path until stream mode is intentionally removed in a later cleanup. + +7. Lockfile differences should be reviewed carefully. Some dependency movement follows from the staged implementation, but any incidental lockfile churn should be squashed away or regenerated cleanly before publishing. + + +## `c21e1fc76d` - `votor: preserve peer identity in fanout targets` + +What it does: + +This commit threads peer identity through votor fanout by changing cached targets and test override listeners from bare socket addresses to `(Pubkey, SocketAddr)` style data. It keeps the existing `ConnectionCache` QUIC stream send path in place; the new pubkey is carried only so a later datagram sender can target authenticated peers. + +Why it exists: + +QUIC datagram sending needs a peer identity, not just an address, because the datagram endpoint/connection table targets authenticated peers by `Pubkey`. This commit threads that identity through votor fanout before changing transport. + +What it does not do: + +No datagram crate, no BLS sigverify changes, no validator wiring, no banlist/allowlist changes, no production transport switch. + +## `fbd2d8e721` - `core: normalize BLS sigverify input bytes` + +What it does: + +This commit refactors BLS sigverify so the verifier first works with normalized authenticated byte items rather than directly parsing streamer packets everywhere. The actual input is still the old `PacketBatch` receiver, and the consensus message bytes are deserialized with `wincode::deserialize_exact(...)` to match votor serialization and reject trailing data. + +Why it exists: + +Datagram ingress will naturally produce authenticated `(remote_pubkey, bytes)` items instead of streamer `PacketBatch` values. This commit isolates the verifier from packet-specific parsing first, so the later datagram switch can replace the producer without rewriting vote/certificate verification logic at the same time. + +What it does not do: + +No datagram receiver, no banlist type change, no validator/TVU wiring, no voting-service changes, and no production transport switch. + +## `edbf3b9730` - `quic-datagram: add unused endpoint crate` + +What it does: + +This commit adds the new generic `solana-quic-datagram` crate and its endpoint API, including authenticated datagram egress/ingress, allowlist and banlist support, one-connection-per-peer handling, datagram-only QUIC transport configuration, identity rotation, basic hardening, metrics, and endpoint tests. Nothing in validator, TVU, BLS sigverify, or votor production wiring uses the crate yet. + +Why it exists: + +The previous two commits prepared votor egress to carry peer identity and prepared BLS sigverify to consume authenticated bytes. This commit supplies the transport object that can produce and consume that shape: authenticated peer-keyed QUIC datagrams with explicit admission, banning, connection ownership, and datagram-only QUIC settings. + +Review note: + +As a stack step, this isolates production risk well. As an individual commit, it is still large: it combines the base endpoint, connection table, hardening/tuning, metrics, and identity-rotation policy. If we wanted an even more reviewable stack, this commit could be split into smaller transport-only commits. For this staged branch, this is the foundation commit that later votor and validator commits build on, even though the final shape now intentionally differs from `pr-12572` by keeping a selectable stream fallback. + +## `c5ab519d4f` - `quic-datagram: test connection semantics` + +What it does: + +This commit adds focused integration tests for the datagram endpoint connection semantics: lexicographic peer tiebreaking, failed `Dialing` cleanup, peer address movement, and nextest config for this networking-heavy crate. + +Why it exists: + +The previous commit added a lot of new transport machinery. The highest-risk parts are not raw `send_datagram` calls; they are the state-machine rules around who owns a connection, when a connection may be reused, when a dial may be retried, and how address changes are handled. These tests lock down those rules before any votor or validator code starts depending on them. + +What it does not do: + +No endpoint implementation changes, no votor changes, no validator/TVU wiring, no BLS sigverify changes, no banlist or allowlist behavior changes, and no production transport switch. + +NOTE: this could probably be PRed with the commit above. + +## `b65c44561f` - `votor: add datagram endpoint wrapper and benchmark` + +What it does: + +This commit adds the votor-specific wrapper around the generic datagram endpoint: alpenglow ALPN, endpoint spawn helper, staked-node allowlist construction, and a fanout/fanin benchmark example. + +Why it exists: + +The transport crate is intentionally generic. Votor still needs a small integration layer to pin down alpenglow-specific protocol choices and to translate the validator bank/stake view into the transport allowlist. This commit creates that layer before touching `VotingService`, TVU, or validator startup. + +Why this is the best next step: + +After the generic transport and its tests, this is the narrowest useful votor-facing step: define the wire ALPN and allowlist construction once, in `agave-votor`, while production still uses the old streamer/`ConnectionCache` path. Later commits can consume this wrapper instead of duplicating protocol constants and stake-admission logic inside `core` or `validator`. + +Review note: + +The wrapper is directly relevant to the datagram switch because ALPN and staked admission are protocol/compatibility boundaries. The benchmark is not required for the minimal production diff; it is additive validation/performance tooling. If we were minimizing review surface aggressively, the benchmark could be split from the wrapper or moved to a separate perf PR. + +What it does not do: + +No `VotingService` send-path change, no validator startup wiring, no TVU/BLS sigverify channel changes, no production endpoint construction, no banlist behavior change, no `ConnectionCache` removal, and no production transport switch. + +## `c1517add2c` - `votor: add additive datagram send path` + +VotingService had one way to send votes: + +```text +VotingService -> ConnectionCache -> QUIC stream +``` + +The send logic was baked directly into the service. + +This commit changes that to: + +```text +VotingService -> "some sender" +``` + +Then it defines two possible senders: + +```text +ConnectionCacheMessageSender -> QUIC stream +DatagramMessageSender -> QUIC datagram channel +``` + +But the default constructor still picks the old one: + +```text +VotingService::new() + -> ConnectionCacheMessageSender + -> QUIC stream +``` + +So after this commit, production is still: + +```text +VotingService -> ConnectionCache -> QUIC stream +``` + +The only new thing is that we now have another constructor: + +```text +VotingService::new_datagram() + -> DatagramMessageSender + -> QUIC datagram channel +``` + +But nobody calls that yet. + +What it does: + +This is the code change behind the diagram above: the final send operation becomes pluggable, while production still uses the old stream sender. + +Why it exists: + +At this point the stack has a datagram transport and a votor wrapper, but `VotingService` still only knows how to send through `ConnectionCache` streams. This commit introduces the smallest useful production-adjacent abstraction: keep the old broadcast pipeline, serialization, target selection, and service thread, but make the final send operation pluggable. + +Why this is the best next step: + +This is the correct point to touch `VotingService` because peer identity has already been preserved in fanout targets, and the votor datagram wrapper already exists. Making the datagram path additive lets reviewers inspect the send-path change without also reviewing validator startup, TVU ingress, BLS sigverify channel changes, or a default behavior flip. + +Review note: + +This commit is close to the minimal votor egress diff. It avoids removing `ConnectionCache` and avoids changing existing callers of `VotingService::new(...)`. The main non-egress addition is allowlist refresh plumbing in `StakedValidatorsCache`; that is needed for datagram endpoint admission, but it is an implementation coupling to review carefully because endpoint allowlist updates are now driven by the same cache refresh path used for fanout target discovery. + +What it does not do: + +No validator constructs the datagram endpoint yet. No validator calls `VotingService::new_datagram(...)` yet. No inbound datagram receiver is wired into BLS sigverify. No TVU fields change in this commit. No old stream path is removed, no `ConnectionCache` behavior changes for existing callers, and no production transport switch happens here. + +## `c3730df0a1` - `votor: test datagram identity rotation` + +What it does: + +This commit adds a test that sends through the votor datagram wrapper, rotates the client identity through the wrapper key updater, and verifies later traffic arrives under the new identity. + +Why it exists: + +The generic `solana-quic-datagram` crate already tests the lower-level `KeyUpdater` mechanics. This commit verifies that the votor wrapper actually exposes that key updater correctly. That matters because validator wiring will later register the returned key updater in the validator key-update registry. + +Why this is the best next step: + +The previous commit made datagram sending available to `VotingService`, but validator startup still has not constructed the endpoint or registered identity-rotation support. Before adding that startup wiring, this test checks the votor-facing endpoint wrapper contract: spawn returns an endpoint whose `key_updater` can rotate the QUIC/TLS identity and send subsequent datagrams under the new pubkey. + +What it does not do: + +No production code changes, no `VotingService` change, no validator/TVU wiring, no BLS sigverify change, no default behavior change, no new protocol constant, and no production transport switch. + +## `e77dcd9691` - `core: wire votor datagram transport behind config` + +Simple dataflow after this commit: + +```text +Default / old path: +VotingService -> ConnectionCache -> QUIC stream +Inbound QUIC simple_qos streamer -> PacketBatch -> BLS sigverify +``` + +```text +Configured datagram path: +VotingService::new_datagram() -> datagram egress channel -> QuicDatagramEndpoint +QuicDatagramEndpoint ingress channel -> Datagram -> BLS sigverify +``` + +What it does: + +This commit wires the stream and datagram votor transports through explicit config enums in validator and TVU. `QuicStream` still builds the old BLS `ConnectionCache` plus simple-qos streamer path, `QuicDatagram` builds the new datagram endpoint and routes datagram ingress into BLS sigverify, and the default remains `QuicStream` in this commit. + +Why it exists: + +The earlier commits made datagram sending and the datagram endpoint available, but nothing in the validator could actually choose it. This commit connects the pieces end to end under a config enum: validator startup can construct the datagram endpoint, TVU can route datagram ingress into BLS sigverify, and the BLS voting service can use the datagram egress channel. + +Why this is the best next step: + +This is the right place for the first full integration because the previous commits isolated the risky pieces: peer identity in fanout, normalized BLS input bytes, the generic endpoint, votor wrapper, and additive `VotingService::new_datagram(...)`. With those pieces already reviewed, this commit can focus on wiring and preserving the old path behind `VotorTransportProtocol::QuicStream`. + +Review note: + +This is a real integration commit and should get careful review. It touches validator startup, TVU initialization, BLS sigverify, banlist abstraction, key-updater registration, and local-cluster config cloning. The important safety property is that the default remains `QuicStream`, so the normal path should still be the old stream-mode behavior until the final default-flip commit. The main review questions are whether the datagram endpoint task lifetime is acceptable, whether the stub-channel behavior outside Testnet/Development is intentional, and whether the new generic banlist trait preserves old ban semantics for stream mode. + +What it does not do: + +It does not make datagrams the default. It does not remove the old simple-qos streamer path. It does not remove the old BLS `ConnectionCache` path. It does not add a public CLI flag for selecting the transport. It does not change TPU transaction networking, TPU vote transaction networking, Tower consensus logic, ReplayStage consensus logic, or non-votor vote paths beyond passing the selected alpenglow transport into TVU. + + +## `05b3d50d24` - `validator: default votor to quic datagrams` + +What it does: + +This commit flips `VotorTransportProtocol::default()` from `QuicStream` to `QuicDatagram` and updates the admin RPC identity-update test expectation for the datagram key updater. + +Why it exists: + +The previous commit wired both transports behind `VotorTransportProtocol`, but intentionally kept `QuicStream` as the default. This commit performs the actual behavior switch: votor now uses the QUIC datagram path by default. + +Why this is the best next step: + +The default flip belongs after the dual-path wiring commit. At that point the old stream path still exists for fallback, the datagram endpoint is wired end to end, BLS sigverify can consume datagram ingress, and `VotingService` can send through datagram egress. Keeping this as a separate commit makes the production behavior change obvious and easy to revert without removing the datagram implementation. + +Review note: + +This is a tiny diff but a high-impact commit. The code changed here is only the default enum variant and an admin RPC test expectation, but the behavior it selects is the full datagram transport path added in the earlier commits. Review should treat this as the rollout point, not as a harmless cleanup. + +What it does not do: + +It does not remove `QuicStream`. It does not remove the old simple-qos BLS streamer path. It does not remove the old BLS `ConnectionCache` path. It does not add a CLI flag for selecting transport. It does not change Tower consensus, ReplayStage consensus, TPU transaction networking, TPU vote transaction networking, or non-votor vote paths. diff --git a/votor/Cargo.toml b/votor/Cargo.toml index 50794e2696d..7ef3ba5c4a1 100644 --- a/votor/Cargo.toml +++ b/votor/Cargo.toml @@ -33,6 +33,7 @@ agave-logger = { workspace = true } agave-math-utils = { workspace = true } agave-votor-messages = { workspace = true } bitvec = { workspace = true } +bytes = { workspace = true } crossbeam-channel = { workspace = true } itertools = { workspace = true } lazy-lru = { workspace = true } @@ -62,6 +63,7 @@ solana-ledger = { workspace = true } solana-measure = { workspace = true } solana-metrics = { workspace = true } solana-pubkey = { workspace = true } +solana-quic-datagram = { workspace = true } solana-rpc = { workspace = true } solana-runtime = { workspace = true } solana-signature = { workspace = true } @@ -74,16 +76,21 @@ solana-transaction-error = { workspace = true } solana-vote = { workspace = true } solana-vote-program = { workspace = true } thiserror = { workspace = true } +tokio = { workspace = true, features = ["sync", "rt"] } wincode = { workspace = true, features = ["alloc"] } [dev-dependencies] agave-votor = { path = ".", features = ["agave-unstable-api", "dev-context-only-utils"] } +clap = { version = "4.5.58", features = ["derive"] } +num_cpus = { workspace = true } rand = { workspace = true } solana-net-utils = { path = "../net-utils", features = ["agave-unstable-api"] } solana-perf = { path = "../perf", features = ["agave-unstable-api", "dev-context-only-utils"] } +solana-quic-datagram = { workspace = true, features = ["dev-context-only-utils"] } solana-runtime = { path = "../runtime", features = ["agave-unstable-api", "dev-context-only-utils"] } solana-sdk-ids = { workspace = true } solana-streamer = { path = "../streamer", features = ["agave-unstable-api", "dev-context-only-utils"] } +solana-tls-utils = { workspace = true } tempfile = { workspace = true } test-case = { workspace = true } tokio-util = { workspace = true } diff --git a/votor/examples/votor_datagram.rs b/votor/examples/votor_datagram.rs new file mode 100644 index 00000000000..5cc4f64f898 --- /dev/null +++ b/votor/examples/votor_datagram.rs @@ -0,0 +1,717 @@ +#![allow(clippy::arithmetic_side_effects, clippy::manual_clamp)] +//! Fan-out / fan-in benchmark exercising [`agave_votor::datagram_endpoint`]. +//! +//! Two topologies, selected by `--direction`: +//! - `egress` (default): 1 client → N servers. Per-slot fan-out from +//! one sender to many receivers; matches production +//! `VotingService::broadcast_consensus_message`. Apples-to-apples +//! against bench A (`vote_quic_client`, tpu-client-next) and bench B +//! (`vote_connection_cache`, ConnectionCache) on the +//! `alpenglow-tpu-client-next` branch. +//! - `ingress`: N clients → 1 server. Per-slot fan-in from many senders +//! to one receiver, exercising the server-side accept / drain path. +//! Thread split flipped vs egress: 4-thread server runtime, 32-thread +//! shared client runtime. +//! +//! Common to both: +//! - per-packet timestamp in payload; per-thread local Vec of latency +//! samples merged into shared Vec once on join (matches A/B's +//! amortized-lock pattern); +//! - explicit shutdown: close endpoints → grace sleep → drop endpoints +//! → drop runtimes → join drain threads → compute stats; +//! - send path uses `try_send` (drop-on-full), matching production. +//! +//! Two unavoidable methodology differences vs A/B: +//! - Datagrams arrive one-at-a-time; A/B's per-`PacketBatch` recv +//! timestamp can't apply. C captures recv time per packet — more +//! accurate per item, slight bias **lower** vs A/B. +//! - No `SimpleQos` stream-rate throttling (datagrams aren't streams). +//! Equivalent under the default load (4 packets/slot/peer × 5 +//! slots/sec is well under `max_streams_per_second: 50`). +//! +//! Run: +//! cargo run --release -p agave-votor --example votor_datagram -- --peers 2000 +//! cargo run --release -p agave-votor --example votor_datagram -- --direction ingress --peers 2000 + +use { + bytes::Bytes, + clap::{Parser, ValueEnum}, + crossbeam_channel::{Receiver, bounded}, + solana_keypair::{Keypair, Signer}, + solana_net_utils::sockets::{bind_to, unique_port_range_for_tests}, + solana_pubkey::Pubkey, + solana_quic_datagram::{Banlist, allowlist::AllowAll, endpoint::Datagram}, + std::{ + net::{IpAddr, Ipv4Addr, SocketAddr, UdpSocket}, + sync::{ + Arc, + atomic::{AtomicU64, Ordering}, + }, + thread, + time::{Duration, Instant}, + }, + tokio::runtime::Runtime, +}; + +#[derive(Copy, Clone, Debug, PartialEq, Eq, ValueEnum)] +enum Direction { + /// 1 client → N servers (egress fan-out). + Egress, + /// N clients → 1 server (ingress fan-in). + Ingress, +} + +#[derive(Debug, Parser)] +#[command(about = "quic-datagram fanout benchmark (egress: 1→N, ingress: N→1)")] +struct Cli { + /// Number of peers. Servers in egress mode; clients in ingress mode. + #[arg(long, default_value_t = 20)] + peers: usize, + + /// Slot duration in milliseconds + #[arg(long, default_value_t = 200)] + slot_ms: u64, + + /// Number of slots to run + #[arg(long, default_value_t = 50)] + slots: usize, + + /// Packets sent per slot per source. Egress: per client → each server. + /// Ingress: per client → the server. + #[arg(long, default_value_t = 4)] + packets_per_slot: usize, + + /// Warmup slots (not counted in stats) + #[arg(long, default_value_t = 5)] + warmup_slots: usize, + + /// Accepted for compatibility with the A/B benches' CLI; ignored — + /// quic-datagram enforces `MAX_DATAGRAMS_PER_SECOND_PER_PEER`. + #[arg(long, default_value_t = 50)] + max_streams_per_second: u64, + + /// Datagram payload size in bytes (capped at 1200, the safe max for the default 1280 MTU). + #[arg(long, default_value_t = 500)] + packet_size: usize, + + /// Connections to open per warmup batch. + #[arg(long, default_value_t = 128)] + handshake_batch: usize, + + /// Pause between warmup batches in milliseconds. + #[arg(long, default_value_t = 100)] + handshake_pause_ms: u64, + + /// Test direction (egress = 1→N, ingress = N→1). + #[arg(long, value_enum, default_value_t = Direction::Egress)] + direction: Direction, +} + +/// Payload layout: `[u32 LE slot][u64 LE ns_since_anchor][padding]` — +/// identical to the A/B benches. +fn encode_packet(slot: u32, anchor: Instant, payload_size: usize) -> Bytes { + let mut buf = vec![0u8; payload_size]; + buf[0..4].copy_from_slice(&slot.to_le_bytes()); + let ns = anchor.elapsed().as_nanos() as u64; + buf[4..12].copy_from_slice(&ns.to_le_bytes()); + Bytes::from(buf) +} + +fn decode_packet(data: &[u8]) -> Option<(u32, u64)> { + if data.len() < 12 { + return None; + } + let slot = u32::from_le_bytes(data[0..4].try_into().ok()?); + let ns = u64::from_le_bytes(data[4..12].try_into().ok()?); + Some((slot, ns)) +} + +/// Sentinel slot value used to encode the warmup-batch probe datagrams. +/// Drain threads recognise this and skip the probe entirely (no +/// `received_count` bump, no latency sample) so the post-run delivery +/// rate is computed against real measured traffic only. Picked at the +/// top of the `u32` range so it can't collide with a real slot index. +const PROBE_SLOT: u32 = u32::MAX; + +/// Bind a UDP socket on localhost via the test port allocator with a +/// retry budget — the allocator's bookkeeping does not guarantee the +/// underlying port is actually free at the OS level. +fn bind_unique() -> UdpSocket { + const MAX_ATTEMPTS: usize = 16; + let mut attempts: Vec<(u16, std::io::Error)> = Vec::with_capacity(MAX_ATTEMPTS); + for _ in 0..MAX_ATTEMPTS { + let port = unique_port_range_for_tests(1).start; + match bind_to(IpAddr::V4(Ipv4Addr::LOCALHOST), port) { + Ok(s) => return s, + Err(e) => attempts.push((port, e)), + } + } + let detail = attempts + .iter() + .map(|(p, e)| format!(" port {p}: {e}")) + .collect::>() + .join("\n"); + panic!("bind UDP failed after {MAX_ATTEMPTS} attempts:\n{detail}"); +} + +/// Pick a keypair whose pubkey is strictly greater than every entry in +/// `lower_bounds`. The 1→N client→server fan-out relies on this: the +/// client (lower) dials every server (higher) per the lex rule. +fn keypair_above_all(lower_bounds: &[Pubkey]) -> Keypair { + let upper = lower_bounds.iter().max().copied().unwrap_or_default(); + loop { + let k = Keypair::new(); + if k.pubkey() > upper { + return k; + } + } +} + +/// Symmetric to [`keypair_above_all`]: pick a keypair whose pubkey is +/// strictly less than every entry in `upper_bounds`. Used by the ingress +/// topology so every client (lower) dials the single server (higher) per +/// the lex rule, and no client ever dials another client. +fn keypair_below_all(upper_bounds: &[Pubkey]) -> Keypair { + let lower = upper_bounds + .iter() + .min() + .copied() + .unwrap_or(Pubkey::new_from_array([u8::MAX; 32])); + loop { + let k = Keypair::new(); + if k.pubkey() < lower { + return k; + } + } +} + +struct ServerHandle { + addr: SocketAddr, + rx: Receiver, + endpoint: solana_quic_datagram::endpoint::QuicDatagramEndpoint, +} + +fn spawn_endpoint( + rt: &Runtime, + keypair: &Keypair, +) -> ( + solana_quic_datagram::endpoint::QuicDatagramEndpoint, + Receiver, + SocketAddr, +) { + let socket = bind_unique(); + let addr = socket.local_addr().expect("local_addr"); + let (tx, rx) = bounded(1 << 15); + let banlist = Arc::new(Banlist::::default()); + let endpoint = agave_votor::datagram_endpoint::spawn( + rt.handle(), + keypair, + socket, + tx, + Arc::new(AllowAll), + banlist, + ) + .expect("endpoint"); + (endpoint, rx, addr) +} + +fn print_stats(slots: usize, num_peers: usize, packets_per_slot: usize, latencies: &[f64]) { + if latencies.is_empty() { + println!("\nWARNING: no latency samples collected from measured slots"); + return; + } + let mut sorted = latencies.to_vec(); + sorted.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let n = sorted.len(); + let sum: f64 = sorted.iter().sum(); + let mean = sum / n as f64; + let var: f64 = sorted.iter().map(|v| (v - mean) * (v - mean)).sum::() / n as f64; + let stddev = var.sqrt(); + let p50 = sorted[n / 2]; + let p99 = sorted[(n as f64 * 0.99) as usize]; + let min = sorted[0]; + let max = sorted[n - 1]; + let measured_expected = (slots * num_peers * packets_per_slot) as u64; + + println!(); + println!("=== Delivery (measured slots only) ==="); + println!(" Packets expected: {measured_expected}"); + println!(" Packets received: {n}"); + println!( + " Delivery rate: {:.2}%", + (n as f64 / measured_expected as f64) * 100.0 + ); + println!(); + println!("=== Latency (one-way, microseconds) ==="); + println!(" Mean : {mean:.1}"); + println!(" Stddev : {stddev:.1}"); + println!(" Min : {min:.1}"); + println!(" P50 : {p50:.1}"); + println!(" P99 : {p99:.1}"); + println!(" Max : {max:.1}"); +} + +fn main() { + let cli = Cli::parse(); + match cli.direction { + Direction::Egress => run_egress(cli), + Direction::Ingress => run_ingress(cli), + } +} + +fn run_egress(cli: Cli) { + let num_peers = cli.peers; + let slot_ms = cli.slot_ms; + let slots = cli.slots; + let warmup_slots = cli.warmup_slots; + let packets_per_slot = cli.packets_per_slot; + let packet_size = cli.packet_size.min(1200).max(12); + let handshake_batch = cli.handshake_batch.max(1); + let handshake_pause_ms = cli.handshake_pause_ms; + + assert!(packets_per_slot > 0, "packets_per_slot must be ≥ 1"); + let slot_duration = Duration::from_millis(slot_ms); + let burst_interval = slot_duration / packets_per_slot as u32; + let warmup_bursts = warmup_slots * packets_per_slot; + let total_bursts = (warmup_slots + slots) * packets_per_slot; + + println!( + "votor_datagram EGRESS: peers={num_peers} slot_ms={slot_ms} slots={slots} \ + warmup={warmup_slots} packets_per_slot={packets_per_slot} packet_size={packet_size} \ + (burst_interval={:.1}ms, total_bursts={total_bursts})", + burst_interval.as_secs_f64() * 1000.0, + ); + + // Match bench A's runtime split: a small client pool (4 threads, same + // as `VotorQuicClient::spawn_runtime`) and a larger shared server pool + // (32 threads). CPU is largely idle during this workload — a single + // 48-thread pool just adds park/unpark overhead. + let client_rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(4) + .enable_all() + .thread_name("votor-bench-client") + .build() + .expect("client tokio runtime"); + let server_rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(32) + .enable_all() + .thread_name("votor-bench-server") + .build() + .expect("server tokio runtime"); + + // Client first, then N servers whose pubkeys are guaranteed to be + // above the client's so the lex rule routes every dial in one + // direction. + let client_keypair = Keypair::new(); + let client_pubkey = client_keypair.pubkey(); + let server_keypairs: Vec = (0..num_peers) + .map(|_| keypair_above_all(&[client_pubkey])) + .collect(); + + let (client, _client_rx, _client_addr) = spawn_endpoint(&client_rt, &client_keypair); + + println!("spawning {num_peers} servers..."); + let spawn_start = Instant::now(); + let mut servers: Vec = Vec::with_capacity(num_peers); + for (idx, kp) in server_keypairs.iter().enumerate() { + let (endpoint, rx, addr) = spawn_endpoint(&server_rt, kp); + servers.push(ServerHandle { addr, rx, endpoint }); + if (idx + 1) % 100 == 0 { + println!(" spawned {}/{} servers", idx + 1, num_peers); + } + } + println!( + "1 client + {num_peers} servers spawned in {:.2}s", + spawn_start.elapsed().as_secs_f64() + ); + thread::sleep(Duration::from_secs(1)); + + let anchor = Instant::now(); + let received_count = Arc::new(AtomicU64::new(0)); + let latencies: Arc>> = Arc::new(std::sync::Mutex::new(Vec::new())); + let warmup_u32 = warmup_bursts as u32; + + // Drain threads: shard all N server receivers across `DRAIN_THREADS` + // OS threads. Each thread owns a contiguous chunk and uses + // `crossbeam_channel::Select` to block on whichever of its channels + // becomes ready. Sharding keeps the per-call `Select` scan bounded + // (~N/K) while avoiding the 2000-threads-fighting-for-CPU pattern of + // one-thread-per-server. Each thread accumulates samples in a local + // Vec and merges once on join (matches A/B's amortized-lock pattern). + // + // Shutdown: when the bench drops the runtimes, every server's + // ingress sender is dropped and `recv()` starts returning Err. A + // drain thread treats the first Err on any of its channels as a + // shutdown signal, drains remaining packets from its shard via + // `try_recv`, and exits. + const DRAIN_THREADS: usize = 4; + let rxs: Vec> = servers.iter().map(|s| s.rx.clone()).collect(); + let shard_size = rxs.len().div_ceil(DRAIN_THREADS).max(1); + let drain_handles: Vec> = rxs + .chunks(shard_size) + .enumerate() + .map(|(shard_idx, chunk)| { + let shard: Vec> = chunk.to_vec(); + let received_count = received_count.clone(); + let latencies = latencies.clone(); + thread::Builder::new() + .name(format!("votor-bench-drain-{shard_idx}")) + .spawn(move || { + let mut sel = crossbeam_channel::Select::new(); + for rx in &shard { + sel.recv(rx); + } + let mut local_latencies: Vec = Vec::new(); + let record = |bytes: &Bytes, local: &mut Vec| { + let recv_ns = anchor.elapsed().as_nanos() as u64; + if let Some((slot, send_ns)) = decode_packet(bytes) { + if slot == PROBE_SLOT { + return; + } + received_count.fetch_add(1, Ordering::Relaxed); + if slot >= warmup_u32 { + let us = recv_ns.saturating_sub(send_ns) as f64 / 1000.0; + local.push(us); + } + } + }; + loop { + let oper = sel.select(); + let idx = oper.index(); + match oper.recv(&shard[idx]) { + Ok(dg) => record(&dg.message, &mut local_latencies), + Err(_) => { + for rx in &shard { + while let Ok(dg) = rx.try_recv() { + record(&dg.message, &mut local_latencies); + } + } + break; + } + } + } + latencies + .lock() + .expect("lock") + .extend_from_slice(&local_latencies); + }) + .expect("spawn drain thread") + }) + .collect(); + + let dropped_count = Arc::new(AtomicU64::new(0)); + + // Warm up connections in batches — same pattern as A/B. Probes are + // encoded with `slot = PROBE_SLOT` so the drain threads can filter + // them out of `received_count` (they're not part of the test traffic + // and otherwise inflate the delivery rate above 100%). + println!("establishing connections to {num_peers} servers..."); + { + let num_batches = num_peers.div_ceil(handshake_batch); + let probe = encode_packet(PROBE_SLOT, anchor, packet_size); + for (i, (chunk_kp, chunk_srv)) in server_keypairs + .chunks(handshake_batch) + .zip(servers.chunks(handshake_batch)) + .enumerate() + { + for (kp, srv) in chunk_kp.iter().zip(chunk_srv.iter()) { + if client + .egress + .try_send(Datagram { + peer_pubkey: kp.pubkey(), + peer_address: srv.addr, + message: probe.clone(), + }) + .is_err() + { + dropped_count.fetch_add(1, Ordering::Relaxed); + } + } + println!( + " batch {}/{num_batches}: {} connections", + i + 1, + chunk_kp.len() + ); + if i + 1 < num_batches && handshake_pause_ms > 0 { + thread::sleep(Duration::from_millis(handshake_pause_ms)); + } + } + thread::sleep(Duration::from_secs(2)); + } + println!("connection establishment complete."); + + println!( + "starting {total_bursts} bursts ({warmup_bursts} warmup + {} measured), {} \ + pkt/peer/burst...", + slots * packets_per_slot, + 1, + ); + for burst in 0..total_bursts as u32 { + let burst_start = Instant::now(); + let is_warmup = (burst as usize) < warmup_bursts; + let buf = encode_packet(burst, anchor, packet_size); + for (kp, srv) in server_keypairs.iter().zip(servers.iter()) { + if client + .egress + .try_send(Datagram { + peer_pubkey: kp.pubkey(), + peer_address: srv.addr, + message: buf.clone(), + }) + .is_err() + { + dropped_count.fetch_add(1, Ordering::Relaxed); + } + } + let send_elapsed = burst_start.elapsed(); + if send_elapsed < burst_interval { + thread::sleep(burst_interval - send_elapsed); + } + let expected = num_peers; + let so_far = received_count.load(Ordering::Relaxed); + println!( + "burst {burst}{}: send phase {:.1}ms, expected {expected} packets, received so far: \ + {so_far}", + if is_warmup { " (warmup)" } else { "" }, + send_elapsed.as_secs_f64() * 1000.0, + ); + } + + println!("waiting for in-flight packets..."); + thread::sleep(Duration::from_secs(3)); + + let total_received = received_count.load(Ordering::Relaxed); + let total_dropped = dropped_count.load(Ordering::Relaxed); + let total_expected = (total_bursts * num_peers) as u64; + println!( + "total packets: expected={total_expected}, sent={}, dropped_at_egress={total_dropped}, \ + received={total_received}, delivery_rate={:.2}%", + total_expected.saturating_sub(total_dropped), + (total_received as f64 / total_expected as f64) * 100.0, + ); + + // Shutdown: close endpoints to break QUIC connections (drops the + // sender clones held by per-connection read tasks), then drop the + // runtime to abort any lingering tasks and release whatever senders + // remain. After that the receiver threads' `rx.recv()` returns Err + // and they exit, merging their local Vec into `latencies`. + println!("shutting down client + servers..."); + client.close(); + for srv in &servers { + srv.endpoint.close(); + } + drop(servers); + drop(client); + drop(client_rt); + drop(server_rt); + + for h in drain_handles { + let _ = h.join(); + } + + let latencies = latencies.lock().expect("lock"); + print_stats(slots, num_peers, packets_per_slot, &latencies); +} + +fn run_ingress(cli: Cli) { + let num_peers = cli.peers; // here = number of clients + let slot_ms = cli.slot_ms; + let slots = cli.slots; + let warmup_slots = cli.warmup_slots; + let packets_per_slot = cli.packets_per_slot; + let packet_size = cli.packet_size.min(1200).max(12); + let handshake_batch = cli.handshake_batch.max(1); + let handshake_pause_ms = cli.handshake_pause_ms; + + assert!(packets_per_slot > 0, "packets_per_slot must be ≥ 1"); + let slot_duration = Duration::from_millis(slot_ms); + let burst_interval = slot_duration / packets_per_slot as u32; + let warmup_bursts = warmup_slots * packets_per_slot; + let total_bursts = (warmup_slots + slots) * packets_per_slot; + + println!( + "votor_datagram INGRESS: clients={num_peers} slot_ms={slot_ms} slots={slots} \ + warmup={warmup_slots} packets_per_slot={packets_per_slot} packet_size={packet_size} \ + (burst_interval={:.1}ms, total_bursts={total_bursts})", + burst_interval.as_secs_f64() * 1000.0, + ); + + // Flip of egress: 4-thread server, 32-thread shared client pool. + let server_rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(4) + .enable_all() + .thread_name("votor-bench-server") + .build() + .expect("server tokio runtime"); + let client_rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(32) + .enable_all() + .thread_name("votor-bench-client") + .build() + .expect("client tokio runtime"); + + // Server first, then N clients whose pubkeys are strictly below the + // server's so the lex rule routes every dial in the client→server + // direction. No client ever dials another client because no code path + // ever feeds another client's pubkey to a client's `egress.try_send`. + let server_keypair = Keypair::new(); + let server_pubkey = server_keypair.pubkey(); + let client_keypairs: Vec = (0..num_peers) + .map(|_| keypair_below_all(&[server_pubkey])) + .collect(); + + println!("spawning 1 server + {num_peers} clients..."); + let spawn_start = Instant::now(); + let (server, server_rx, server_addr) = spawn_endpoint(&server_rt, &server_keypair); + + let mut clients: Vec = + Vec::with_capacity(num_peers); + for (idx, kp) in client_keypairs.iter().enumerate() { + let (endpoint, _rx, _addr) = spawn_endpoint(&client_rt, kp); + clients.push(endpoint); + if (idx + 1) % 100 == 0 { + println!(" spawned {}/{} clients", idx + 1, num_peers); + } + } + println!( + "1 server + {num_peers} clients spawned in {:.2}s", + spawn_start.elapsed().as_secs_f64() + ); + thread::sleep(Duration::from_secs(1)); + + let anchor = Instant::now(); + let received_count = Arc::new(AtomicU64::new(0)); + let latencies: Arc>> = Arc::new(std::sync::Mutex::new(Vec::new())); + let warmup_u32 = warmup_bursts as u32; + + // Single drain thread on the one server rx — no Select needed. + let drain_rx = server_rx.clone(); + let drain_received = received_count.clone(); + let drain_latencies = latencies.clone(); + let drain_handle = thread::Builder::new() + .name("votor-bench-drain".to_string()) + .spawn(move || { + let mut local_latencies: Vec = Vec::new(); + let mut record = |bytes: &Bytes| { + let recv_ns = anchor.elapsed().as_nanos() as u64; + if let Some((slot, send_ns)) = decode_packet(bytes) { + if slot == PROBE_SLOT { + return; + } + drain_received.fetch_add(1, Ordering::Relaxed); + if slot >= warmup_u32 { + let us = recv_ns.saturating_sub(send_ns) as f64 / 1000.0; + local_latencies.push(us); + } + } + }; + while let Ok(dg) = drain_rx.recv() { + record(&dg.message); + } + while let Ok(dg) = drain_rx.try_recv() { + record(&dg.message); + } + drain_latencies + .lock() + .expect("lock") + .extend_from_slice(&local_latencies); + }) + .expect("spawn drain thread"); + + let dropped_count = Arc::new(AtomicU64::new(0)); + + // Warmup probes: every client sends one PROBE_SLOT to server. Same + // batching pattern as egress, iterating clients (not servers). + println!("establishing connections from {num_peers} clients to server..."); + { + let num_batches = num_peers.div_ceil(handshake_batch); + let probe = encode_packet(PROBE_SLOT, anchor, packet_size); + for (i, chunk) in clients.chunks(handshake_batch).enumerate() { + for c in chunk.iter() { + if c.egress + .try_send(Datagram { + peer_pubkey: server_pubkey, + peer_address: server_addr, + message: probe.clone(), + }) + .is_err() + { + dropped_count.fetch_add(1, Ordering::Relaxed); + } + } + println!( + " batch {}/{num_batches}: {} connections", + i + 1, + chunk.len() + ); + if i + 1 < num_batches && handshake_pause_ms > 0 { + thread::sleep(Duration::from_millis(handshake_pause_ms)); + } + } + thread::sleep(Duration::from_secs(2)); + } + println!("connection establishment complete."); + + println!( + "starting {total_bursts} bursts ({warmup_bursts} warmup + {} measured)...", + slots * packets_per_slot, + ); + for burst in 0..total_bursts as u32 { + let burst_start = Instant::now(); + let is_warmup = (burst as usize) < warmup_bursts; + let buf = encode_packet(burst, anchor, packet_size); + for c in clients.iter() { + if c.egress + .try_send(Datagram { + peer_pubkey: server_pubkey, + peer_address: server_addr, + message: buf.clone(), + }) + .is_err() + { + dropped_count.fetch_add(1, Ordering::Relaxed); + } + } + let send_elapsed = burst_start.elapsed(); + if send_elapsed < burst_interval { + thread::sleep(burst_interval - send_elapsed); + } + let expected = num_peers; + let so_far = received_count.load(Ordering::Relaxed); + println!( + "burst {burst}{}: send phase {:.1}ms, expected {expected} packets, received so far: \ + {so_far}", + if is_warmup { " (warmup)" } else { "" }, + send_elapsed.as_secs_f64() * 1000.0, + ); + } + + println!("waiting for in-flight packets..."); + thread::sleep(Duration::from_secs(3)); + + let total_received = received_count.load(Ordering::Relaxed); + let total_dropped = dropped_count.load(Ordering::Relaxed); + let total_expected = (total_bursts * num_peers) as u64; + println!( + "total packets: expected={total_expected}, sent={}, dropped_at_egress={total_dropped}, \ + received={total_received}, delivery_rate={:.2}%", + total_expected.saturating_sub(total_dropped), + (total_received as f64 / total_expected as f64) * 100.0, + ); + + println!("shutting down clients + server..."); + for c in &clients { + c.close(); + } + server.close(); + drop(clients); + drop(server); + drop(server_rx); + drop(client_rt); + drop(server_rt); + + let _ = drain_handle.join(); + + let latencies = latencies.lock().expect("lock"); + print_stats(slots, num_peers, packets_per_slot, &latencies); +} diff --git a/votor/src/datagram_endpoint.rs b/votor/src/datagram_endpoint.rs new file mode 100644 index 00000000000..13b55e3c3b4 --- /dev/null +++ b/votor/src/datagram_endpoint.rs @@ -0,0 +1,87 @@ +//! Votor-flavored constructor for [`solana_quic_datagram::QuicDatagramEndpoint`]. +//! +//! Centralizes the wire-format identifier (`ALPENGLOW_ALPN`) and the metrics +//! namespace used by both directions of alpenglow consensus traffic, so the +//! send side (`voting_service`) and the receive side (the BLS sigverifier +//! consuming `ingress`) cannot drift apart. +//! +//! One UDP socket — one endpoint — multiplexes votes (egress) and inbound +//! consensus messages (ingress) per the lex-pubkey direction rule. See the +//! `solana-quic-datagram` crate docs for the underlying semantics. + +use { + crossbeam_channel::Sender, + solana_keypair::Keypair, + solana_pubkey::Pubkey, + solana_quic_datagram::{ + Allowlist, Banlist, Error, StakedNodesAllowlist, + endpoint::{Datagram, QuicDatagramEndpoint}, + }, + solana_runtime::bank_forks::BankForks, + std::{ + collections::HashMap, + net::UdpSocket, + sync::{Arc, RwLock}, + }, +}; + +/// ALPN identifier negotiated on every alpenglow QUIC handshake. Changing +/// this value is a wire-breaking protocol change — peers with mismatched +/// ALPN fail the TLS handshake. +pub const ALPENGLOW_ALPN: &[u8] = b"alpenglow-v1"; + +/// Construct a [`QuicDatagramEndpoint`] tuned for alpenglow consensus +/// traffic. Caller owns allowlist, banlist, ingress and the banlist +/// eviction receiver; the returned endpoint owns its control loop task. +/// +/// **Identity rotation** — register `endpoint.key_updater.clone()` with +/// the validator's `KeyUpdaters` registry (implements +/// `solana_tls_utils::NotifyKeyUpdate`). Hot-spare failover: a backup +/// node is handed the primary's staked keypair; peers re-handshake under +/// the new identity. The primary observes its connections replaced with +/// `HANDOVER` closes and soft-bans those peers to avoid double-voting. +#[allow(clippy::too_many_arguments)] +pub fn spawn( + runtime: &tokio::runtime::Handle, + keypair: &Keypair, + socket: UdpSocket, + ingress: Sender, + allowlist: Arc, + banlist: Arc>, +) -> Result { + QuicDatagramEndpoint::new( + runtime, + keypair, + socket, + ALPENGLOW_ALPN, + ingress, + allowlist, + banlist, + ) +} + +/// Build the allowlist map (pubkey → epoch stake) for validators with +/// positive stake in the working bank's current epoch. This is the +/// canonical allowlist; callers seed `StakedNodesAllowlist` from +/// this at construction and then drive subsequent epoch-boundary +/// refreshes from [`crate::staked_validators_cache::StakedValidatorsCache`]. +pub fn current_admit_set(bank_forks: &Arc>) -> HashMap { + let bank = bank_forks.read().unwrap().working_bank(); + let epoch = bank.epoch(); + bank.epoch_staked_nodes(epoch) + .map(|m| { + m.iter() + .filter(|(_, stake)| **stake > 0) + .map(|(pk, stake)| (*pk, *stake)) + .collect() + }) + .unwrap_or_default() +} + +/// Build a fresh [`StakedNodesAllowlist`] seeded with the current +/// epoch's staked-set. Hand the returned Arc both to +/// [`spawn`] and to the [`StakedValidatorsCache`] that voting_service +/// owns — the cache will call `.swap()` on epoch transitions. +pub fn build_allowlist(bank_forks: &Arc>) -> Arc { + Arc::new(StakedNodesAllowlist::new(current_admit_set(bank_forks))) +} diff --git a/votor/src/lib.rs b/votor/src/lib.rs index e3cf0a52fa9..d5ed9188cb8 100644 --- a/votor/src/lib.rs +++ b/votor/src/lib.rs @@ -9,6 +9,7 @@ pub mod common; pub mod consensus_metrics; pub mod consensus_pool; mod consensus_pool_service; +pub mod datagram_endpoint; pub mod event; mod event_handler; pub mod generated_cert_types; diff --git a/votor/src/staked_validators_cache.rs b/votor/src/staked_validators_cache.rs index 0912b78690c..68edce5b767 100644 --- a/votor/src/staked_validators_cache.rs +++ b/votor/src/staked_validators_cache.rs @@ -1,12 +1,13 @@ use { - crate::voting_service::AlpenglowPortOverride, + crate::{datagram_endpoint, voting_service::AlpenglowPortOverride}, lazy_lru::LruCache, solana_clock::{Epoch, Slot}, solana_gossip::cluster_info::ClusterInfo, solana_pubkey::Pubkey, + solana_quic_datagram::StakedNodesAllowlist, solana_runtime::bank_forks::BankForks, std::{ - collections::HashMap, + collections::{HashMap, HashSet}, net::SocketAddr, sync::{Arc, RwLock}, time::{Duration, Instant}, @@ -14,8 +15,8 @@ use { }; struct StakedValidatorsCacheEntry { - /// Alpenglow Sockets associated with the staked validators - alpenglow_sockets: Vec, + /// (Pubkey, Alpenglow socket) pairs for the staked validators. + peers: Vec<(Pubkey, SocketAddr)>, /// The time at which this entry was created creation_time: Instant, @@ -46,15 +47,44 @@ pub struct StakedValidatorsCache { /// timestamp of the last alpenglow port override we read alpenglow_port_override_last_modified: Instant, + + /// Optional datagram endpoint allowlist kept current on epoch refresh. + allowlist: Option>, + last_allowlist_epoch: Option, + + /// Extra non-staked peers to admit, used by test-only additional listeners. + extra_admit: HashSet, } impl StakedValidatorsCache { + #[cfg(test)] pub fn new( bank_forks: Arc>, ttl: Duration, target_cache_size: usize, include_self: bool, alpenglow_port_override: Option, + ) -> Self { + Self::new_with_allowlist( + bank_forks, + ttl, + target_cache_size, + include_self, + alpenglow_port_override, + None, + HashSet::new(), + ) + } + + #[allow(clippy::too_many_arguments)] + pub fn new_with_allowlist( + bank_forks: Arc>, + ttl: Duration, + target_cache_size: usize, + include_self: bool, + alpenglow_port_override: Option, + allowlist: Option>, + extra_admit: HashSet, ) -> Self { Self { cache: LruCache::new(target_cache_size), @@ -63,9 +93,25 @@ impl StakedValidatorsCache { include_self, alpenglow_port_override, alpenglow_port_override_last_modified: Instant::now(), + allowlist, + last_allowlist_epoch: None, + extra_admit, } } + fn maybe_refresh_allowlist(&mut self, epoch: Epoch) { + let Some(allowlist) = self.allowlist.as_ref() else { + return; + }; + if self.last_allowlist_epoch == Some(epoch) { + return; + } + let mut map = datagram_endpoint::current_admit_set(&self.bank_forks); + map.extend(self.extra_admit.iter().map(|pk| (*pk, 0u64))); + allowlist.swap(map); + self.last_allowlist_epoch = Some(epoch); + } + #[inline] fn cur_epoch(&self, slot: Slot) -> Epoch { self.bank_forks @@ -82,6 +128,8 @@ impl StakedValidatorsCache { cluster_info: &ClusterInfo, update_time: Instant, ) { + self.maybe_refresh_allowlist(epoch); + let banks = { let bank_forks = self.bank_forks.read().unwrap(); [bank_forks.root_bank(), bank_forks.working_bank()] @@ -126,7 +174,7 @@ impl StakedValidatorsCache { nodes.dedup_by_key(|node| node.alpenglow_socket); nodes.sort_unstable_by_key(|a| a.stake); - let mut alpenglow_sockets = Vec::with_capacity(nodes.len()); + let mut peers = Vec::with_capacity(nodes.len()); let override_map = self .alpenglow_port_override .as_ref() @@ -142,12 +190,12 @@ impl StakedValidatorsCache { } else { alpenglow_socket }; - alpenglow_sockets.push(socket); + peers.push((node.pubkey, socket)); } self.cache.put( epoch, StakedValidatorsCacheEntry { - alpenglow_sockets, + peers, creation_time: update_time, }, ); @@ -158,7 +206,7 @@ impl StakedValidatorsCache { slot: Slot, cluster_info: &ClusterInfo, access_time: Instant, - ) -> (&[SocketAddr], bool) { + ) -> (&[(Pubkey, SocketAddr)], bool) { // Check if self.alpenglow_port_override has a different last_modified. // Immediately refresh the cache if it does. if let Some(alpenglow_port_override) = &self.alpenglow_port_override { @@ -183,7 +231,7 @@ impl StakedValidatorsCache { epoch: Epoch, cluster_info: &ClusterInfo, access_time: Instant, - ) -> (&[SocketAddr], bool) { + ) -> (&[(Pubkey, SocketAddr)], bool) { // For a given epoch, if we either: // // (1) have a cache entry that has expired @@ -203,10 +251,7 @@ impl StakedValidatorsCache { ( // Unwrapping is fine here, since update_cache guarantees that we push a cache entry to // self.cache[epoch]. - self.cache - .get(&epoch) - .map(|v| &*v.alpenglow_sockets) - .unwrap(), + self.cache.get(&epoch).map(|v| &*v.peers).unwrap(), refresh_cache, ) } @@ -239,7 +284,7 @@ mod tests { bank::Bank, bank_forks::BankForks, genesis_utils::{ - ValidatorVoteKeypairs, create_genesis_config_with_alpenglow_vote_accounts, + create_genesis_config_with_alpenglow_vote_accounts, ValidatorVoteKeypairs, }, }, solana_signer::Signer, @@ -265,14 +310,12 @@ mod tests { .map(|(node_ix, pubkey)| { let mut contact_info = ContactInfo::new(*pubkey, 0_u64, 0_u16); - assert!( - contact_info - .set_alpenglow(( - Ipv4Addr::LOCALHOST, - 8080_u16.saturating_add(node_ix as u16) - )) - .is_ok() - ); + assert!(contact_info + .set_alpenglow(( + Ipv4Addr::LOCALHOST, + 8080_u16.saturating_add(node_ix as u16) + )) + .is_ok()); contact_info }); @@ -479,18 +522,16 @@ mod tests { // Epochs 1-5 should have been evicted (LRU). for entry_ix in 1_u64..=5_u64 { - assert!( - !svc.cache - .contains_key(&svc.cur_epoch(entry_ix.saturating_mul(base_slot))) - ); + assert!(!svc + .cache + .contains_key(&svc.cur_epoch(entry_ix.saturating_mul(base_slot)))); } // Epochs 6-10 should have entries. for entry_ix in 6_u64..=10_u64 { - assert!( - svc.cache - .contains_key(&svc.cur_epoch(entry_ix.saturating_mul(base_slot))) - ); + assert!(svc + .cache + .contains_key(&svc.cur_epoch(entry_ix.saturating_mul(base_slot)))); } // Re-accessing entries 1-5 after TTL re-inserts them. With 5 already in cache (entries @@ -552,7 +593,7 @@ mod tests { let (sockets, _) = svc.get_staked_validators_by_slot(slot_num, &cluster_info, Instant::now()); assert_eq!(sockets.len(), num_nodes); - assert!(sockets.contains(&my_socket_addr)); + assert!(sockets.iter().any(|(_, socket)| socket == &my_socket_addr)); // Create our staked validators cache - set include_self to false let mut svc = @@ -562,7 +603,7 @@ mod tests { svc.get_staked_validators_by_slot(slot_num, &cluster_info, Instant::now()); // We should have num_nodes - 1 sockets, since we exclude our own socket address. assert_eq!(sockets.len(), num_nodes.checked_sub(1).unwrap()); - assert!(!sockets.contains(&my_socket_addr)); + assert!(!sockets.iter().any(|(_, socket)| socket == &my_socket_addr)); } #[test] @@ -585,14 +626,14 @@ mod tests { // Nothing in the override, so we should get the original socket addresses. let (sockets, _) = svc.get_staked_validators_by_slot(0, &cluster_info, Instant::now()); assert_eq!(sockets.len(), 2); - assert!(!sockets.contains(&blackhole_addr)); + assert!(!sockets.iter().any(|(_, socket)| socket == &blackhole_addr)); // Add an override for pubkey_B, and check that we get the overridden socket address. alpenglow_port_override.update_override(HashMap::from([(pubkey_b, blackhole_addr)])); let (sockets, _) = svc.get_staked_validators_by_slot(0, &cluster_info, Instant::now()); assert_eq!(sockets.len(), 2); // Sort sockets to ensure the blackhole address is at index 0. - let mut sockets: Vec<_> = sockets.to_vec(); + let mut sockets: Vec<_> = sockets.iter().map(|(_, socket)| *socket).collect(); sockets.sort(); assert_eq!(sockets[0], blackhole_addr); assert_ne!(sockets[1], blackhole_addr); @@ -601,6 +642,6 @@ mod tests { alpenglow_port_override.clear(); let (sockets, _) = svc.get_staked_validators_by_slot(0, &cluster_info, Instant::now()); assert_eq!(sockets.len(), 2); - assert!(!sockets.contains(&blackhole_addr)); + assert!(!sockets.iter().any(|(_, socket)| socket == &blackhole_addr)); } } diff --git a/votor/src/voting_service.rs b/votor/src/voting_service.rs index 4dbb3278814..9c74e711459 100644 --- a/votor/src/voting_service.rs +++ b/votor/src/voting_service.rs @@ -4,6 +4,7 @@ use { vote_history_storage::{SavedVoteHistoryVersions, VoteHistoryStorage}, }, agave_votor_messages::{certificate::Certificate, consensus_message::ConsensusMessage}, + bytes::Bytes, crossbeam_channel::Receiver, solana_client::connection_cache::ConnectionCache, solana_clock::Slot, @@ -11,15 +12,18 @@ use { solana_gossip::cluster_info::ClusterInfo, solana_measure::measure::Measure, solana_pubkey::Pubkey, + solana_quic_datagram::{endpoint::Datagram, StakedNodesAllowlist}, solana_runtime::bank_forks::BankForks, solana_transaction_error::TransportError, std::{ - collections::HashMap, + collections::{HashMap, HashSet}, + fmt, net::SocketAddr, sync::{Arc, RwLock}, thread::{self, Builder, JoinHandle}, time::{Duration, Instant}, }, + tokio::sync::mpsc, }; const STAKED_VALIDATORS_CACHE_TTL_S: u64 = 5; @@ -40,14 +44,73 @@ pub enum BLSOp { }, } -fn send_message( - buf: Vec, - socket: &SocketAddr, - connection_cache: &Arc, -) -> Result<(), TransportError> { - let client = connection_cache.get_connection(socket); +#[derive(Debug)] +enum ConsensusSendError { + Stream(TransportError), + DatagramFull, + DatagramClosed, +} + +impl fmt::Display for ConsensusSendError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Stream(err) => write!(f, "{err:?}"), + Self::DatagramFull => f.write_str("datagram egress channel is full"), + Self::DatagramClosed => f.write_str("datagram egress channel is closed"), + } + } +} + +trait ConsensusMessageSender: Send + Sync + 'static { + fn send_message( + &self, + peer: Pubkey, + addr: SocketAddr, + buf: Vec, + ) -> Result<(), ConsensusSendError>; +} + +struct ConnectionCacheMessageSender { + connection_cache: Arc, +} + +impl ConsensusMessageSender for ConnectionCacheMessageSender { + fn send_message( + &self, + _peer: Pubkey, + addr: SocketAddr, + buf: Vec, + ) -> Result<(), ConsensusSendError> { + let client = self.connection_cache.get_connection(&addr); + + client + .send_data_async(Arc::new(buf)) + .map_err(ConsensusSendError::Stream) + } +} - client.send_data_async(Arc::new(buf)) +struct DatagramMessageSender { + egress: mpsc::Sender, +} + +impl ConsensusMessageSender for DatagramMessageSender { + fn send_message( + &self, + peer: Pubkey, + addr: SocketAddr, + buf: Vec, + ) -> Result<(), ConsensusSendError> { + self.egress + .try_send(Datagram { + peer_pubkey: peer, + peer_address: addr, + message: Bytes::from(buf), + }) + .map_err(|err| match err { + mpsc::error::TrySendError::Full(_) => ConsensusSendError::DatagramFull, + mpsc::error::TrySendError::Closed(_) => ConsensusSendError::DatagramClosed, + }) + } } pub struct VotingService { @@ -109,9 +172,20 @@ impl AlpenglowPortOverride { } } +/// Test-only additional listener: the voting service will fan out every +/// consensus message to this `(pubkey, addr)` peer alongside the live +/// staked-validators list. The stream-mode sender only uses `addr`; the +/// pubkey is carried here so the later datagram path can target authenticated +/// peers without changing the fanout API again. +#[derive(Clone, Debug)] +pub struct AdditionalListener { + pub pubkey: Pubkey, + pub addr: SocketAddr, +} + #[derive(Clone)] pub struct VotingServiceOverride { - pub additional_listeners: Vec, + pub additional_listeners: Vec, pub alpenglow_port_override: AlpenglowPortOverride, } @@ -123,6 +197,61 @@ impl VotingService { connection_cache: Arc, bank_forks: Arc>, test_override: Option, + ) -> Self { + Self::new_with_sender( + bls_receiver, + cluster_info, + vote_history_storage, + Arc::new(ConnectionCacheMessageSender { connection_cache }), + None, + HashSet::new(), + bank_forks, + test_override, + ) + } + + #[allow(clippy::too_many_arguments)] + pub fn new_datagram( + bls_receiver: Receiver, + cluster_info: Arc, + vote_history_storage: Arc, + egress: mpsc::Sender, + allowlist: Option>, + bank_forks: Arc>, + test_override: Option, + ) -> Self { + let extra_admit = test_override + .as_ref() + .map(|override_| { + override_ + .additional_listeners + .iter() + .map(|listener| listener.pubkey) + .collect() + }) + .unwrap_or_default(); + Self::new_with_sender( + bls_receiver, + cluster_info, + vote_history_storage, + Arc::new(DatagramMessageSender { egress }), + allowlist, + extra_admit, + bank_forks, + test_override, + ) + } + + #[allow(clippy::too_many_arguments)] + fn new_with_sender( + bls_receiver: Receiver, + cluster_info: Arc, + vote_history_storage: Arc, + message_sender: Arc, + allowlist: Option>, + extra_admit: HashSet, + bank_forks: Arc>, + test_override: Option, ) -> Self { let (additional_listeners, alpenglow_port_override) = match test_override { None => (Vec::new(), None), @@ -135,12 +264,14 @@ impl VotingService { let thread_hdl = Builder::new() .name("solVotorVoteSvc".to_string()) .spawn(move || { - let mut staked_validators_cache = StakedValidatorsCache::new( + let mut staked_validators_cache = StakedValidatorsCache::new_with_allowlist( bank_forks.clone(), Duration::from_secs(STAKED_VALIDATORS_CACHE_TTL_S), STAKED_VALIDATORS_CACHE_NUM_EPOCH_TARGET, false, alpenglow_port_override, + allowlist, + extra_admit, ); info!("AlpenglowVotingService has started"); @@ -149,7 +280,7 @@ impl VotingService { &cluster_info, vote_history_storage.as_ref(), bls_op, - connection_cache.clone(), + message_sender.as_ref(), &additional_listeners, &mut staked_validators_cache, ); @@ -164,8 +295,8 @@ impl VotingService { slot: Slot, cluster_info: &ClusterInfo, message: &ConsensusMessage, - connection_cache: Arc, - additional_listeners: &[SocketAddr], + message_sender: &dyn ConsensusMessageSender, + additional_listeners: &[AdditionalListener], staked_validators_cache: &mut StakedValidatorsCache, ) { let buf = match wincode::serialize(message) { @@ -178,16 +309,17 @@ impl VotingService { let (staked_validator_alpenglow_sockets, _) = staked_validators_cache .get_staked_validators_by_slot(slot, cluster_info, Instant::now()); - let sockets = additional_listeners + let peers = additional_listeners .iter() - .chain(staked_validator_alpenglow_sockets.iter()); + .map(|listener| (listener.pubkey, listener.addr)) + .chain(staked_validator_alpenglow_sockets.iter().copied()); // We use send_message in a loop right now because we worry that sending packets too fast // will cause a packet spike and overwhelm the network. If we later find out that this is // not an issue, we can optimize this by using multi_targret_send or similar methods. - for socket in sockets { - if let Err(e) = send_message(buf.clone(), socket, &connection_cache) { - warn!("Failed to send alpenglow message to {socket}: {e:?}"); + for (pubkey, socket) in peers { + if let Err(e) = message_sender.send_message(pubkey, socket, buf.clone()) { + warn!("Failed to send alpenglow message to {socket}: {e}"); } } } @@ -196,8 +328,8 @@ impl VotingService { cluster_info: &ClusterInfo, vote_history_storage: &dyn VoteHistoryStorage, bls_op: BLSOp, - connection_cache: Arc, - additional_listeners: &[SocketAddr], + message_sender: &dyn ConsensusMessageSender, + additional_listeners: &[AdditionalListener], staked_validators_cache: &mut StakedValidatorsCache, ) { match bls_op { @@ -218,7 +350,7 @@ impl VotingService { slot, cluster_info, &message, - connection_cache, + message_sender, additional_listeners, staked_validators_cache, ); @@ -230,7 +362,7 @@ impl VotingService { vote_slot, cluster_info, &message, - connection_cache, + message_sender, additional_listeners, staked_validators_cache, ); @@ -255,34 +387,31 @@ mod tests { consensus_message::{ConsensusMessage, VoteMessage}, vote::Vote, }, - solana_bls_signatures::{BLS_SIGNATURE_AFFINE_SIZE, Signature as BLSSignature}, + solana_bls_signatures::{Signature as BLSSignature, BLS_SIGNATURE_AFFINE_SIZE}, solana_gossip::{cluster_info::ClusterInfo, contact_info::ContactInfo}, solana_keypair::Keypair, - solana_net_utils::{SocketAddrSpace, sockets::bind_to_localhost_unique}, + solana_net_utils::{sockets::bind_to_localhost_unique, SocketAddrSpace}, solana_runtime::{ bank::Bank, bank_forks::BankForks, genesis_utils::{ - ValidatorVoteKeypairs, create_genesis_config_with_alpenglow_vote_accounts, + create_genesis_config_with_alpenglow_vote_accounts, ValidatorVoteKeypairs, }, }, solana_signer::Signer, solana_streamer::{ nonblocking::swqos::SwQosConfig, - quic::{QuicStreamerConfig, SpawnServerResult, spawn_stake_weighted_qos_server}, + quic::{spawn_stake_weighted_qos_server, QuicStreamerConfig, SpawnServerResult}, streamer::StakedNodes, }, - std::{ - net::SocketAddr, - sync::{Arc, RwLock}, - }, + std::sync::{Arc, RwLock}, test_case::test_case, tokio_util::sync::CancellationToken, }; fn create_voting_service( bls_receiver: Receiver, - listener: SocketAddr, + listener: AdditionalListener, ) -> (VotingService, Vec) { // Create 10 node validatorvotekeypairs vec let validator_keypairs = (0..10) @@ -322,6 +451,95 @@ mod tests { ) } + fn create_datagram_voting_service( + bls_receiver: Receiver, + listener: AdditionalListener, + egress: mpsc::Sender, + ) -> VotingService { + let validator_keypairs = (0..10) + .map(|_| ValidatorVoteKeypairs::new_rand()) + .collect::>(); + let genesis = create_genesis_config_with_alpenglow_vote_accounts( + 1_000_000_000, + &validator_keypairs, + vec![100; validator_keypairs.len()], + ); + let bank0 = Bank::new_for_tests(&genesis.genesis_config); + let bank_forks = BankForks::new_rw_arc(bank0); + let keypair = Keypair::new(); + let contact_info = ContactInfo::new_localhost(&keypair.pubkey(), 0); + let cluster_info = ClusterInfo::new( + contact_info, + Arc::new(keypair), + SocketAddrSpace::Unspecified, + ); + + VotingService::new_datagram( + bls_receiver, + Arc::new(cluster_info), + Arc::new(NullVoteHistoryStorage::default()), + egress, + None, + bank_forks, + Some(VotingServiceOverride { + additional_listeners: vec![listener], + alpenglow_port_override: AlpenglowPortOverride::default(), + }), + ) + } + + #[test_case(BLSOp::PushVote { + message: Arc::new(ConsensusMessage::Vote(VoteMessage { + vote: Vote::new_skip_vote(5), + signature: BLSSignature([0; BLS_SIGNATURE_AFFINE_SIZE]), + rank: 1, + })), + slot: 5, + saved_vote_history: SavedVoteHistoryVersions::Current(SavedVoteHistory::default()), + }, ConsensusMessage::Vote(VoteMessage { + vote: Vote::new_skip_vote(5), + signature: BLSSignature([0; BLS_SIGNATURE_AFFINE_SIZE]), + rank: 1, + }))] + #[test_case(BLSOp::PushCertificate { + certificate: Arc::new(Certificate { + cert_type: CertificateType::Skip(5), + signature: BLSSignature([0; BLS_SIGNATURE_AFFINE_SIZE]), + bitmap: Vec::new(), + }), + }, ConsensusMessage::Certificate(Certificate { + cert_type: CertificateType::Skip(5), + signature: BLSSignature([0; BLS_SIGNATURE_AFFINE_SIZE]), + bitmap: Vec::new(), + }))] + fn test_send_message_datagram(bls_op: BLSOp, expected_message: ConsensusMessage) { + let (bls_sender, bls_receiver) = crossbeam_channel::unbounded(); + let (egress, mut egress_rx) = mpsc::channel(8); + let listener = AdditionalListener { + pubkey: Pubkey::new_unique(), + addr: bind_to_localhost_unique().unwrap().local_addr().unwrap(), + }; + + let _service = create_datagram_voting_service(bls_receiver, listener.clone(), egress); + bls_sender.send(bls_op).unwrap(); + + let deadline = Instant::now() + Duration::from_secs(2); + let datagram = loop { + match egress_rx.try_recv() { + Ok(datagram) => break datagram, + Err(mpsc::error::TryRecvError::Empty) => { + assert!(Instant::now() < deadline, "timed out waiting for datagram"); + std::thread::sleep(Duration::from_millis(10)); + } + Err(mpsc::error::TryRecvError::Disconnected) => panic!("egress disconnected"), + } + }; + assert_eq!(datagram.peer_pubkey, listener.pubkey); + assert_eq!(datagram.peer_address, listener.addr); + let received = wincode::deserialize::(&datagram.message).unwrap(); + assert_eq!(received, expected_message); + } + #[test_case(BLSOp::PushVote { message: Arc::new(ConsensusMessage::Vote(VoteMessage { vote: Vote::new_skip_vote(5), @@ -354,9 +572,13 @@ mod tests { // Bind to a random UDP port let socket = bind_to_localhost_unique().unwrap(); let listener_addr = socket.local_addr().unwrap(); + let listener = AdditionalListener { + pubkey: Pubkey::new_unique(), + addr: listener_addr, + }; // Create VotingService with the listener address - let (_, validator_keypairs) = create_voting_service(bls_receiver, listener_addr); + let (_, validator_keypairs) = create_voting_service(bls_receiver, listener); // Send a BLS message via the VotingService assert!(bls_sender.send(bls_op).is_ok()); diff --git a/votor/tests/identity_rotation.rs b/votor/tests/identity_rotation.rs new file mode 100644 index 00000000000..4f71a1c9c25 --- /dev/null +++ b/votor/tests/identity_rotation.rs @@ -0,0 +1,190 @@ +#![allow(clippy::arithmetic_side_effects)] +//! Identity rotation exercised through the votor wrapper. +//! +//! The underlying `KeyUpdater` is tested at the quic-datagram crate +//! level (`solana_quic_datagram::tests::identity_rotation`). This test +//! confirms the wrapper at [`agave_votor::datagram_endpoint::spawn`] +//! surfaces the same handle correctly: rotating identity through +//! `key_updater.update_key(&new_kp)` should evict cached connections +//! with `IDENTITY_ROTATED`, swap the local pubkey for the lex rule, +//! and accept subsequent traffic attributed to the new identity. +//! +//! **Production use case context** - this isn't a "rotate to a freshly +//! generated pubkey" scenario. It's a hot-spare failover: a non-voting +//! backup running with a throwaway keypair gets promoted by being +//! handed the staked keypair already in use by the primary. After +//! rotation: +//! - The new pubkey is already in every peer's allowlist (it +//! was staked the whole time - it's the primary's identity). +//! - Peers observe a second connection from the same staked pubkey +//! and replace the primary's connection via HANDOVER. +//! - The primary receives HANDOVER closes and soft-bans the evicting +//! peers so it doesn't redial and disrupt consensus. +//! +//! This test uses two arbitrary keypairs (K1 -> K2) with `AllowAll` +//! AllowAll allowlist on the server side; that exercises the rotation +//! mechanics in isolation without modeling the staked-set realities +//! of a real cluster. + +use { + agave_votor::datagram_endpoint, + bytes::Bytes, + crossbeam_channel::Receiver, + solana_keypair::{Keypair, Signer}, + solana_net_utils::sockets::bind_to_localhost_unique, + solana_pubkey::Pubkey, + solana_quic_datagram::{ + Banlist, StakedNodesAllowlist, allowlist::AllowAll, endpoint::Datagram, + }, + solana_tls_utils::NotifyKeyUpdate, + std::{ + collections::HashMap, + sync::Arc, + time::{Duration, Instant}, + }, + tokio::runtime::Runtime, +}; + +/// Pick a keypair whose pubkey is strictly less than `upper`. Used so +/// the client always plays the lex-correct dialer role. +fn keypair_below(upper: &Pubkey) -> Keypair { + loop { + let k = Keypair::new(); + if &k.pubkey() < upper { + return k; + } + } +} + +/// Re-send `payload` on `egress` every 50ms until `rx` observes an +/// item satisfying `cond`, or `timeout` elapses. Mirrors the helper in +/// `quic-datagram/tests/common.rs`: the first egress to a fresh peer +/// is consumed as the dial trigger and dropped; subsequent retries +/// ride the resulting `Established`. Identity rotation evicts the +/// table, so the post-rotation send also needs the same retry loop. +fn send_until_received( + rt: &Runtime, + egress: &tokio::sync::mpsc::Sender, + target_pk: Pubkey, + target_addr: std::net::SocketAddr, + payload: Bytes, + rx: &Receiver, + timeout: Duration, + mut cond: impl FnMut(&Datagram) -> Option, + msg: &str, +) -> T { + const POLL_INTERVAL: Duration = Duration::from_millis(50); + let deadline = Instant::now() + timeout; + while Instant::now() < deadline { + rt.block_on(async { + let _ = egress.try_send(Datagram { + peer_pubkey: target_pk, + peer_address: target_addr, + message: payload.clone(), + }); + }); + if let Ok(item) = rx.recv_timeout(POLL_INTERVAL) + && let Some(t) = cond(&item) + { + return t; + } + } + panic!("{msg}"); +} + +#[test] +fn identity_rotation_via_votor_wrapper() { + let rt = Runtime::new().expect("tokio runtime"); + + // Server endpoint with AllowAll allowlist so any client pubkey is + // accepted - we're testing the rotation path, not the allowlist. + let server_kp = Keypair::new(); + let server_pubkey = server_kp.pubkey(); + let server_socket = bind_to_localhost_unique().expect("server bind"); + let server_addr = server_socket.local_addr().expect("server addr"); + let (server_ingress_tx, server_ingress_rx) = crossbeam_channel::bounded(4096); + let server_banlist = Arc::new(Banlist::::default()); + let server = datagram_endpoint::spawn( + rt.handle(), + &server_kp, + server_socket, + server_ingress_tx, + Arc::new(AllowAll), + server_banlist, + ) + .expect("server endpoint"); + + // Client endpoint with K1 - strictly below the server so the + // lex-correct dial goes client -> server. + let k1 = keypair_below(&server_pubkey); + let k1_pubkey = k1.pubkey(); + let client_socket = bind_to_localhost_unique().expect("client bind"); + let (client_ingress_tx, _client_ingress_rx) = crossbeam_channel::bounded(4096); + let client_banlist = Arc::new(Banlist::::default()); + // Client allowlist must include server_pubkey so its dial-side + // allowlist check passes. Use a StakedNodesAllowlist populated + // with the server's pubkey (mirrors how the cache would seed it + // from BankForks's staked_nodes). + let admit: HashMap<_, _> = std::iter::once((server_pubkey, 100u64)).collect(); + let client = datagram_endpoint::spawn( + rt.handle(), + &k1, + client_socket, + client_ingress_tx, + Arc::new(StakedNodesAllowlist::new(admit)), + client_banlist, + ) + .expect("client endpoint"); + + // Send one message under K1. Server should observe it attributed + // to K1's pubkey. Retry until landed: the first send to a fresh + // peer is the dial trigger and is dropped. + let p1 = Bytes::from_static(b"under-K1"); + send_until_received( + &rt, + &client.egress, + server_pubkey, + server_addr, + p1.clone(), + &server_ingress_rx, + Duration::from_secs(5), + |d| (d.peer_pubkey == k1_pubkey && d.message == p1).then_some(()), + "server never received message attributed to K1", + ); + + // Rotate to K2 via the wrapper's KeyUpdater handle. K2 must also + // be lex-below the server so the dial after rotation still goes + // the right direction. + let k2 = keypair_below(&server_pubkey); + let k2_pubkey = k2.pubkey(); + assert_ne!(k1_pubkey, k2_pubkey, "K1 and K2 must differ"); + NotifyKeyUpdate::update_key(client.key_updater.as_ref(), &k2) + .expect("key updater accepts rotation"); + + // Give the control loop a beat to apply the rotation (rebuild TLS + // configs, swap, evict the K1 connection). + std::thread::sleep(Duration::from_millis(500)); + + // Send under K2. Server should now observe K2's pubkey. Same retry + // pattern: rotation wiped the table, so the post-rotation egress is + // also a dial trigger and is dropped on the first try. + let p2 = Bytes::from_static(b"under-K2"); + send_until_received( + &rt, + &client.egress, + server_pubkey, + server_addr, + p2.clone(), + &server_ingress_rx, + Duration::from_secs(5), + |d| (d.peer_pubkey == k2_pubkey && d.message == p2).then_some(()), + "server never received message attributed to K2 after rotation", + ); + + // Identity rotation is the local endpoint closing its own + // connections with IDENTITY_ROTATED - not a HANDOVER event for + // the peer. The peer's read loop reaps the entry without soft-ban + // and accepts the fresh K2 handshake on the next send. + drop(client); + drop(server); +}