From 6ead31487fa0962fea63efdd3553c13b0ec1c1c2 Mon Sep 17 00:00:00 2001 From: greg Date: Tue, 9 Jun 2026 02:47:48 +0000 Subject: [PATCH 1/6] votor: add datagram transport module --- Cargo.lock | 9 + Cargo.toml | 1 + votor/Cargo.toml | 13 +- votor/src/datagram_transport/allowlist.rs | 57 ++ votor/src/datagram_transport/client.rs | 359 +++++++++++ votor/src/datagram_transport/endpoint.rs | 341 ++++++++++ votor/src/datagram_transport/error.rs | 105 +++ votor/src/datagram_transport/mod.rs | 50 ++ votor/src/datagram_transport/server.rs | 752 ++++++++++++++++++++++ votor/src/datagram_transport/stats.rs | 187 ++++++ votor/src/datagram_transport/testutils.rs | 142 ++++ votor/src/datagram_transport/transport.rs | 155 +++++ votor/src/lib.rs | 1 + votor/tests/datagram_transport.rs | 367 +++++++++++ 14 files changed, 2537 insertions(+), 2 deletions(-) create mode 100644 votor/src/datagram_transport/allowlist.rs create mode 100644 votor/src/datagram_transport/client.rs create mode 100644 votor/src/datagram_transport/endpoint.rs create mode 100644 votor/src/datagram_transport/error.rs create mode 100644 votor/src/datagram_transport/mod.rs create mode 100644 votor/src/datagram_transport/server.rs create mode 100644 votor/src/datagram_transport/stats.rs create mode 100644 votor/src/datagram_transport/testutils.rs create mode 100644 votor/src/datagram_transport/transport.rs create mode 100644 votor/tests/datagram_transport.rs diff --git a/Cargo.lock b/Cargo.lock index e278ac5e0b1..9962caa61fb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -435,14 +435,21 @@ dependencies = [ "agave-math-utils", "agave-votor", "agave-votor-messages", + "arc-swap", + "arrayvec", "bitvec", + "bytes", "crossbeam-channel", + "futures 0.3.32", "itertools 0.14.0", "lazy-lru", "log", "parking_lot 0.12.3", "qualifier_attr", + "quinn", + "quinn-proto", "rand 0.9.4", + "rustls", "serde", "serde_bytes", "solana-accounts-db", @@ -472,6 +479,7 @@ dependencies = [ "solana-signer-store", "solana-streamer", "solana-time-utils", + "solana-tls-utils", "solana-transaction", "solana-transaction-error", "solana-vote", @@ -479,6 +487,7 @@ dependencies = [ "tempfile", "test-case", "thiserror 2.0.18", + "tokio", "tokio-util 0.7.18", "wincode", ] diff --git a/Cargo.toml b/Cargo.toml index d588879bf19..157fa41ec30 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -308,6 +308,7 @@ protobuf-src = "1.1.0" protosol = "=8.2.0" qualifier_attr = { version = "0.2.2", default-features = false } quinn = "0.11.9" +quinn-proto = "0.11.14" rand = "0.9.4" rand_chacha = "0.9.0" rayon = "1.12.0" diff --git a/votor/Cargo.toml b/votor/Cargo.toml index 50794e2696d..2fea1737d3a 100644 --- a/votor/Cargo.toml +++ b/votor/Cargo.toml @@ -32,13 +32,20 @@ frozen-abi = [ agave-logger = { workspace = true } agave-math-utils = { workspace = true } agave-votor-messages = { workspace = true } +arc-swap = { workspace = true } +arrayvec = { workspace = true } bitvec = { workspace = true } +bytes = { workspace = true } crossbeam-channel = { workspace = true } +futures = { workspace = true } itertools = { workspace = true } lazy-lru = { workspace = true } log = { workspace = true } parking_lot = { workspace = true } qualifier_attr = { workspace = true } +quinn = { workspace = true } +quinn-proto = { workspace = true } +rustls = { workspace = true } serde = { workspace = true } serde_bytes = { workspace = true } solana-accounts-db = { workspace = true } @@ -61,6 +68,7 @@ solana-leader-schedule = { workspace = true } solana-ledger = { workspace = true } solana-measure = { workspace = true } solana-metrics = { workspace = true } +solana-net-utils = { workspace = true } solana-pubkey = { workspace = true } solana-rpc = { workspace = true } solana-runtime = { workspace = true } @@ -69,24 +77,25 @@ solana-signer = { workspace = true } solana-signer-store = { workspace = true } solana-streamer = { workspace = true } solana-time-utils = { workspace = true } +solana-tls-utils = { workspace = true, features = ["agave-unstable-api"] } solana-transaction = { workspace = true } solana-transaction-error = { workspace = true } solana-vote = { workspace = true } solana-vote-program = { workspace = true } thiserror = { workspace = true } +tokio = { workspace = true, features = ["rt-multi-thread", "macros", "sync", "time"] } +tokio-util = { workspace = true } wincode = { workspace = true, features = ["alloc"] } [dev-dependencies] agave-votor = { path = ".", features = ["agave-unstable-api", "dev-context-only-utils"] } rand = { workspace = true } -solana-net-utils = { path = "../net-utils", features = ["agave-unstable-api"] } solana-perf = { path = "../perf", features = ["agave-unstable-api", "dev-context-only-utils"] } solana-runtime = { path = "../runtime", features = ["agave-unstable-api", "dev-context-only-utils"] } solana-sdk-ids = { workspace = true } solana-streamer = { path = "../streamer", features = ["agave-unstable-api", "dev-context-only-utils"] } tempfile = { workspace = true } test-case = { workspace = true } -tokio-util = { workspace = true } [lints] workspace = true diff --git a/votor/src/datagram_transport/allowlist.rs b/votor/src/datagram_transport/allowlist.rs new file mode 100644 index 00000000000..1d62df1e2cc --- /dev/null +++ b/votor/src/datagram_transport/allowlist.rs @@ -0,0 +1,57 @@ +use { + arc_swap::ArcSwap, + solana_pubkey::Pubkey, + std::{collections::HashMap, sync::Arc}, +}; + +pub trait Allowlist: Send + Sync + 'static { + /// Called once per handshake, then periodically during connection lifetime. + /// A `false` answer closes the connection with the `NOT_ADMITTED` error code. + fn allow(&self, peer: &Pubkey) -> bool; +} + +/// Snapshot of the currently-allowed peer set. +#[derive(Default)] +pub struct StakedNodesAllowlist { + inner: ArcSwap>, +} + +impl StakedNodesAllowlist { + /// `peers` maps admitted pubkeys to epoch stake. + pub fn new(peers: HashMap) -> Self { + Self { + inner: ArcSwap::new(Arc::new(peers)), + } + } + + /// Atomically publish a new allowlist generation. + pub fn swap(&self, peers: HashMap) { + self.inner.store(Arc::new(peers)); + } + + /// Number of allowed peers in the current generation. + pub fn len(&self) -> usize { + self.inner.load().len() + } + + pub fn is_empty(&self) -> bool { + self.inner.load().is_empty() + } +} + +impl Allowlist for StakedNodesAllowlist { + fn allow(&self, peer: &Pubkey) -> bool { + self.inner.load().contains_key(peer) + } +} + +/// Allow every peer. +#[cfg(any(test, feature = "dev-context-only-utils"))] +pub struct AllowAll; + +#[cfg(any(test, feature = "dev-context-only-utils"))] +impl Allowlist for AllowAll { + fn allow(&self, _: &Pubkey) -> bool { + true + } +} diff --git a/votor/src/datagram_transport/client.rs b/votor/src/datagram_transport/client.rs new file mode 100644 index 00000000000..b2ee9e729b6 --- /dev/null +++ b/votor/src/datagram_transport/client.rs @@ -0,0 +1,359 @@ +//! Outbound (client) direction: we-dial, send-only. +use { + crate::datagram_transport::{ + close_codes, + endpoint::{Datagram, METRICS_INTERVAL}, + error::Error, + stats::{self, QuicDatagramStats, add, record_error}, + transport::{IdentitySnapshot, new_client_config}, + }, + bytes::Bytes, + log::{error, info, warn}, + quinn::{Connection, Endpoint, SendDatagramError}, + solana_net_utils::banlist::Banlist, + solana_pubkey::Pubkey, + solana_tls_utils::{get_remote_pubkey, socket_addr_to_quic_server_name}, + std::{ + collections::{HashMap, hash_map::Entry}, + net::SocketAddr, + sync::{Arc, atomic::Ordering}, + }, + tokio::{ + sync::{mpsc, watch}, + time::MissedTickBehavior, + }, + tokio_util::sync::CancellationToken, +}; + +/// State of a peer's entry in the outbound table. +pub(crate) enum OutgoingEntry { + /// A placeholder installed by the egress path before spawning a dial + /// task. Exactly one dial is ever in flight per peer (later egress sees + /// `Dialing` and drops). + Dialing, + Established(Connection), +} + +impl OutboundLoop { + /// Outbound packet dispatch. Returns `true` if the + /// caller must spawn a dial task (the trigger datagram is carried into it), + /// `false` if the datagram was sent or dropped. The 4-case decision over the + /// `outgoing` entry for `peer`: + /// * vacant -> initiate new connection (-> true), + /// * dial in progress -> drop (-> false), + /// * established to `addr` -> send (-> false; or true if the conn was lost), + /// * established to a different addr -> evict (`PEER_MOVED`) + redial (-> true). + fn send_outbound(&mut self, peer: Pubkey, addr: SocketAddr, bytes: &Bytes) -> bool { + match self.outgoing.entry(peer) { + Entry::Vacant(slot) => { + slot.insert(OutgoingEntry::Dialing); + true + } + Entry::Occupied(mut slot) => match slot.get() { + OutgoingEntry::Dialing => { + self.stats + .egress_dropped_dial_in_progress + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + false + } + OutgoingEntry::Established(conn) if conn.remote_address() == addr => { + match conn.send_datagram(bytes.clone()) { + Ok(()) => { + add(&self.stats.datagrams_sent); + false + } + Err(SendDatagramError::ConnectionLost(_)) => { + // Connection is dead; swap to Dialing so the + // caller re-dials with this datagram as trigger. + *slot.get_mut() = OutgoingEntry::Dialing; + true + } + Err(e) => { + record_error(&Error::from(e), &self.stats); + false + } + } + } + OutgoingEntry::Established(_) => { + // Peer moved - swap the slot to `Dialing`... + let old = std::mem::replace(slot.get_mut(), OutgoingEntry::Dialing); + // ... and close the displaced connection with PEER_MOVED. + if let OutgoingEntry::Established(old_conn) = old { + close_codes::PEER_MOVED.close(&old_conn); + self.stats + .connection_evicted_peer_moved + .fetch_add(1, std::sync::atomic::Ordering::Relaxed); + info!("peer {peer} moved; re-dialing at {addr}"); + } + true + } + }, + } + } +} + +/// Event reported by a dial task or close-watcher to the outbound control loop. +pub(crate) enum OutboundEvent { + /// Dial result: `Ok(conn)` on success, `Err(())` if the dial or identity + /// check failed. + Dialed { + peer: Pubkey, + generation: u64, + outcome: Result, + }, + /// An established outbound connection closed; the loop reaps its slot. + Closed { + peer: Pubkey, + generation: u64, + stable_id: usize, + }, +} + +/// An outbound dial: connect, validate the server identity, and report the +/// outcome to the control loop. Built by the loop and dispatched via +/// [`Self::spawn`]. +pub(crate) struct ClientConnection { + pub(crate) endpoint: Endpoint, + pub(crate) peer: Pubkey, + pub(crate) addr: SocketAddr, + pub(crate) generation: u64, + pub(crate) trigger: Bytes, + pub(crate) events: mpsc::Sender, + pub(crate) stats: Arc, +} + +impl ClientConnection { + async fn run(self) { + let outcome = match self.dial().await { + Ok(conn) => { + match conn.send_datagram(self.trigger) { + Ok(()) => add(&self.stats.datagrams_sent), + Err(e) => record_error(&Error::from(e), &self.stats), + } + Ok(conn) + } + Err(e) => { + error!( + "Connection attempt to ({}, {}) failed: {e:?}", + self.peer, self.addr + ); + record_error(&e, &self.stats); + Err(()) + } + }; + // Blocking send to make sure we clear the `Dialing` placeholder. If the + // send fails there is nothing left to do. + let _ = self + .events + .send(OutboundEvent::Dialed { + peer: self.peer, + generation: self.generation, + outcome, + }) + .await; + } + + async fn dial(&self) -> Result { + let server_name = socket_addr_to_quic_server_name(self.addr); + let connection = self.endpoint.connect(self.addr, &server_name)?.await?; + // Server identity must match the pubkey the caller targeted. + let attested = get_remote_pubkey(&connection).ok_or(Error::InvalidIdentity(self.addr))?; + if attested != self.peer { + close_codes::INVALID_IDENTITY.close(&connection); + return Err(Error::InvalidIdentity(self.addr)); + } + Ok(connection) + } +} + +/// Outbound control loop: client egress (we-dial, send-only). Owns the outgoing +/// table and the dial-task event channel. +pub(crate) struct OutboundLoop { + pub(crate) endpoint: Endpoint, + pub(crate) local_pubkey: Pubkey, + /// Identity-rotation counter, local to this loop. + pub(crate) generation: u64, + pub(crate) egress_rx: mpsc::Receiver, + pub(crate) banlist: Arc>, + pub(crate) identity_rx: watch::Receiver>>, + /// Outbound table (per-peer send-only connection state). + pub(crate) outgoing: HashMap, + /// Cloned into every dial task so it can report its [`OutboundEvent`]. + pub(crate) events_tx: mpsc::Sender, + pub(crate) events_rx: mpsc::Receiver, + pub(crate) shutdown: CancellationToken, + pub(crate) stats: Arc, + /// Held so an identity rotation can rebuild the client TLS config. + pub(crate) alpn: &'static [u8], +} + +impl OutboundLoop { + pub(crate) async fn run(mut self) { + let mut metrics = tokio::time::interval(METRICS_INTERVAL); + metrics.set_missed_tick_behavior(MissedTickBehavior::Skip); + + // The identity arm tolerates the `KeyUpdater` sender being dropped (some + // local-cluster tests drop it); once closed we stop polling that arm. + let mut id_closed = false; + loop { + tokio::select! { + biased; + // Explicit shutdown. Fires only on an explicit `cancel()`; + // dropping the public handle's token clone never cancels it. + _ = self.shutdown.cancelled() => break, + changed = self.identity_rx.changed(), if !id_closed => { + if changed.is_err() { + warn!("identity rotation channel closed; outbound loop running without rotation support"); + id_closed = true; + continue; + } + let snap = self.identity_rx.borrow_and_update().clone(); + if let Some(snap) = snap { + self.apply_identity_change(snap); + } + } + maybe_datagram = self.egress_rx.recv() => { + let Some(datagram) = maybe_datagram else { break }; + self.handle_datagram(datagram); + } + // Dial outcomes. `recv()` never yields `None` - the loop holds + // an `events_tx`. + Some(event) = self.events_rx.recv() => self.handle_event(event), + _ = metrics.tick() => stats::report_client(&self.stats, self.outgoing.len() as u64), + } + } + } + + /// Rebuild the client TLS config against the new identity, swap it into the + /// quinn endpoint, evict the outbound table so peers are re-dialed, and + /// adopt the new pubkey. + fn apply_identity_change(&mut self, snap: Arc) { + let client_config = new_client_config(snap.cert.clone(), snap.key.clone_key(), self.alpn); + self.local_pubkey = snap.pubkey; + self.endpoint.set_default_client_config(client_config); + // Bump first so any in-flight dial that completes after this point is + // dropped at the event boundary (its event carries the old generation). + self.generation = self.generation.wrapping_add(1); + let evicted = self + .outgoing + .drain() + .map(|(_, entry)| { + if let OutgoingEntry::Established(conn) = entry { + close_codes::IDENTITY_ROTATED.close(&conn); + } + }) + .count() as u64; + self.stats + .connection_evicted_identity_rotated + .fetch_add(evicted, Ordering::Relaxed); + info!( + "outbound identity rotated to {} ({} connection(s) evicted)", + snap.pubkey, evicted + ); + } + + fn handle_datagram(&mut self, datagram: Datagram) { + let Datagram { + peer_pubkey: peer, + peer_address: addr, + message: bytes, + } = datagram; + debug_assert_ne!(self.local_pubkey, peer, "egress to self is a caller bug"); + if self.banlist.is_banned(&peer) { + return; + } + + if !self.send_outbound(peer, addr, &bytes) { + return; + } + + tokio::spawn( + ClientConnection { + endpoint: self.endpoint.clone(), + peer, + addr, + generation: self.generation, + trigger: bytes, + events: self.events_tx.clone(), + stats: self.stats.clone(), + } + .run(), + ); + } + + /// Apply a dial result or a connection-close notification. + fn handle_event(&mut self, event: OutboundEvent) { + let (peer, generation) = match &event { + OutboundEvent::Dialed { + peer, generation, .. + } + | OutboundEvent::Closed { + peer, generation, .. + } => (*peer, *generation), + }; + if generation != self.generation { + if let OutboundEvent::Dialed { + outcome: Ok(conn), .. + } = event + { + close_codes::IDENTITY_ROTATED.close(&conn); + self.stats + .connection_evicted_identity_rotated + .fetch_add(1, Ordering::Relaxed); + } + return; + } + match event { + OutboundEvent::Dialed { + outcome: Ok(conn), .. + } => match self.outgoing.get_mut(&peer) { + Some(slot @ OutgoingEntry::Dialing) => { + *slot = OutgoingEntry::Established(conn.clone()); + self.stats + .record_connection_count(self.outgoing.len() as u64); + self.spawn_close_watcher(peer, conn); + } + _ => { + close_codes::IDENTITY_ROTATED.close(&conn); + record_error(&Error::IdentityRotated(peer), &self.stats); + } + }, + OutboundEvent::Dialed { + outcome: Err(()), .. + } => { + if let Entry::Occupied(slot) = self.outgoing.entry(peer) + && matches!(slot.get(), OutgoingEntry::Dialing) + { + slot.remove(); + } + } + // Reap only if the closed connection is still the one we hold; a + // re-dial may have already replaced it. + OutboundEvent::Closed { stable_id, .. } => { + if let Entry::Occupied(slot) = self.outgoing.entry(peer) + && matches!(slot.get(), OutgoingEntry::Established(c) if c.stable_id() == stable_id) + { + slot.remove(); + } + } + } + } + + /// Watch an established connection and report [`OutboundEvent::Closed`] when + /// it ends so the loop can reap the slot. + fn spawn_close_watcher(&self, peer: Pubkey, conn: Connection) { + let events = self.events_tx.clone(); + let generation = self.generation; + tokio::spawn(async move { + let stable_id = conn.stable_id(); + conn.closed().await; + let _ = events + .send(OutboundEvent::Closed { + peer, + generation, + stable_id, + }) + .await; + }); + } +} diff --git a/votor/src/datagram_transport/endpoint.rs b/votor/src/datagram_transport/endpoint.rs new file mode 100644 index 00000000000..83edddb3527 --- /dev/null +++ b/votor/src/datagram_transport/endpoint.rs @@ -0,0 +1,341 @@ +//! QUIC datagram endpoint +use { + crate::datagram_transport::{ + EGRESS_CHANNEL_CAP, MAX_PEERS, + allowlist::Allowlist, + client::{OutboundEvent, OutboundLoop}, + error::Error, + server::{InboundEvent, InboundLoop}, + stats::QuicDatagramStats, + transport::{IdentitySnapshot, new_client_config, new_server_config}, + }, + bytes::Bytes, + crossbeam_channel::Sender, + quinn::{Endpoint, EndpointConfig, TokioRuntime}, + solana_keypair::{Keypair, Signer}, + solana_net_utils::{banlist::Banlist, token_bucket::TokenBucket}, + solana_pubkey::Pubkey, + solana_tls_utils::{NotifyKeyUpdate, new_dummy_x509_certificate}, + std::{ + collections::HashMap, + net::{SocketAddr, UdpSocket}, + sync::Arc, + time::Duration, + }, + tokio::sync::{mpsc, watch}, + tokio_util::sync::CancellationToken, +}; + +/// Handle for caller-driven identity rotation. Cloneable and thread-safe. +pub struct KeyUpdater { + tx: watch::Sender>>, +} + +impl NotifyKeyUpdate for KeyUpdater { + fn update_key(&self, keypair: &Keypair) -> Result<(), Box> { + let snap = Arc::new(IdentitySnapshot::from_keypair(keypair)); + self.tx + .send(Some(snap)) + .map_err(|_| -> Box { + "votor datagram endpoint has shut down; identity update rejected".into() + })?; + Ok(()) + } +} + +pub(crate) const METRICS_INTERVAL: Duration = Duration::from_secs(2); + +/// Capacity of the task -> control-loop connection-event channel. +const CONN_EVENT_CHANNEL_CAP: usize = MAX_PEERS as usize; + +/// Datagram envelope used on both directions of the endpoint. +#[derive(Debug)] +pub struct Datagram { + pub peer_pubkey: Pubkey, + pub peer_address: SocketAddr, + pub message: Bytes, +} + +/// Datagram-only QUIC endpoint bound to a UDP socket. +pub struct QuicDatagramEndpoint { + pub egress: mpsc::Sender, + /// Handle for rotating the local identity (TLS cert / pubkey). + pub key_updater: Arc, + shutdown: CancellationToken, +} + +impl QuicDatagramEndpoint { + /// Construct a datagram-only QUIC endpoint bound to `socket`. Spawns the + /// unified control loop on `runtime`. Received datagrams flow into + /// `ingress` via `try_send`; full ingress channel results in a drop + /// (counted in `datagram_ingress_dropped_channel_full`). + /// + /// `allowlist` is consulted once per new connection in either direction. + /// `banlist` is consulted on every send and at handshake. + #[allow(clippy::too_many_arguments)] + pub fn new( + runtime: &tokio::runtime::Handle, + keypair: &Keypair, + socket: UdpSocket, + alpn_protocol_id: &'static [u8], + ingress: Sender, + allowlist: Arc, + banlist: Arc>, + ) -> Result { + let local_pubkey = keypair.pubkey(); + let (cert, key) = new_dummy_x509_certificate(keypair); + let server_config = new_server_config(cert.clone(), key.clone_key(), alpn_protocol_id); + let client_config = new_client_config(cert, key, alpn_protocol_id); + + let mut endpoint = { + // Endpoint::new requires being inside the runtime context, else it + // panics on its first internal `tokio::spawn`. + let _guard = runtime.enter(); + Endpoint::new( + EndpointConfig::default(), + Some(server_config), + socket, + Arc::new(TokioRuntime), + ) + .map_err(Error::Endpoint)? + }; + endpoint.set_default_client_config(client_config); + + // Independent stats instances: each loop owns one and reports it under + // its own datapoint, so the two directions share no atomics. + let client_stats = Arc::::default(); + let server_stats = Arc::::default(); + let (egress_tx, egress_rx) = mpsc::channel(EGRESS_CHANNEL_CAP); + let (out_events_tx, out_events_rx) = mpsc::channel::(CONN_EVENT_CHANNEL_CAP); + let (in_events_tx, in_events_rx) = mpsc::channel::(CONN_EVENT_CHANNEL_CAP); + let shutdown = CancellationToken::new(); + let (id_tx, identity_rx) = watch::channel(None); + let key_updater = Arc::new(KeyUpdater { tx: id_tx }); + + // TODO: change this to only ever admit gossip-advertised IPs + // Global cap: burst handles a ~200-node cold-start wave; 40/s sustained + // caps a many-IP botnet flood to the same 40 TLS handshakes/s. + let handshake_global_limiter = TokenBucket::new(200, 200, 40.0); + + let outbound = OutboundLoop { + endpoint: endpoint.clone(), + local_pubkey, + generation: 0, + egress_rx, + banlist: banlist.clone(), + identity_rx: identity_rx.clone(), + outgoing: HashMap::new(), + events_tx: out_events_tx, + events_rx: out_events_rx, + shutdown: shutdown.clone(), + stats: client_stats, + alpn: alpn_protocol_id, + }; + runtime.spawn(outbound.run()); + let inbound = InboundLoop { + endpoint: endpoint.clone(), + generation: 0, + ingress, + banlist, + allowlist, + identity_rx, + incoming: HashMap::new(), + events_tx: in_events_tx, + events_rx: in_events_rx, + handshake_global_limiter, + stats: server_stats, + alpn: alpn_protocol_id, + shutdown: shutdown.clone(), + }; + runtime.spawn(inbound.run()); + + Ok(Self { + egress: egress_tx, + key_updater, + shutdown, + }) + } + + /// Initiate endpoint shutdown. + pub fn close(&self) { + self.shutdown.cancel(); + } +} + +#[cfg(test)] +mod tests { + use { + crate::datagram_transport::{ + allowlist::AllowAll, + testutils::{drain_matching, make_runtime, send_until_received, spawn_node}, + }, + bytes::Bytes, + solana_keypair::{Keypair, Signer}, + solana_tls_utils::NotifyKeyUpdate, + std::{sync::Arc, time::Duration}, + }; + + #[test] + /// Split-direction model: each peer reaches the other by dialing its *own* + /// outbound connection, so traffic flows in both directions regardless of + /// pubkey ordering (no lex tiebreaker, no "higher side waits to be dialed"). + /// A→B and B→A ride two independent unidirectional connections. + fn both_directions_use_independent_connections() { + let rt = make_runtime(); + let a = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + let b = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + + // A → B: A dials its outbound to B; B receives on its inbound from A. + let from_a = Bytes::from_static(b"from-A"); + send_until_received( + &rt, + &a.endpoint, + b.pubkey(), + b.addr, + from_a.clone(), + &b.ingress_rx, + Duration::from_secs(5), + |d| (d.peer_pubkey == a.pubkey() && d.message == from_a).then_some(()), + "B did not receive A's datagram", + ); + drain_matching(&b.ingress_rx, Duration::from_millis(200), |d| { + d.message == from_a + }); + + // B → A: a *separate* connection (B's own outbound) carries this - B + // does not reuse the inbound A dialed. A receives on its inbound from B. + let from_b = Bytes::from_static(b"from-B"); + send_until_received( + &rt, + &b.endpoint, + a.pubkey(), + a.addr, + from_b.clone(), + &a.ingress_rx, + Duration::from_secs(5), + |d| (d.peer_pubkey == b.pubkey() && d.message == from_b).then_some(()), + "A did not receive B's datagram", + ); + } + + #[test] + fn rotation_evicts_connections_and_resends_under_new_identity() { + let rt = make_runtime(); + let server = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + + let k1 = Keypair::new(); + let k1_pk = k1.pubkey(); + let client = spawn_node(&rt, Arc::new(AllowAll), k1); + + // Send under K1. Server should observe message attributed to K1. + let p1 = Bytes::from_static(b"under-K1"); + send_until_received( + &rt, + &client.endpoint, + server.pubkey(), + server.addr, + p1.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.peer_pubkey == k1_pk && d.message == p1).then_some(()), + "server never received message attributed to K1", + ); + drain_matching(&server.ingress_rx, Duration::from_millis(200), |d| { + d.message == p1 + }); + + // Rotate to K2. + let k2 = Keypair::new(); + let k2_pk = k2.pubkey(); + assert_ne!(k1_pk, k2_pk, "K1 and K2 must differ"); + client + .endpoint + .key_updater + .update_key(&k2) + .expect("identity rotation accepted"); + + // The control loop applies the rotation asynchronously: rebuild TLS + // configs, evict cached connections (server sees IDENTITY_ROTATED). + // Give it a beat. + std::thread::sleep(Duration::from_millis(500)); + + // Send under K2. Server's table no longer holds the K1 entry; a + // fresh handshake under K2 establishes a new connection, and the + // server observes the message attributed to K2. + let p2 = Bytes::from_static(b"under-K2"); + send_until_received( + &rt, + &client.endpoint, + server.pubkey(), + server.addr, + p2.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.peer_pubkey == k2_pk && d.message == p2).then_some(()), + "server never received message attributed to K2 after rotation", + ); + + // Client side: the rotated client should NOT have banned the server + // (rotation is a local event; we never soft-ban peers we close + // ourselves, and there is no handover signal to react to). + assert!( + !client.banlist.is_banned(&server.pubkey()), + "rotation must not ban peers we close ourselves" + ); + } + + #[test] + fn outbound_addr_change_redials_new_addr() { + let rt = make_runtime(); + + let s1 = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + let s_key = s1.keypair.insecure_clone(); + let s_pubkey = s1.pubkey(); + + let client = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + + // Initial send: client dials S1 at its addr A1. + let p1 = Bytes::from_static(b"p1"); + send_until_received( + &rt, + &client.endpoint, + s_pubkey, + s1.addr, + p1.clone(), + &s1.ingress_rx, + Duration::from_secs(5), + |d| (d.message == p1).then_some(()), + "S1 did not receive p1", + ); + drain_matching(&s1.ingress_rx, Duration::from_millis(200), |d| { + d.message == p1 + }); + + // S2 takes the same identity at a new addr - simulates a peer host + // move with gossip publishing a new SocketAddr for the same pubkey. + let s2 = spawn_node(&rt, Arc::new(AllowAll), s_key); + assert_ne!(s1.addr, s2.addr, "S1 and S2 must bind distinct addrs"); + + // Send to the new addr. Client must observe the addr mismatch, + // evict its cached conn to A1, and re-dial A2. + let p2 = Bytes::from_static(b"p2"); + send_until_received( + &rt, + &client.endpoint, + s_pubkey, + s2.addr, + p2.clone(), + &s2.ingress_rx, + Duration::from_secs(5), + |d| (d.message == p2).then_some(()), + "S2 (post-move) did not receive p2", + ); + + // Stale conn to S1 was evicted; S1 must not see post-move datagrams. + let stray = s1.ingress_rx.recv_timeout(Duration::from_millis(800)); + assert!( + stray.is_err(), + "S1 must not see post-move datagrams; got {stray:?}" + ); + } +} diff --git a/votor/src/datagram_transport/error.rs b/votor/src/datagram_transport/error.rs new file mode 100644 index 00000000000..a9eb2b3c697 --- /dev/null +++ b/votor/src/datagram_transport/error.rs @@ -0,0 +1,105 @@ +use { + quinn::{ConnectError, ConnectionError, SendDatagramError}, + solana_pubkey::Pubkey, + std::{io, net::SocketAddr}, + thiserror::Error, +}; + +/// Application close codes and reason strings, paired. +/// +/// Numeric codes are part of the wire format - adding a new one is fine, +/// changing the value of an existing one is a breaking protocol change. +pub(crate) mod close_codes { + use quinn::{Connection, VarInt}; + + pub(crate) struct Spec { + pub code: VarInt, + pub reason: &'static [u8], + } + + impl Spec { + /// Close `conn` with this spec's code/reason pair. + pub fn close(&self, conn: &Connection) { + conn.close(self.code, self.reason); + } + } + + pub(crate) const INVALID_IDENTITY: Spec = Spec { + code: VarInt::from_u32(2), + reason: b"INVALID_IDENTITY", + }; + + pub(crate) const NOT_ADMITTED: Spec = Spec { + code: VarInt::from_u32(3), + reason: b"NOT_ADMITTED", + }; + + pub(crate) const BANNED: Spec = Spec { + code: VarInt::from_u32(4), + reason: b"BANNED", + }; + + pub(crate) const TABLE_FULL: Spec = Spec { + code: VarInt::from_u32(5), + reason: b"TABLE_FULL", + }; + + pub(crate) const IDENTITY_ROTATED: Spec = Spec { + code: VarInt::from_u32(11), + reason: b"IDENTITY_ROTATED", + }; + + pub(crate) const PEER_MOVED: Spec = Spec { + code: VarInt::from_u32(12), + reason: b"PEER_MOVED", + }; +} + +/// All errors observed by the endpoint. Returned from public APIs and stamped +/// into [`crate::datagram_transport::stats::QuicDatagramStats`] via `record_error`. +#[derive(Error, Debug)] +pub enum Error { + #[error("egress channel closed")] + EgressChannelClosed, + + #[error("ingress channel closed")] + IngressChannelClosed, + + #[error(transparent)] + Connect(#[from] ConnectError), + + #[error(transparent)] + Connection(#[from] ConnectionError), + + /// TLS handshake succeeded but the peer cert did not yield a recoverable + /// solana ed25519 pubkey. The connection is closed by the caller. + #[error("invalid identity from {0:?}")] + InvalidIdentity(SocketAddr), + + /// Peer pubkey is not in the allowlist. + #[error("peer {0} is not admitted (unstaked)")] + NotAdmitted(Pubkey), + + /// Peer pubkey is currently banned. + #[error("peer {0} is banned")] + Banned(Pubkey), + + /// Inbound refused at a capacity limit: this peer already holds + /// [`crate::datagram_transport::MAX_INBOUND_CONNECTIONS_PER_PEER`] connections. + #[error("connection table full")] + TableFull, + + #[error(transparent)] + SendDatagram(#[from] SendDatagramError), + + /// Identity rotated between handshake start and the post-handshake + /// install. The completed connection is authenticated under our previous + /// cert and was closed. + #[error("identity rotated mid-handshake; connection to {0} aborted")] + IdentityRotated(Pubkey), + + /// `quinn::Endpoint::new` failed (e.g. socket already bound by another + /// process). Construction-time only. + #[error(transparent)] + Endpoint(#[from] io::Error), +} diff --git a/votor/src/datagram_transport/mod.rs b/votor/src/datagram_transport/mod.rs new file mode 100644 index 00000000000..aad66b54272 --- /dev/null +++ b/votor/src/datagram_transport/mod.rs @@ -0,0 +1,50 @@ +//! Votor QUIC datagram transport. + +pub mod allowlist; +pub mod error; + +#[cfg(any(test, feature = "dev-context-only-utils"))] +pub mod testutils; + +pub(crate) mod client; +pub(crate) use error::close_codes; +pub mod endpoint; +pub(crate) mod server; +pub(crate) mod stats; +pub(crate) mod transport; + +use std::time::Duration; + +/// Maximum number of unique peer pubkeys we expect in steady state. +/// Sized for the maximum expected alpenglow staked-node count. +pub const MAX_PEERS: u64 = 2000; + +/// Maximum simultaneous inbound (we-accepted, receive-only) connections we keep +/// from a single peer pubkey. Two is enough to let a hot-spare of the same +/// identity to connect while the previous instance's connection is still alive. +pub const MAX_INBOUND_CONNECTIONS_PER_PEER: usize = 2; + +/// Capacity of the egress channel held by [`QuicDatagramEndpoint`]. Senders +/// must `try_send` and accept drop-on-full. +/// +/// Sized to absorb a full slot's burst with headroom - `MAX_PEERS` peers +/// × ~4 messages/slot = ~8 K items. +pub const EGRESS_CHANNEL_CAP: usize = 4 * MAX_PEERS as usize; + +/// Per-peer receive-side rate limit. +pub const MAX_DATAGRAMS_PER_SECOND_PER_PEER: f64 = 30.0; + +/// Per-peer receive side burst limit. +pub const PEER_RATE_LIMIT_BURST: u64 = 100; + +/// Per-peer receive side DOS protection limit. +/// If peer exhausts this burst size they are considered to be attacking and banned. +pub const PEER_RATE_LIMIT_BURST_DOS: u64 = 100000; + +/// If peer exceeds DOS budget they get banned. +pub const BAN_DURATION_DOS: Duration = Duration::from_hours(48); + +/// How often each connection's read loop re-checks whether the peer is still +/// in the allowlist. Stale (epoch-evicted) connections are closed within one +/// interval after the allowlist snapshot is updated. +pub const ALLOWLIST_CHECK_INTERVAL: Duration = Duration::from_secs(10); diff --git a/votor/src/datagram_transport/server.rs b/votor/src/datagram_transport/server.rs new file mode 100644 index 00000000000..56338fd3d56 --- /dev/null +++ b/votor/src/datagram_transport/server.rs @@ -0,0 +1,752 @@ +//! Inbound (server) direction: we-accept, receive-only. + +use { + crate::datagram_transport::{ + ALLOWLIST_CHECK_INTERVAL, BAN_DURATION_DOS, MAX_DATAGRAMS_PER_SECOND_PER_PEER, + MAX_INBOUND_CONNECTIONS_PER_PEER, PEER_RATE_LIMIT_BURST, PEER_RATE_LIMIT_BURST_DOS, + allowlist::Allowlist, + close_codes, + endpoint::{Datagram, METRICS_INTERVAL}, + error::Error, + stats::{self, QuicDatagramStats, add, record_error}, + transport::{IdentitySnapshot, new_server_config}, + }, + crossbeam_channel::{Sender, TrySendError}, + log::{debug, info, warn}, + quinn::{Connection, Endpoint, Incoming}, + solana_net_utils::{banlist::Banlist, token_bucket::TokenBucket}, + solana_pubkey::Pubkey, + solana_tls_utils::get_remote_pubkey, + std::{ + collections::{HashMap, hash_map::Entry}, + net::SocketAddr, + sync::{Arc, atomic::Ordering}, + time::Duration, + }, + tokio::{ + sync::{mpsc, watch}, + time::MissedTickBehavior, + }, + tokio_util::sync::CancellationToken, +}; + +/// Interval of pruning for banlist entries that are no longer valid. +const BANLIST_PRUNE_INTERVAL: Duration = Duration::from_secs(60 * 60); + +/// Event reported by an accept or read task to the inbound control loop. +pub(crate) enum InboundEvent { + /// A TLS handshake completed and yielded an authenticated peer. + Accepted { + peer: Pubkey, + conn: Connection, + generation: u64, + }, + /// An inbound (we-accepted) connection ended. The read loop reports this + /// so the control loop can reap the table slot. + Closed { + peer: Pubkey, + generation: u64, + stable_id: usize, + }, +} + +impl InboundLoop { + /// Live inbound connections (each pubkey may hold several). Sampled for the + /// peak-occupancy high-water mark. + fn incoming_len(&self) -> u64 { + self.incoming.values().map(Vec::len).sum::() as u64 + } + + /// Install a freshly-accepted receive-only connection for `peer`. We keep + /// up to [`MAX_INBOUND_CONNECTIONS_PER_PEER`] connections per pubkey so a + /// same-identity hot-spare can coexist with the original without a + /// handover. Returns: + /// - `Ok(())` if the connection took a slot (a fresh pubkey, or an + /// additional connection for a pubkey already present). + /// - `Err(())` if this pubkey already holds the per-peer maximum; caller + /// must close `conn` with `TABLE_FULL`. This should be rare in normal + /// operation. The count of distinct inbound pubkeys is not checked here: + /// admission control (allowlist) already bounds it to the staked set. + fn insert_inbound(&mut self, peer: Pubkey, conn: Connection) -> Result<(), ()> { + match self.incoming.entry(peer) { + Entry::Vacant(slot) => { + slot.insert(vec![conn]); + Ok(()) + } + Entry::Occupied(mut slot) => { + let conns = slot.get_mut(); + if conns.len() < MAX_INBOUND_CONNECTIONS_PER_PEER { + conns.push(conn); + Ok(()) + } else { + Err(()) + } + } + } + } + + /// Remove the connection with the given `stable_id` from `peer`'s inbound + /// set when its read loop exits; drop the map entry once its last + /// connection is gone. Called by the read loop on exit. No-op if the + /// connection was already removed (e.g. by an identity rotation). + fn reap_incoming(&mut self, peer: &Pubkey, stable_id: usize) { + if let Entry::Occupied(mut slot) = self.incoming.entry(*peer) { + let conns = slot.get_mut(); + conns.retain(|c| c.stable_id() != stable_id); + if conns.is_empty() { + slot.remove(); + } + } + } +} + +/// An inbound accept: run the handshake and hand the connection (plus its +/// attested pubkey) to the control loop for admission. +pub(crate) struct ServerConnection { + pub(crate) incoming: Incoming, + pub(crate) generation: u64, + pub(crate) events: mpsc::Sender, + pub(crate) stats: Arc, +} + +impl ServerConnection { + async fn run(self) { + let remote_addr = self.incoming.remote_address(); + let conn = match async { self.incoming.accept()?.await }.await { + Ok(conn) => conn, + Err(e) => { + record_error(&Error::from(e), &self.stats); + return; + } + }; + let Some(peer) = get_remote_pubkey(&conn) else { + close_codes::INVALID_IDENTITY.close(&conn); + record_error(&Error::InvalidIdentity(remote_addr), &self.stats); + return; + }; + // Hand the connection to the loop for admission. The loop is the sole + // consumer of this channel and never sends into it, so awaiting here + // cannot deadlock - a momentarily-full channel just parks this task + // until the loop drains. If the send fails the loop is gone (shutdown) + // and dropping `conn` closes it. + let _ = self + .events + .send(InboundEvent::Accepted { + peer, + conn, + generation: self.generation, + }) + .await; + } +} + +/// Drive the per-connection read loop for an incoming connection. +/// Returns when the connection closes. On exit it reliably reports +/// [`InboundEvent::Closed`] so the control loop can reap the table entry. +#[allow(clippy::too_many_arguments)] +pub(crate) async fn read_datagram_loop( + connection: Connection, + peer: Pubkey, + remote_addr: SocketAddr, + generation: u64, + ingress: Sender, + allowlist: Arc, + banlist: Arc>, + events: mpsc::Sender, + stats: Arc, +) { + let stable_id = connection.stable_id(); + // Per-connection rate limiter. Any datagram arriving with the bucket + // below RATE_LIMIT_WATERMARK is dropped, since honest peers + // legitimately burst above the refill rate during catch-up. + // Any packet arriving when bucket is empty is *closed*. + const RATE_LIMIT_WATERMARK: u64 = PEER_RATE_LIMIT_BURST_DOS - PEER_RATE_LIMIT_BURST; + let rate_limit = TokenBucket::new( + PEER_RATE_LIMIT_BURST_DOS, + PEER_RATE_LIMIT_BURST_DOS, + MAX_DATAGRAMS_PER_SECOND_PER_PEER, + ); + let mut allowlist_check = tokio::time::interval(ALLOWLIST_CHECK_INTERVAL); + allowlist_check.tick().await; // skip the immediate first fire + loop { + tokio::select! { + result = connection.read_datagram() => { + match result { + Ok(bytes) => { + // Banlist check happens AFTER the read so a ban that + // lands while we're awaiting can't let a follow-up + // datagram leak through to ingress. + if banlist.is_banned(&peer) { + close_codes::BANNED.close(&connection); + break; + } + match rate_limit.consume_tokens(1) { + // green corridor - sender is behaving + Ok(remaining) if remaining > RATE_LIMIT_WATERMARK => {} + // red corridor - sender is bursting above normal + Ok(_) => { + drop(bytes); + stats.datagram_rate_limited.fetch_add(1, Ordering::Relaxed); + continue; + } + // we are under attack - kick the sender. + Err(_) => { + drop(bytes); + banlist.ban(peer, BAN_DURATION_DOS); + close_codes::BANNED.close(&connection); + break; + } + } + + match ingress.try_send(Datagram { + peer_pubkey: peer, + peer_address: remote_addr, + message: bytes, + }) { + Ok(()) => { + stats.datagrams_received.fetch_add(1, Ordering::Relaxed); + } + Err(TrySendError::Full(_)) => { + stats + .datagram_ingress_dropped_channel_full + .fetch_add(1, Ordering::Relaxed); + } + Err(TrySendError::Disconnected(_)) => { + debug!("ingress disconnected; reader for {peer} exiting"); + break; + } + } + } + Err(e) => { + // The peer (or we) closed this inbound, or it timed + // out. Record and exit; the control loop reaps the + // table slot from the `Closed` event below. + record_error(&Error::from(e), &stats); + break; + } + } + } + _ = allowlist_check.tick() => { + if !allowlist.allow(&peer) { + close_codes::NOT_ADMITTED.close(&connection); + stats.connection_evicted_allowlist.fetch_add(1, Ordering::Relaxed); + break; + } + } + } + } + // Send the notification to control that this connection died. + let _ = events + .send(InboundEvent::Closed { + peer, + generation, + stable_id, + }) + .await; +} + +/// Inbound control loop: server accept (we-accept, receive-only). Owns the +/// incoming table, the handshake DoS gate, admission, banlist eviction, and +/// read-loop reaping; spawned by [`crate::datagram_transport::endpoint::QuicDatagramEndpoint::new`]. +pub(crate) struct InboundLoop { + pub(crate) endpoint: Endpoint, + /// Identity-rotation counter + pub(crate) generation: u64, + pub(crate) ingress: Sender, + pub(crate) banlist: Arc>, + /// Policy for which peers may occupy a slot. Consulted by the admission + /// gate and handed to each read loop for its periodic re-check. + pub(crate) allowlist: Arc, + pub(crate) identity_rx: watch::Receiver>>, + /// Inbound table (per-peer accepted receive-only connections), owned solely + /// by this loop. + pub(crate) incoming: HashMap>, + /// Cloned into every accept / read task so it can report its [`InboundEvent`]. + pub(crate) events_tx: mpsc::Sender, + pub(crate) events_rx: mpsc::Receiver, + /// Global cap on inbound handshake rate across all source IPs. + pub(crate) handshake_global_limiter: TokenBucket, + pub(crate) stats: Arc, + /// Held so an identity rotation can rebuild the server TLS config. + pub(crate) alpn: &'static [u8], + pub(crate) shutdown: CancellationToken, +} + +impl InboundLoop { + pub(crate) async fn run(mut self) { + let mut prune = tokio::time::interval(BANLIST_PRUNE_INTERVAL); + prune.set_missed_tick_behavior(MissedTickBehavior::Skip); + + let mut metrics = tokio::time::interval(METRICS_INTERVAL); + metrics.set_missed_tick_behavior(MissedTickBehavior::Skip); + + // TODO: this flag is a workaround for some local-cluster tests that are a + // nightmare to refactor. But they really should be. + let mut id_closed = false; + loop { + tokio::select! { + biased; + _ = self.shutdown.cancelled() => break, + changed = self.identity_rx.changed(), if !id_closed => { + if changed.is_err() { + warn!("identity rotation channel closed; inbound loop running without rotation support"); + id_closed = true; + continue; + } + let snap = self.identity_rx.borrow_and_update().clone(); + if let Some(snap) = snap { + self.apply_identity_change(snap); + } + } + // Lifecycle results keep the table coherent; drained above + // accept so a flood of inbounds can't starve the reaping of + // dead connections. `recv()` never yields `None` - the loop + // holds an `events_tx`. + Some(event) = self.events_rx.recv() => self.handle_event(event), + maybe_incoming = self.endpoint.accept() => { + let Some(incoming) = maybe_incoming else { break }; + self.maybe_accept_connection(incoming); + } + // when idle we can take care of bookkeeping. If these are delayed + // it is usually not a problem. + _ = prune.tick() => self.banlist.prune(), + _ = metrics.tick() => stats::report_server(&self.stats, self.incoming_len()), + } + } + } + + /// Rebuild the server TLS config against the new identity, swap it into the + /// quinn endpoint, and evict the inbound table so peers re-handshake. + fn apply_identity_change(&mut self, snap: Arc) { + let server_config = new_server_config(snap.cert.clone(), snap.key.clone_key(), self.alpn); + self.endpoint.set_server_config(Some(server_config)); + // Bump first so any in-flight accept that completes after this point is + // dropped at the event boundary (its event carries the old generation). + self.generation = self.generation.wrapping_add(1); + let evicted = self + .incoming + .drain() + .flat_map(|(_, conns)| conns) + .inspect(|conn| close_codes::IDENTITY_ROTATED.close(conn)) + .count() as u64; + self.stats + .connection_evicted_identity_rotated + .fetch_add(evicted, Ordering::Relaxed); + info!( + "inbound identity rotated to {} ({} connection(s) evicted)", + snap.pubkey, evicted + ); + } + + /// Apply a connection-lifecycle result. + fn handle_event(&mut self, event: InboundEvent) { + let generation = match &event { + InboundEvent::Accepted { generation, .. } | InboundEvent::Closed { generation, .. } => { + *generation + } + }; + if generation != self.generation { + // Close any live connection carried by a stale-generation event so a + // connection authenticated under a now-rotated identity is not leaked. + if let InboundEvent::Accepted { conn, .. } = event { + close_codes::IDENTITY_ROTATED.close(&conn); + self.stats + .connection_evicted_identity_rotated + .fetch_add(1, Ordering::Relaxed); + } + return; + } + match event { + InboundEvent::Accepted { peer, conn, .. } => self.maybe_admit_inbound(peer, conn), + InboundEvent::Closed { + peer, stable_id, .. + } => self.reap_incoming(&peer, stable_id), + } + } + + /// Performs the non-expensive checks to handle incoming connections. + /// Then spawns the statemachine to handle the handshake and serve connection. + fn maybe_accept_connection(&mut self, incoming: Incoming) { + let remote_addr = incoming.remote_address(); + if remote_addr.is_ipv6() || remote_addr.ip().is_multicast() { + incoming.ignore(); + return; + } + // TODO: add Retry challenge here. + let ip = remote_addr.ip(); + // Apply rate-limit before spending TLS CPU. + // Loopback is exempt (for local-cluster / tests). + if !ip.is_loopback() && self.handshake_global_limiter.consume_tokens(1).is_err() { + add(&self.stats.handshake_rejected_global_limit); + incoming.ignore(); + return; + } + tokio::spawn( + ServerConnection { + incoming, + generation: self.generation, + events: self.events_tx.clone(), + stats: self.stats.clone(), + } + .run(), + ); + } + + /// Admission checks for a freshly handshaked inbound (we-accepted, + /// receive-only) connection. The split-direction model has no lex-pubkey + /// tiebreaker: we accept an inbound from any admitted peer regardless of + /// pubkey ordering, and install it into the receive-only `incoming` map. + fn maybe_admit_inbound(&mut self, peer: Pubkey, conn: Connection) { + if self.banlist.is_banned(&peer) { + close_codes::BANNED.close(&conn); + record_error(&Error::Banned(peer), &self.stats); + return; + } + if !self.allowlist.allow(&peer) { + close_codes::NOT_ADMITTED.close(&conn); + record_error(&Error::NotAdmitted(peer), &self.stats); + return; + } + let remote_addr = conn.remote_address(); + if self.insert_inbound(peer, conn.clone()).is_err() { + close_codes::TABLE_FULL.close(&conn); + record_error(&Error::TableFull, &self.stats); + return; + } + self.stats.record_connection_count(self.incoming_len()); + // The read loop reports [`InboundEvent::Closed`] when it exits so this + // loop can reap the table slot. + tokio::spawn(read_datagram_loop( + conn, + peer, + remote_addr, + self.generation, + self.ingress.clone(), + self.allowlist.clone(), + self.banlist.clone(), + self.events_tx.clone(), + self.stats.clone(), + )); + } +} + +#[cfg(test)] +mod tests { + use { + crate::datagram_transport::{ + MAX_DATAGRAMS_PER_SECOND_PER_PEER, PEER_RATE_LIMIT_BURST, + allowlist::{AllowAll, StakedNodesAllowlist}, + endpoint::Datagram, + testutils::{ + drain_matching, make_runtime, recv_until, send_until_received, spawn_node, + }, + }, + bytes::Bytes, + solana_keypair::{Keypair, Signer}, + std::{collections::HashMap, sync::Arc, time::Duration}, + }; + + #[test] + fn staked_peer_is_admitted_unstaked_is_rejected() { + let rt = make_runtime(); + + let server_kp = Keypair::new(); + let a_kp = Keypair::new(); + let a_pk = a_kp.pubkey(); + let admit_map: HashMap<_, _> = std::iter::once((a_pk, 100u64)).collect(); + let server = spawn_node( + &rt, + Arc::new(StakedNodesAllowlist::new(admit_map)), + server_kp, + ); + + // Client A - admitted by the allowlist. + let client_a = spawn_node(&rt, Arc::new(AllowAll), a_kp); + // Client B - not admitted by the allowlist. + let client_b = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + + let payload_a = Bytes::from_static(b"hello-from-A"); + send_until_received( + &rt, + &client_a.endpoint, + server.pubkey(), + server.addr, + payload_a.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.peer_pubkey == a_pk && d.message == payload_a).then_some(()), + "server never received payload from admitted peer A", + ); + // send_until_received may have left retry duplicates of payload_a + // in the channel; drain them before asserting B's rejection. + drain_matching(&server.ingress_rx, Duration::from_millis(200), |d| { + d.message == payload_a + }); + + let payload_b = Bytes::from_static(b"hello-from-B"); + rt.block_on(async { + client_b + .endpoint + .egress + .send(Datagram { + peer_pubkey: server.pubkey(), + peer_address: server.addr, + message: payload_b.clone(), + }) + .await + .expect("egress send B"); + }); + + // Server must close the handshake before any datagram from B is queued. + let bad = server.ingress_rx.recv_timeout(Duration::from_millis(800)); + assert!( + bad.is_err(), + "unstaked peer B's datagram should not reach server ingress, got {bad:?}" + ); + } + + #[test] + fn ban_evicts_existing_and_blocks_rehandshake() { + let rt = make_runtime(); + let server = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + // No lex tiebreaker in the split-direction model: the client dials, the + // server accepts a receive-only inbound. Any distinct keypair works. + let client = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + + // Establish a connection by driving send-until-receive: the trigger + // packet is dropped; the retry through the now-Established slot lands. + let probe = Bytes::from_static(b"probe"); + send_until_received( + &rt, + &client.endpoint, + server.pubkey(), + server.addr, + probe.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.message == probe).then_some(()), + "first datagram never arrived", + ); + drain_matching(&server.ingress_rx, Duration::from_millis(200), |d| { + d.message == probe + }); + + // Ban the client at the server side. Eviction task should close the + // server-side connection; client's read loop sees ConnectionClosed and + // exits, dropping its cache entry on the way out. + server.banlist.ban(client.pubkey(), Duration::from_secs(5)); + + // Best-effort wait for eviction to flush. The eviction task is async; on + // a quiet system it observes the channel within a few ms. + std::thread::sleep(Duration::from_millis(200)); + + // Now have the client send again. The send path dials a new connection + // (server-side cache no longer holds the old one); but the server is + // banning this pubkey, so handshake closes with BANNED and no datagram + // ever reaches ingress. + let again = Bytes::from_static(b"after-ban"); + rt.block_on(async { + client + .endpoint + .egress + .send(Datagram { + peer_pubkey: server.pubkey(), + peer_address: server.addr, + message: again.clone(), + }) + .await + .unwrap(); + }); + + let bad = server.ingress_rx.recv_timeout(Duration::from_millis(1500)); + assert!( + bad.is_err(), + "banned client must not deliver datagrams to server; got {bad:?}" + ); + } + + #[test] + /// Two client instances sharing one identity (hot-spare promotion) each + /// dial the server. In the split-direction model with no handover, both + /// inbound connections *coexist* (capped at + /// `MAX_INBOUND_CONNECTIONS_PER_PEER`): neither displaces the other, and + /// nobody is soft-banned. Both clients deliver, and the first keeps + /// delivering after the second connects. + fn two_connections_same_identity_coexist() { + let rt = make_runtime(); + let server = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + + let shared = Keypair::new(); + let server_pk = server.pubkey(); + let c1 = spawn_node(&rt, Arc::new(AllowAll), shared.insecure_clone()); + let c2 = spawn_node(&rt, Arc::new(AllowAll), shared.insecure_clone()); + + // C1 establishes its inbound to the server and delivers p1. + let p1 = Bytes::from_static(b"from-c1"); + send_until_received( + &rt, + &c1.endpoint, + server_pk, + server.addr, + p1.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.message == p1).then_some(()), + "server did not receive c1's probe", + ); + drain_matching(&server.ingress_rx, Duration::from_millis(200), |d| { + d.message == p1 + }); + + // C2 (same identity) brings up a *separate* inbound and delivers p2. + let p2 = Bytes::from_static(b"from-c2"); + send_until_received( + &rt, + &c2.endpoint, + server_pk, + server.addr, + p2.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.message == p2).then_some(()), + "server did not receive c2's probe", + ); + drain_matching(&server.ingress_rx, Duration::from_millis(200), |d| { + d.message == p2 + }); + + // No handover means no soft-ban on either client's outgoing side. + std::thread::sleep(Duration::from_millis(300)); + assert!( + !c1.banlist.is_banned(&server_pk), + "no handover: c1 must not soft-ban the server" + ); + assert!( + !c2.banlist.is_banned(&server_pk), + "no handover: c2 must not soft-ban the server" + ); + + // C1's connection survived C2 connecting (not displaced): it can still + // deliver on the same outbound. + let p3 = Bytes::from_static(b"from-c1-again"); + send_until_received( + &rt, + &c1.endpoint, + server_pk, + server.addr, + p3.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.message == p3).then_some(()), + "c1 must still deliver after c2's same-identity connection", + ); + } + + #[test] + /// Per-connection receive token bucket. A burst beyond + /// `BURST_DATAGRAMS_PER_SECOND_PER_PEER` is *dropped* silently - the + /// bucket itself is the throttle. The connection stays alive and the + /// peer is NOT banned (consensus traffic legitimately bursts above the + /// refill rate during catch-up). + fn burst_exceeding_rate_limit_drops_excess_without_banning() { + let rt = make_runtime(); + let server = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + let client = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + + // Establish the connection: retry the probe until one lands (first one + // triggers the dial, gets dropped; followers ride the Established slot). + let probe = Bytes::from_static(b"probe"); + send_until_received( + &rt, + &client.endpoint, + server.pubkey(), + server.addr, + probe.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.message == probe).then_some(()), + "first datagram never arrived", + ); + // Drain probe retry duplicates so they don't inflate the post-burst + // delivery count below. + drain_matching(&server.ingress_rx, Duration::from_millis(200), |d| { + d.message == probe + }); + + // Blast a burst far in excess of the bucket capacity. Probe retries + // have already consumed some tokens; the (capacity)-th additional + // datagram trips the limiter. + let burst = (PEER_RATE_LIMIT_BURST as usize) * 4; + rt.block_on(async { + for i in 0..burst { + let payload = Bytes::from(format!("burst-{i:04}").into_bytes()); + client + .endpoint + .egress + .send(Datagram { + peer_pubkey: server.pubkey(), + peer_address: server.addr, + message: payload, + }) + .await + .unwrap(); + } + }); + + // Let the receiver chew through whatever fits in the bucket. + std::thread::sleep(Duration::from_millis(500)); + + // The peer must NOT have been banned - drop-only semantics. + assert!( + !server.banlist.is_banned(&client.pubkey()), + "client pubkey should NOT be banned by RX rate limit (drop-only semantics)" + ); + + // Drain whatever made it through ingress. The bucket caps how many + // post-probe datagrams can be delivered within the first refill window. + let mut delivered = 0usize; + while server + .ingress_rx + .recv_timeout(Duration::from_millis(50)) + .is_ok() + { + delivered = delivered.saturating_add(1); + } + let cap = PEER_RATE_LIMIT_BURST as usize; + assert!( + delivered <= cap + 2, + "delivered {delivered} datagrams post-probe, exceeds bucket capacity {cap} + slop" + ); + + // After waiting long enough for the bucket to refill, the sender should + // be able to deliver fresh datagrams on the SAME connection (proves the + // connection wasn't torn down). The green corridor is the top + // `PEER_RATE_LIMIT_BURST` tokens of a `PEER_RATE_LIMIT_BURST_DOS`-capacity + // bucket, so after a `burst`-sized blast the bucket has to climb back + // above the watermark before it delivers again - roughly + // `(burst - BURST) / refill_rate` seconds. Wait that long, plus margin. + let recover_secs = ((burst as u64).saturating_sub(PEER_RATE_LIMIT_BURST) as f64 + / MAX_DATAGRAMS_PER_SECOND_PER_PEER) + .ceil() as u64; + std::thread::sleep(Duration::from_secs(recover_secs.saturating_add(2))); + let resume = Bytes::from_static(b"after-refill"); + rt.block_on(async { + client + .endpoint + .egress + .send(Datagram { + peer_pubkey: server.pubkey(), + peer_address: server.addr, + message: resume.clone(), + }) + .await + .unwrap(); + }); + recv_until( + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.message == resume).then_some(()), + "post-refill datagram never arrived - connection may have been torn down", + ); + } +} diff --git a/votor/src/datagram_transport/stats.rs b/votor/src/datagram_transport/stats.rs new file mode 100644 index 00000000000..5bcb643e638 --- /dev/null +++ b/votor/src/datagram_transport/stats.rs @@ -0,0 +1,187 @@ +use { + crate::datagram_transport::error::Error, + quinn::{ConnectionError, SendDatagramError}, + solana_metrics::datapoint_info, + std::sync::atomic::{AtomicU64, Ordering}, +}; + +#[derive(Default)] +pub(crate) struct QuicDatagramStats { + // --- Positive path --- + /// High-water mark of live table entries over the reporting period. + pub(crate) peak_connections: AtomicU64, + /// Datagrams successfully delivered to the ingress channel. + pub(crate) datagrams_received: AtomicU64, + /// Datagrams successfully handed to quinn for transmission. + pub(crate) datagrams_sent: AtomicU64, + /// A live connection ended through an expected teardown path + /// (application/connection close, local close, reset, timeout) + /// This is normal churn, not likely to be a fault. + pub(crate) connection_lost: AtomicU64, + /// Egress dropped because the connection for the peer is not ready. + pub(crate) egress_dropped_dial_in_progress: AtomicU64, + + // --- Connection lifecycle management --- + /// Connections closed because the local identity (TLS cert / pubkey) was + /// rotated. Each rotation evicts every cached connection in one burst. + pub(crate) connection_evicted_identity_rotated: AtomicU64, + /// Live inbound connection closed because the periodic allowlist recheck + /// found the peer no longer admitted (e.g. evicted at an epoch boundary). + pub(crate) connection_evicted_allowlist: AtomicU64, + /// The dialer evicted a cached outbound connection because the caller + /// supplied a new socket addr for the same pubkey (peer moved). + pub(crate) connection_evicted_peer_moved: AtomicU64, + + // --- Error buckets --- + /// A connection failed abnormally: dial setup error, protocol-level + /// connection fault, or a non-transient `send_datagram` failure. All are + /// local/config/protocol faults that should not occur in prod for a + /// well-behaved peer. + pub(crate) connect_failed: AtomicU64, + /// Handshake refused because the peer is not permitted: not in the + /// allowlist, or currently banned. + pub(crate) handshake_rejected_unauthorized: AtomicU64, + /// Handshake refused due to a resource limit: connection table full. + pub(crate) handshake_rejected_overload: AtomicU64, + /// Peer's incoming datagram exceeded the per-connection rate. + pub(crate) datagram_rate_limited: AtomicU64, + + // --- Drops on internal channels (distinct backpressure signals) --- + /// We have received a datagram but have nowhere to put it + pub(crate) datagram_ingress_dropped_channel_full: AtomicU64, + + // --- DOS management --- + /// Inbound dropped before the handshake: global handshake rate cap hit + /// (many-IP flood or cluster-wide reconnection storm). + pub(crate) handshake_rejected_global_limit: AtomicU64, +} + +#[inline] +pub(crate) fn add(metric: &AtomicU64) { + metric.fetch_add(1, Ordering::Relaxed); +} + +impl QuicDatagramStats { + /// Updates the peak-occupancy after a successful connection install. + pub(crate) fn record_connection_count(&self, count: u64) { + self.peak_connections.fetch_max(count, Ordering::Relaxed); + } +} + +pub(crate) fn record_error(err: &Error, stats: &QuicDatagramStats) { + match err { + Error::EgressChannelClosed | Error::IngressChannelClosed => {} + Error::Connection( + ConnectionError::ApplicationClosed(_) + | ConnectionError::ConnectionClosed(_) + | ConnectionError::LocallyClosed + | ConnectionError::Reset + | ConnectionError::TimedOut, + ) + | Error::SendDatagram(SendDatagramError::ConnectionLost(_)) => add(&stats.connection_lost), + Error::Connect(_) + | Error::Connection( + ConnectionError::TransportError(_) + | ConnectionError::VersionMismatch + | ConnectionError::CidsExhausted, + ) + | Error::InvalidIdentity(_) + | Error::SendDatagram( + SendDatagramError::TooLarge + | SendDatagramError::UnsupportedByPeer + | SendDatagramError::Disabled, + ) => add(&stats.connect_failed), + Error::NotAdmitted(_) | Error::Banned(_) => add(&stats.handshake_rejected_unauthorized), + Error::TableFull => add(&stats.handshake_rejected_overload), + Error::IdentityRotated(_) => add(&stats.connection_evicted_identity_rotated), + Error::Endpoint(_) => {} // construction-time only; no runtime counter + } +} + +macro_rules! swap { + ($m:expr) => { + $m.swap(0, Ordering::Relaxed) as i64 + }; +} + +/// Re-baseline the peak high-water mark to the current occupancy and return +/// the peak observed over the period just ending. +#[inline] +fn take_peak(stats: &QuicDatagramStats, live_connections: u64) -> i64 { + stats + .peak_connections + .swap(live_connections, Ordering::Relaxed) + .max(live_connections) as i64 +} + +/// Emit and reset the outbound counters. +pub(crate) fn report_client(stats: &QuicDatagramStats, live_connections: u64) { + datapoint_info!( + "votor_datagram_client", + ("connections_peak", take_peak(stats, live_connections), i64), + ("datagrams_sent", swap!(stats.datagrams_sent), i64), + ("connect_failed", swap!(stats.connect_failed), i64), + ("connection_lost", swap!(stats.connection_lost), i64), + ( + "egress_dropped_dial_in_progress", + swap!(stats.egress_dropped_dial_in_progress), + i64 + ), + ( + "connection_evicted_peer_moved", + swap!(stats.connection_evicted_peer_moved), + i64 + ), + ( + "connection_evicted_identity_rotated", + swap!(stats.connection_evicted_identity_rotated), + i64 + ), + ); +} + +/// Emit and reset the inbound counters. +pub(crate) fn report_server(stats: &QuicDatagramStats, live_connections: u64) { + datapoint_info!( + "votor_datagram_server", + ("connections_peak", take_peak(stats, live_connections), i64), + ("datagrams_received", swap!(stats.datagrams_received), i64), + ("connect_failed", swap!(stats.connect_failed), i64), + ("connection_lost", swap!(stats.connection_lost), i64), + ( + "datagram_rate_limited", + swap!(stats.datagram_rate_limited), + i64 + ), + ( + "datagram_ingress_dropped_channel_full", + swap!(stats.datagram_ingress_dropped_channel_full), + i64 + ), + ( + "handshake_rejected_global_limit", + swap!(stats.handshake_rejected_global_limit), + i64 + ), + ( + "handshake_rejected_unauthorized", + swap!(stats.handshake_rejected_unauthorized), + i64 + ), + ( + "handshake_rejected_overload", + swap!(stats.handshake_rejected_overload), + i64 + ), + ( + "connection_evicted_allowlist", + swap!(stats.connection_evicted_allowlist), + i64 + ), + ( + "connection_evicted_identity_rotated", + swap!(stats.connection_evicted_identity_rotated), + i64 + ), + ); +} diff --git a/votor/src/datagram_transport/testutils.rs b/votor/src/datagram_transport/testutils.rs new file mode 100644 index 00000000000..27f0cc806cd --- /dev/null +++ b/votor/src/datagram_transport/testutils.rs @@ -0,0 +1,142 @@ +#![cfg(any(test, feature = "dev-context-only-utils"))] +#![allow(clippy::arithmetic_side_effects)] + +use { + crate::datagram_transport::{ + allowlist::Allowlist, + endpoint::{Datagram, QuicDatagramEndpoint}, + }, + bytes::Bytes, + crossbeam_channel::Receiver, + solana_keypair::{Keypair, Signer}, + solana_net_utils::{banlist::Banlist, sockets::bind_to_localhost_unique}, + solana_pubkey::Pubkey, + std::{net::SocketAddr, sync::Arc, time::Duration}, + tokio::runtime::Runtime, +}; + +pub const TEST_ALPN: &[u8] = b"votor-test"; + +pub struct TestNode { + pub keypair: Keypair, + pub addr: SocketAddr, + pub endpoint: QuicDatagramEndpoint, + pub ingress_rx: Receiver, + pub banlist: Arc>, +} + +impl TestNode { + pub fn pubkey(&self) -> Pubkey { + self.keypair.pubkey() + } +} + +pub fn make_runtime() -> Runtime { + tokio::runtime::Builder::new_multi_thread() + .worker_threads(4) + .enable_all() + .build() + .expect("tokio multi-thread runtime") +} + +/// Spawn an endpoint with `allowlist` policy and a caller-supplied keypair. +/// Pass `Keypair::new()` when the identity is irrelevant; pass a specific one +/// when the test needs a particular pubkey. +pub fn spawn_node(rt: &Runtime, allowlist: Arc, keypair: Keypair) -> TestNode { + let socket = bind_to_localhost_unique().expect("bind UDP socket"); + let addr = socket.local_addr().expect("local addr"); + let (ingress_tx, ingress_rx) = crossbeam_channel::bounded(4096); + let banlist = Arc::new(Banlist::::default()); + let endpoint = QuicDatagramEndpoint::new( + rt.handle(), + &keypair, + socket, + TEST_ALPN, + ingress_tx, + allowlist, + banlist.clone(), + ) + .expect("QuicDatagramEndpoint::new"); + TestNode { + keypair, + addr, + endpoint, + ingress_rx, + banlist, + } +} + +/// Pull from `rx` until `cond` returns `Some(T)` or `timeout` elapses. Panics +/// with `msg` on timeout. +pub fn recv_until( + rx: &Receiver, + timeout: Duration, + mut cond: impl FnMut(&Datagram) -> Option, + msg: &str, +) -> T { + let deadline = std::time::Instant::now() + timeout; + while std::time::Instant::now() < deadline { + let remaining = deadline.saturating_duration_since(std::time::Instant::now()); + match rx.recv_timeout(remaining) { + Ok(item) => { + if let Some(t) = cond(&item) { + return t; + } + } + Err(_) => break, + } + } + panic!("{msg}"); +} + +/// Drive a send + receive round-trip, re-sending the payload every +/// `POLL_INTERVAL` until `cond` returns `Some(T)` or `timeout` elapses. +pub fn send_until_received( + rt: &Runtime, + endpoint: &QuicDatagramEndpoint, + target_pk: Pubkey, + target_addr: SocketAddr, + payload: Bytes, + rx: &Receiver, + timeout: Duration, + mut cond: impl FnMut(&Datagram) -> Option, + msg: &str, +) -> T { + const POLL_INTERVAL: Duration = Duration::from_millis(100); + let deadline = std::time::Instant::now() + timeout; + while std::time::Instant::now() < deadline { + rt.block_on(async { + let _ = endpoint.egress.try_send(Datagram { + peer_pubkey: target_pk, + peer_address: target_addr, + message: payload.clone(), + }); + }); + if let Ok(item) = rx.recv_timeout(POLL_INTERVAL) + && let Some(t) = cond(&item) + { + return t; + } + } + panic!("{msg}"); +} + +/// Drain any backlog from `rx` that matches `pred` within `timeout`. +/// Used after [`send_until_received`] when the test needs to assert +/// "no further packets" - by then `rx` may hold retry duplicates that +/// landed after the dial completed. +pub fn drain_matching( + rx: &Receiver, + timeout: Duration, + mut pred: impl FnMut(&Datagram) -> bool, +) { + let deadline = std::time::Instant::now() + timeout; + while std::time::Instant::now() < deadline { + let remaining = deadline.saturating_duration_since(std::time::Instant::now()); + match rx.recv_timeout(remaining.min(Duration::from_millis(50))) { + Ok(item) if pred(&item) => continue, + Ok(_) => break, + Err(_) => break, + } + } +} diff --git a/votor/src/datagram_transport/transport.rs b/votor/src/datagram_transport/transport.rs new file mode 100644 index 00000000000..332546344af --- /dev/null +++ b/votor/src/datagram_transport/transport.rs @@ -0,0 +1,155 @@ +//! Transport tuning constants for a votor-style workload. +use { + crate::datagram_transport::MAX_PEERS, + quinn::{ + AckFrequencyConfig, ClientConfig, IdleTimeout, ServerConfig, TransportConfig, VarInt, + congestion::{Controller, ControllerFactory}, + crypto::rustls::{QuicClientConfig, QuicServerConfig}, + }, + quinn_proto::RttEstimator, + rustls::pki_types::{CertificateDer, PrivateKeyDer}, + solana_keypair::{Keypair, Signer}, + solana_pubkey::Pubkey, + solana_tls_utils::{ + new_dummy_x509_certificate, tls_client_config_builder, tls_server_config_builder, + }, + std::{ + any::Any, + sync::Arc, + time::{Duration, Instant}, + }, +}; + +/// Identity material derived from a keypair: the ed25519 pubkey plus the +/// self-signed TLS cert/key that the endpoint presents to peers. +pub(crate) struct IdentitySnapshot { + pub pubkey: Pubkey, + pub cert: CertificateDer<'static>, + pub key: PrivateKeyDer<'static>, +} + +impl IdentitySnapshot { + pub fn from_keypair(keypair: &Keypair) -> Self { + let (cert, key) = new_dummy_x509_certificate(keypair); + Self { + pubkey: keypair.pubkey(), + cert, + key, + } + } +} + +/// Receive datagram buffer (per endpoint, total across all peers). +/// Provisions enough room for 1 full slot at max load. +const DATAGRAM_RECEIVE_BUFFER: usize = MAX_PEERS as usize * 8; +/// Send datagram buffer (per endpoint). Not too huge since we are +/// supposed to operate in realtime. +const DATAGRAM_SEND_BUFFER: usize = MAX_PEERS as usize * 2; +/// Close connections after this much time without feedback from peer. +/// This should be more than reasonable max time between PING (or data) +/// frames arriving from the peer. We also want to keep this rather short +/// to make sure we reinitiate connections that are truly stuck. +pub(crate) const MAX_IDLE_TIMEOUT: Duration = Duration::from_secs(5); +/// QUIC keep-alive heartbeat. Must be << `MAX_IDLE_TIMEOUT` to avoid +/// connections dying when no vote traffic is available, and > MAX_ACK_DELAY +/// to make sure we actually get ACKs for every keepalive we send. +const KEEP_ALIVE_INTERVAL: Duration = Duration::from_millis(600); +/// `max_ack_delay` we request the peer to use via the QUIC ACK Frequency +/// extension (RFC 9799). Loosens the peer's ACK cadence to cut +/// reverse-direction packet rate. Set this ~ 1 slot to avoid ACKs during +/// normal votor operation. +const MAX_ACK_DELAY: Duration = Duration::from_millis(400); +/// Initial MTU before path discovery. Floor of 1280 matches QUIC spec. +const INITIAL_MTU: u16 = 1280; +/// Minimum MTU after path discovery. +const MIN_MTU: u16 = 1280; + +/// Allow this many bytes to be in flight towards any other peer +const NOP_CONGESTION_WINDOW: u64 = 8 * 1024 * 1024; + +#[derive(Clone)] +struct NopCongestion; + +impl Controller for NopCongestion { + fn on_congestion_event(&mut self, _: Instant, _: Instant, _: bool, _: u64) {} + fn on_ack(&mut self, _: Instant, _: Instant, _: u64, _: bool, _: &RttEstimator) {} + fn on_mtu_update(&mut self, _: u16) {} + fn window(&self) -> u64 { + NOP_CONGESTION_WINDOW + } + fn initial_window(&self) -> u64 { + NOP_CONGESTION_WINDOW + } + fn clone_box(&self) -> Box { + Box::new(self.clone()) + } + fn into_any(self: Box) -> Box { + self + } +} + +impl ControllerFactory for NopCongestion { + fn build(self: Arc, _: Instant, _: u16) -> Box { + Box::new(NopCongestion) + } +} + +pub(crate) fn new_transport_config() -> TransportConfig { + let max_idle = + IdleTimeout::try_from(MAX_IDLE_TIMEOUT).expect("MAX_IDLE_TIMEOUT fits IdleTimeout"); + let mut ack_freq = AckFrequencyConfig::default(); + ack_freq.max_ack_delay(Some(MAX_ACK_DELAY)); + // prevent acks from being elicited by packet count + ack_freq.ack_eliciting_threshold(VarInt::from_u32(512)); + // disable reordering notifications + ack_freq.reordering_threshold(VarInt::from_u32(0)); + let mut c = TransportConfig::default(); + c.datagram_receive_buffer_size(Some(DATAGRAM_RECEIVE_BUFFER)) + .datagram_send_buffer_size(DATAGRAM_SEND_BUFFER) + .initial_mtu(INITIAL_MTU) + .min_mtu(MIN_MTU) + .mtu_discovery_config(None) + .keep_alive_interval(Some(KEEP_ALIVE_INTERVAL)) + .max_idle_timeout(Some(max_idle)) + .ack_frequency_config(Some(ack_freq)) + .congestion_controller_factory(Arc::new(NopCongestion)) + // Datagrams only - disable streams entirely. + .max_concurrent_bidi_streams(VarInt::from(0u8)) + .max_concurrent_uni_streams(VarInt::from(0u8)); + c +} + +/// Build the rustls + quinn server config. +pub(crate) fn new_server_config( + cert: CertificateDer<'static>, + key: PrivateKeyDer<'static>, + alpn: &[u8], +) -> ServerConfig { + let mut tls = tls_server_config_builder() + .with_single_cert(vec![cert], key) + .expect("rustls accepts our self-signed solana cert/key pair"); + tls.alpn_protocols = vec![alpn.to_vec()]; + let quic = QuicServerConfig::try_from(tls) + .expect("TLS 1.3-only config yields an initial cipher suite"); + let mut cfg = ServerConfig::with_crypto(Arc::new(quic)); + cfg.transport_config(Arc::new(new_transport_config())) + .migration(false); + cfg +} + +/// Build the rustls + quinn client config. +pub(crate) fn new_client_config( + cert: CertificateDer<'static>, + key: PrivateKeyDer<'static>, + alpn: &[u8], +) -> ClientConfig { + let mut tls = tls_client_config_builder() + .with_client_auth_cert(vec![cert], key) + .expect("rustls accepts our solana cert/key pair"); + tls.enable_early_data = true; + tls.alpn_protocols = vec![alpn.to_vec()]; + let quic = QuicClientConfig::try_from(tls).expect("TLS config should be valid"); + let mut cfg = ClientConfig::new(Arc::new(quic)); + cfg.transport_config(Arc::new(new_transport_config())); + cfg +} diff --git a/votor/src/lib.rs b/votor/src/lib.rs index e3cf0a52fa9..67be461795b 100644 --- a/votor/src/lib.rs +++ b/votor/src/lib.rs @@ -9,6 +9,7 @@ pub mod common; pub mod consensus_metrics; pub mod consensus_pool; mod consensus_pool_service; +pub mod datagram_transport; pub mod event; mod event_handler; pub mod generated_cert_types; diff --git a/votor/tests/datagram_transport.rs b/votor/tests/datagram_transport.rs new file mode 100644 index 00000000000..6a0ce0cc9dd --- /dev/null +++ b/votor/tests/datagram_transport.rs @@ -0,0 +1,367 @@ +//! End-to-end integration tests for the votor datagram endpoint. +//! +//! Covers three behaviors that need real quinn endpoints over loopback: +//! * `Dialing` placeholder cleanup on dial failure / identity mismatch. +//! * Caller-driven addr refresh while a dial is still in flight. +//! * Split-direction delivery at scale (no duplicate delivery). + +use { + agave_votor::datagram_transport::{ + allowlist::AllowAll, + endpoint::Datagram, + testutils::{TestNode, drain_matching, make_runtime, send_until_received, spawn_node}, + }, + bytes::Bytes, + solana_keypair::Keypair, + solana_net_utils::sockets::bind_to_localhost_unique, + solana_pubkey::Pubkey, + std::{collections::HashMap, net::SocketAddr, sync::Arc, time::Duration}, +}; + +// Dial to a blackhole address times out. On failure the `Dialing` placeholder is +// removed so a follow-up egress to the same pubkey at a working addr +// can spawn a fresh dial and reach the server. +#[test] +fn failed_connect_clears_placeholder() { + let rt = make_runtime(); + let server = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + let s_pubkey = server.pubkey(); + + let client = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + + // Pretend gossip published a bogus addr for `s_pubkey`. The dial will + // fail; if the placeholder is not cleaned up, the follow-up below + // would see `Occupied(Dialing)` and drop forever. + let blackhole = bind_to_localhost_unique().expect("bind blackhole socket"); + let bogus_addr = blackhole.local_addr().expect("blackhole addr"); + + let p1 = Bytes::from_static(b"p1-blackhole"); + rt.block_on(async { + client + .endpoint + .egress + .send(Datagram { + peer_pubkey: s_pubkey, + peer_address: bogus_addr, + message: p1.clone(), + }) + .await + .unwrap(); + }); + + // Give the dial enough time to fail. + std::thread::sleep(Duration::from_secs(10)); + + let p2 = Bytes::from_static(b"p2-real"); + send_until_received( + &rt, + &client.endpoint, + s_pubkey, + server.addr, + p2.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.message == p2).then_some(()), + "server did not receive the retry - placeholder not cleared after dial failure", + ); + + // p1 went to the blackhole; server should never see it. Drain p2 + // retry duplicates, then assert no foreign packet arrives. + drain_matching(&server.ingress_rx, Duration::from_millis(200), |d| { + d.message == p2 + }); + let stray = server.ingress_rx.recv_timeout(Duration::from_millis(500)); + assert!( + stray.is_err(), + "server unexpectedly received {stray:?}; p1 should have been lost to the blackhole", + ); + drop(blackhole); +} + +/// Dial to a server whose attested identity differs from the pubkey +/// the caller targeted: the client closes the conn with INVALID_IDENTITY +/// and removes its `Dialing` placeholder. A follow-up egress with the +/// correct pubkey/addr must succeed. +#[test] +fn identity_mismatch_clears_placeholder() { + let rt = make_runtime(); + + // Two servers with distinct keypairs at distinct addrs. We send to + // (s2_pk, s1.addr) - quinn handshake succeeds, then the attested pk + // (s1's) does not match the caller-requested s2_pk → INVALID_IDENTITY. + let s1 = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + let s2 = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + + let client = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + + let s2_pk = s2.pubkey(); + let s2_addr = s2.addr; + let bad = Bytes::from_static(b"to-wrong-addr"); + rt.block_on(async { + client + .endpoint + .egress + .send(Datagram { + peer_pubkey: s2_pk, + peer_address: s1.addr, + message: bad.clone(), + }) + .await + .unwrap(); + }); + + // Give the bogus dial time to complete its handshake, observe the + // identity mismatch, and clear the placeholder. + std::thread::sleep(Duration::from_secs(1)); + + // Same pk, now at the real addr. Must succeed. + let good = Bytes::from_static(b"to-right-addr"); + send_until_received( + &rt, + &client.endpoint, + s2_pk, + s2_addr, + good.clone(), + &s2.ingress_rx, + Duration::from_secs(5), + |d| (d.message == good).then_some(()), + "S2 did not receive retry - placeholder not cleared after INVALID_IDENTITY", + ); + + // S1 must not see either datagram. The bogus one was closed before + // any data flowed; the good one targeted s2. + let stray = s1.ingress_rx.recv_timeout(Duration::from_millis(500)); + assert!(stray.is_err(), "S1 unexpectedly received {stray:?}",); +} + +// --------------------------------------------------------------------------- +// Caller-driven addr refresh: when a caller (e.g. votor via a gossip refresh) +// hands the endpoint a new SocketAddr for a pubkey already in the outgoing +// table, the dialer must evict its stale outbound connection and re-dial the +// new addr. Inbound (we-accepted) connections are receive-only and untouched by +// this path - a peer's NAT-mapped source addr can legitimately differ from its +// gossip-published one. +// --------------------------------------------------------------------------- + +#[test] +fn egress_during_dial_dropped_addr_recovers_after_timeout() { + // Mid-dial address change: while a dial to addr A1 is still in flight + // (blackhole'd, will time out), the caller queues an egress with a + // different addr A2 for the same pubkey. The state-machine drops it + // (placeholder is `Dialing`, not `Established`, so the addr-mismatch + // eviction path doesn't apply). Once the first dial times out and + // clears the placeholder, a fresh egress to A2 must succeed. + let rt = make_runtime(); + let server = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + let s_pubkey = server.pubkey(); + let client = spawn_node(&rt, Arc::new(AllowAll), Keypair::new()); + + let blackhole = bind_to_localhost_unique().expect("blackhole socket"); + let blackhole_addr = blackhole.local_addr().expect("blackhole addr"); + + // 1. start dial to blackhole - server's pubkey, wrong addr. + let p1 = Bytes::from_static(b"dial-to-blackhole"); + rt.block_on(async { + client + .endpoint + .egress + .send(Datagram { + peer_pubkey: s_pubkey, + peer_address: blackhole_addr, + message: p1.clone(), + }) + .await + .unwrap(); + }); + // Give the dial a beat to install its `Dialing` placeholder. + std::thread::sleep(Duration::from_millis(200)); + + // 2. while `Dialing`, send to the REAL addr. Should be dropped (we do + // not buffer; one dial task per peer at a time). + let p2 = Bytes::from_static(b"during-dial-dropped"); + rt.block_on(async { + client + .endpoint + .egress + .send(Datagram { + peer_pubkey: s_pubkey, + peer_address: server.addr, + message: p2.clone(), + }) + .await + .unwrap(); + }); + // Confirm server does NOT see p2 within the dial-in-progress window. + let stray = server.ingress_rx.recv_timeout(Duration::from_millis(500)); + assert!( + stray.is_err(), + "server unexpectedly received {stray:?} while dial-in-flight" + ); + + // 3. wait for the blackhole dial to time out, clearing the placeholder. + std::thread::sleep(Duration::from_secs(8)); + + // 4. retry the real addr - must succeed. + let p3 = Bytes::from_static(b"post-timeout-retry"); + send_until_received( + &rt, + &client.endpoint, + s_pubkey, + server.addr, + p3.clone(), + &server.ingress_rx, + Duration::from_secs(5), + |d| (d.message == p3).then_some(()), + "server did not receive post-timeout retry", + ); + + // p1 and p2 never reach the server: p1 went to the blackhole, p2 was + // dropped while `Dialing`. Drain p3 retry duplicates, then assert + // no foreign packet arrives. + drain_matching(&server.ingress_rx, Duration::from_millis(200), |d| { + d.message == p3 + }); + let stray = server.ingress_rx.recv_timeout(Duration::from_millis(500)); + assert!( + stray.is_err(), + "server unexpectedly received {stray:?} after the retry landed", + ); + drop(blackhole); +} + +// --------------------------------------------------------------------------- +// Split-direction delivery at scale. +// --------------------------------------------------------------------------- + +#[test] +/// Verifies the split-direction model delivers each payload exactly once at +/// scale: every ordered pair (i -> j) carries i's traffic on exactly one +/// connection (i's outbound to j's inbound), so there are no duplicates. +/// +/// Setup: N=32 nodes all mutually-admitted, every node sends to every other +/// at the same time. +/// +/// Assertion: after warm-up settles, a follow-up "burst" of one unique +/// payload per (sender, receiver) pair must arrive exactly once at every +/// receiver. +fn split_direction_no_duplicates_after_settle() { + const N: usize = 32; + let rt = make_runtime(); + let nodes: Vec = (0..N) + .map(|_| spawn_node(&rt, Arc::new(AllowAll), Keypair::new())) + .collect(); + let pubkeys: Vec = nodes.iter().map(|n| n.pubkey()).collect(); + let addrs: Vec = nodes.iter().map(|n| n.addr).collect(); + + // Warm-up: every node concurrently sends one payload to every other - + // provokes the simultaneous-dial race for every pair. + rt.block_on(async { + let mut tasks = Vec::new(); + for (i, n) in nodes.iter().enumerate() { + let egress = n.endpoint.egress.clone(); + let targets: Vec = (0..N) + .filter(|&j| j != i) + .map(|j| Datagram { + peer_pubkey: pubkeys[j], + peer_address: addrs[j], + message: Bytes::from_static(b"warmup"), + }) + .collect(); + tasks.push(tokio::spawn(async move { + for t in targets { + let _ = egress.send(t).await; + } + })); + } + for t in tasks { + let _ = t.await; + } + }); + + // Allow warm-up dials/handshakes to settle (localhost: microseconds each) + // plus task scheduling. Generous slack. + std::thread::sleep(Duration::from_secs(2)); + + // Drain warm-up ingress so it can't pollute the burst-phase counts. + for n in &nodes { + while n.ingress_rx.try_recv().is_ok() {} + } + + // Burst: one unique payload per ordered pair. With clean dedup each + // receiver sees each sender's payload exactly once. + rt.block_on(async { + let mut tasks = Vec::new(); + for (i, n) in nodes.iter().enumerate() { + let egress = n.endpoint.egress.clone(); + let targets: Vec = (0..N) + .filter(|&j| j != i) + .map(|j| { + let payload = Bytes::copy_from_slice(format!("burst-{i}-{j}").as_bytes()); + Datagram { + peer_pubkey: pubkeys[j], + peer_address: addrs[j], + message: payload, + } + }) + .collect(); + tasks.push(tokio::spawn(async move { + for t in targets { + let _ = egress.send(t).await; + } + })); + } + for t in tasks { + let _ = t.await; + } + }); + + std::thread::sleep(Duration::from_secs(1)); + + let mut duplicates = Vec::new(); + let mut losses = Vec::new(); + for (j, n) in nodes.iter().enumerate() { + let mut counts: HashMap = HashMap::new(); + while let Ok(d) = n.ingress_rx.try_recv() { + let s = String::from_utf8_lossy(&d.message).into_owned(); + *counts.entry(s).or_insert(0) += 1; + } + for i in 0..N { + if i == j { + continue; + } + let key = format!("burst-{i}-{j}"); + let c = counts.get(&key).copied().unwrap_or(0); + match c { + 1 => {} + 0 => losses.push(format!("node {j}: lost '{key}'")), + _ => duplicates.push(format!("node {j}: got {c} copies of '{key}'")), + } + } + } + + let total_pairs = N * (N - 1); + assert!( + duplicates.is_empty(), + "{}/{} pairs duplicated (split-direction dedup failed):\n{}", + duplicates.len(), + total_pairs, + duplicates + .iter() + .take(10) + .cloned() + .collect::>() + .join("\n") + ); + assert!( + losses.is_empty(), + "{}/{} pairs lost their burst datagram (warm-up did not establish a connection):\n{}", + losses.len(), + total_pairs, + losses + .iter() + .take(10) + .cloned() + .collect::>() + .join("\n") + ); +} From eff5a573fa3caa65cfc534fba464e0ad4a27b461 Mon Sep 17 00:00:00 2001 From: greg Date: Tue, 9 Jun 2026 02:48:09 +0000 Subject: [PATCH 2/6] votor: add datagram endpoint plumbing --- votor/src/datagram_endpoint.rs | 40 ++++++++++++++++++++++++++++++++++ votor/src/lib.rs | 1 + 2 files changed, 41 insertions(+) create mode 100644 votor/src/datagram_endpoint.rs diff --git a/votor/src/datagram_endpoint.rs b/votor/src/datagram_endpoint.rs new file mode 100644 index 00000000000..c7675eda851 --- /dev/null +++ b/votor/src/datagram_endpoint.rs @@ -0,0 +1,40 @@ +//! Votor wrapper for [`crate::datagram_transport::QuicDatagramEndpoint`]. + +use { + crate::datagram_transport::allowlist::StakedNodesAllowlist, + solana_pubkey::Pubkey, + solana_runtime::bank_forks::SharableBanks, + std::{collections::HashMap, sync::Arc}, +}; + +/// ALPN identifier negotiated on every alpenglow QUIC handshake. Pass this as +/// the `alpn_protocol_id` when constructing the votor +/// [`crate::datagram_transport::endpoint::QuicDatagramEndpoint`]. +pub const ALPENGLOW_ALPN: &[u8] = b"alpenglow-v1"; + +/// Build the allowlist map (pubkey → epoch stake) for validators with +/// positive stake in the working bank's current epoch. This is the +/// canonical allowlist; callers seed `StakedNodesAllowlist` from +/// this at construction and then drive subsequent epoch-boundary +/// refreshes from [`crate::staked_validators_cache::StakedValidatorsCache`]. +/// +/// Takes a [`SharableBanks`] so the working bank is read lock-free (no +/// `BankForks` `RwLock` acquisition). +pub fn current_admit_set(banks: &SharableBanks) -> HashMap { + let bank = banks.working(); + let epoch = bank.epoch(); + bank.epoch_staked_nodes(epoch) + .map(|m| { + m.iter() + .filter(|(_, stake)| **stake > 0) + .map(|(pk, stake)| (*pk, *stake)) + .collect() + }) + .unwrap_or_default() +} + +/// Build a fresh [`StakedNodesAllowlist`] seeded with the current +/// epoch's staked-set. +pub fn build_allowlist(banks: &SharableBanks) -> Arc { + Arc::new(StakedNodesAllowlist::new(current_admit_set(banks))) +} diff --git a/votor/src/lib.rs b/votor/src/lib.rs index 67be461795b..46697f318ed 100644 --- a/votor/src/lib.rs +++ b/votor/src/lib.rs @@ -9,6 +9,7 @@ pub mod common; pub mod consensus_metrics; pub mod consensus_pool; mod consensus_pool_service; +pub mod datagram_endpoint; pub mod datagram_transport; pub mod event; mod event_handler; From 366b2a98fc2a05208368e0dff157f2798a3ab6c6 Mon Sep 17 00:00:00 2001 From: greg Date: Tue, 9 Jun 2026 02:49:08 +0000 Subject: [PATCH 3/6] core: support datagram BLS sigverify input --- core/src/bls_sigverify/bls_cert_sigverify.rs | 11 +- core/src/bls_sigverify/bls_sigverifier.rs | 136 ++++++++++++++++--- core/src/bls_sigverify/bls_vote_sigverify.rs | 9 +- core/src/tvu.rs | 9 +- 4 files changed, 134 insertions(+), 31 deletions(-) diff --git a/core/src/bls_sigverify/bls_cert_sigverify.rs b/core/src/bls_sigverify/bls_cert_sigverify.rs index 63aa9554024..71c42318ebd 100644 --- a/core/src/bls_sigverify/bls_cert_sigverify.rs +++ b/core/src/bls_sigverify/bls_cert_sigverify.rs @@ -1,5 +1,9 @@ use { - super::{bls_sigverifier::BAN_TIMEOUT, errors::SigVerifyCertError, stats::SigVerifyCertStats}, + super::{ + bls_sigverifier::{BAN_TIMEOUT, SigVerifierBanlist}, + errors::SigVerifyCertError, + stats::SigVerifyCertStats, + }, crate::bls_sigverify::{bls_sigverifier::NUM_SLOTS_FOR_VERIFY, utils::send_certs_to_pool}, agave_bls_cert_verify::cert_verify::Error as BlsCertVerifyError, agave_votor_messages::{ @@ -16,7 +20,6 @@ use { solana_measure::measure::Measure, solana_pubkey::Pubkey, solana_runtime::bank::Bank, - solana_streamer::nonblocking::simple_qos::SimpleQosBanlist, std::{collections::HashSet, num::NonZeroU64}, thiserror::Error, }; @@ -53,7 +56,7 @@ pub(super) fn verify_and_send_certificates( certs: Vec, root_bank: &Bank, channel_to_pool: &Sender, - banlist: &SimpleQosBanlist, + banlist: &SigVerifierBanlist, thread_pool: &ThreadPool, ) -> Result { for cert in certs.iter().map(|cert_payload| &cert_payload.cert) { @@ -95,7 +98,7 @@ fn verify_certs( root_bank: &Bank, verified_certs_set: &mut HashSet, stats: &mut SigVerifyCertStats, - banlist: &SimpleQosBanlist, + banlist: &SigVerifierBanlist, thread_pool: &ThreadPool, ) -> SigVerifiedBatch { let verified = thread_pool.install(|| { diff --git a/core/src/bls_sigverify/bls_sigverifier.rs b/core/src/bls_sigverify/bls_sigverifier.rs index fb620beb366..e7e82d4e229 100644 --- a/core/src/bls_sigverify/bls_sigverifier.rs +++ b/core/src/bls_sigverify/bls_sigverifier.rs @@ -11,6 +11,7 @@ use { block_creation_loop::rewards::{certs_builder::wants_vote, msg_types::AddVoteMessage}, cluster_info_vote_listener::VerifiedVoterSlotsSender, }, + agave_votor::datagram_transport::endpoint::Datagram, agave_votor::{ consensus_metrics::ConsensusMetricsEventSender, generated_cert_types::GeneratedCertTypes, }, @@ -26,9 +27,11 @@ use { solana_gossip::cluster_info::ClusterInfo, solana_ledger::leader_schedule_cache::LeaderScheduleCache, solana_measure::measure_us, + solana_net_utils::banlist::Banlist, + solana_perf::packet::{BytesPacket, Meta, PacketBatch}, solana_pubkey::Pubkey, solana_runtime::{bank::Bank, bank_forks::SharableBanks}, - solana_streamer::{nonblocking::simple_qos::SimpleQosBanlist, packet::PacketBatch}, + solana_streamer::nonblocking::simple_qos::SimpleQosBanlist, std::{ collections::HashSet, sync::{ @@ -50,9 +53,55 @@ pub(super) const NUM_SLOTS_FOR_VERIFY: Slot = 90_000; /// We ban the sender for 2 days which roughly corresponds to an epoch pub(super) const BAN_TIMEOUT: Duration = Duration::from_hours(48); +#[derive(Clone)] +pub(crate) enum SigVerifierBanlist { + Stream(Arc), + Datagram(Arc>), +} + +impl SigVerifierBanlist { + pub(crate) fn ban(&self, pubkey: Pubkey, timeout: Duration) -> bool { + match self { + Self::Stream(banlist) => banlist.ban(pubkey, timeout), + Self::Datagram(banlist) => banlist.ban(pubkey, timeout), + } + } + + #[cfg(test)] + fn is_banned(&self, pubkey: &Pubkey) -> bool { + match self { + Self::Stream(banlist) => banlist.is_banned(pubkey), + Self::Datagram(banlist) => banlist.is_banned(pubkey), + } + } +} + +impl From> for SigVerifierBanlist { + fn from(banlist: Arc) -> Self { + Self::Stream(banlist) + } +} + +impl From>> for SigVerifierBanlist { + fn from(banlist: Arc>) -> Self { + Self::Datagram(banlist) + } +} + +pub(crate) enum SigVerifierPacketReceiver { + Stream(Receiver), + Datagram(Receiver), +} + +impl From> for SigVerifierPacketReceiver { + fn from(receiver: Receiver) -> Self { + Self::Datagram(receiver) + } +} + pub(crate) struct SigVerifierContext { pub(crate) migration_status: Arc, - pub(crate) banlist: Arc, + pub(crate) banlist: SigVerifierBanlist, pub(crate) sharable_banks: SharableBanks, pub(crate) cluster_info: Arc, pub(crate) leader_schedule: Arc, @@ -61,7 +110,7 @@ pub(crate) struct SigVerifierContext { } pub(crate) struct SigVerifierChannels { - pub(crate) packet_receiver: Receiver, + pub(crate) packet_receiver: SigVerifierPacketReceiver, pub(crate) channel_to_repair: VerifiedVoterSlotsSender, pub(crate) channel_to_reward: Sender, pub(crate) channel_to_pool: Sender, @@ -84,7 +133,7 @@ pub(crate) fn spawn_service( struct SigVerifier { migration_status: Arc, - banlist: Arc, + banlist: SigVerifierBanlist, channels: SigVerifierChannels, /// Container to look up root banks from. sharable_banks: SharableBanks, @@ -294,31 +343,78 @@ impl SigVerifier { } } -/// Receives a `Vec` from the `receiver` while adhering to the `soft_receive_cap` limit. +fn datagram_to_batch(datagram: Datagram) -> PacketBatch { + let Datagram { + peer_pubkey, + peer_address, + message, + } = datagram; + let mut meta = Meta::default(); + meta.size = message.len(); + meta.set_socket_addr(&peer_address); + meta.set_remote_pubkey(peer_pubkey); + PacketBatch::Single(BytesPacket::new(message, meta)) +} + +/// Receives packet batches from the configured BLS receiver while adhering to +/// the `soft_receive_cap` limit. /// /// Returns `Err(())` if the channel disconnected. fn recv_batches( + receiver: &SigVerifierPacketReceiver, + soft_receive_cap: usize, +) -> Result, ()> { + match receiver { + SigVerifierPacketReceiver::Stream(receiver) => { + recv_stream_batches(receiver, soft_receive_cap) + } + SigVerifierPacketReceiver::Datagram(receiver) => { + recv_datagram_batches(receiver, soft_receive_cap) + } + } +} + +fn recv_stream_batches( receiver: &Receiver, soft_receive_cap: usize, ) -> Result, ()> { let batch = match receiver.recv_timeout(Duration::from_secs(1)) { Ok(b) => b, Err(e) => match e { - RecvTimeoutError::Timeout => { - return Ok(vec![]); - } - RecvTimeoutError::Disconnected => { - return Err(()); - } + RecvTimeoutError::Timeout => return Ok(vec![]), + RecvTimeoutError::Disconnected => return Err(()), }, }; let mut batches = Vec::with_capacity(soft_receive_cap); batches.push(batch); while batches.len() < soft_receive_cap { match receiver.try_recv() { - Ok(b) => { - batches.push(b); - } + Ok(b) => batches.push(b), + Err(e) => match e { + TryRecvError::Empty => return Ok(batches), + TryRecvError::Disconnected => return Err(()), + }, + } + } + Ok(batches) +} + +fn recv_datagram_batches( + receiver: &Receiver, + soft_receive_cap: usize, +) -> Result, ()> { + let batch = match receiver.recv_timeout(Duration::from_secs(1)) { + Ok(b) => datagram_to_batch(b), + Err(e) => match e { + RecvTimeoutError::Timeout => return Ok(vec![]), + RecvTimeoutError::Disconnected => return Err(()), + }, + }; + let mut batches = Vec::with_capacity(soft_receive_cap); + batches.push(batch); + while batches.len() < soft_receive_cap { + match receiver.try_recv() { + Ok(b) => batches.push(datagram_to_batch(b)), Err(e) => match e { TryRecvError::Empty => return Ok(batches), TryRecvError::Disconnected => return Err(()), @@ -371,7 +467,7 @@ mod tests { struct TestContext { verifier: SigVerifier, validator_keypairs: Vec, - banlist: Arc, + banlist: SigVerifierBanlist, _packet_sender: Sender, repair_receiver: VerifiedVoterSlotsReceiver, @@ -427,7 +523,7 @@ mod tests { let verifier = SigVerifier::new( SigVerifierContext { migration_status: Arc::new(MigrationStatus::default()), - banlist: banlist.clone(), + banlist: banlist.clone().into(), sharable_banks, cluster_info, leader_schedule, @@ -435,7 +531,7 @@ mod tests { generated_cert_types: generated_cert_types.clone(), }, SigVerifierChannels { - packet_receiver, + packet_receiver: SigVerifierPacketReceiver::Stream(packet_receiver), channel_to_repair, channel_to_reward, channel_to_pool, @@ -445,7 +541,7 @@ mod tests { Self { validator_keypairs, verifier, - banlist, + banlist: banlist.into(), _packet_sender: packet_sender, repair_receiver, _reward_receiver: reward_receiver, @@ -1385,7 +1481,7 @@ mod tests { let mut sig_verifier = SigVerifier::new( SigVerifierContext { migration_status: Arc::new(MigrationStatus::default()), - banlist: new_test_banlist(), + banlist: new_test_banlist().into(), sharable_banks, cluster_info, leader_schedule, @@ -1393,7 +1489,7 @@ mod tests { generated_cert_types: Arc::new(GeneratedCertTypes::default()), }, SigVerifierChannels { - packet_receiver, + packet_receiver: SigVerifierPacketReceiver::Stream(packet_receiver), channel_to_repair: votes_for_repair_sender, channel_to_reward: reward_votes_sender, channel_to_pool: message_sender, diff --git a/core/src/bls_sigverify/bls_vote_sigverify.rs b/core/src/bls_sigverify/bls_vote_sigverify.rs index 6974e7867cf..d43aea8d39d 100644 --- a/core/src/bls_sigverify/bls_vote_sigverify.rs +++ b/core/src/bls_sigverify/bls_vote_sigverify.rs @@ -5,7 +5,9 @@ use { crate::{ block_creation_loop::rewards::msg_types::AddVoteMessage, bls_sigverify::{ - bls_sigverifier::{BAN_TIMEOUT, NUM_SLOTS_FOR_VERIFY, SigVerifierChannels}, + bls_sigverifier::{ + BAN_TIMEOUT, NUM_SLOTS_FOR_VERIFY, SigVerifierBanlist, SigVerifierChannels, + }, utils::{ send_votes_to_metrics, send_votes_to_pool, send_votes_to_repair, send_votes_to_rewards, @@ -32,7 +34,6 @@ use { solana_measure::{measure::Measure, measure_us}, solana_pubkey::Pubkey, solana_runtime::bank::Bank, - solana_streamer::nonblocking::simple_qos::SimpleQosBanlist, std::{collections::HashMap, sync::Arc}, }; @@ -76,7 +77,7 @@ pub(super) fn verify_and_send_votes( root_bank: &Bank, cluster_info: &ClusterInfo, leader_schedule: &LeaderScheduleCache, - banlist: &SimpleQosBanlist, + banlist: &SigVerifierBanlist, thread_pool: &ThreadPool, channels: &SigVerifierChannels, ) -> Result { @@ -180,7 +181,7 @@ fn verify_votes( root_bank: &Bank, votes_to_verify: Vec, stats: &mut SigVerifyVoteStats, - banlist: &SimpleQosBanlist, + banlist: &SigVerifierBanlist, thread_pool: &ThreadPool, ) -> Vec { // Filter votes too far in the future. diff --git a/core/src/tvu.rs b/core/src/tvu.rs index 0469a562ea8..3eae2e685a1 100644 --- a/core/src/tvu.rs +++ b/core/src/tvu.rs @@ -6,7 +6,10 @@ use { admin_rpc_post_init::{KeyUpdaterType, KeyUpdaters}, banking_trace::BankingTracer, block_creation_loop::{ReplayHighestFrozen, rewards::msg_types::AddVoteMessage}, - bls_sigverify::bls_sigverifier::{self, SigVerifierChannels, SigVerifierContext}, + bls_sigverify::bls_sigverifier::{ + self, SigVerifierBanlist, SigVerifierChannels, SigVerifierContext, + SigVerifierPacketReceiver, + }, cluster_info_vote_listener::{ DuplicateConfirmedSlotsReceiver, GossipVerifiedVoteHashReceiver, VerifiedVoterSlotsReceiver, VerifiedVoterSlotsSender, VoteTracker, @@ -323,7 +326,7 @@ impl Tvu { exit.clone(), SigVerifierContext { migration_status: migration_status.clone(), - banlist, + banlist: SigVerifierBanlist::Stream(banlist), sharable_banks, cluster_info: cluster_info.clone(), leader_schedule: leader_schedule_cache.clone(), @@ -331,7 +334,7 @@ impl Tvu { generated_cert_types: generated_cert_types.clone(), }, SigVerifierChannels { - packet_receiver: bls_packet_receiver, + packet_receiver: SigVerifierPacketReceiver::Stream(bls_packet_receiver), channel_to_repair: verified_voter_slots_sender, channel_to_reward: reward_votes_sender, channel_to_pool: consensus_message_sender.clone(), From 910a03769aa9b973c88cc4f97a8dc5f623b46e29 Mon Sep 17 00:00:00 2001 From: greg Date: Tue, 9 Jun 2026 02:49:34 +0000 Subject: [PATCH 4/6] votor: carry pubkeys with alpenglow peers --- votor/src/staked_validators_cache.rs | 96 +++++++++++++++++----------- votor/src/voting_service.rs | 11 ++-- 2 files changed, 65 insertions(+), 42 deletions(-) diff --git a/votor/src/staked_validators_cache.rs b/votor/src/staked_validators_cache.rs index 0912b78690c..7c6a0066af0 100644 --- a/votor/src/staked_validators_cache.rs +++ b/votor/src/staked_validators_cache.rs @@ -4,18 +4,18 @@ use { solana_clock::{Epoch, Slot}, solana_gossip::cluster_info::ClusterInfo, solana_pubkey::Pubkey, - solana_runtime::bank_forks::BankForks, + solana_runtime::bank_forks::SharableBanks, std::{ collections::HashMap, net::SocketAddr, - sync::{Arc, RwLock}, + sync::Arc, time::{Duration, Instant}, }, }; struct StakedValidatorsCacheEntry { - /// Alpenglow Sockets associated with the staked validators - alpenglow_sockets: Vec, + /// (Pubkey, Alpenglow socket) pairs for the staked validators. + peers: Vec<(Pubkey, SocketAddr)>, /// The time at which this entry was created creation_time: Instant, @@ -35,8 +35,8 @@ pub struct StakedValidatorsCache { /// Time to live for cache entries ttl: Duration, - /// Bank forks - bank_forks: Arc>, + /// Lock-free handle to the root/working banks. + sharable_banks: SharableBanks, /// Whether to include the running validator's socket address in cache entries include_self: bool, @@ -50,7 +50,7 @@ pub struct StakedValidatorsCache { impl StakedValidatorsCache { pub fn new( - bank_forks: Arc>, + sharable_banks: SharableBanks, ttl: Duration, target_cache_size: usize, include_self: bool, @@ -59,7 +59,7 @@ impl StakedValidatorsCache { Self { cache: LruCache::new(target_cache_size), ttl, - bank_forks, + sharable_banks, include_self, alpenglow_port_override, alpenglow_port_override_last_modified: Instant::now(), @@ -68,10 +68,8 @@ impl StakedValidatorsCache { #[inline] fn cur_epoch(&self, slot: Slot) -> Epoch { - self.bank_forks - .read() - .unwrap() - .working_bank() + self.sharable_banks + .working() .epoch_schedule() .get_epoch(slot) } @@ -82,10 +80,7 @@ impl StakedValidatorsCache { cluster_info: &ClusterInfo, update_time: Instant, ) { - let banks = { - let bank_forks = self.bank_forks.read().unwrap(); - [bank_forks.root_bank(), bank_forks.working_bank()] - }; + let banks = [self.sharable_banks.root(), self.sharable_banks.working()]; let epoch_staked_nodes = banks .iter() @@ -126,7 +121,7 @@ impl StakedValidatorsCache { nodes.dedup_by_key(|node| node.alpenglow_socket); nodes.sort_unstable_by_key(|a| a.stake); - let mut alpenglow_sockets = Vec::with_capacity(nodes.len()); + let mut peers = Vec::with_capacity(nodes.len()); let override_map = self .alpenglow_port_override .as_ref() @@ -142,12 +137,12 @@ impl StakedValidatorsCache { } else { alpenglow_socket }; - alpenglow_sockets.push(socket); + peers.push((node.pubkey, socket)); } self.cache.put( epoch, StakedValidatorsCacheEntry { - alpenglow_sockets, + peers, creation_time: update_time, }, ); @@ -158,7 +153,7 @@ impl StakedValidatorsCache { slot: Slot, cluster_info: &ClusterInfo, access_time: Instant, - ) -> (&[SocketAddr], bool) { + ) -> (&[(Pubkey, SocketAddr)], bool) { // Check if self.alpenglow_port_override has a different last_modified. // Immediately refresh the cache if it does. if let Some(alpenglow_port_override) = &self.alpenglow_port_override { @@ -183,7 +178,7 @@ impl StakedValidatorsCache { epoch: Epoch, cluster_info: &ClusterInfo, access_time: Instant, - ) -> (&[SocketAddr], bool) { + ) -> (&[(Pubkey, SocketAddr)], bool) { // For a given epoch, if we either: // // (1) have a cache entry that has expired @@ -203,10 +198,7 @@ impl StakedValidatorsCache { ( // Unwrapping is fine here, since update_cache guarantees that we push a cache entry to // self.cache[epoch]. - self.cache - .get(&epoch) - .map(|v| &*v.alpenglow_sockets) - .unwrap(), + self.cache.get(&epoch).map(|v| &*v.peers).unwrap(), refresh_cache, ) } @@ -370,7 +362,13 @@ mod tests { create_bank_forks_and_cluster_info(num_nodes, num_zero_stake_nodes, slot_num); // Create our staked validators cache - let mut svc = StakedValidatorsCache::new(bank_forks, Duration::from_secs(5), 5, true, None); + let mut svc = StakedValidatorsCache::new( + bank_forks.read().unwrap().sharable_banks(), + Duration::from_secs(5), + 5, + true, + None, + ); let now = Instant::now(); @@ -445,7 +443,13 @@ mod tests { let (bank_forks, cluster_info, _) = create_bank_forks_and_cluster_info(50, 7, base_slot); // Create our staked validators cache - let mut svc = StakedValidatorsCache::new(bank_forks, Duration::from_secs(5), 5, true, None); + let mut svc = StakedValidatorsCache::new( + bank_forks.read().unwrap().sharable_banks(), + Duration::from_secs(5), + 5, + true, + None, + ); assert_eq!(0, svc.len()); assert!(svc.is_empty()); @@ -517,7 +521,13 @@ mod tests { create_bank_forks_and_cluster_info(num_nodes, num_zero_stake_nodes, slot_num); // Create our staked validators cache - let mut svc = StakedValidatorsCache::new(bank_forks, Duration::from_secs(5), 5, true, None); + let mut svc = StakedValidatorsCache::new( + bank_forks.read().unwrap().sharable_banks(), + Duration::from_secs(5), + 5, + true, + None, + ); let now = Instant::now(); @@ -546,23 +556,33 @@ mod tests { .unwrap(); // Create our staked validators cache - set include_self to true - let mut svc = - StakedValidatorsCache::new(bank_forks.clone(), Duration::from_secs(5), 5, true, None); + let mut svc = StakedValidatorsCache::new( + bank_forks.read().unwrap().sharable_banks(), + Duration::from_secs(5), + 5, + true, + None, + ); let (sockets, _) = svc.get_staked_validators_by_slot(slot_num, &cluster_info, Instant::now()); assert_eq!(sockets.len(), num_nodes); - assert!(sockets.contains(&my_socket_addr)); + assert!(sockets.iter().any(|(_, socket)| socket == &my_socket_addr)); // Create our staked validators cache - set include_self to false - let mut svc = - StakedValidatorsCache::new(bank_forks, Duration::from_secs(5), 5, false, None); + let mut svc = StakedValidatorsCache::new( + bank_forks.read().unwrap().sharable_banks(), + Duration::from_secs(5), + 5, + false, + None, + ); let (sockets, _) = svc.get_staked_validators_by_slot(slot_num, &cluster_info, Instant::now()); // We should have num_nodes - 1 sockets, since we exclude our own socket address. assert_eq!(sockets.len(), num_nodes.checked_sub(1).unwrap()); - assert!(!sockets.contains(&my_socket_addr)); + assert!(!sockets.iter().any(|(_, socket)| socket == &my_socket_addr)); } #[test] @@ -576,7 +596,7 @@ mod tests { // Create our staked validators cache - set include_self to false let mut svc = StakedValidatorsCache::new( - bank_forks, + bank_forks.read().unwrap().sharable_banks(), Duration::from_secs(5), 5, false, @@ -585,14 +605,14 @@ mod tests { // Nothing in the override, so we should get the original socket addresses. let (sockets, _) = svc.get_staked_validators_by_slot(0, &cluster_info, Instant::now()); assert_eq!(sockets.len(), 2); - assert!(!sockets.contains(&blackhole_addr)); + assert!(!sockets.iter().any(|(_, socket)| socket == &blackhole_addr)); // Add an override for pubkey_B, and check that we get the overridden socket address. alpenglow_port_override.update_override(HashMap::from([(pubkey_b, blackhole_addr)])); let (sockets, _) = svc.get_staked_validators_by_slot(0, &cluster_info, Instant::now()); assert_eq!(sockets.len(), 2); // Sort sockets to ensure the blackhole address is at index 0. - let mut sockets: Vec<_> = sockets.to_vec(); + let mut sockets: Vec<_> = sockets.iter().map(|(_, socket)| *socket).collect(); sockets.sort(); assert_eq!(sockets[0], blackhole_addr); assert_ne!(sockets[1], blackhole_addr); @@ -601,6 +621,6 @@ mod tests { alpenglow_port_override.clear(); let (sockets, _) = svc.get_staked_validators_by_slot(0, &cluster_info, Instant::now()); assert_eq!(sockets.len(), 2); - assert!(!sockets.contains(&blackhole_addr)); + assert!(!sockets.iter().any(|(_, socket)| socket == &blackhole_addr)); } } diff --git a/votor/src/voting_service.rs b/votor/src/voting_service.rs index 09cf5db9fa9..fc6dd5ce61b 100644 --- a/votor/src/voting_service.rs +++ b/votor/src/voting_service.rs @@ -138,7 +138,7 @@ impl VotingService { .name("solVotorVoteSvc".to_string()) .spawn(move || { let mut staked_validators_cache = StakedValidatorsCache::new( - bank_forks.clone(), + bank_forks.read().unwrap().sharable_banks(), Duration::from_secs(STAKED_VALIDATORS_CACHE_TTL_S), STAKED_VALIDATORS_CACHE_NUM_EPOCH_TARGET, false, @@ -178,11 +178,14 @@ impl VotingService { } }; - let (staked_validator_alpenglow_sockets, _) = staked_validators_cache - .get_staked_validators_by_slot(slot, cluster_info, Instant::now()); + let (staked_validator_peers, _) = staked_validators_cache.get_staked_validators_by_slot( + slot, + cluster_info, + Instant::now(), + ); let sockets = additional_listeners .iter() - .chain(staked_validator_alpenglow_sockets.iter()); + .chain(staked_validator_peers.iter().map(|(_, socket)| socket)); // We use send_message in a loop right now because we worry that sending packets too fast // will cause a packet spike and overwhelm the network. If we later find out that this is From 43ba1b61f1126cfd6d47e31c86d00d0a66786d34 Mon Sep 17 00:00:00 2001 From: greg Date: Tue, 9 Jun 2026 02:50:12 +0000 Subject: [PATCH 5/6] votor: add datagram voting transport --- core/src/tvu.rs | 6 +- local-cluster/tests/local_cluster.rs | 14 +- votor/src/staked_validators_cache.rs | 88 ++++++++++++- votor/src/voting_service.rs | 186 ++++++++++++++++++++++----- 4 files changed, 257 insertions(+), 37 deletions(-) diff --git a/core/src/tvu.rs b/core/src/tvu.rs index 3eae2e685a1..9cb6557c08f 100644 --- a/core/src/tvu.rs +++ b/core/src/tvu.rs @@ -37,7 +37,9 @@ use { generated_cert_types::GeneratedCertTypes, vote_history::VoteHistory, vote_history_storage::VoteHistoryStorage, - voting_service::{VotingService as BLSVotingService, VotingServiceOverride}, + voting_service::{ + VotingService as BLSVotingService, VotingServiceOverride, VotingTransport, + }, votor::{Votor, VotorConfig}, }, agave_votor_messages::consensus_message::Block, @@ -598,7 +600,7 @@ impl Tvu { bls_receiver, cluster_info.clone(), vote_history_storage, - bls_connection_cache, + VotingTransport::Stream(bls_connection_cache), bank_forks.clone(), voting_service_test_override, ); diff --git a/local-cluster/tests/local_cluster.rs b/local-cluster/tests/local_cluster.rs index 94ab61b8af7..2baac35dc37 100644 --- a/local-cluster/tests/local_cluster.rs +++ b/local-cluster/tests/local_cluster.rs @@ -4,7 +4,9 @@ use { SnapshotArchiveKind, SnapshotInterval, paths as snapshot_paths, snapshot_archive_info::SnapshotArchiveInfoGetter, snapshot_config::SnapshotConfig, }, - agave_votor::voting_service::{AlpenglowPortOverride, VotingServiceOverride}, + agave_votor::voting_service::{ + AdditionalListener, AlpenglowPortOverride, VotingServiceOverride, + }, agave_votor_messages::migration::MIGRATION_SLOT_OFFSET, assert_matches::assert_matches, crossbeam_channel::{Receiver, bounded}, @@ -5990,7 +5992,10 @@ fn test_alpenglow_imbalanced_stakes_catchup() { let mut validator_config = ValidatorConfig::default_for_test(); validator_config.fixed_leader_schedule = Some(leader_schedule); validator_config.voting_service_test_override = Some(VotingServiceOverride { - additional_listeners: vec![vote_listener_addr.local_addr().unwrap()], + additional_listeners: vec![AdditionalListener { + pubkey: Pubkey::new_unique(), + addr: vote_listener_addr.local_addr().unwrap(), + }], alpenglow_port_override: AlpenglowPortOverride::default(), }); validator_config.wait_for_supermajority = Some(0); @@ -6177,7 +6182,10 @@ fn test_alpenglow_migration( let vote_listener_addr = vote_listener_socket.try_clone().unwrap(); let mut validator_config = ValidatorConfig::default_for_test(); validator_config.voting_service_test_override = Some(VotingServiceOverride { - additional_listeners: vec![vote_listener_addr.local_addr().unwrap()], + additional_listeners: vec![AdditionalListener { + pubkey: Pubkey::new_unique(), + addr: vote_listener_addr.local_addr().unwrap(), + }], alpenglow_port_override: AlpenglowPortOverride::default(), }); validator_config.wait_for_supermajority = Some(0); diff --git a/votor/src/staked_validators_cache.rs b/votor/src/staked_validators_cache.rs index 7c6a0066af0..e92a9dd8370 100644 --- a/votor/src/staked_validators_cache.rs +++ b/votor/src/staked_validators_cache.rs @@ -1,5 +1,8 @@ +#[cfg(any(test, feature = "dev-context-only-utils"))] +use std::collections::HashSet; use { - crate::voting_service::AlpenglowPortOverride, + crate::datagram_transport::allowlist::StakedNodesAllowlist, + crate::{datagram_endpoint, voting_service::AlpenglowPortOverride}, lazy_lru::LruCache, solana_clock::{Epoch, Slot}, solana_gossip::cluster_info::ClusterInfo, @@ -46,15 +49,25 @@ pub struct StakedValidatorsCache { /// timestamp of the last alpenglow port override we read alpenglow_port_override_last_modified: Instant, + + /// Allowlist for the votor datagram endpoint. + allowlist: Option>, + + /// Extra pubkeys that should always be allowed even when not in the staked set. + #[cfg(any(test, feature = "dev-context-only-utils"))] + extra_admit: HashSet, } impl StakedValidatorsCache { + #[allow(clippy::too_many_arguments)] pub fn new( sharable_banks: SharableBanks, ttl: Duration, target_cache_size: usize, include_self: bool, alpenglow_port_override: Option, + allowlist: Option>, + #[cfg(any(test, feature = "dev-context-only-utils"))] extra_admit: HashSet, ) -> Self { Self { cache: LruCache::new(target_cache_size), @@ -63,9 +76,27 @@ impl StakedValidatorsCache { include_self, alpenglow_port_override, alpenglow_port_override_last_modified: Instant::now(), + allowlist, + #[cfg(any(test, feature = "dev-context-only-utils"))] + extra_admit, } } + fn refresh_allowlist(&mut self) { + let Some(allowlist) = self.allowlist.as_ref() else { + return; + }; + + let admit_set = datagram_endpoint::current_admit_set(&self.sharable_banks); + #[cfg(any(test, feature = "dev-context-only-utils"))] + let admit_set = { + let mut admit_set = admit_set; + admit_set.extend(self.extra_admit.iter().map(|pubkey| (*pubkey, 0))); + admit_set + }; + allowlist.swap(admit_set); + } + #[inline] fn cur_epoch(&self, slot: Slot) -> Epoch { self.sharable_banks @@ -80,6 +111,8 @@ impl StakedValidatorsCache { cluster_info: &ClusterInfo, update_time: Instant, ) { + self.refresh_allowlist(); + let banks = [self.sharable_banks.root(), self.sharable_banks.working()]; let epoch_staked_nodes = banks @@ -217,7 +250,7 @@ impl StakedValidatorsCache { #[cfg(test)] mod tests { use { - super::StakedValidatorsCache, + super::{HashSet, StakedValidatorsCache}, crate::voting_service::AlpenglowPortOverride, rand::Rng, solana_gossip::{ @@ -368,6 +401,8 @@ mod tests { 5, true, None, + None, + HashSet::new(), ); let now = Instant::now(); @@ -449,6 +484,8 @@ mod tests { 5, true, None, + None, + HashSet::new(), ); assert_eq!(0, svc.len()); @@ -527,6 +564,8 @@ mod tests { 5, true, None, + None, + HashSet::new(), ); let now = Instant::now(); @@ -562,6 +601,8 @@ mod tests { 5, true, None, + None, + HashSet::new(), ); let (sockets, _) = @@ -576,6 +617,8 @@ mod tests { 5, false, None, + None, + HashSet::new(), ); let (sockets, _) = @@ -601,6 +644,8 @@ mod tests { 5, false, Some(alpenglow_port_override.clone()), + None, + HashSet::new(), ); // Nothing in the override, so we should get the original socket addresses. let (sockets, _) = svc.get_staked_validators_by_slot(0, &cluster_info, Instant::now()); @@ -623,4 +668,43 @@ mod tests { assert_eq!(sockets.len(), 2); assert!(!sockets.iter().any(|(_, socket)| socket == &blackhole_addr)); } + + #[test] + fn test_allowlist_populated_from_staked_nodes() { + use crate::datagram_transport::allowlist::{Allowlist, StakedNodesAllowlist}; + + let slot_num = 325_000_000_u64; + let num_nodes = 10_usize; + let num_zero_stake_nodes = 3_usize; + let (bank_forks, cluster_info, pubkeys) = + create_bank_forks_and_cluster_info(num_nodes, num_zero_stake_nodes, slot_num); + + let allowlist = Arc::new(StakedNodesAllowlist::default()); + let mut svc = StakedValidatorsCache::new( + bank_forks.read().unwrap().sharable_banks(), + Duration::from_secs(5), + 5, + true, + None, + Some(allowlist.clone()), + HashSet::new(), + ); + + assert!(allowlist.is_empty()); + let _ = svc.get_staked_validators_by_slot(slot_num, &cluster_info, Instant::now()); + + for (index, pubkey) in pubkeys.iter().enumerate() { + if index < num_zero_stake_nodes { + assert!( + !allowlist.allow(pubkey), + "zero-stake node {index} must not be admitted" + ); + } else { + assert!( + allowlist.allow(pubkey), + "staked node {index} must be admitted" + ); + } + } + } } diff --git a/votor/src/voting_service.rs b/votor/src/voting_service.rs index fc6dd5ce61b..7b32b053fa7 100644 --- a/votor/src/voting_service.rs +++ b/votor/src/voting_service.rs @@ -1,4 +1,7 @@ +#[cfg(any(test, feature = "dev-context-only-utils"))] +use std::collections::HashSet; use { + crate::datagram_transport::{allowlist::StakedNodesAllowlist, endpoint::Datagram}, crate::{ staked_validators_cache::StakedValidatorsCache, vote_history_storage::{SavedVoteHistoryVersions, VoteHistoryStorage}, @@ -7,6 +10,7 @@ use { certificate::Certificate, consensus_message::{ConsensusMessage, VoteMessage}, }, + bytes::Bytes, crossbeam_channel::Receiver, solana_client::connection_cache::ConnectionCache, solana_clock::Slot, @@ -23,6 +27,7 @@ use { thread::{self, Builder, JoinHandle}, time::{Duration, Instant}, }, + tokio::sync::mpsc, }; const STAKED_VALIDATORS_CACHE_TTL_S: u64 = 5; @@ -111,9 +116,33 @@ impl AlpenglowPortOverride { } } +#[derive(Clone, Debug)] +pub struct AdditionalListener { + pub pubkey: Pubkey, + pub addr: SocketAddr, +} + +#[derive(Clone)] +pub enum VotingTransport { + Stream(Arc), + Datagram { + egress: mpsc::Sender, + allowlist: Option>, + }, +} + +impl VotingTransport { + fn allowlist(&self) -> Option> { + match self { + Self::Stream(_) => None, + Self::Datagram { allowlist, .. } => allowlist.clone(), + } + } +} + #[derive(Clone)] pub struct VotingServiceOverride { - pub additional_listeners: Vec, + pub additional_listeners: Vec, pub alpenglow_port_override: AlpenglowPortOverride, } @@ -122,7 +151,7 @@ impl VotingService { bls_receiver: Receiver, cluster_info: Arc, vote_history_storage: Arc, - connection_cache: Arc, + transport: VotingTransport, bank_forks: Arc>, test_override: Option, ) -> Self { @@ -134,6 +163,13 @@ impl VotingService { }) => (additional_listeners, Some(alpenglow_port_override)), }; + let allowlist = transport.allowlist(); + #[cfg(any(test, feature = "dev-context-only-utils"))] + let extra_admit: HashSet = additional_listeners + .iter() + .map(|listener| listener.pubkey) + .collect(); + let thread_hdl = Builder::new() .name("solVotorVoteSvc".to_string()) .spawn(move || { @@ -143,6 +179,9 @@ impl VotingService { STAKED_VALIDATORS_CACHE_NUM_EPOCH_TARGET, false, alpenglow_port_override, + allowlist, + #[cfg(any(test, feature = "dev-context-only-utils"))] + extra_admit, ); info!("AlpenglowVotingService has started"); @@ -151,7 +190,7 @@ impl VotingService { &cluster_info, vote_history_storage.as_ref(), bls_op, - &connection_cache, + &transport, &additional_listeners, &mut staked_validators_cache, ); @@ -166,8 +205,8 @@ impl VotingService { slot: Slot, cluster_info: &ClusterInfo, message: &ConsensusMessage, - connection_cache: &ConnectionCache, - additional_listeners: &[SocketAddr], + transport: &VotingTransport, + additional_listeners: &[AdditionalListener], staked_validators_cache: &mut StakedValidatorsCache, ) { let buf = match wincode::serialize(message) { @@ -183,16 +222,45 @@ impl VotingService { cluster_info, Instant::now(), ); - let sockets = additional_listeners - .iter() - .chain(staked_validator_peers.iter().map(|(_, socket)| socket)); - - // We use send_message in a loop right now because we worry that sending packets too fast - // will cause a packet spike and overwhelm the network. If we later find out that this is - // not an issue, we can optimize this by using multi_targret_send or similar methods. - for socket in sockets { - if let Err(e) = send_message(buf.clone(), socket, connection_cache) { - warn!("Failed to send alpenglow message to {socket}: {e:?}"); + match transport { + VotingTransport::Stream(connection_cache) => { + let sockets = additional_listeners + .iter() + .map(|listener| &listener.addr) + .chain(staked_validator_peers.iter().map(|(_, socket)| socket)); + + // We use send_message in a loop right now because we worry that sending packets + // too fast will cause a packet spike and overwhelm the network. If we later find + // out that this is not an issue, we can optimize this by using multi_target_send + // or a similar API. + for socket in sockets { + if let Err(e) = send_message(buf.clone(), socket, connection_cache) { + warn!("Failed to send alpenglow message to {socket}: {e:?}"); + } + } + } + VotingTransport::Datagram { egress, .. } => { + let buf = Bytes::from(buf); + let peers = additional_listeners + .iter() + .map(|listener| (listener.pubkey, listener.addr)) + .chain(staked_validator_peers.iter().copied()); + for (peer_pubkey, peer_address) in peers { + match egress.try_send(Datagram { + peer_pubkey, + peer_address, + message: buf.clone(), + }) { + Ok(()) => {} + Err(mpsc::error::TrySendError::Full(_)) => { + warn!("alpenglow egress channel full; dropping vote/cert"); + } + Err(mpsc::error::TrySendError::Closed(_)) => { + warn!("alpenglow egress channel closed; endpoint shutting down"); + return; + } + } + } } } } @@ -201,8 +269,8 @@ impl VotingService { cluster_info: &ClusterInfo, vote_history_storage: &dyn VoteHistoryStorage, bls_op: BLSOp, - connection_cache: &ConnectionCache, - additional_listeners: &[SocketAddr], + transport: &VotingTransport, + additional_listeners: &[AdditionalListener], staked_validators_cache: &mut StakedValidatorsCache, ) { match bls_op { @@ -223,7 +291,7 @@ impl VotingService { slot, cluster_info, &msg, - connection_cache, + transport, additional_listeners, staked_validators_cache, ); @@ -236,7 +304,7 @@ impl VotingService { slot, cluster_info, &message, - connection_cache, + transport, additional_listeners, staked_validators_cache, ); @@ -280,17 +348,15 @@ mod tests { quic::{QuicStreamerConfig, SpawnServerResult, spawn_stake_weighted_qos_server}, streamer::StakedNodes, }, - std::{ - net::SocketAddr, - sync::{Arc, RwLock}, - }, + std::sync::{Arc, RwLock}, test_case::test_case, tokio_util::sync::CancellationToken, }; fn create_voting_service( bls_receiver: Receiver, - listener: SocketAddr, + listener: AdditionalListener, + transport: VotingTransport, ) -> (VotingService, Vec) { // Create 10 node validatorvotekeypairs vec let validator_keypairs = (0..10) @@ -316,10 +382,7 @@ mod tests { bls_receiver, Arc::new(cluster_info), Arc::new(NullVoteHistoryStorage::default()), - Arc::new(ConnectionCache::new_quic( - "TestAlpenglowConnectionCache", - 10, - )), + transport, bank_forks, Some(VotingServiceOverride { additional_listeners: vec![listener], @@ -362,8 +425,15 @@ mod tests { let socket = bind_to_localhost_unique().unwrap(); let listener_addr = socket.local_addr().unwrap(); - // Create VotingService with the listener address - let (_, validator_keypairs) = create_voting_service(bls_receiver, listener_addr); + let listener = AdditionalListener { + pubkey: Pubkey::new_unique(), + addr: listener_addr, + }; + let transport = VotingTransport::Stream(Arc::new(ConnectionCache::new_quic( + "TestAlpenglowConnectionCache", + 10, + ))); + let (_, validator_keypairs) = create_voting_service(bls_receiver, listener, transport); // Send a BLS message via the VotingService assert!(bls_sender.send(bls_op).is_ok()); @@ -411,4 +481,60 @@ mod tests { cancel.cancel(); quic_server_thread.join().unwrap(); } + + #[test_case(BLSOp::PushVote { + vote: Arc::new(VoteMessage { + vote: Vote::new_skip_vote(5), + signature: BLSSignature([0; BLS_SIGNATURE_AFFINE_SIZE]), + rank: 1, + }), + saved_vote_history: SavedVoteHistoryVersions::Current(SavedVoteHistory::default()), + }, ConsensusMessage::Vote(VoteMessage { + vote: Vote::new_skip_vote(5), + signature: BLSSignature([0; BLS_SIGNATURE_AFFINE_SIZE]), + rank: 1, + }))] + #[test_case(BLSOp::PushCertificates { + certificates: vec![Arc::new(Certificate { + cert_type: CertificateType::Skip(5), + signature: BLSSignature([0; BLS_SIGNATURE_AFFINE_SIZE]), + bitmap: Vec::new(), + })], + }, ConsensusMessage::Certificate(Certificate { + cert_type: CertificateType::Skip(5), + signature: BLSSignature([0; BLS_SIGNATURE_AFFINE_SIZE]), + bitmap: Vec::new(), + }))] + fn test_send_datagram_message(bls_op: BLSOp, expected_message: ConsensusMessage) { + let (bls_sender, bls_receiver) = bounded(1024); + let (egress, mut egress_receiver) = mpsc::channel(1024); + let listener = AdditionalListener { + pubkey: Pubkey::new_unique(), + addr: "127.0.0.1:12345".parse().unwrap(), + }; + let transport = VotingTransport::Datagram { + egress, + allowlist: None, + }; + let (_voting_service, _validator_keypairs) = + create_voting_service(bls_receiver, listener.clone(), transport); + + assert!(bls_sender.send(bls_op).is_ok()); + + let deadline = Instant::now() + Duration::from_secs(5); + let datagram = loop { + match egress_receiver.try_recv() { + Ok(datagram) => break datagram, + Err(mpsc::error::TryRecvError::Empty) if Instant::now() < deadline => { + std::thread::sleep(Duration::from_millis(10)); + } + Err(err) => panic!("failed to receive datagram: {err:?}"), + } + }; + assert_eq!(datagram.peer_pubkey, listener.pubkey); + assert_eq!(datagram.peer_address, listener.addr); + let received_message = wincode::deserialize::(&datagram.message) + .expect("deserialize datagram message"); + assert_eq!(received_message, expected_message); + } } From 07a1c1e1bcf19e91006db610396844cc5734abac Mon Sep 17 00:00:00 2001 From: greg Date: Tue, 9 Jun 2026 02:52:02 +0000 Subject: [PATCH 6/6] votor: switch BLS messaging to datagrams --- Cargo.lock | 2 + core/src/admin_rpc_post_init.rs | 7 +- core/src/bls_sigverify/bls_sigverifier.rs | 6 + core/src/tvu.rs | 146 ++++++++------------- core/src/validator.rs | 91 +++++++++---- local-cluster/Cargo.toml | 4 +- local-cluster/src/cluster_tests.rs | 152 ++++++++++------------ local-cluster/src/local_cluster.rs | 10 +- local-cluster/tests/local_cluster.rs | 65 +++++---- validator/src/admin_rpc_service.rs | 3 +- 10 files changed, 247 insertions(+), 239 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9962caa61fb..cec1635b096 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8958,7 +8958,9 @@ dependencies = [ "static_assertions", "strum", "tempfile", + "tokio", "tokio-util 0.7.18", + "wincode", ] [[package]] diff --git a/core/src/admin_rpc_post_init.rs b/core/src/admin_rpc_post_init.rs index 79259fc625c..93f5f5ed194 100644 --- a/core/src/admin_rpc_post_init.rs +++ b/core/src/admin_rpc_post_init.rs @@ -30,10 +30,9 @@ pub enum KeyUpdaterType { Forward, /// For the RPC service RpcService, - /// BLS all-to-all streamer key updater - Bls, - /// BLS all-to-all connection cache key updater - BlsConnectionCache, + /// Votor QUIC datagram endpoint key updater (single endpoint multiplexes + /// inbound consensus messages and outbound votes/certs). + VotorDatagram, } /// Responsible for managing the updaters for identity key change diff --git a/core/src/bls_sigverify/bls_sigverifier.rs b/core/src/bls_sigverify/bls_sigverifier.rs index e7e82d4e229..3818c82b780 100644 --- a/core/src/bls_sigverify/bls_sigverifier.rs +++ b/core/src/bls_sigverify/bls_sigverifier.rs @@ -93,6 +93,12 @@ pub(crate) enum SigVerifierPacketReceiver { Datagram(Receiver), } +impl From> for SigVerifierPacketReceiver { + fn from(receiver: Receiver) -> Self { + Self::Stream(receiver) + } +} + impl From> for SigVerifierPacketReceiver { fn from(receiver: Receiver) -> Self { Self::Datagram(receiver) diff --git a/core/src/tvu.rs b/core/src/tvu.rs index 9cb6557c08f..4cf59005fbc 100644 --- a/core/src/tvu.rs +++ b/core/src/tvu.rs @@ -3,7 +3,7 @@ use { crate::{ - admin_rpc_post_init::{KeyUpdaterType, KeyUpdaters}, + admin_rpc_post_init::KeyUpdaters, banking_trace::BankingTracer, block_creation_loop::{ReplayHighestFrozen, rewards::msg_types::AddVoteMessage}, bls_sigverify::bls_sigverifier::{ @@ -31,6 +31,7 @@ use { warm_quic_cache_service::WarmQuicCacheService, window_service::{WindowService, WindowServiceChannels}, }, + agave_votor::datagram_transport::endpoint::Datagram, agave_votor::{ consensus_metrics::MAX_IN_FLIGHT_CONSENSUS_EVENTS, event::{LatestSwitchRequest, LeaderWindowInfo, VotorEventReceiver, VotorEventSender}, @@ -60,6 +61,7 @@ use { leader_schedule_cache::LeaderScheduleCache, shred::filter::TurbineMode, }, + solana_net_utils::banlist::Banlist, solana_poh::{poh_controller::PohController, poh_recorder::PohRecorder}, solana_pubkey::Pubkey, solana_rpc::{ @@ -67,7 +69,6 @@ use { rpc_subscriptions::RpcSubscriptions, slot_status_notifier::SlotStatusNotifier, }, solana_runtime::{ - bank::MAX_ALPENGLOW_VOTE_ACCOUNTS, bank_forks::BankForks, bank_forks_controller::{BankForksCommandReceiver, BankForksController}, commitment::BlockCommitmentCache, @@ -76,12 +77,7 @@ use { validated_block_finalization::ValidatedBlockFinalizationCert, vote_sender_types::ReplayVoteSender, }, - solana_streamer::{ - evicting_sender::EvictingSender, - nonblocking::simple_qos::SimpleQosConfig, - quic::{QuicStreamerConfig, SpawnServerResult, spawn_simple_qos_server}, - streamer::StakedNodes, - }, + solana_streamer::evicting_sender::EvictingSender, solana_turbine::{XdpSender as TurbineXdpSender, retransmit_stage::RetransmitStage}, std::{ collections::HashSet, @@ -90,7 +86,6 @@ use { sync::{Arc, RwLock, atomic::AtomicBool}, thread::{self, JoinHandle}, }, - tokio_util::sync::CancellationToken, }; /// Sets the upper bound on the number of batches stored in the retransmit @@ -122,7 +117,7 @@ pub struct Tvu { warm_quic_cache_service: Option, drop_bank_service: DropBankService, duplicate_shred_listener: DuplicateShredListener, - bls_sigverify_threads: Option<(JoinHandle<()>, JoinHandle<()>)>, + bls_sigverify_threads: Option>, votor: Votor, commitment_service: AggregateCommitmentService, } @@ -184,13 +179,20 @@ pub struct AlpenglowInitializationState { pub votor_event_sender: VotorEventSender, pub votor_event_receiver: VotorEventReceiver, - // For BLS streamer setup - pub cancel: CancellationToken, - pub staked_nodes: Arc>, pub key_notifiers: Arc>, - // For BLS voting service - pub bls_connection_cache: Arc, + // Votor QUIC datagram transport handles. Egress goes to the BLS + // voting service; ingress feeds the BLS sigverifier; banlist is shared + // with the sigverifier so external triggers (e.g. signature failure) + // can soft-ban peers cluster-wide. `votor_allowlist` is owned by the + // voting service's StakedValidatorsCache, which republishes the + // staked-pubkey set on every epoch boundary. + pub votor_egress: + tokio::sync::mpsc::Sender, + pub votor_ingress: Receiver, + pub votor_banlist: Arc>, + pub votor_allowlist: + Option>, pub voting_service_test_override: Option, } @@ -269,10 +271,11 @@ impl Tvu { bank_forks_controller_receiver, votor_event_sender, votor_event_receiver, - cancel, - staked_nodes, - key_notifiers, - bls_connection_cache, + key_notifiers: _key_notifiers, + votor_egress, + votor_ingress, + votor_banlist, + votor_allowlist, voting_service_test_override, highest_finalized, } = votor_init; @@ -284,51 +287,23 @@ impl Tvu { bounded(MAX_IN_FLIGHT_CONSENSUS_EVENTS); let generated_cert_types = Arc::new(GeneratedCertTypes::default()); - // The BLS socket is currently only available on Testnet and Development clusters. - // Closer to release we will enable this for all clusters. - let bls_sigverify_threads = if let Some(bls_socket) = bls_socket { - let (bls_packet_sender, bls_packet_receiver) = bounded(MAX_ALPENGLOW_PACKET_NUM); - - let ( - SpawnServerResult { - endpoints: _, - thread: bls_streamer_t, - key_updater: bls_key_updater, - }, - banlist, - ) = { - let quic_server_params = QuicStreamerConfig { - num_threads: NonZeroUsize::new(4.min(num_cpus::get())).unwrap(), - ..Default::default() - }; - let qos_config = SimpleQosConfig { - max_streams_per_second: 30, - // Cap by # of active validators (some overhead for epoch boundaries) - max_staked_connections: MAX_ALPENGLOW_VOTE_ACCOUNTS * 2, - // Two staked connection per validator to account for hotspares - max_connections_per_peer: 2, - }; - spawn_simple_qos_server( - "solQuicBLS", - "quic_streamer_bls", - vec![bls_socket.into()], - &cluster_info.keypair(), - bls_packet_sender, - staked_nodes, - quic_server_params, - qos_config, - cancel, - ) - .unwrap() - }; - - // sigverifier + // BLS sigverifier — fed by the votor datagram endpoint constructed + // in validator.rs. The endpoint task itself runs there; here we + // only consume its ingress + banlist. + // + // The bls_socket is currently only available on Testnet and + // Development clusters; outside that window the endpoint is not + // constructed and ingress will simply never fire. The `_` on + // bls_socket below is intentional — we don't actually need it + // here anymore (validator.rs already consumed it). + let _ = bls_socket; + let bls_sigverify_threads = { let sharable_banks = bank_forks.read().unwrap().sharable_banks(); let bls_sigverifier_t = bls_sigverifier::spawn_service( exit.clone(), SigVerifierContext { migration_status: migration_status.clone(), - banlist: SigVerifierBanlist::Stream(banlist), + banlist: SigVerifierBanlist::Datagram(votor_banlist), sharable_banks, cluster_info: cluster_info.clone(), leader_schedule: leader_schedule_cache.clone(), @@ -336,20 +311,14 @@ impl Tvu { generated_cert_types: generated_cert_types.clone(), }, SigVerifierChannels { - packet_receiver: SigVerifierPacketReceiver::Stream(bls_packet_receiver), + packet_receiver: SigVerifierPacketReceiver::Datagram(votor_ingress), channel_to_repair: verified_voter_slots_sender, channel_to_reward: reward_votes_sender, channel_to_pool: consensus_message_sender.clone(), channel_to_metrics: consensus_metrics_sender.clone(), }, ); - - let mut key_notifiers = key_notifiers.write().unwrap(); - key_notifiers.add(KeyUpdaterType::Bls, bls_key_updater); - - Some((bls_streamer_t, bls_sigverifier_t)) - } else { - None + Some(bls_sigverifier_t) }; let (fetch_sender, fetch_receiver) = EvictingSender::new_bounded(SHRED_FETCH_CHANNEL_SIZE); @@ -600,7 +569,10 @@ impl Tvu { bls_receiver, cluster_info.clone(), vote_history_storage, - VotingTransport::Stream(bls_connection_cache), + VotingTransport::Datagram { + egress: votor_egress, + allowlist: votor_allowlist, + }, bank_forks.clone(), voting_service_test_override, ); @@ -676,8 +648,7 @@ impl Tvu { } self.drop_bank_service.join()?; self.duplicate_shred_listener.join()?; - if let Some((streamer, sigverifier)) = self.bls_sigverify_threads { - streamer.join()?; + if let Some(sigverifier) = self.bls_sigverify_threads { sigverifier.join()?; } self.votor.join()?; @@ -736,10 +707,7 @@ pub mod tests { solana_runtime::{bank::Bank, bank_forks_controller::BankForksControllerHandle}, solana_signer::Signer, solana_tpu_client::tpu_client::{DEFAULT_TPU_CONNECTION_POOL_SIZE, DEFAULT_VOTE_USE_QUIC}, - std::{ - sync::atomic::{AtomicU64, Ordering}, - time::Duration, - }, + std::sync::atomic::{AtomicU64, Ordering}, }; #[test] @@ -804,10 +772,12 @@ pub mod tests { DEFAULT_TPU_CONNECTION_POOL_SIZE, ) }; - let bls_connection_cache = ConnectionCache::new_quic_for_tests( - "connection_cache_bls_quic", - DEFAULT_TPU_CONNECTION_POOL_SIZE, - ); + // Stub the votor datagram channels — the test runs without an + // actual alpenglow endpoint. The egress sink and ingress source + // are never connected to each other; ingress simply never fires. + let (votor_egress, _votor_egress_rx) = tokio::sync::mpsc::channel(1024); + let (_votor_ingress_tx, votor_ingress) = bounded(1024); + let votor_banlist = Arc::new(Banlist::::default()); let replay_highest_frozen = Arc::new(ReplayHighestFrozen::default()); let (leader_window_info_sender, _leader_window_info_receiver) = unbounded(); let (optimistic_parent_sender, optimistic_parent_receiver) = unbounded(); @@ -820,20 +790,7 @@ pub mod tests { ))); let (votor_event_sender, votor_event_receiver): (VotorEventSender, VotorEventReceiver) = unbounded(); - let staked_nodes = Arc::new(RwLock::new(StakedNodes::default())); let key_notifiers = Arc::new(RwLock::new(KeyUpdaters::default())); - let cancel = CancellationToken::new(); - thread::spawn({ - let cancel = cancel.clone(); - let exit = exit.clone(); - move || loop { - if exit.load(Ordering::Relaxed) { - cancel.cancel(); - break; - } - thread::sleep(Duration::from_secs(1)); - } - }); let (bank_forks_controller, bank_forks_controller_receiver) = BankForksControllerHandle::new(); let bank_forks_controller = Arc::new(bank_forks_controller); @@ -903,10 +860,11 @@ pub mod tests { highest_parent_ready, votor_event_sender, votor_event_receiver, - cancel, - staked_nodes, key_notifiers, - bls_connection_cache: Arc::new(bls_connection_cache), + votor_egress, + votor_ingress, + votor_banlist, + votor_allowlist: None, voting_service_test_override: None, highest_finalized: Arc::new(RwLock::new(None)), bank_forks_controller, diff --git a/core/src/validator.rs b/core/src/validator.rs index 160eb923341..4c4a4e06b51 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -1187,30 +1187,7 @@ impl Validator { )) }; - let bls_connection_cache = Arc::new(ConnectionCache::new_with_client_options( - "connection_cache_bls_quic", - // BLS consensus messaging is extremely low throughput (5 PPS). Even during standstill operations - // we wouldn't expect more than a 100 PPS. 1 connection is enough. - 1, /* connection_pool_size */ - Some(node.sockets.quic_alpenglow_client), - Some(( - &identity_keypair, - node.info - .alpenglow() - .ok_or_else(|| { - ValidatorError::Other(String::from( - "Invalid QUIC address for Alpenglow BLS", - )) - })? - .ip(), - )), - Some((&staked_nodes, &identity_keypair.pubkey())), - )); let key_notifiers = Arc::new(RwLock::new(KeyUpdaters::default())); - key_notifiers.write().unwrap().add( - KeyUpdaterType::BlsConnectionCache, - bls_connection_cache.clone(), - ); // test-validator crate may start the validator in a tokio runtime // context which forces us to use the same runtime because a nested @@ -1227,6 +1204,67 @@ impl Validator { .unwrap() }); + // Votor QUIC datagram endpoint. One UDP socket multiplexes + // outbound votes/certs (egress) and inbound consensus messages + // (ingress) per the lex-pubkey direction rule. Only constructed + // on Testnet/Development clusters; absent → stub channels are + // installed downstream so the BLS sigverifier task spawns but + // never receives anything. + let alpenglow_socket = if matches!( + genesis_config.cluster_type, + ClusterType::Testnet | ClusterType::Development, + ) { + node.sockets.alpenglow.take() + } else { + None + }; + let (votor_egress, votor_ingress, votor_banlist, votor_allowlist) = if let Some(socket) = + alpenglow_socket + { + let votor_rt_handle = tpu_client_next_runtime + .as_ref() + .map(TokioRuntime::handle) + .unwrap_or_else(|| current_runtime_handle.as_ref().unwrap()); + let (ingress_tx, ingress_rx) = + crossbeam_channel::bounded(crate::tvu::MAX_ALPENGLOW_PACKET_NUM); + + let banlist = Arc::new(solana_net_utils::banlist::Banlist::::default()); + let allowlist = agave_votor::datagram_endpoint::build_allowlist( + &bank_forks.read().unwrap().sharable_banks(), + ); + + let endpoint = agave_votor::datagram_transport::endpoint::QuicDatagramEndpoint::new( + votor_rt_handle, + &identity_keypair, + socket, + agave_votor::datagram_endpoint::ALPENGLOW_ALPN, + ingress_tx, + allowlist.clone(), + banlist.clone(), + ) + .map_err(|e| ValidatorError::Other(format!("alpenglow endpoint: {e:?}")))?; + key_notifiers + .write() + .unwrap() + .add(KeyUpdaterType::VotorDatagram, endpoint.key_updater.clone()); + let egress = endpoint.egress.clone(); + votor_rt_handle.spawn({ + let cancel = cancel.clone(); + async move { + cancel.cancelled().await; + endpoint.close(); + } + }); + (egress, ingress_rx, banlist, Some(allowlist)) + } else { + let (egress, _) = tokio::sync::mpsc::channel(1); + let (_, ingress) = crossbeam_channel::bounded::< + agave_votor::datagram_transport::endpoint::Datagram, + >(1); + let banlist = Arc::new(solana_net_utils::banlist::Banlist::::default()); + (egress, ingress, banlist, None) + }; + let rpc_override_health_check = Arc::new(AtomicBool::new(config.rpc_config.disable_health_check)); let ( @@ -1647,10 +1685,11 @@ impl Validator { bank_forks_controller_receiver, votor_event_sender: votor_event_sender.clone(), votor_event_receiver, - cancel: cancel.clone(), - staked_nodes: staked_nodes.clone(), key_notifiers: key_notifiers.clone(), - bls_connection_cache, + votor_egress, + votor_ingress, + votor_banlist, + votor_allowlist, voting_service_test_override: config.voting_service_test_override.clone(), highest_finalized, }, diff --git a/local-cluster/Cargo.toml b/local-cluster/Cargo.toml index cf9a9ab1c49..f5dea0d238a 100644 --- a/local-cluster/Cargo.toml +++ b/local-cluster/Cargo.toml @@ -23,7 +23,7 @@ dev-context-only-utils = [ agave-feature-set = { workspace = true } agave-logger = { workspace = true } agave-snapshots = { workspace = true } -agave-votor = { workspace = true } +agave-votor = { workspace = true, features = ["dev-context-only-utils"] } agave-votor-messages = { workspace = true } crossbeam-channel = { workspace = true } itertools = { workspace = true } @@ -81,7 +81,9 @@ solana-vote-program = { workspace = true } static_assertions = { workspace = true } strum = { workspace = true, features = ["derive"] } tempfile = { workspace = true } +tokio = { workspace = true, features = ["sync", "rt", "rt-multi-thread"] } tokio-util = { workspace = true } +wincode = { workspace = true, features = ["alloc"] } [dev-dependencies] assert_matches = { workspace = true } diff --git a/local-cluster/src/cluster_tests.rs b/local-cluster/src/cluster_tests.rs index 89c6cab12ba..5554f5b0de9 100644 --- a/local-cluster/src/cluster_tests.rs +++ b/local-cluster/src/cluster_tests.rs @@ -5,6 +5,10 @@ use log::*; use { crate::{cluster::QuicTpuClient, local_cluster::LocalCluster}, + agave_votor::datagram_transport::{ + allowlist::AllowAll, + endpoint::{Datagram, QuicDatagramEndpoint}, + }, agave_votor_messages::consensus_message::ConsensusMessage, crossbeam_channel::bounded, rand::{Rng, rng}, @@ -26,16 +30,11 @@ use { solana_hash::Hash, solana_keypair::Keypair, solana_ledger::blockstore::Blockstore, - solana_net_utils::SocketAddrSpace, + solana_net_utils::{SocketAddrSpace, banlist::Banlist}, solana_poh_config::PohConfig, solana_pubkey::Pubkey, solana_rpc_client::rpc_client::RpcClient, solana_signer::Signer, - solana_streamer::{ - nonblocking::simple_qos::SimpleQosConfig, - quic::{QuicStreamerConfig, spawn_simple_qos_server}, - streamer::StakedNodes, - }, solana_system_transaction as system_transaction, solana_time_utils::timestamp, solana_tpu_client::tpu_client::{TpuClient, TpuClientConfig, TpuSenderError}, @@ -55,7 +54,6 @@ use { thread::{JoinHandle, sleep}, time::{Duration, Instant}, }, - tokio_util::sync::CancellationToken, }; #[cfg(feature = "dev-context-only-utils")] use { @@ -435,41 +433,44 @@ pub fn check_for_new_processed( ); } -/// Start a QUIC streamer to listen for votes and certificates. -/// Returns a cancellation token, the server thread handle, and a receiver for packet batches. -pub fn start_quic_streamer_to_listen_for_votes_and_certs( +/// Spawn a votor-flavored votor datagram endpoint to sniff vote / cert +/// traffic for a local-cluster test. Validators in the cluster dial +/// this listener as an `AdditionalListener`; the lex-pubkey rule +/// requires `listener_keypair.pubkey() > max(validator_pubkeys)` and +/// the validators' admission must include `listener_keypair.pubkey()` +/// — both ensured by the caller (the test). +/// +/// Returns the endpoint (kept alive by the caller so its task does +/// not get dropped), the ingress receiver (one item per received +/// datagram), and a tokio runtime handle whose lifetime must outlive +/// the endpoint. +pub fn start_datagram_listener_for_votes_and_certs( vote_listener_socket: UdpSocket, - validator_keys: &[Arc], - node_stakes: &[u64], + listener_keypair: Keypair, ) -> ( - CancellationToken, - JoinHandle<()>, - crossbeam_channel::Receiver, + QuicDatagramEndpoint, + crossbeam_channel::Receiver, + tokio::runtime::Runtime, ) { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .worker_threads(2) + .thread_name("solAlpenglowListen") + .build() + .expect("tokio runtime"); let (sender, receiver) = bounded(1024); - let cancel = CancellationToken::new(); - let stakes = validator_keys - .iter() - .zip(node_stakes) - .map(|(keypair, stake)| (keypair.pubkey(), *stake)) - .collect(); - let staked_nodes: Arc> = Arc::new(RwLock::new(StakedNodes::new( - Arc::new(stakes), - HashMap::::default(), // overrides - ))); - let (result, _banlist) = spawn_simple_qos_server( - "solAlpenglowTest", - "alpenglow_local_cluster_test", - [vote_listener_socket.into()], - &Keypair::new(), + let banlist = Arc::new(Banlist::::default()); + let endpoint = QuicDatagramEndpoint::new( + rt.handle(), + &listener_keypair, + vote_listener_socket, + agave_votor::datagram_endpoint::ALPENGLOW_ALPN, sender, - staked_nodes, - QuicStreamerConfig::default(), - SimpleQosConfig::default(), - cancel.clone(), + Arc::new(AllowAll), + banlist, ) - .unwrap(); - (cancel, result.thread, receiver) + .expect("alpenglow datagram listener"); + (endpoint, receiver, rt) } /// Check that all nodes in the cluster are producing notarized votes. @@ -478,8 +479,7 @@ pub fn check_for_new_notarized_votes( contact_infos: &[ContactInfo], test_name: &str, vote_listener_socket: UdpSocket, - validator_keys: &[Arc], - node_stakes: &[u64], + listener_keypair: Keypair, ) { let loop_start = Instant::now(); let loop_timeout = Duration::from_secs(180); @@ -500,11 +500,8 @@ pub fn check_for_new_notarized_votes( let contact_infos_owned: Vec = contact_infos.to_vec(); let test_name_owned = test_name.to_string(); - let (cancel, quic_server_thread, receiver) = start_quic_streamer_to_listen_for_votes_and_certs( - vote_listener_socket, - validator_keys, - node_stakes, - ); + let (_endpoint, receiver, _rt) = + start_datagram_listener_for_votes_and_certs(vote_listener_socket, listener_keypair); // Now start vote listener and wait for new notarized votes. let vote_listener = std::thread::spawn({ @@ -519,52 +516,45 @@ pub fn check_for_new_notarized_votes( move || { while !done { assert!(loop_start.elapsed() < loop_timeout); - let Ok(packet_batch) = receiver.recv_timeout(Duration::from_millis(100)) else { + let Ok(dg) = receiver.recv_timeout(Duration::from_millis(100)) else { continue; }; - for packet in packet_batch.iter() { - let Ok(ConsensusMessage::Vote(vote_message)) = packet.deserialize_slice(..) - else { - continue; - }; - let vote = vote_message.vote; - if !vote.is_notarization() { - continue; - } - let rank = vote_message.rank; - if rank >= contact_infos_owned.len() as u16 { - warn!( - "Received vote with rank {} which is greater than number of nodes {}", - rank, - contact_infos_owned.len() - ); - continue; - } - let slot = vote.slot(); - if slot <= last_notarized[rank as usize] { - continue; - } - last_notarized[rank as usize] = slot; - num_new_notarized_votes[rank as usize] += 1; - done = num_new_notarized_votes.iter().all(|&x| x > num_new_votes); - if done || last_print.elapsed().as_secs() > 3 { - info!( - "{test_name_owned} waiting for {num_new_votes} new notarized votes.. \ - observed: {num_new_notarized_votes:?}" - ); - last_print = Instant::now(); - } + let Ok(ConsensusMessage::Vote(vote_message)) = + wincode::deserialize::(&dg.message) + else { + continue; + }; + let vote = vote_message.vote; + if !vote.is_notarization() { + continue; + } + let rank = vote_message.rank; + if rank >= contact_infos_owned.len() as u16 { + warn!( + "Received vote with rank {} which is greater than number of nodes {}", + rank, + contact_infos_owned.len() + ); + continue; + } + let slot = vote.slot(); + if slot <= last_notarized[rank as usize] { + continue; } - if done { - cancel.cancel(); + last_notarized[rank as usize] = slot; + num_new_notarized_votes[rank as usize] += 1; + done = num_new_notarized_votes.iter().all(|&x| x > num_new_votes); + if done || last_print.elapsed().as_secs() > 3 { + info!( + "{test_name_owned} waiting for {num_new_votes} new notarized votes.. \ + observed: {num_new_notarized_votes:?}" + ); + last_print = Instant::now(); } } } }); vote_listener.join().expect("Vote listener thread panicked"); - quic_server_thread - .join() - .expect("QUIC server thread panicked"); } pub fn check_no_new_roots( diff --git a/local-cluster/src/local_cluster.rs b/local-cluster/src/local_cluster.rs index e0bf90f1628..195f13970c4 100644 --- a/local-cluster/src/local_cluster.rs +++ b/local-cluster/src/local_cluster.rs @@ -911,9 +911,8 @@ impl LocalCluster { num_new_notarized_votes: usize, test_name: &str, socket_addr_space: SocketAddrSpace, - vote_listener_addr: std::net::UdpSocket, - validator_keys: &[Arc], - node_stakes: &[u64], + vote_listener_socket: std::net::UdpSocket, + listener_keypair: Keypair, ) { let alive_node_contact_infos = self.discover_nodes(socket_addr_space, test_name); info!("{test_name} looking for new notarized votes on all nodes"); @@ -921,9 +920,8 @@ impl LocalCluster { num_new_notarized_votes, &alive_node_contact_infos, test_name, - vote_listener_addr, - validator_keys, - node_stakes, + vote_listener_socket, + listener_keypair, ); info!("{test_name} done waiting for notarized votes"); } diff --git a/local-cluster/tests/local_cluster.rs b/local-cluster/tests/local_cluster.rs index 2baac35dc37..b7145729f35 100644 --- a/local-cluster/tests/local_cluster.rs +++ b/local-cluster/tests/local_cluster.rs @@ -5961,6 +5961,21 @@ fn test_restart_node_alpenglow() { /// We start 2 nodes, where the first node A holds 90% of the stake. /// We let A run by itself, and ensure that B can join and rejoin the network /// through repair. +/// Generate a fresh keypair whose pubkey is strictly greater than every +/// pubkey in `lower_bounds`. Alpenglow tests that attach a sniffer +/// endpoint via `AdditionalListener` need this so the lex-pubkey rule +/// routes every validator's dial of the sniffer correctly (lower +/// dials, higher listens). +fn keypair_above_all(lower_bounds: &[Pubkey]) -> Keypair { + let upper = lower_bounds.iter().max().copied().unwrap_or_default(); + loop { + let k = Keypair::new(); + if k.pubkey() > upper { + return k; + } + } +} + /// Verifies that a low-stake Alpenglow validator can fall behind, rejoin through repair, /// and resume notarized voting in an imbalanced-stake network. #[test] @@ -5986,6 +6001,17 @@ fn test_alpenglow_imbalanced_stakes_catchup() { leader_schedule: Arc::new(leader_schedule), }; + // Collect node pubkeys early so we can derive a listener identity + // whose pubkey is strictly greater than every validator's — the + // lex-pubkey rule then routes every validator's dial of the + // listener correctly (validator < listener). + let node_pubkeys = validator_keys + .iter() + .map(|key| key.node_keypair.pubkey()) + .collect::>(); + let listener_keypair = keypair_above_all(&node_pubkeys); + let listener_pubkey = listener_keypair.pubkey(); + // Create our UDP socket to listen to votes let vote_listener_addr = bind_to_localhost_unique().unwrap(); @@ -5993,19 +6019,13 @@ fn test_alpenglow_imbalanced_stakes_catchup() { validator_config.fixed_leader_schedule = Some(leader_schedule); validator_config.voting_service_test_override = Some(VotingServiceOverride { additional_listeners: vec![AdditionalListener { - pubkey: Pubkey::new_unique(), + pubkey: listener_pubkey, addr: vote_listener_addr.local_addr().unwrap(), }], alpenglow_port_override: AlpenglowPortOverride::default(), }); validator_config.wait_for_supermajority = Some(0); - // Collect node pubkeys - let node_pubkeys = validator_keys - .iter() - .map(|key| key.node_keypair.pubkey()) - .collect::>(); - // Cluster config let mut cluster_config = ClusterConfig { mint_lamports: DEFAULT_MINT_LAMPORTS + node_stakes.iter().sum::(), @@ -6049,18 +6069,12 @@ fn test_alpenglow_imbalanced_stakes_catchup() { info!("restarting node B"); cluster.restart_node(&node_pubkeys[1], b_info, SocketAddrSpace::Unspecified); - // Ensure all nodes are voting - let validator_node_keypairs: Vec<_> = validator_keys - .iter() - .map(|k| k.node_keypair.clone()) - .collect(); cluster.check_for_new_notarized_votes( 16, "test_alpenglow_imbalanced_stakes_catchup", SocketAddrSpace::Unspecified, vote_listener_addr, - &validator_node_keypairs, - &node_stakes, + listener_keypair, ); } @@ -6178,20 +6192,26 @@ fn test_alpenglow_migration( ) { agave_logger::setup_with_default(AG_DEBUG_LOG_FILTER); + let (leader_schedule, keys) = create_custom_leader_schedule_with_random_keys(leader_schedule); + + // Listener pubkey must exceed every validator's pubkey so the + // lex-pubkey rule routes each validator's dial to the listener + // correctly. + let validator_pubkeys: Vec = keys.iter().map(|k| k.node_keypair.pubkey()).collect(); + let listener_keypair = keypair_above_all(&validator_pubkeys); + let vote_listener_socket = bind_to_localhost_unique().unwrap(); let vote_listener_addr = vote_listener_socket.try_clone().unwrap(); let mut validator_config = ValidatorConfig::default_for_test(); validator_config.voting_service_test_override = Some(VotingServiceOverride { additional_listeners: vec![AdditionalListener { - pubkey: Pubkey::new_unique(), + pubkey: listener_keypair.pubkey(), addr: vote_listener_addr.local_addr().unwrap(), }], alpenglow_port_override: AlpenglowPortOverride::default(), }); validator_config.wait_for_supermajority = Some(0); - let (leader_schedule, keys) = create_custom_leader_schedule_with_random_keys(leader_schedule); - validator_config.fixed_leader_schedule = Some(FixedSchedule { leader_schedule: Arc::new(leader_schedule), }); @@ -6223,12 +6243,8 @@ fn test_alpenglow_migration( // Create local cluster with alpenglow accounts but feature not activated let cluster = LocalCluster::new(&mut cluster_config, SocketAddrSpace::Unspecified); - let validator_keys: Vec> = cluster - .validators - .values() - .map(|v| v.info.keypair.clone()) - .collect(); - + // Send feature activation transaction + info!("Sending feature activation transaction"); let client = RpcClient::new_socket_with_commitment( cluster.entry_point_info.rpc().unwrap(), CommitmentConfig::processed(), @@ -6269,8 +6285,7 @@ fn test_alpenglow_migration( test_name, SocketAddrSpace::Unspecified, vote_listener_addr, - &validator_keys, - &node_stakes, + listener_keypair, ); // Additionally ensure that roots are being made diff --git a/validator/src/admin_rpc_service.rs b/validator/src/admin_rpc_service.rs index 90b1d334966..aaa804aba63 100644 --- a/validator/src/admin_rpc_service.rs +++ b/validator/src/admin_rpc_service.rs @@ -1372,8 +1372,7 @@ mod tests { KeyUpdaterType::TpuVote, KeyUpdaterType::Forward, KeyUpdaterType::RpcService, - KeyUpdaterType::Bls, - KeyUpdaterType::BlsConnectionCache, + KeyUpdaterType::VotorDatagram, ]) ); let mut io = MetaIoHandler::default();