diff --git a/.changelog/dry-ducks-write.md b/.changelog/dry-ducks-write.md new file mode 100644 index 0000000000..de44e1b6ef --- /dev/null +++ b/.changelog/dry-ducks-write.md @@ -0,0 +1,5 @@ +--- +reth-network: minor +--- + +Added reason label to backed_off_peers metric. The metric now tracks backed off peers by reason (too_many_peers, graceful_close, connection_error) to improve observability. diff --git a/crates/net/network/src/manager.rs b/crates/net/network/src/manager.rs index 7387a5b64a..6bfbb9d3e1 100644 --- a/crates/net/network/src/manager.rs +++ b/crates/net/network/src/manager.rs @@ -25,11 +25,11 @@ use crate::{ listener::ConnectionListener, message::{NewBlockMessage, PeerMessage}, metrics::{ - ClosedSessionsMetrics, DisconnectMetrics, NetworkMetrics, PendingSessionFailureMetrics, - NETWORK_POOL_TRANSACTIONS_SCOPE, + BackedOffPeersMetrics, ClosedSessionsMetrics, DisconnectMetrics, NetworkMetrics, + PendingSessionFailureMetrics, NETWORK_POOL_TRANSACTIONS_SCOPE, }, network::{NetworkHandle, NetworkHandleMessage}, - peers::PeersManager, + peers::{BackoffReason, PeersManager}, poll_nested_stream_with_budget, protocol::IntoRlpxSubProtocol, required_block_filter::RequiredBlockFilter, @@ -146,6 +146,8 @@ pub struct NetworkManager { closed_sessions_metrics: ClosedSessionsMetrics, /// Pending session failure metrics, split by direction. pending_session_failure_metrics: PendingSessionFailureMetrics, + /// Backed off peers metrics, split by reason. + backed_off_peers_metrics: BackedOffPeersMetrics, } impl NetworkManager { @@ -363,6 +365,7 @@ impl NetworkManager { disconnect_metrics: Default::default(), closed_sessions_metrics: Default::default(), pending_session_failure_metrics: Default::default(), + backed_off_peers_metrics: Default::default(), }) } @@ -869,10 +872,15 @@ impl NetworkManager { &peer_id, err, ); + self.backed_off_peers_metrics.increment_for_reason( + BackoffReason::from_disconnect(err.as_disconnected()), + ); err.as_disconnected() } else { // Gracefully disconnected self.swarm.state_mut().peers_mut().on_active_session_gracefully_closed(peer_id); + self.backed_off_peers_metrics + .increment_for_reason(BackoffReason::GracefulClose); None }; self.closed_sessions_metrics.active.increment(1); @@ -914,9 +922,6 @@ impl NetworkManager { self.metrics .incoming_connections .set(self.swarm.state().peers().num_inbound_connections() as f64); - self.metrics - .backed_off_peers - .set(self.swarm.state().peers().num_backed_off_peers() as f64); } SwarmEvent::OutgoingPendingSessionClosed { remote_addr, peer_id, error } => { trace!( @@ -934,6 +939,9 @@ impl NetworkManager { err, ); self.pending_session_failure_metrics.outbound.increment(1); + self.backed_off_peers_metrics.increment_for_reason( + BackoffReason::from_disconnect(err.as_disconnected()), + ); if let Some(reason) = err.as_disconnected() { self.disconnect_metrics.increment(reason); } @@ -945,7 +953,6 @@ impl NetworkManager { } self.closed_sessions_metrics.outgoing_pending.increment(1); self.update_pending_connection_metrics(); - self.metrics .backed_off_peers .set(self.swarm.state().peers().num_backed_off_peers() as f64); @@ -965,6 +972,7 @@ impl NetworkManager { &error, ); + self.backed_off_peers_metrics.increment_for_reason(BackoffReason::ConnectionError); self.metrics .backed_off_peers .set(self.swarm.state().peers().num_backed_off_peers() as f64); diff --git a/crates/net/network/src/metrics.rs b/crates/net/network/src/metrics.rs index 1a15866346..71fb1d5d08 100644 --- a/crates/net/network/src/metrics.rs +++ b/crates/net/network/src/metrics.rs @@ -2,7 +2,7 @@ use metrics::Histogram; use reth_eth_wire::DisconnectReason; use reth_ethereum_primitives::TxType; use reth_metrics::{ - metrics::{self, Counter, Gauge}, + metrics::{Counter, Gauge}, Metrics, }; @@ -110,6 +110,29 @@ impl Default for PendingSessionFailureMetrics { } } +/// Metrics for backed off peers, split by reason. +#[derive(Metrics)] +#[metrics(scope = "network.backed_off_peers")] +pub struct BackedOffPeersMetrics { + /// Peers backed off because they reported too many peers. + pub too_many_peers: Counter, + /// Peers backed off after a graceful session close. + pub graceful_close: Counter, + /// Peers backed off due to connection or protocol errors. + pub connection_error: Counter, +} + +impl BackedOffPeersMetrics { + /// Increments the counter for the given backoff reason. + pub fn increment_for_reason(&self, reason: crate::peers::BackoffReason) { + match reason { + crate::peers::BackoffReason::TooManyPeers => self.too_many_peers.increment(1), + crate::peers::BackoffReason::GracefulClose => self.graceful_close.increment(1), + crate::peers::BackoffReason::ConnectionError => self.connection_error.increment(1), + } + } +} + /// Metrics for `SessionManager` #[derive(Metrics)] #[metrics(scope = "network")] diff --git a/crates/net/network/src/peers.rs b/crates/net/network/src/peers.rs index 801c4795c7..049f15e907 100644 --- a/crates/net/network/src/peers.rs +++ b/crates/net/network/src/peers.rs @@ -1260,6 +1260,27 @@ impl Display for InboundConnectionError { } } +/// The reason a peer was backed off. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BackoffReason { + /// The remote peer responded with `TooManyPeers` (0x04). + TooManyPeers, + /// The session was gracefully closed and we're backing off briefly. + GracefulClose, + /// A connection or protocol-level error occurred. + ConnectionError, +} + +impl BackoffReason { + /// Derives the backoff reason from an optional [`DisconnectReason`]. + pub const fn from_disconnect(reason: Option) -> Self { + match reason { + Some(DisconnectReason::TooManyPeers) => Self::TooManyPeers, + _ => Self::ConnectionError, + } + } +} + #[cfg(test)] mod tests { use alloy_primitives::B512;