From 3648483512044997d97b8020b2a2d2790e081d02 Mon Sep 17 00:00:00 2001 From: joshieDo <93316087+joshieDo@users.noreply.github.com> Date: Fri, 23 Jan 2026 19:59:10 +0000 Subject: [PATCH] feat(rocksdb): add WAL size tracking metric and Grafana dashboard (#21295) Co-authored-by: Amp --- crates/cli/commands/src/db/stats.rs | 10 +++ crates/node/metrics/src/server.rs | 5 ++ crates/storage/provider/src/providers/mod.rs | 3 +- .../provider/src/providers/rocksdb/mod.rs | 3 +- .../src/providers/rocksdb/provider.rs | 62 +++++++++++++++++++ .../provider/src/providers/rocksdb_stub.rs | 24 +++++++ etc/grafana/dashboards/overview.json | 38 ++++++++++-- 7 files changed, 139 insertions(+), 6 deletions(-) diff --git a/crates/cli/commands/src/db/stats.rs b/crates/cli/commands/src/db/stats.rs index 0b73727a60..62c8af1f40 100644 --- a/crates/cli/commands/src/db/stats.rs +++ b/crates/cli/commands/src/db/stats.rs @@ -205,6 +205,16 @@ impl Command { .add_cell(Cell::new(human_bytes(total_size as f64))) .add_cell(Cell::new(human_bytes(total_pending as f64))); table.add_row(row); + + let wal_size = tool.provider_factory.rocksdb_provider().wal_size_bytes(); + let mut row = Row::new(); + row.add_cell(Cell::new("WAL")) + .add_cell(Cell::new("")) + .add_cell(Cell::new("")) + .add_cell(Cell::new("")) + .add_cell(Cell::new(human_bytes(wal_size as f64))) + .add_cell(Cell::new("")); + table.add_row(row); } table diff --git a/crates/node/metrics/src/server.rs b/crates/node/metrics/src/server.rs index ea24e6572e..9ef68cf303 100644 --- a/crates/node/metrics/src/server.rs +++ b/crates/node/metrics/src/server.rs @@ -257,6 +257,11 @@ fn describe_rocksdb_metrics() { Unit::Bytes, "The size of memtables for a RocksDB table" ); + describe_gauge!( + "rocksdb.wal_size", + Unit::Bytes, + "The total size of WAL (Write-Ahead Log) files. Important: this is not included in table_size or sst_size metrics" + ); } #[cfg(all(feature = "jemalloc", unix))] diff --git a/crates/storage/provider/src/providers/mod.rs b/crates/storage/provider/src/providers/mod.rs index c477ccbb98..91aff23fe9 100644 --- a/crates/storage/provider/src/providers/mod.rs +++ b/crates/storage/provider/src/providers/mod.rs @@ -39,7 +39,8 @@ pub use consistent::ConsistentProvider; pub(crate) mod rocksdb; pub use rocksdb::{ - RocksDBBatch, RocksDBBuilder, RocksDBProvider, RocksDBRawIter, RocksDBTableStats, RocksTx, + RocksDBBatch, RocksDBBuilder, RocksDBProvider, RocksDBRawIter, RocksDBStats, RocksDBTableStats, + RocksTx, }; /// Helper trait to bound [`NodeTypes`] so that combined with database they satisfy diff --git a/crates/storage/provider/src/providers/rocksdb/mod.rs b/crates/storage/provider/src/providers/rocksdb/mod.rs index efab03e2af..219a3ebfbe 100644 --- a/crates/storage/provider/src/providers/rocksdb/mod.rs +++ b/crates/storage/provider/src/providers/rocksdb/mod.rs @@ -6,5 +6,6 @@ mod provider; pub(crate) use provider::{PendingRocksDBBatches, RocksDBWriteCtx}; pub use provider::{ - RocksDBBatch, RocksDBBuilder, RocksDBProvider, RocksDBRawIter, RocksDBTableStats, RocksTx, + RocksDBBatch, RocksDBBuilder, RocksDBProvider, RocksDBRawIter, RocksDBStats, RocksDBTableStats, + RocksTx, }; diff --git a/crates/storage/provider/src/providers/rocksdb/provider.rs b/crates/storage/provider/src/providers/rocksdb/provider.rs index 0cc85f43c4..06e5837a91 100644 --- a/crates/storage/provider/src/providers/rocksdb/provider.rs +++ b/crates/storage/provider/src/providers/rocksdb/provider.rs @@ -57,6 +57,19 @@ pub struct RocksDBTableStats { pub pending_compaction_bytes: u64, } +/// Database-level statistics for `RocksDB`. +/// +/// Contains both per-table statistics and DB-level metrics like WAL size. +#[derive(Debug, Clone)] +pub struct RocksDBStats { + /// Statistics for each table (column family). + pub tables: Vec, + /// Total size of WAL (Write-Ahead Log) files in bytes. + /// + /// WAL is shared across all tables and not included in per-table metrics. + pub wal_size_bytes: u64, +} + /// Context for `RocksDB` block writes. #[derive(Clone)] pub(crate) struct RocksDBWriteCtx { @@ -457,6 +470,31 @@ impl RocksDBProviderInner { } } + /// Returns the path to the database directory. + fn path(&self) -> &Path { + match self { + Self::ReadWrite { db, .. } => db.path(), + Self::ReadOnly { db, .. } => db.path(), + } + } + + /// Returns the total size of WAL (Write-Ahead Log) files in bytes. + /// + /// WAL files have a `.log` extension in the `RocksDB` directory. + fn wal_size_bytes(&self) -> u64 { + let path = self.path(); + + match std::fs::read_dir(path) { + Ok(entries) => entries + .filter_map(|e| e.ok()) + .filter(|e| e.path().extension().is_some_and(|ext| ext == "log")) + .filter_map(|e| e.metadata().ok()) + .map(|m| m.len()) + .sum(), + Err(_) => 0, + } + } + /// Returns statistics for all column families in the database. fn table_stats(&self) -> Vec { let mut stats = Vec::new(); @@ -515,6 +553,11 @@ impl RocksDBProviderInner { stats } + + /// Returns database-level statistics including per-table stats and WAL size. + fn db_stats(&self) -> RocksDBStats { + RocksDBStats { tables: self.table_stats(), wal_size_bytes: self.wal_size_bytes() } + } } impl fmt::Debug for RocksDBProviderInner { @@ -595,6 +638,9 @@ impl DatabaseMetrics for RocksDBProvider { )); } + // WAL size (DB-level, shared across all tables) + metrics.push(("rocksdb.wal_size", self.wal_size_bytes() as f64, vec![])); + metrics } } @@ -838,6 +884,22 @@ impl RocksDBProvider { self.0.table_stats() } + /// Returns the total size of WAL (Write-Ahead Log) files in bytes. + /// + /// This scans the `RocksDB` directory for `.log` files and sums their sizes. + /// WAL files can be significant (e.g., 2.7GB observed) and are not included + /// in `table_size`, `sst_size`, or `memtable_size` metrics. + pub fn wal_size_bytes(&self) -> u64 { + self.0.wal_size_bytes() + } + + /// Returns database-level statistics including per-table stats and WAL size. + /// + /// This combines [`Self::table_stats`] and [`Self::wal_size_bytes`] into a single struct. + pub fn db_stats(&self) -> RocksDBStats { + self.0.db_stats() + } + /// Flushes pending writes for the specified tables to disk. /// /// This performs a flush of: diff --git a/crates/storage/provider/src/providers/rocksdb_stub.rs b/crates/storage/provider/src/providers/rocksdb_stub.rs index 31c38103e3..822bafd7e8 100644 --- a/crates/storage/provider/src/providers/rocksdb_stub.rs +++ b/crates/storage/provider/src/providers/rocksdb_stub.rs @@ -32,6 +32,15 @@ pub struct RocksDBTableStats { pub pending_compaction_bytes: u64, } +/// Database-level statistics for `RocksDB` - stub. +#[derive(Debug, Clone)] +pub struct RocksDBStats { + /// Statistics for each table (column family). + pub tables: Vec, + /// Total size of WAL (Write-Ahead Log) files in bytes. + pub wal_size_bytes: u64, +} + /// Context for `RocksDB` block writes (stub). #[derive(Debug, Clone)] #[allow(dead_code)] @@ -89,6 +98,21 @@ impl RocksDBProvider { Ok(()) } + /// Returns the total size of WAL (Write-Ahead Log) files in bytes (stub implementation). + /// + /// Returns 0 since there is no `RocksDB` when the feature is disabled. + pub const fn wal_size_bytes(&self) -> u64 { + 0 + } + + /// Returns database-level statistics including per-table stats and WAL size (stub + /// implementation). + /// + /// Returns empty stats since there is no `RocksDB` when the feature is disabled. + pub const fn db_stats(&self) -> RocksDBStats { + RocksDBStats { tables: Vec::new(), wal_size_bytes: 0 } + } + /// Flushes all pending writes to disk (stub implementation). /// /// This is a no-op since there is no `RocksDB` when the feature is disabled. diff --git a/etc/grafana/dashboards/overview.json b/etc/grafana/dashboards/overview.json index b81d90c622..62dfcab965 100644 --- a/etc/grafana/dashboards/overview.json +++ b/etc/grafana/dashboards/overview.json @@ -828,7 +828,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(reth_rocksdb_table_size{$instance_label=\"$instance\"}) or vector(0)", + "expr": "(sum(reth_rocksdb_table_size{$instance_label=\"$instance\"}) or vector(0)) + (sum(reth_rocksdb_wal_size{$instance_label=\"$instance\"}) or vector(0))", "hide": false, "instant": false, "legendFormat": "RocksDB", @@ -841,7 +841,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum(reth_db_table_size{$instance_label=\"$instance\"}) + sum(reth_db_freelist{$instance_label=\"$instance\"} * reth_db_page_size{$instance_label=\"$instance\"}) + sum(reth_static_files_segment_size{$instance_label=\"$instance\"}) + (sum(reth_rocksdb_table_size{$instance_label=\"$instance\"}) or vector(0))", + "expr": "sum(reth_db_table_size{$instance_label=\"$instance\"}) + sum(reth_db_freelist{$instance_label=\"$instance\"} * reth_db_page_size{$instance_label=\"$instance\"}) + sum(reth_static_files_segment_size{$instance_label=\"$instance\"}) + (sum(reth_rocksdb_table_size{$instance_label=\"$instance\"}) or vector(0)) + (sum(reth_rocksdb_wal_size{$instance_label=\"$instance\"}) or vector(0))", "hide": false, "instant": false, "legendFormat": "Total", @@ -6771,6 +6771,17 @@ "legendFormat": "{{table}}", "range": true, "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "reth_rocksdb_wal_size{$instance_label=\"$instance\"}", + "legendFormat": "WAL", + "range": true, + "refId": "B" } ], "title": "RocksDB Tables Size", @@ -7091,7 +7102,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (job) ( reth_rocksdb_table_size{$instance_label=\"$instance\"} )", + "expr": "sum by (job) ( reth_rocksdb_table_size{$instance_label=\"$instance\"} ) + (sum by (job) ( reth_rocksdb_wal_size{$instance_label=\"$instance\"} ) or vector(0))", "legendFormat": "__auto", "range": true, "refId": "A" @@ -12441,6 +12452,18 @@ "legendFormat": "__auto", "range": true, "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "label_replace(reth_rocksdb_wal_size{$instance_label=\"$instance\"}, \"table\", \"WAL\", \"\", \"\")", + "format": "table", + "legendFormat": "__auto", + "range": true, + "refId": "C" } ], "transformations": [ @@ -12464,6 +12487,12 @@ ], "operation": "aggregate" }, + "Value #C": { + "aggregations": [ + "lastNotNull" + ], + "operation": "aggregate" + }, "table": { "aggregations": [], "operation": "groupby" @@ -12489,7 +12518,8 @@ "renameByName": { "table": "Table", "Value #A (lastNotNull)": "SST Size", - "Value #B (lastNotNull)": "Memtable Size" + "Value #B (lastNotNull)": "Memtable Size", + "Value #C (lastNotNull)": "WAL Size" } } }