diff --git a/bin/reth/src/db/diff.rs b/bin/reth/src/db/diff.rs new file mode 100644 index 0000000000..b7527b0ab1 --- /dev/null +++ b/bin/reth/src/db/diff.rs @@ -0,0 +1,410 @@ +use std::{ + collections::HashMap, + fmt::Debug, + fs::{self, File}, + hash::Hash, + io::Write, + path::{Path, PathBuf}, +}; + +use crate::{ + args::DatabaseArgs, + dirs::{DataDirPath, PlatformPath}, + utils::DbTool, +}; +use clap::Parser; + +use reth_db::{ + cursor::DbCursorRO, database::Database, open_db_read_only, table::Table, transaction::DbTx, + AccountChangeSet, AccountHistory, AccountsTrie, BlockBodyIndices, BlockOmmers, + BlockWithdrawals, Bytecodes, CanonicalHeaders, DatabaseEnvRO, HashedAccount, HashedStorage, + HeaderNumbers, HeaderTD, Headers, PlainAccountState, PlainStorageState, PruneCheckpoints, + Receipts, StorageChangeSet, StorageHistory, StoragesTrie, SyncStage, SyncStageProgress, Tables, + TransactionBlock, Transactions, TxHashNumber, TxSenders, +}; +use tracing::info; + +#[derive(Parser, Debug)] +/// The arguments for the `reth db diff` command +pub struct Command { + /// The path to the data dir for all reth files and subdirectories. + #[arg(long, verbatim_doc_comment)] + secondary_datadir: PlatformPath, + + /// Arguments for the second database + #[clap(flatten)] + second_db: DatabaseArgs, + + /// The table name to diff. If not specified, all tables are diffed. + #[arg(long, verbatim_doc_comment)] + table: Option, + + /// The output directory for the diff report. + #[arg(long, verbatim_doc_comment)] + output: PlatformPath, +} + +impl Command { + /// Execute the `db diff` command. + /// + /// This first opens the `db/` folder from the secondary datadir, where the second database is + /// opened read-only. + /// + /// The tool will then iterate through all key-value pairs for the primary and secondary + /// databases. The value for each key will be compared with its corresponding value in the + /// other database. If the values are different, a discrepancy will be recorded in-memory. If + /// one key is present in one database but not the other, this will be recorded as an "extra + /// element" for that database. + /// + /// The discrepancies and extra elements, along with a brief summary of the diff results are + /// then written to a file in the output directory. + pub fn execute(self, tool: &DbTool<'_, DatabaseEnvRO>) -> eyre::Result<()> { + // open second db + let second_db_path: PathBuf = self.secondary_datadir.join("db").into(); + let second_db = open_db_read_only(&second_db_path, self.second_db.log_level)?; + + let tables = match self.table { + Some(table) => vec![table], + None => Tables::ALL.to_vec(), + }; + + for table in tables { + let primary_tx = tool.db.tx()?; + let secondary_tx = second_db.tx()?; + + let output_dir = self.output.clone(); + match table { + Tables::CanonicalHeaders => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::HeaderTD => find_diffs::(primary_tx, secondary_tx, output_dir)?, + Tables::HeaderNumbers => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::Headers => find_diffs::(primary_tx, secondary_tx, output_dir)?, + Tables::BlockBodyIndices => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::BlockOmmers => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::BlockWithdrawals => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::TransactionBlock => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::Transactions => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::TxHashNumber => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::Receipts => find_diffs::(primary_tx, secondary_tx, output_dir)?, + Tables::PlainAccountState => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::PlainStorageState => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::Bytecodes => find_diffs::(primary_tx, secondary_tx, output_dir)?, + Tables::AccountHistory => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::StorageHistory => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::AccountChangeSet => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::StorageChangeSet => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::HashedAccount => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::HashedStorage => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::AccountsTrie => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::StoragesTrie => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::TxSenders => find_diffs::(primary_tx, secondary_tx, output_dir)?, + Tables::SyncStage => find_diffs::(primary_tx, secondary_tx, output_dir)?, + Tables::SyncStageProgress => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + Tables::PruneCheckpoints => { + find_diffs::(primary_tx, secondary_tx, output_dir)? + } + }; + } + + Ok(()) + } +} + +/// Find diffs for a table, then analyzing the result +fn find_diffs<'a, T: Table>( + primary_tx: impl DbTx<'a>, + secondary_tx: impl DbTx<'a>, + output_dir: impl AsRef, +) -> eyre::Result<()> +where + T::Key: Hash, + T::Value: PartialEq, +{ + let table_name = T::NAME; + + info!("Analyzing table {table_name}..."); + let result = find_diffs_advanced::(&primary_tx, &secondary_tx)?; + info!("Done analyzing table {table_name}!"); + + // Pretty info summary header: newline then header + info!(""); + info!("Diff results for {table_name}:"); + + // create directory and open file + fs::create_dir_all(output_dir.as_ref())?; + let file_name = format!("{table_name}.txt"); + let mut file = File::create(output_dir.as_ref().join(file_name.clone()))?; + + // analyze the result and print some stats + let discrepancies = result.discrepancies.len(); + let extra_elements = result.extra_elements.len(); + + // Make a pretty summary header for the table + writeln!(file, "Diff results for {table_name}")?; + + if discrepancies > 0 { + // write to file + writeln!(file, "Found {discrepancies} discrepancies in table {table_name}")?; + + // also print to info + info!("Found {discrepancies} discrepancies in table {table_name}"); + } else { + // write to file + writeln!(file, "No discrepancies found in table {table_name}")?; + + // also print to info + info!("No discrepancies found in table {table_name}"); + } + + if extra_elements > 0 { + // write to file + writeln!(file, "Found {extra_elements} extra elements in table {table_name}")?; + + // also print to info + info!("Found {extra_elements} extra elements in table {table_name}"); + } else { + writeln!(file, "No extra elements found in table {table_name}")?; + + // also print to info + info!("No extra elements found in table {table_name}"); + } + + info!("Writing diff results for {table_name} to {file_name}..."); + + if discrepancies > 0 { + writeln!(file, "Discrepancies:")?; + } + + for discrepancy in result.discrepancies.values() { + writeln!(file, "{discrepancy:?}")?; + } + + if extra_elements > 0 { + writeln!(file, "Extra elements:")?; + } + + for extra_element in result.extra_elements.values() { + writeln!(file, "{extra_element:?}")?; + } + + let full_file_name = output_dir.as_ref().join(file_name); + info!("Done writing diff results for {table_name} to {}", full_file_name.display()); + Ok(()) +} + +/// This diff algorithm is slightly different, it will walk _each_ table, cross-checking for the +/// element in the other table. +fn find_diffs_advanced<'a, T: Table>( + primary_tx: &impl DbTx<'a>, + secondary_tx: &impl DbTx<'a>, +) -> eyre::Result> +where + T::Value: PartialEq, + T::Key: Hash, +{ + // initialize the zipped walker + let mut primary_zip_cursor = + primary_tx.cursor_read::().expect("Was not able to obtain a cursor."); + let primary_walker = primary_zip_cursor.walk(None)?; + + let mut secondary_zip_cursor = + secondary_tx.cursor_read::().expect("Was not able to obtain a cursor."); + let secondary_walker = secondary_zip_cursor.walk(None)?; + let zipped_cursor = primary_walker.zip(secondary_walker); + + // initialize the cursors for seeking when we are cross checking elements + let mut primary_cursor = + primary_tx.cursor_read::().expect("Was not able to obtain a cursor."); + + let mut secondary_cursor = + secondary_tx.cursor_read::().expect("Was not able to obtain a cursor."); + + let mut result = TableDiffResult::::default(); + + // this loop will walk both tables, cross-checking for the element in the other table. + // it basically just loops through both tables at the same time. if the keys are different, it + // will check each key in the other table. if the keys are the same, it will compare the + // values + for (primary_entry, secondary_entry) in zipped_cursor { + let (primary_key, primary_value) = primary_entry?; + let (secondary_key, secondary_value) = secondary_entry?; + + if primary_key != secondary_key { + // if the keys are different, we need to check if the key is in the other table + let crossed_secondary = + secondary_cursor.seek_exact(primary_key.clone())?.map(|(_, value)| value); + result.try_push_discrepancy( + primary_key.clone(), + Some(primary_value), + crossed_secondary, + ); + + // now do the same for the primary table + let crossed_primary = + primary_cursor.seek_exact(secondary_key.clone())?.map(|(_, value)| value); + result.try_push_discrepancy( + secondary_key.clone(), + crossed_primary, + Some(secondary_value), + ); + } else { + // the keys are the same, so we need to compare the values + result.try_push_discrepancy(primary_key, Some(primary_value), Some(secondary_value)); + } + } + + Ok(result) +} + +/// Includes a table element between two databases with the same key, but different values +#[derive(Debug)] +struct TableDiffElement { + /// The key for the element + key: T::Key, + + /// The element from the first table + #[allow(dead_code)] + first: T::Value, + + /// The element from the second table + #[allow(dead_code)] + second: T::Value, +} + +/// The diff result for an entire table. If the tables had the same number of elements, there will +/// be no extra elements. +struct TableDiffResult +where + T::Key: Hash, +{ + /// All elements of the database that are different + discrepancies: HashMap>, + + /// Any extra elements, and the table they are in + extra_elements: HashMap>, +} + +impl Default for TableDiffResult +where + T: Table, + T::Key: Hash, +{ + fn default() -> Self { + Self { discrepancies: HashMap::new(), extra_elements: HashMap::new() } + } +} + +impl TableDiffResult +where + T::Key: Hash, +{ + /// Push a diff result into the discrepancies set. + fn push_discrepancy(&mut self, discrepancy: TableDiffElement) { + self.discrepancies.insert(discrepancy.key.clone(), discrepancy); + } + + /// Push an extra element into the extra elements set. + fn push_extra_element(&mut self, element: ExtraTableElement) { + self.extra_elements.insert(element.key().clone(), element); + } +} + +impl TableDiffResult +where + T: Table, + T::Key: Hash, + T::Value: PartialEq, +{ + /// Try to push a diff result into the discrepancy set, only pushing if the given elements are + /// different, and the discrepancy does not exist anywhere already. + fn try_push_discrepancy( + &mut self, + key: T::Key, + first: Option, + second: Option, + ) { + // do not bother comparing if the key is already in the discrepancies map + if self.discrepancies.contains_key(&key) { + return + } + + // do not bother comparing if the key is already in the extra elements map + if self.extra_elements.contains_key(&key) { + return + } + + match (first, second) { + (Some(first), Some(second)) => { + if first != second { + self.push_discrepancy(TableDiffElement { key, first, second }); + } + } + (Some(first), None) => { + self.push_extra_element(ExtraTableElement::First { key, value: first }); + } + (None, Some(second)) => { + self.push_extra_element(ExtraTableElement::Second { key, value: second }); + } + (None, None) => {} + } + } +} + +/// A single extra element from a table +#[derive(Debug)] +enum ExtraTableElement { + /// The extra element that is in the first table + #[allow(dead_code)] + First { key: T::Key, value: T::Value }, + + /// The extra element that is in the second table + #[allow(dead_code)] + Second { key: T::Key, value: T::Value }, +} + +impl ExtraTableElement { + /// Return the key for the extra element + fn key(&self) -> &T::Key { + match self { + Self::First { key, .. } => key, + Self::Second { key, .. } => key, + } + } +} diff --git a/bin/reth/src/db/mod.rs b/bin/reth/src/db/mod.rs index 04b51a1546..a56ff9d9a1 100644 --- a/bin/reth/src/db/mod.rs +++ b/bin/reth/src/db/mod.rs @@ -18,6 +18,7 @@ use reth_primitives::ChainSpec; use std::sync::Arc; mod clear; +mod diff; mod get; mod list; /// DB List TUI @@ -68,6 +69,8 @@ pub enum Subcommands { Stats, /// Lists the contents of a table List(list::Command), + /// Create a diff between two database tables or two entire databases. + Diff(diff::Command), /// Gets the content of a table for the given key Get(get::Command), /// Deletes all database entries @@ -165,6 +168,11 @@ impl Command { let tool = DbTool::new(&db, self.chain.clone())?; command.execute(&tool)?; } + Subcommands::Diff(command) => { + let db = open_db_read_only(&db_path, self.db.log_level)?; + let tool = DbTool::new(&db, self.chain.clone())?; + command.execute(&tool)?; + } Subcommands::Get(command) => { let db = open_db_read_only(&db_path, self.db.log_level)?; let tool = DbTool::new(&db, self.chain.clone())?; diff --git a/crates/primitives/src/prune/part.rs b/crates/primitives/src/prune/part.rs index caa176b86a..f47ea03d1b 100644 --- a/crates/primitives/src/prune/part.rs +++ b/crates/primitives/src/prune/part.rs @@ -2,7 +2,7 @@ use reth_codecs::{main_codec, Compact}; /// Part of the data that can be pruned. #[main_codec] -#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)] +#[derive(Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd, Hash)] pub enum PrunePart { /// Prune part responsible for the `TxSenders` table. SenderRecovery, diff --git a/crates/storage/db/src/tables/models/accounts.rs b/crates/storage/db/src/tables/models/accounts.rs index c1a50e95bc..c82b474090 100644 --- a/crates/storage/db/src/tables/models/accounts.rs +++ b/crates/storage/db/src/tables/models/accounts.rs @@ -64,7 +64,9 @@ impl Compact for AccountBeforeTx { /// [`StorageChangeSet`](crate::tables::StorageChangeSet) /// /// Since it's used as a key, it isn't compressed when encoding it. -#[derive(Debug, Default, Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Ord, PartialOrd)] +#[derive( + Debug, Default, Copy, Clone, PartialEq, Eq, Serialize, Deserialize, Ord, PartialOrd, Hash, +)] pub struct BlockNumberAddress(pub (BlockNumber, Address)); impl BlockNumberAddress { diff --git a/crates/storage/db/src/tables/models/sharded_key.rs b/crates/storage/db/src/tables/models/sharded_key.rs index a38c3af3a3..5dedd349eb 100644 --- a/crates/storage/db/src/tables/models/sharded_key.rs +++ b/crates/storage/db/src/tables/models/sharded_key.rs @@ -1,5 +1,7 @@ //! Sharded key +use std::hash::Hash; + use crate::{ table::{Decode, Encode}, DatabaseError, @@ -74,3 +76,13 @@ where Ok(ShardedKey::new(key, highest_tx_number)) } } + +impl Hash for ShardedKey +where + T: Hash, +{ + fn hash(&self, state: &mut H) { + self.key.hash(state); + self.highest_block_number.hash(state); + } +} diff --git a/crates/storage/db/src/tables/models/storage_sharded_key.rs b/crates/storage/db/src/tables/models/storage_sharded_key.rs index 984933d1f1..15e6736599 100644 --- a/crates/storage/db/src/tables/models/storage_sharded_key.rs +++ b/crates/storage/db/src/tables/models/storage_sharded_key.rs @@ -19,7 +19,9 @@ pub const NUM_OF_INDICES_IN_SHARD: usize = 2_000; /// `Address | Storagekey | 200` -> data is from transition 0 to 200. /// /// `Address | StorageKey | 300` -> data is from transition 201 to 300. -#[derive(Debug, Default, Clone, Eq, Ord, PartialOrd, PartialEq, AsRef, Serialize, Deserialize)] +#[derive( + Debug, Default, Clone, Eq, Ord, PartialOrd, PartialEq, AsRef, Serialize, Deserialize, Hash, +)] pub struct StorageShardedKey { /// Storage account address. pub address: H160,