mirror of
https://github.com/paradigmxyz/reth.git
synced 2026-04-30 03:01:58 -04:00
Compare commits
90 Commits
devnet4
...
feat/use-h
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
94d7607278 | ||
|
|
752d40291a | ||
|
|
de3b17309e | ||
|
|
8a8e4b5edd | ||
|
|
e14adaf79f | ||
|
|
ae09a5e036 | ||
|
|
6ae2d53af2 | ||
|
|
bd10b69257 | ||
|
|
b7e190a544 | ||
|
|
f460459b59 | ||
|
|
fdfb8f8694 | ||
|
|
65a3033b87 | ||
|
|
c2e1679934 | ||
|
|
9d552797b5 | ||
|
|
a5da200c49 | ||
|
|
21a6f1ab60 | ||
|
|
afc718b7d9 | ||
|
|
10baffc181 | ||
|
|
66713e22ad | ||
|
|
549e3214d5 | ||
|
|
21a5ace5d9 | ||
|
|
89544f6727 | ||
|
|
fe2f4f759d | ||
|
|
d967866e9b | ||
|
|
3c3fef6b5f | ||
|
|
3d0cc1dc01 | ||
|
|
c0ed20e44b | ||
|
|
41d90c3c11 | ||
|
|
63f0b36c49 | ||
|
|
d826180b5f | ||
|
|
6d0e37f4fb | ||
|
|
76f884e47d | ||
|
|
4cc1df513a | ||
|
|
94e23e4ad2 | ||
|
|
c0e2ce10a9 | ||
|
|
cd36789d7e | ||
|
|
acce33340b | ||
|
|
478d8f7845 | ||
|
|
606d6a2f37 | ||
|
|
cc772c4170 | ||
|
|
8c6944897f | ||
|
|
88fb4a0a4e | ||
|
|
846556af52 | ||
|
|
7ce2b55415 | ||
|
|
751af221fa | ||
|
|
23c1ca137d | ||
|
|
c65f282907 | ||
|
|
82f82f4515 | ||
|
|
8d9ade2b1d | ||
|
|
dd6e3932f2 | ||
|
|
a7eaad001b | ||
|
|
670df03a47 | ||
|
|
8dc8d94a02 | ||
|
|
2914667f1a | ||
|
|
6784d2d3ab | ||
|
|
5dd48ba5f0 | ||
|
|
75f9aba1ad | ||
|
|
a314e615ea | ||
|
|
fa9379052b | ||
|
|
aa71ba6621 | ||
|
|
80497b5e38 | ||
|
|
cc3cd92bb2 | ||
|
|
992e22230d | ||
|
|
717aa92b8d | ||
|
|
e5355bfbef | ||
|
|
c60a9af94b | ||
|
|
6ad0565b0f | ||
|
|
fc59562d46 | ||
|
|
0df85f0029 | ||
|
|
a693857608 | ||
|
|
6c685d0ed3 | ||
|
|
04bd81f74d | ||
|
|
877ab61330 | ||
|
|
3e77741fc7 | ||
|
|
dea1d3520e | ||
|
|
34aff1e945 | ||
|
|
e84c6fdb7b | ||
|
|
0ba99298d4 | ||
|
|
b153f66ed4 | ||
|
|
7d66c98afb | ||
|
|
bef6030e4b | ||
|
|
9bebc00957 | ||
|
|
4ca24ef2a9 | ||
|
|
0507327a5e | ||
|
|
c065b3990d | ||
|
|
efd6f06069 | ||
|
|
570411e189 | ||
|
|
96003e1bf6 | ||
|
|
ca17c74be8 | ||
|
|
68335cf015 |
@@ -134,4 +134,4 @@ arbitrary = [
|
||||
]
|
||||
|
||||
rocksdb = ["reth-db-common/rocksdb", "reth-stages/rocksdb", "reth-provider/rocksdb", "reth-prune/rocksdb"]
|
||||
edge = ["rocksdb"]
|
||||
edge = ["rocksdb", "reth-db-common/edge", "reth-provider/edge"]
|
||||
|
||||
@@ -16,58 +16,93 @@ const LOG_INTERVAL: Duration = Duration::from_secs(5);
|
||||
pub struct Command {
|
||||
/// The account address to check storage for
|
||||
address: Address,
|
||||
|
||||
/// Use hashed state tables (HashedStorages) instead of plain state
|
||||
#[arg(long)]
|
||||
hashed: bool,
|
||||
}
|
||||
|
||||
impl Command {
|
||||
/// Execute `db account-storage` command
|
||||
pub fn execute<N: NodeTypesWithDB>(self, tool: &DbTool<N>) -> eyre::Result<()> {
|
||||
let address = self.address;
|
||||
let (slot_count, plain_size) = tool.provider_factory.db_ref().view(|tx| {
|
||||
let mut cursor = tx.cursor_dup_read::<tables::PlainStorageState>()?;
|
||||
let use_hashed = self.hashed;
|
||||
let hashed_address = keccak256(address);
|
||||
|
||||
let (slot_count, storage_size) = tool.provider_factory.db_ref().view(|tx| {
|
||||
let mut count = 0usize;
|
||||
let mut total_value_bytes = 0usize;
|
||||
let mut last_log = Instant::now();
|
||||
|
||||
// Walk all storage entries for this address
|
||||
let walker = cursor.walk_dup(Some(address), None)?;
|
||||
for entry in walker {
|
||||
let (_, storage_entry) = entry?;
|
||||
count += 1;
|
||||
// StorageEntry encodes as: 32 bytes (key/subkey uncompressed) + compressed U256
|
||||
let mut buf = Vec::new();
|
||||
let entry_len = storage_entry.to_compact(&mut buf);
|
||||
total_value_bytes += entry_len;
|
||||
if use_hashed {
|
||||
let mut cursor = tx.cursor_dup_read::<tables::HashedStorages>()?;
|
||||
let walker = cursor.walk_dup(Some(hashed_address), None)?;
|
||||
for entry in walker {
|
||||
let (_, storage_entry) = entry?;
|
||||
count += 1;
|
||||
let mut buf = Vec::new();
|
||||
let entry_len = storage_entry.to_compact(&mut buf);
|
||||
total_value_bytes += entry_len;
|
||||
|
||||
if last_log.elapsed() >= LOG_INTERVAL {
|
||||
info!(
|
||||
target: "reth::cli",
|
||||
address = %address,
|
||||
slots = count,
|
||||
key = %storage_entry.key,
|
||||
"Processing storage slots"
|
||||
);
|
||||
last_log = Instant::now();
|
||||
if last_log.elapsed() >= LOG_INTERVAL {
|
||||
info!(
|
||||
target: "reth::cli",
|
||||
hashed_address = %hashed_address,
|
||||
slots = count,
|
||||
key = %storage_entry.key,
|
||||
"Processing hashed storage slots"
|
||||
);
|
||||
last_log = Instant::now();
|
||||
}
|
||||
}
|
||||
// HashedStorages uses 32-byte B256 key
|
||||
let total_size = if count > 0 { 32 + total_value_bytes } else { 0 };
|
||||
Ok::<_, eyre::Report>((count, total_size))
|
||||
} else {
|
||||
let mut cursor = tx.cursor_dup_read::<tables::PlainStorageState>()?;
|
||||
let walker = cursor.walk_dup(Some(address), None)?;
|
||||
for entry in walker {
|
||||
let (_, storage_entry) = entry?;
|
||||
count += 1;
|
||||
let mut buf = Vec::new();
|
||||
let entry_len = storage_entry.to_compact(&mut buf);
|
||||
total_value_bytes += entry_len;
|
||||
|
||||
if last_log.elapsed() >= LOG_INTERVAL {
|
||||
info!(
|
||||
target: "reth::cli",
|
||||
address = %address,
|
||||
slots = count,
|
||||
key = %storage_entry.key,
|
||||
"Processing storage slots"
|
||||
);
|
||||
last_log = Instant::now();
|
||||
}
|
||||
}
|
||||
// PlainStorageState uses 20-byte Address key
|
||||
let total_size = if count > 0 { 20 + total_value_bytes } else { 0 };
|
||||
Ok::<_, eyre::Report>((count, total_size))
|
||||
}
|
||||
|
||||
// Add 20 bytes for the Address key (stored once per account in dupsort)
|
||||
let total_size = if count > 0 { 20 + total_value_bytes } else { 0 };
|
||||
|
||||
Ok::<_, eyre::Report>((count, total_size))
|
||||
})??;
|
||||
|
||||
// Estimate hashed storage size: 32-byte B256 key instead of 20-byte Address
|
||||
let hashed_size_estimate = if slot_count > 0 { plain_size + 12 } else { 0 };
|
||||
let total_estimate = plain_size + hashed_size_estimate;
|
||||
|
||||
let hashed_address = keccak256(address);
|
||||
let state_source = if use_hashed { "hashed" } else { "plain" };
|
||||
|
||||
println!("Account: {address}");
|
||||
println!("Hashed address: {hashed_address}");
|
||||
println!("State source: {state_source}");
|
||||
println!("Storage slots: {slot_count}");
|
||||
println!("Plain storage size: {} (estimated)", human_bytes(plain_size as f64));
|
||||
println!("Hashed storage size: {} (estimated)", human_bytes(hashed_size_estimate as f64));
|
||||
println!("Total estimated size: {}", human_bytes(total_estimate as f64));
|
||||
println!("Storage size: {} (estimated)", human_bytes(storage_size as f64));
|
||||
|
||||
if !use_hashed {
|
||||
// When querying plain state, also estimate what hashed would be
|
||||
let hashed_size_estimate = if slot_count > 0 { storage_size + 12 } else { 0 };
|
||||
let total_estimate = storage_size + hashed_size_estimate;
|
||||
println!(
|
||||
"Hashed storage size: {} (estimated)",
|
||||
human_bytes(hashed_size_estimate as f64)
|
||||
);
|
||||
println!("Total estimated size: {}", human_bytes(total_estimate as f64));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -88,5 +123,17 @@ mod tests {
|
||||
cmd.address,
|
||||
"0xd8dA6BF26964aF9D7eEd9e03E53415D37aA96045".parse::<Address>().unwrap()
|
||||
);
|
||||
assert!(!cmd.hashed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_hashed_flag() {
|
||||
let cmd = Command::try_parse_from([
|
||||
"account-storage",
|
||||
"0xd8dA6BF26964aF9D7eEd9e03E53415D37aA96045",
|
||||
"--hashed",
|
||||
])
|
||||
.unwrap();
|
||||
assert!(cmd.hashed);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -121,6 +121,7 @@ impl Command {
|
||||
account_history_in_rocksdb: _,
|
||||
account_changesets_in_static_files: _,
|
||||
storage_changesets_in_static_files: _,
|
||||
use_hashed_state: _,
|
||||
} = settings.unwrap_or_else(StorageSettings::v1);
|
||||
|
||||
// Update the setting based on the key
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use alloy_primitives::{Address, BlockNumber, B256, U256};
|
||||
use alloy_primitives::{keccak256, Address, BlockNumber, B256, U256};
|
||||
use clap::Parser;
|
||||
use parking_lot::Mutex;
|
||||
use reth_db_api::{
|
||||
@@ -39,6 +39,10 @@ pub struct Command {
|
||||
/// Output format (table, json, csv)
|
||||
#[arg(long, short, default_value = "table")]
|
||||
format: OutputFormat,
|
||||
|
||||
/// Use hashed state tables (HashedStorages) instead of plain state
|
||||
#[arg(long)]
|
||||
hashed: bool,
|
||||
}
|
||||
|
||||
impl Command {
|
||||
@@ -63,35 +67,66 @@ impl Command {
|
||||
address: Address,
|
||||
limit: usize,
|
||||
) -> eyre::Result<()> {
|
||||
let entries = tool.provider_factory.db_ref().view(|tx| {
|
||||
// Get account info
|
||||
let account = tx.get::<tables::PlainAccountState>(address)?;
|
||||
let use_hashed = self.hashed;
|
||||
let hashed_address = keccak256(address);
|
||||
|
||||
let entries = tool.provider_factory.db_ref().view(|tx| {
|
||||
let account = if use_hashed {
|
||||
tx.get::<tables::HashedAccounts>(hashed_address)?
|
||||
} else {
|
||||
tx.get::<tables::PlainAccountState>(address)?
|
||||
};
|
||||
|
||||
// Get storage entries
|
||||
let mut cursor = tx.cursor_dup_read::<tables::PlainStorageState>()?;
|
||||
let mut entries = Vec::new();
|
||||
let mut last_log = Instant::now();
|
||||
|
||||
let walker = cursor.walk_dup(Some(address), None)?;
|
||||
for (idx, entry) in walker.enumerate() {
|
||||
let (_, storage_entry) = entry?;
|
||||
if use_hashed {
|
||||
let mut cursor = tx.cursor_dup_read::<tables::HashedStorages>()?;
|
||||
let walker = cursor.walk_dup(Some(hashed_address), None)?;
|
||||
for (idx, entry) in walker.enumerate() {
|
||||
let (_, storage_entry) = entry?;
|
||||
|
||||
if storage_entry.value != U256::ZERO {
|
||||
entries.push((storage_entry.key, storage_entry.value));
|
||||
if storage_entry.value != U256::ZERO {
|
||||
entries.push((storage_entry.key, storage_entry.value));
|
||||
}
|
||||
|
||||
if entries.len() >= limit {
|
||||
break;
|
||||
}
|
||||
|
||||
if last_log.elapsed() >= LOG_INTERVAL {
|
||||
info!(
|
||||
target: "reth::cli",
|
||||
hashed_address = %hashed_address,
|
||||
slots_scanned = idx,
|
||||
"Scanning hashed storage slots"
|
||||
);
|
||||
last_log = Instant::now();
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let mut cursor = tx.cursor_dup_read::<tables::PlainStorageState>()?;
|
||||
let walker = cursor.walk_dup(Some(address), None)?;
|
||||
for (idx, entry) in walker.enumerate() {
|
||||
let (_, storage_entry) = entry?;
|
||||
|
||||
if entries.len() >= limit {
|
||||
break;
|
||||
}
|
||||
if storage_entry.value != U256::ZERO {
|
||||
entries.push((storage_entry.key, storage_entry.value));
|
||||
}
|
||||
|
||||
if last_log.elapsed() >= LOG_INTERVAL {
|
||||
info!(
|
||||
target: "reth::cli",
|
||||
address = %address,
|
||||
slots_scanned = idx,
|
||||
"Scanning storage slots"
|
||||
);
|
||||
last_log = Instant::now();
|
||||
if entries.len() >= limit {
|
||||
break;
|
||||
}
|
||||
|
||||
if last_log.elapsed() >= LOG_INTERVAL {
|
||||
info!(
|
||||
target: "reth::cli",
|
||||
address = %address,
|
||||
slots_scanned = idx,
|
||||
"Scanning storage slots"
|
||||
);
|
||||
last_log = Instant::now();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -100,7 +135,7 @@ impl Command {
|
||||
|
||||
let (account, storage_entries) = entries;
|
||||
|
||||
self.print_results(address, None, account, &storage_entries);
|
||||
self.print_results(address, None, account, &storage_entries, use_hashed);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -176,7 +211,7 @@ impl Command {
|
||||
}
|
||||
}
|
||||
|
||||
self.print_results(address, Some(block), account, &entries);
|
||||
self.print_results(address, Some(block), account, &entries, false);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -318,15 +353,22 @@ impl Command {
|
||||
block: Option<BlockNumber>,
|
||||
account: Option<reth_primitives_traits::Account>,
|
||||
storage: &[(alloy_primitives::B256, U256)],
|
||||
use_hashed: bool,
|
||||
) {
|
||||
let state_source = if use_hashed { "hashed" } else { "plain" };
|
||||
|
||||
match self.format {
|
||||
OutputFormat::Table => {
|
||||
println!("Account: {address}");
|
||||
if use_hashed {
|
||||
println!("Hashed address: {}", keccak256(address));
|
||||
}
|
||||
if let Some(b) = block {
|
||||
println!("Block: {b}");
|
||||
} else {
|
||||
println!("Block: latest");
|
||||
}
|
||||
println!("State source: {state_source}");
|
||||
println!();
|
||||
|
||||
if let Some(acc) = account {
|
||||
@@ -340,9 +382,10 @@ impl Command {
|
||||
}
|
||||
|
||||
println!();
|
||||
let slot_header = if use_hashed { "Hashed Slot" } else { "Slot" };
|
||||
println!("Storage ({} slots):", storage.len());
|
||||
println!("{:-<130}", "");
|
||||
println!("{:<66} | {:<64}", "Slot", "Value");
|
||||
println!("{:<66} | {:<64}", slot_header, "Value");
|
||||
println!("{:-<130}", "");
|
||||
for (key, value) in storage {
|
||||
println!("{key} | {value:#066x}");
|
||||
@@ -351,7 +394,9 @@ impl Command {
|
||||
OutputFormat::Json => {
|
||||
let output = serde_json::json!({
|
||||
"address": address.to_string(),
|
||||
"hashed_address": if use_hashed { Some(keccak256(address).to_string()) } else { None },
|
||||
"block": block,
|
||||
"state_source": state_source,
|
||||
"account": account.map(|a| serde_json::json!({
|
||||
"nonce": a.nonce,
|
||||
"balance": a.balance.to_string(),
|
||||
@@ -409,5 +454,17 @@ mod tests {
|
||||
let cmd = Command::try_parse_from(["state", "0xd8dA6BF26964aF9D7eEd9e03E53415D37aA96045"])
|
||||
.unwrap();
|
||||
assert_eq!(cmd.block, None);
|
||||
assert!(!cmd.hashed);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_state_args_hashed() {
|
||||
let cmd = Command::try_parse_from([
|
||||
"state",
|
||||
"0xd8dA6BF26964aF9D7eEd9e03E53415D37aA96045",
|
||||
"--hashed",
|
||||
])
|
||||
.unwrap();
|
||||
assert!(cmd.hashed);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -59,7 +59,10 @@ impl AccountHashingStage {
|
||||
///
|
||||
/// Proceeds to go to the `BlockTransitionIndex` end, go back `transitions` and change the
|
||||
/// account state in the `AccountChangeSets` table.
|
||||
pub fn seed<Tx: DbTx + DbTxMut + 'static, N: reth_provider::providers::ProviderNodeTypes>(
|
||||
pub fn seed<
|
||||
Tx: DbTx + DbTxMut + Sync + 'static,
|
||||
N: reth_provider::providers::ProviderNodeTypes,
|
||||
>(
|
||||
provider: &reth_provider::DatabaseProvider<Tx, N>,
|
||||
opts: SeedOpts,
|
||||
) -> Result<Vec<(alloy_primitives::Address, Account)>, StageError>
|
||||
|
||||
@@ -21,6 +21,13 @@ use tracing::info;
|
||||
/// Stage is indexing history the storage changesets generated in
|
||||
/// [`ExecutionStage`][crate::stages::ExecutionStage]. For more information
|
||||
/// on index sharding take a look at [`tables::StoragesHistory`].
|
||||
///
|
||||
/// # Hashed State Compatibility
|
||||
///
|
||||
/// This stage uses storage keys directly from changesets without additional hashing.
|
||||
/// When `use_hashed_state` is enabled, changesets already contain hashed storage slots,
|
||||
/// so the history index will use hashed keys. When disabled (default), changesets contain
|
||||
/// plain slots, so the history index will use plain keys.
|
||||
#[derive(Debug)]
|
||||
pub struct IndexStorageHistoryStage {
|
||||
/// Number of blocks after which the control
|
||||
|
||||
@@ -34,6 +34,12 @@ pub struct StorageSettings {
|
||||
/// Whether this node should read and write storage changesets from static files.
|
||||
#[serde(default)]
|
||||
pub storage_changesets_in_static_files: bool,
|
||||
/// Whether to use hashed state tables instead of plain state tables.
|
||||
///
|
||||
/// When enabled, `PlainAccountState` and `PlainStorageState` tables are not used.
|
||||
/// State is read/written directly from/to `HashedAccounts` and `HashedStorages`.
|
||||
#[serde(default)]
|
||||
pub use_hashed_state: bool,
|
||||
}
|
||||
|
||||
impl StorageSettings {
|
||||
@@ -61,6 +67,7 @@ impl StorageSettings {
|
||||
storages_history_in_rocksdb: true,
|
||||
transaction_hash_numbers_in_rocksdb: true,
|
||||
account_history_in_rocksdb: true,
|
||||
use_hashed_state: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -78,6 +85,7 @@ impl StorageSettings {
|
||||
account_history_in_rocksdb: false,
|
||||
account_changesets_in_static_files: false,
|
||||
storage_changesets_in_static_files: false,
|
||||
use_hashed_state: false,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -123,10 +131,27 @@ impl StorageSettings {
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the `use_hashed_state` flag to the provided value.
|
||||
pub const fn with_use_hashed_state(mut self, value: bool) -> Self {
|
||||
self.use_hashed_state = value;
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns `true` if any tables are configured to be stored in `RocksDB`.
|
||||
pub const fn any_in_rocksdb(&self) -> bool {
|
||||
self.transaction_hash_numbers_in_rocksdb ||
|
||||
self.account_history_in_rocksdb ||
|
||||
self.storages_history_in_rocksdb
|
||||
}
|
||||
|
||||
/// Returns `true` if all v2 storage features are enabled.
|
||||
pub const fn is_v2(&self) -> bool {
|
||||
self.receipts_in_static_files &&
|
||||
self.transaction_senders_in_static_files &&
|
||||
self.account_changesets_in_static_files &&
|
||||
self.storage_changesets_in_static_files &&
|
||||
self.storages_history_in_rocksdb &&
|
||||
self.transaction_hash_numbers_in_rocksdb &&
|
||||
self.account_history_in_rocksdb
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,29 @@ use crate::{
|
||||
};
|
||||
use std::fmt::Debug;
|
||||
|
||||
/// Source of arena hint value after floor was applied.
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
|
||||
pub enum ArenaHintSource {
|
||||
/// Raw estimate was used (no floor applied)
|
||||
#[default]
|
||||
Estimated = 0,
|
||||
/// Floor was applied (estimate was below minimum)
|
||||
Floored = 1,
|
||||
}
|
||||
|
||||
/// Estimation stats for a single table's arena hint.
|
||||
///
|
||||
/// Used for tracking whether arena hint estimation is working or always hitting floor.
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct ArenaHintEstimationStats {
|
||||
/// Raw calculated estimate before floor
|
||||
pub estimated: usize,
|
||||
/// Final value used after floor
|
||||
pub actual: usize,
|
||||
/// Source of the final value
|
||||
pub source: ArenaHintSource,
|
||||
}
|
||||
|
||||
/// Helper adapter type for accessing [`DbTx`] cursor.
|
||||
pub type CursorTy<TX, T> = <TX as DbTx>::Cursor<T>;
|
||||
|
||||
@@ -76,4 +99,58 @@ pub trait DbTxMut: Send {
|
||||
fn cursor_write<T: Table>(&self) -> Result<Self::CursorMut<T>, DatabaseError>;
|
||||
/// `DupCursor` mut.
|
||||
fn cursor_dup_write<T: DupSort>(&self) -> Result<Self::DupCursorMut<T>, DatabaseError>;
|
||||
|
||||
/// Enables parallel writes mode, allowing multiple threads to write to different tables
|
||||
/// simultaneously. Must be called before any parallel cursor operations.
|
||||
fn enable_parallel_writes(&self) -> Result<(), DatabaseError> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns whether parallel writes mode is currently enabled.
|
||||
fn is_parallel_writes_enabled(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
/// Commits all sub-transactions created during parallel writes.
|
||||
fn commit_subtxns(&self) -> Result<(), DatabaseError> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Commits all sub-transactions and records arena stats as Prometheus metrics.
|
||||
///
|
||||
/// This is the preferred method when metrics are enabled, as it collects per-table
|
||||
/// arena allocation statistics for observability.
|
||||
fn commit_subtxns_with_metrics(&self) -> Result<(), DatabaseError> {
|
||||
self.commit_subtxns()
|
||||
}
|
||||
|
||||
/// Enables parallel writes mode only for the specified tables.
|
||||
///
|
||||
/// This creates subtransactions only for the listed tables, allowing parallel
|
||||
/// writes to those tables while other tables continue using the main transaction.
|
||||
fn enable_parallel_writes_for_tables(&self, tables: &[&str]) -> Result<(), DatabaseError> {
|
||||
let hints: Vec<_> = tables.iter().map(|&t| (t, 0usize)).collect();
|
||||
self.enable_parallel_writes_for_tables_with_hints(&hints)
|
||||
}
|
||||
|
||||
/// Enables parallel writes mode only for the specified tables with arena size hints.
|
||||
///
|
||||
/// Similar to [`enable_parallel_writes_for_tables`], but allows specifying an arena_hint
|
||||
/// for each table to guide page pre-allocation. An arena_hint of 0 means use
|
||||
/// equal distribution among all subtransactions.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `tables` - Slice of (table_name, arena_hint) tuples.
|
||||
fn enable_parallel_writes_for_tables_with_hints(
|
||||
&self,
|
||||
_tables: &[(&str, usize)],
|
||||
) -> Result<(), DatabaseError> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Records arena hint estimation stats for a table.
|
||||
///
|
||||
/// Tracks whether arena hint estimation is working or always hitting floor/cap.
|
||||
/// This is a no-op by default; implementations may override to record metrics.
|
||||
fn record_arena_estimation(&self, _table: &'static str, _stats: &ArenaHintEstimationStats) {}
|
||||
}
|
||||
|
||||
@@ -303,6 +303,7 @@ impl<T: Table> DbCursorRW<T> for Cursor<RW, T> {
|
||||
fn append(&mut self, key: T::Key, value: &T::Value) -> Result<(), DatabaseError> {
|
||||
let key = key.encode();
|
||||
let value = compress_to_buf_or_ref!(self, value);
|
||||
|
||||
self.execute_with_operation_metric(
|
||||
Operation::CursorAppend,
|
||||
Some(value.unwrap_or(&self.buf).len()),
|
||||
|
||||
@@ -693,6 +693,183 @@ mod tests {
|
||||
let _tempdir = create_test_db(DatabaseEnvKind::RW);
|
||||
}
|
||||
|
||||
/// Test parallel writes to two separate DBIs using subtransactions.
|
||||
///
|
||||
/// This demonstrates using the parallel subtxn API to write to two different
|
||||
/// tables concurrently from separate threads, then committing serially.
|
||||
/// Uses pure FFI to test parallel writes with WriteMap mode (required for parallel subtxns).
|
||||
#[test]
|
||||
fn db_parallel_writes_two_tables() {
|
||||
use std::{
|
||||
ffi::{c_void, CString},
|
||||
ptr,
|
||||
};
|
||||
|
||||
let tempdir = TempDir::new().expect(ERROR_TEMPDIR);
|
||||
let path = CString::new(tempdir.path().to_str().unwrap()).unwrap();
|
||||
|
||||
unsafe {
|
||||
// Create environment with WriteMap (required for parallel subtxns)
|
||||
let mut env: *mut ffi::MDBX_env = ptr::null_mut();
|
||||
let rc = ffi::mdbx_env_create(&mut env);
|
||||
assert_eq!(rc, 0, "mdbx_env_create failed");
|
||||
|
||||
ffi::mdbx_env_set_option(env, ffi::MDBX_opt_max_db, 4);
|
||||
let rc = ffi::mdbx_env_open(
|
||||
env,
|
||||
path.as_ptr(),
|
||||
ffi::MDBX_NOSTICKYTHREADS | ffi::MDBX_WRITEMAP,
|
||||
0o644,
|
||||
);
|
||||
assert_eq!(rc, 0, "mdbx_env_open failed");
|
||||
|
||||
// Begin parent write transaction
|
||||
let mut parent_ptr: *mut ffi::MDBX_txn = ptr::null_mut();
|
||||
let rc = ffi::mdbx_txn_begin_ex(
|
||||
env,
|
||||
ptr::null_mut(),
|
||||
ffi::MDBX_TXN_READWRITE,
|
||||
&mut parent_ptr,
|
||||
ptr::null_mut(),
|
||||
);
|
||||
assert_eq!(rc, 0, "mdbx_txn_begin failed: {rc}");
|
||||
|
||||
// Open two separate DBIs (simulating Headers and CanonicalHeaders)
|
||||
let db0_name = CString::new("headers").unwrap();
|
||||
let db1_name = CString::new("canonical").unwrap();
|
||||
|
||||
let mut headers_dbi: ffi::MDBX_dbi = 0;
|
||||
let rc = ffi::mdbx_dbi_open(
|
||||
parent_ptr,
|
||||
db0_name.as_ptr(),
|
||||
ffi::MDBX_CREATE,
|
||||
&mut headers_dbi,
|
||||
);
|
||||
assert_eq!(rc, 0, "open headers dbi failed");
|
||||
|
||||
let mut canonical_dbi: ffi::MDBX_dbi = 0;
|
||||
let rc = ffi::mdbx_dbi_open(
|
||||
parent_ptr,
|
||||
db1_name.as_ptr(),
|
||||
ffi::MDBX_CREATE,
|
||||
&mut canonical_dbi,
|
||||
);
|
||||
assert_eq!(rc, 0, "open canonical dbi failed");
|
||||
|
||||
// Create subtxns for both DBIs using the new batch API
|
||||
let specs = [
|
||||
ffi::MDBX_subtxn_spec_t { dbi: headers_dbi, arena_hint: 0 },
|
||||
ffi::MDBX_subtxn_spec_t { dbi: canonical_dbi, arena_hint: 0 },
|
||||
];
|
||||
let mut subtxns: [*mut ffi::MDBX_txn; 2] = [ptr::null_mut(); 2];
|
||||
let rc = ffi::mdbx_txn_create_subtxns(
|
||||
parent_ptr,
|
||||
specs.as_ptr(),
|
||||
specs.len(),
|
||||
subtxns.as_mut_ptr(),
|
||||
);
|
||||
assert_eq!(rc, 0, "mdbx_txn_create_subtxns failed: {rc}");
|
||||
|
||||
let headers_subtx = subtxns[0];
|
||||
let canonical_subtx = subtxns[1];
|
||||
|
||||
// Serial writes to both subtxns
|
||||
let num_entries = 50u64;
|
||||
|
||||
// Headers subtxn writes
|
||||
for i in 0..num_entries {
|
||||
let key_bytes = i.to_be_bytes();
|
||||
let value = format!("header_data_{i}");
|
||||
let value_bytes = value.as_bytes();
|
||||
|
||||
let mut k =
|
||||
ffi::MDBX_val { iov_base: key_bytes.as_ptr() as *mut c_void, iov_len: 8 };
|
||||
let mut v = ffi::MDBX_val {
|
||||
iov_base: value_bytes.as_ptr() as *mut c_void,
|
||||
iov_len: value_bytes.len(),
|
||||
};
|
||||
|
||||
let rc = ffi::mdbx_put(
|
||||
headers_subtx,
|
||||
headers_dbi,
|
||||
&mut k,
|
||||
&mut v,
|
||||
ffi::MDBX_put_flags_t::default(),
|
||||
);
|
||||
assert_eq!(rc, 0, "headers put {i} failed: {rc}");
|
||||
}
|
||||
|
||||
// Canonical subtxn writes
|
||||
for i in 0..num_entries {
|
||||
let key_bytes = i.to_be_bytes();
|
||||
let mut hash_bytes = [0u8; 32];
|
||||
hash_bytes[31] = i as u8;
|
||||
|
||||
let mut k =
|
||||
ffi::MDBX_val { iov_base: key_bytes.as_ptr() as *mut c_void, iov_len: 8 };
|
||||
let mut v =
|
||||
ffi::MDBX_val { iov_base: hash_bytes.as_ptr() as *mut c_void, iov_len: 32 };
|
||||
|
||||
let rc = ffi::mdbx_put(
|
||||
canonical_subtx,
|
||||
canonical_dbi,
|
||||
&mut k,
|
||||
&mut v,
|
||||
ffi::MDBX_put_flags_t::default(),
|
||||
);
|
||||
assert_eq!(rc, 0, "canonical put {i} failed: {rc}");
|
||||
}
|
||||
|
||||
// Commit subtxns serially
|
||||
let rc = ffi::mdbx_subtx_commit(headers_subtx);
|
||||
assert_eq!(rc, 0, "headers subtx commit failed: {rc}");
|
||||
|
||||
let rc = ffi::mdbx_subtx_commit(canonical_subtx);
|
||||
assert_eq!(rc, 0, "canonical subtx commit failed: {rc}");
|
||||
|
||||
// Commit parent transaction
|
||||
let rc = ffi::mdbx_txn_commit_ex(parent_ptr, ptr::null_mut());
|
||||
assert_eq!(rc, 0, "parent commit failed: {rc}");
|
||||
|
||||
// Verify data was written correctly
|
||||
let mut read_txn: *mut ffi::MDBX_txn = ptr::null_mut();
|
||||
let rc = ffi::mdbx_txn_begin_ex(
|
||||
env,
|
||||
ptr::null_mut(),
|
||||
ffi::MDBX_TXN_RDONLY,
|
||||
&mut read_txn,
|
||||
ptr::null_mut(),
|
||||
);
|
||||
assert_eq!(rc, 0, "read txn begin failed");
|
||||
|
||||
// Verify headers
|
||||
for i in 0..num_entries {
|
||||
let key_bytes = i.to_be_bytes();
|
||||
let k = ffi::MDBX_val { iov_base: key_bytes.as_ptr() as *mut c_void, iov_len: 8 };
|
||||
let mut v = ffi::MDBX_val { iov_base: ptr::null_mut(), iov_len: 0 };
|
||||
let rc = ffi::mdbx_get(read_txn, headers_dbi, &k, &mut v);
|
||||
assert_eq!(rc, 0, "header {i} not found");
|
||||
}
|
||||
|
||||
// Verify canonical headers
|
||||
for i in 0..num_entries {
|
||||
let key_bytes = i.to_be_bytes();
|
||||
let k = ffi::MDBX_val { iov_base: key_bytes.as_ptr() as *mut c_void, iov_len: 8 };
|
||||
let mut v = ffi::MDBX_val { iov_base: ptr::null_mut(), iov_len: 0 };
|
||||
let rc = ffi::mdbx_get(read_txn, canonical_dbi, &k, &mut v);
|
||||
assert_eq!(rc, 0, "canonical header {i} not found");
|
||||
|
||||
// Verify the value matches expected hash
|
||||
assert_eq!(v.iov_len, 32);
|
||||
let data = std::slice::from_raw_parts(v.iov_base as *const u8, 32);
|
||||
assert_eq!(data[31], i as u8, "canonical header {i} hash mismatch");
|
||||
}
|
||||
|
||||
ffi::mdbx_txn_abort(read_txn);
|
||||
ffi::mdbx_env_close_ex(env, false);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn db_drop_orphan_table() {
|
||||
let tempdir = tempfile::TempDir::new().expect(ERROR_TEMPDIR);
|
||||
|
||||
@@ -310,6 +310,31 @@ impl<K: TransactionKind> DbTx for Tx<K> {
|
||||
#[instrument(name = "Tx::commit", level = "debug", target = "providers::db", skip_all)]
|
||||
fn commit(self) -> Result<(), DatabaseError> {
|
||||
self.execute_with_close_transaction_metric(TransactionOutcome::Commit, |this| {
|
||||
// If parallel writes is enabled (only for RW), commit subtxns first with metrics
|
||||
if !K::IS_READ_ONLY && this.inner.is_parallel_writes_enabled() {
|
||||
let stats_result = this.inner.commit_subtxns_with_stats();
|
||||
match stats_result {
|
||||
Ok(stats) => {
|
||||
// Record edge arena metrics if metrics are enabled
|
||||
if let Some(handler) = &this.metrics_handler {
|
||||
let dbi_to_table: rustc_hash::FxHashMap<
|
||||
reth_libmdbx::ffi::MDBX_dbi,
|
||||
&'static str,
|
||||
> = this.dbis.iter().map(|(&name, &dbi)| (dbi, name)).collect();
|
||||
|
||||
for (dbi, subtxn_stats) in &stats {
|
||||
if let Some(&table) = dbi_to_table.get(dbi) {
|
||||
handler
|
||||
.env_metrics
|
||||
.record_edge_arena_stats(table, subtxn_stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
Err(e) => return (Err(DatabaseError::Commit(e.into())), None),
|
||||
}
|
||||
}
|
||||
|
||||
match this.inner.commit().map_err(|e| DatabaseError::Commit(e.into())) {
|
||||
Ok(latency) => (Ok(()), Some(latency)),
|
||||
Err(e) => (Err(e), None),
|
||||
@@ -387,17 +412,162 @@ impl Tx<RW> {
|
||||
let key = key.encode();
|
||||
let value = value.compress();
|
||||
let (operation, write_operation, flags) = kind.into_operation_and_flags();
|
||||
self.execute_with_operation_metric::<T, _>(operation, Some(value.as_ref().len()), |tx| {
|
||||
tx.put(self.get_dbi::<T>()?, key.as_ref(), value, flags).map_err(|e| {
|
||||
DatabaseWriteError {
|
||||
info: e.into(),
|
||||
operation: write_operation,
|
||||
table_name: T::NAME,
|
||||
key: key.into_vec(),
|
||||
let dbi = self.get_dbi::<T>()?;
|
||||
|
||||
if self.is_parallel_writes_enabled() {
|
||||
self.execute_with_operation_metric::<T, _>(
|
||||
operation,
|
||||
Some(value.as_ref().len()),
|
||||
|tx| {
|
||||
tx.put_parallel(dbi, key.as_ref(), value, flags).map_err(|e| {
|
||||
DatabaseWriteError {
|
||||
info: e.into(),
|
||||
operation: write_operation,
|
||||
table_name: T::NAME,
|
||||
key: key.into_vec(),
|
||||
}
|
||||
.into()
|
||||
})
|
||||
},
|
||||
)
|
||||
} else {
|
||||
self.execute_with_operation_metric::<T, _>(
|
||||
operation,
|
||||
Some(value.as_ref().len()),
|
||||
|tx| {
|
||||
tx.put(dbi, key.as_ref(), value, flags).map_err(|e| {
|
||||
DatabaseWriteError {
|
||||
info: e.into(),
|
||||
operation: write_operation,
|
||||
table_name: T::NAME,
|
||||
key: key.into_vec(),
|
||||
}
|
||||
.into()
|
||||
})
|
||||
},
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
/// Enables parallel writes mode by creating subtransactions for ALL known DBIs.
|
||||
///
|
||||
/// After calling this, cursor operations on any table will automatically use
|
||||
/// the corresponding subtransaction, enabling safe parallel writes from multiple threads.
|
||||
///
|
||||
/// This requires WRITEMAP mode to be enabled on the environment.
|
||||
///
|
||||
/// # Returns
|
||||
/// Ok(()) on success, or an error if subtransaction creation fails.
|
||||
pub fn enable_parallel_writes(&self) -> Result<(), DatabaseError> {
|
||||
let dbis: Vec<MDBX_dbi> = self.dbis.values().copied().collect();
|
||||
self.inner.enable_parallel_writes(&dbis).map_err(|e| DatabaseError::InitCursor(e.into()))
|
||||
}
|
||||
|
||||
/// Returns whether parallel writes mode is enabled.
|
||||
pub fn is_parallel_writes_enabled(&self) -> bool {
|
||||
self.inner.is_parallel_writes_enabled()
|
||||
}
|
||||
|
||||
/// Commits all subtransactions serially.
|
||||
///
|
||||
/// This must be called before committing the parent transaction when parallel writes
|
||||
/// mode is enabled.
|
||||
pub fn commit_subtxns(&self) -> Result<(), DatabaseError> {
|
||||
self.inner.commit_subtxns().map_err(|e| DatabaseError::Commit(e.into()))
|
||||
}
|
||||
|
||||
/// Commits all subtransactions serially and records arena stats as Prometheus metrics.
|
||||
///
|
||||
/// This is the preferred method when metrics are enabled, as it collects per-table
|
||||
/// arena allocation statistics for observability.
|
||||
pub fn commit_subtxns_with_metrics(&self) -> Result<(), DatabaseError> {
|
||||
let stats =
|
||||
self.inner.commit_subtxns_with_stats().map_err(|e| DatabaseError::Commit(e.into()))?;
|
||||
|
||||
if let Some(handler) = &self.metrics_handler {
|
||||
let dbi_to_table: rustc_hash::FxHashMap<MDBX_dbi, &'static str> =
|
||||
self.dbis.iter().map(|(&name, &dbi)| (dbi, name)).collect();
|
||||
|
||||
for (dbi, subtxn_stats) in &stats {
|
||||
if let Some(&table) = dbi_to_table.get(dbi) {
|
||||
handler.env_metrics.record_edge_arena_stats(table, subtxn_stats);
|
||||
}
|
||||
.into()
|
||||
})
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Creates a cursor for the given table, using the subtransaction if parallel writes is
|
||||
/// enabled.
|
||||
pub fn new_cursor_parallel<T: Table>(&self) -> Result<Cursor<RW, T>, DatabaseError> {
|
||||
let dbi = self.get_dbi::<T>()?;
|
||||
let inner = self
|
||||
.inner
|
||||
.cursor_with_dbi_parallel_owned(dbi)
|
||||
.map_err(|e| DatabaseError::InitCursor(e.into()))?;
|
||||
|
||||
Ok(Cursor::new_with_metrics(
|
||||
inner,
|
||||
self.metrics_handler.as_ref().map(|h| h.env_metrics.clone()),
|
||||
))
|
||||
}
|
||||
|
||||
/// Enables parallel writes mode only for the specified tables.
|
||||
///
|
||||
/// Creates subtransactions only for the listed tables, allowing parallel
|
||||
/// writes to those tables while other tables continue using the main transaction.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `tables` - Slice of table names to create subtransactions for.
|
||||
///
|
||||
/// # Returns
|
||||
/// Ok(()) on success, or an error if subtransaction creation fails.
|
||||
pub fn enable_parallel_writes_for_tables(&self, tables: &[&str]) -> Result<(), DatabaseError> {
|
||||
let hints: Vec<_> = tables.iter().map(|&t| (t, 0usize)).collect();
|
||||
self.enable_parallel_writes_for_tables_with_hints(&hints)
|
||||
}
|
||||
|
||||
/// Enables parallel writes mode with arena size hints for specified tables.
|
||||
///
|
||||
/// Similar to [`enable_parallel_writes_for_tables`], but allows specifying an arena_hint
|
||||
/// for each table to guide page pre-allocation. An arena_hint of 0 means use
|
||||
/// equal distribution among all subtransactions.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `tables` - Slice of (table_name, arena_hint) tuples.
|
||||
///
|
||||
/// # Returns
|
||||
/// Ok(()) on success, or an error if subtransaction creation fails.
|
||||
pub fn enable_parallel_writes_for_tables_with_hints(
|
||||
&self,
|
||||
tables: &[(&str, usize)],
|
||||
) -> Result<(), DatabaseError> {
|
||||
let specs: Vec<(MDBX_dbi, usize)> = tables
|
||||
.iter()
|
||||
.filter_map(|(name, hint)| self.dbis.get(*name).map(|&dbi| (dbi, *hint)))
|
||||
.collect();
|
||||
|
||||
if specs.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
self.inner
|
||||
.enable_parallel_writes_with_hints(&specs)
|
||||
.map_err(|e| DatabaseError::InitCursor(e.into()))
|
||||
}
|
||||
|
||||
/// Records arena hint estimation stats for a table.
|
||||
///
|
||||
/// This tracks whether the arena hint estimation is working or always hitting floor/cap.
|
||||
pub fn record_arena_estimation(
|
||||
&self,
|
||||
table: &'static str,
|
||||
stats: &crate::metrics::ArenaHintEstimationStats,
|
||||
) {
|
||||
if let Some(handler) = &self.metrics_handler {
|
||||
handler.env_metrics.record_arena_estimation(table, stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -425,10 +595,18 @@ impl DbTxMut for Tx<RW> {
|
||||
data = Some(value.as_ref());
|
||||
};
|
||||
|
||||
self.execute_with_operation_metric::<T, _>(Operation::Delete, None, |tx| {
|
||||
tx.del(self.get_dbi::<T>()?, key.encode(), data)
|
||||
.map_err(|e| DatabaseError::Delete(e.into()))
|
||||
})
|
||||
let dbi = self.get_dbi::<T>()?;
|
||||
let encoded_key = key.encode();
|
||||
|
||||
if self.is_parallel_writes_enabled() {
|
||||
self.execute_with_operation_metric::<T, _>(Operation::Delete, None, |tx| {
|
||||
tx.del_parallel(dbi, encoded_key, data).map_err(|e| DatabaseError::Delete(e.into()))
|
||||
})
|
||||
} else {
|
||||
self.execute_with_operation_metric::<T, _>(Operation::Delete, None, |tx| {
|
||||
tx.del(dbi, encoded_key, data).map_err(|e| DatabaseError::Delete(e.into()))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
fn clear<T: Table>(&self) -> Result<(), DatabaseError> {
|
||||
@@ -438,11 +616,54 @@ impl DbTxMut for Tx<RW> {
|
||||
}
|
||||
|
||||
fn cursor_write<T: Table>(&self) -> Result<Self::CursorMut<T>, DatabaseError> {
|
||||
self.new_cursor()
|
||||
if self.is_parallel_writes_enabled() {
|
||||
self.new_cursor_parallel()
|
||||
} else {
|
||||
self.new_cursor()
|
||||
}
|
||||
}
|
||||
|
||||
fn cursor_dup_write<T: DupSort>(&self) -> Result<Self::DupCursorMut<T>, DatabaseError> {
|
||||
self.new_cursor()
|
||||
if self.is_parallel_writes_enabled() {
|
||||
self.new_cursor_parallel()
|
||||
} else {
|
||||
self.new_cursor()
|
||||
}
|
||||
}
|
||||
|
||||
fn enable_parallel_writes(&self) -> Result<(), DatabaseError> {
|
||||
Tx::enable_parallel_writes(self)
|
||||
}
|
||||
|
||||
fn is_parallel_writes_enabled(&self) -> bool {
|
||||
Tx::is_parallel_writes_enabled(self)
|
||||
}
|
||||
|
||||
fn commit_subtxns(&self) -> Result<(), DatabaseError> {
|
||||
Tx::commit_subtxns(self)
|
||||
}
|
||||
|
||||
fn commit_subtxns_with_metrics(&self) -> Result<(), DatabaseError> {
|
||||
Tx::commit_subtxns_with_metrics(self)
|
||||
}
|
||||
|
||||
fn enable_parallel_writes_for_tables(&self, tables: &[&str]) -> Result<(), DatabaseError> {
|
||||
Tx::enable_parallel_writes_for_tables(self, tables)
|
||||
}
|
||||
|
||||
fn enable_parallel_writes_for_tables_with_hints(
|
||||
&self,
|
||||
tables: &[(&str, usize)],
|
||||
) -> Result<(), DatabaseError> {
|
||||
Tx::enable_parallel_writes_for_tables_with_hints(self, tables)
|
||||
}
|
||||
|
||||
fn record_arena_estimation(
|
||||
&self,
|
||||
table: &'static str,
|
||||
stats: &reth_db_api::transaction::ArenaHintEstimationStats,
|
||||
) {
|
||||
Tx::record_arena_estimation(self, table, stats)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -503,4 +724,61 @@ mod tests {
|
||||
// Backtrace is recorded.
|
||||
assert!(tx.metrics_handler.unwrap().backtrace_recorded.load(Ordering::Relaxed));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parallel_writes_high_level_api() {
|
||||
use reth_db_api::{
|
||||
cursor::DbCursorRW,
|
||||
transaction::{DbTx, DbTxMut},
|
||||
};
|
||||
use std::{sync::Barrier, thread};
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let args = DatabaseArguments::new(ClientVersion::default());
|
||||
let mut db = DatabaseEnv::open(dir.path(), DatabaseEnvKind::RW, args).unwrap();
|
||||
db.create_tables().unwrap();
|
||||
|
||||
let tx = db.tx_mut().unwrap();
|
||||
tx.enable_parallel_writes().unwrap();
|
||||
assert!(tx.is_parallel_writes_enabled());
|
||||
|
||||
let barrier = std::sync::Arc::new(Barrier::new(2));
|
||||
let tx_clone = &tx;
|
||||
let barrier1 = barrier.clone();
|
||||
let barrier2 = barrier.clone();
|
||||
|
||||
thread::scope(|s| {
|
||||
let handle1 = s.spawn(move || {
|
||||
barrier1.wait();
|
||||
let mut cursor = tx_clone.cursor_write::<tables::CanonicalHeaders>().unwrap();
|
||||
for i in 0..10u64 {
|
||||
cursor.append(i, &alloy_primitives::B256::repeat_byte(i as u8)).unwrap();
|
||||
}
|
||||
});
|
||||
|
||||
let handle2 = s.spawn(move || {
|
||||
barrier2.wait();
|
||||
let mut cursor = tx_clone.cursor_write::<tables::HeaderNumbers>().unwrap();
|
||||
for i in 0..10u64 {
|
||||
cursor.upsert(alloy_primitives::B256::repeat_byte(i as u8), &i).unwrap();
|
||||
}
|
||||
});
|
||||
|
||||
handle1.join().unwrap();
|
||||
handle2.join().unwrap();
|
||||
});
|
||||
|
||||
tx.commit().unwrap();
|
||||
|
||||
let tx = db.tx().unwrap();
|
||||
for i in 0..10u64 {
|
||||
let hash = tx.get::<tables::CanonicalHeaders>(i).unwrap();
|
||||
assert_eq!(hash, Some(alloy_primitives::B256::repeat_byte(i as u8)));
|
||||
|
||||
let num = tx
|
||||
.get::<tables::HeaderNumbers>(alloy_primitives::B256::repeat_byte(i as u8))
|
||||
.unwrap();
|
||||
assert_eq!(num, Some(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::Tables;
|
||||
use metrics::Histogram;
|
||||
use metrics::{Gauge, Histogram};
|
||||
use reth_metrics::{metrics::Counter, Metrics};
|
||||
use rustc_hash::FxHashMap;
|
||||
use std::time::{Duration, Instant};
|
||||
@@ -23,6 +23,9 @@ pub(crate) struct DatabaseEnvMetrics {
|
||||
/// outcome. Can only be updated at tx close, as outcome is only known at that point.
|
||||
transaction_outcomes:
|
||||
FxHashMap<(TransactionMode, TransactionOutcome), TransactionOutcomeMetrics>,
|
||||
/// Caches `EdgeArenaMetrics` handles for each table.
|
||||
/// Used for tracking parallel subtransaction arena allocation stats.
|
||||
edge_arena: FxHashMap<&'static str, EdgeArenaMetrics>,
|
||||
}
|
||||
|
||||
impl DatabaseEnvMetrics {
|
||||
@@ -33,6 +36,7 @@ impl DatabaseEnvMetrics {
|
||||
operations: Self::generate_operation_handles(),
|
||||
transactions: Self::generate_transaction_handles(),
|
||||
transaction_outcomes: Self::generate_transaction_outcome_handles(),
|
||||
edge_arena: Self::generate_edge_arena_handles(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -95,6 +99,20 @@ impl DatabaseEnvMetrics {
|
||||
transaction_outcomes
|
||||
}
|
||||
|
||||
/// Generate a map of all table names to edge arena metric handles.
|
||||
/// Used for tracking parallel subtransaction arena allocation stats.
|
||||
fn generate_edge_arena_handles() -> FxHashMap<&'static str, EdgeArenaMetrics> {
|
||||
Tables::ALL
|
||||
.iter()
|
||||
.map(|table| {
|
||||
(
|
||||
table.name(),
|
||||
EdgeArenaMetrics::new_with_labels(&[(Labels::Table.as_str(), table.name())]),
|
||||
)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Record a metric for database operation executed in `f`.
|
||||
/// Panics if a metric recorder is not found for the given table and operation.
|
||||
pub(crate) fn record_operation<R>(
|
||||
@@ -139,6 +157,34 @@ impl DatabaseEnvMetrics {
|
||||
.expect("transaction outcome metric handle not found")
|
||||
.record(open_duration, close_duration, commit_latency);
|
||||
}
|
||||
|
||||
/// Record edge arena stats for a subtransaction.
|
||||
///
|
||||
/// The table name is looked up from the provided dbi-to-table mapping.
|
||||
#[cfg(feature = "mdbx")]
|
||||
pub(crate) fn record_edge_arena_stats(
|
||||
&self,
|
||||
table: &'static str,
|
||||
stats: &reth_libmdbx::SubTransactionStats,
|
||||
) {
|
||||
if let Some(metrics) = self.edge_arena.get(table) {
|
||||
metrics.record(stats);
|
||||
}
|
||||
}
|
||||
|
||||
/// Record arena hint estimation stats for a table.
|
||||
///
|
||||
/// Tracks whether arena hint estimation is working or always hitting floor/cap.
|
||||
#[cfg(feature = "mdbx")]
|
||||
pub(crate) fn record_arena_estimation(
|
||||
&self,
|
||||
table: &'static str,
|
||||
stats: &ArenaHintEstimationStats,
|
||||
) {
|
||||
if let Some(metrics) = self.edge_arena.get(table) {
|
||||
metrics.record_estimation(stats);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Transaction mode for the database, either read-only or read-write.
|
||||
@@ -363,3 +409,79 @@ impl OperationMetrics {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Metrics for parallel subtransaction (edge mode) arena allocation.
|
||||
/// Tracks page allocation efficiency from pre-distributed arenas.
|
||||
#[derive(Metrics, Clone)]
|
||||
#[metrics(scope = "database.edge")]
|
||||
pub(crate) struct EdgeArenaMetrics {
|
||||
/// Pages allocated from pre-distributed arena (fast path)
|
||||
arena_page_allocations: Counter,
|
||||
/// Times fallback to parent was needed (arena refill events)
|
||||
arena_refill_events: Counter,
|
||||
/// Distribution of refill events per subtxn commit (per-batch granularity)
|
||||
arena_refills_per_batch: Histogram,
|
||||
/// Pages initially distributed to subtxn
|
||||
arena_initial_pages: Counter,
|
||||
/// Pages returned to parent on commit (not consumed)
|
||||
pages_unused: Counter,
|
||||
/// Distribution of unused pages per subtxn commit (detects over-allocation)
|
||||
pages_unused_per_batch: Histogram,
|
||||
/// Pages acquired from parent during fallback (arena refill)
|
||||
arena_refill_pages: Counter,
|
||||
/// Configured arena size hint for this table (pages)
|
||||
arena_hint: Gauge,
|
||||
/// Pages reclaimed from GC (garbage collector / freeDB)
|
||||
pages_from_gc: Counter,
|
||||
/// Pages allocated from end-of-file (extending the database)
|
||||
pages_from_eof: Counter,
|
||||
/// Raw calculated estimate before floor was applied
|
||||
arena_hint_estimated: Gauge,
|
||||
/// Final hint value used after floor
|
||||
arena_hint_actual: Gauge,
|
||||
/// Times the estimate was below floor and floored value was used
|
||||
arena_hint_floored_total: Counter,
|
||||
/// Current source of hint: 0=estimated, 1=floored
|
||||
arena_hint_source: Gauge,
|
||||
}
|
||||
|
||||
pub(crate) use reth_db_api::transaction::{ArenaHintEstimationStats, ArenaHintSource};
|
||||
|
||||
impl EdgeArenaMetrics {
|
||||
/// Record stats from a single subtransaction.
|
||||
pub(crate) fn record(&self, stats: &reth_libmdbx::SubTransactionStats) {
|
||||
println!(
|
||||
"[ARENA] page_allocations={} refill_events={} initial_pages={} unused={} refill_pages={} hint={} from_gc={} from_eof={}",
|
||||
stats.arena_page_allocations,
|
||||
stats.arena_refill_events,
|
||||
stats.arena_initial_pages,
|
||||
stats.pages_unused,
|
||||
stats.arena_refill_pages,
|
||||
stats.arena_hint,
|
||||
stats.pages_from_gc,
|
||||
stats.pages_from_eof
|
||||
);
|
||||
self.arena_page_allocations.increment(stats.arena_page_allocations as u64);
|
||||
self.arena_refill_events.increment(stats.arena_refill_events as u64);
|
||||
self.arena_refills_per_batch.record(stats.arena_refill_events as f64);
|
||||
self.arena_initial_pages.increment(stats.arena_initial_pages as u64);
|
||||
self.pages_unused.increment(stats.pages_unused as u64);
|
||||
self.pages_unused_per_batch.record(stats.pages_unused as f64);
|
||||
self.arena_refill_pages.increment(stats.arena_refill_pages as u64);
|
||||
self.arena_hint.set(stats.arena_hint as f64);
|
||||
self.pages_from_gc.increment(stats.pages_from_gc as u64);
|
||||
self.pages_from_eof.increment(stats.pages_from_eof as u64);
|
||||
}
|
||||
|
||||
/// Record estimation stats for arena hint calculation.
|
||||
pub(crate) fn record_estimation(&self, stats: &ArenaHintEstimationStats) {
|
||||
self.arena_hint_estimated.set(stats.estimated as f64);
|
||||
self.arena_hint_actual.set(stats.actual as f64);
|
||||
self.arena_hint_source.set(stats.source as i64 as f64);
|
||||
|
||||
match stats.source {
|
||||
ArenaHintSource::Floored => self.arena_hint_floored_total.increment(1),
|
||||
ArenaHintSource::Estimated => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
1659
crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c
vendored
1659
crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.c
vendored
File diff suppressed because it is too large
Load Diff
129
crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h
vendored
129
crates/storage/libmdbx-rs/mdbx-sys/libmdbx/mdbx.h
vendored
@@ -6644,6 +6644,135 @@ LIBMDBX_API int mdbx_env_chk_encount_problem(MDBX_chk_context_t *ctx);
|
||||
|
||||
/** end of chk @} */
|
||||
|
||||
/** \defgroup c_parallel Parallel Write Transactions
|
||||
* @{
|
||||
*
|
||||
* These APIs enable parallel writes within a single transaction by partitioning
|
||||
* page allocations. Each subtransaction gets its own page range and per-txn
|
||||
* I/O resources, allowing thread-safe parallel cursor operations.
|
||||
*
|
||||
* \note This is a reth-specific extension to libmdbx.
|
||||
*
|
||||
* Usage pattern:
|
||||
* 1. Begin a write transaction and open all DBIs
|
||||
* 2. Create subtransactions with mdbx_txn_create_subtxns() - each bound to a specific DBI
|
||||
* 3. Perform parallel writes using cursors on each subtransaction (DBI enforced)
|
||||
* 4. Commit subtransactions with mdbx_subtx_commit()
|
||||
* 5. Commit the parent transaction
|
||||
*/
|
||||
|
||||
/** \brief Page range for sub-allocator used in parallel writes.
|
||||
*
|
||||
* This structure defines a contiguous range of pages that a subtransaction
|
||||
* can allocate from without coordinating with other subtransactions.
|
||||
*/
|
||||
typedef struct MDBX_page_range {
|
||||
uint64_t begin; /**< First page number in range (inclusive) */
|
||||
uint64_t end; /**< Last page number in range (exclusive) */
|
||||
} MDBX_page_range_t;
|
||||
|
||||
/** \brief Specification for a parallel subtransaction.
|
||||
*
|
||||
* Used with mdbx_txn_create_subtxns() to create multiple subtransactions
|
||||
* atomically, each bound to a specific DBI.
|
||||
*/
|
||||
typedef struct MDBX_subtxn_spec {
|
||||
MDBX_dbi dbi; /**< DBI this subtxn will write to (enforced) */
|
||||
size_t arena_hint; /**< Estimated pages needed (0 = use equal distribution) */
|
||||
} MDBX_subtxn_spec_t;
|
||||
|
||||
/** \brief Create multiple subtransactions for parallel writes.
|
||||
*
|
||||
* Creates all subtransactions atomically, each bound to a specific DBI.
|
||||
* This enforces the invariant that each DBI has at most one subtxn.
|
||||
*
|
||||
* Pages are distributed from parent's reclaimed GC pages (repnl) and
|
||||
* loose_pages. No EOF pre-reservation is done. If a subtxn exhausts its
|
||||
* pre-claimed pages, it returns MDBX_MAP_FULL and the caller must handle
|
||||
* synchronized fallback to parent allocation.
|
||||
*
|
||||
* Each subtransaction can only open cursors on its assigned DBI. Attempting
|
||||
* to open a cursor on a different DBI will fail with MDBX_EINVAL.
|
||||
*
|
||||
* \param [in] parent The parent write transaction (must be WRITEMAP).
|
||||
* \param [in] specs Array of subtxn specifications.
|
||||
* \param [in] count Number of subtxns to create.
|
||||
* \param [out] subtxns Array to receive subtxn handles (must be pre-allocated).
|
||||
*
|
||||
* \returns A non-zero error value on failure and 0 on success.
|
||||
* \retval MDBX_EINVAL Invalid parameters or duplicate DBIs in specs.
|
||||
* \retval MDBX_INCOMPATIBLE Parent is not WRITEMAP mode.
|
||||
*/
|
||||
LIBMDBX_API int mdbx_txn_create_subtxns(MDBX_txn *parent,
|
||||
const MDBX_subtxn_spec_t *specs,
|
||||
size_t count,
|
||||
MDBX_txn **subtxns);
|
||||
|
||||
/** \brief Commit a subtransaction, merging its changes to the parent.
|
||||
*
|
||||
* Commits the subtransaction by merging its dirtylist into the parent
|
||||
* transaction. After this call, the subtransaction handle is invalidated.
|
||||
*
|
||||
* All subtransactions must be committed before the parent can be committed.
|
||||
*
|
||||
* \param [in] subtxn A subtransaction handle created by mdbx_txn_create_subtxns().
|
||||
*
|
||||
* \returns A non-zero error value on failure and 0 on success.
|
||||
* \retval MDBX_EINVAL subtxn is NULL or not a subtransaction.
|
||||
* \retval MDBX_BAD_TXN subtxn has an error or was already committed.
|
||||
*/
|
||||
LIBMDBX_API int mdbx_subtx_commit(MDBX_txn *subtxn);
|
||||
|
||||
/** \brief Abort a subtransaction, discarding its changes.
|
||||
*
|
||||
* Aborts the subtransaction, discarding all writes. The pages allocated
|
||||
* by this subtransaction are NOT returned to the parent's allocator.
|
||||
*
|
||||
* \param [in] subtxn A subtransaction handle created by mdbx_txn_create_subtxns().
|
||||
*
|
||||
* \returns A non-zero error value on failure and 0 on success.
|
||||
*/
|
||||
LIBMDBX_API int mdbx_subtx_abort(MDBX_txn *subtxn);
|
||||
|
||||
/** \brief Check if a transaction is a parallel subtransaction.
|
||||
*
|
||||
* \param [in] txn A transaction handle.
|
||||
*
|
||||
* \returns Non-zero if txn is a parallel subtransaction, 0 otherwise.
|
||||
*/
|
||||
LIBMDBX_API int mdbx_txn_is_subtx(const MDBX_txn *txn);
|
||||
|
||||
/** \brief Statistics for a parallel subtransaction.
|
||||
*
|
||||
* Contains metrics about page allocation and fallback behavior.
|
||||
*/
|
||||
typedef struct MDBX_subtxn_stats {
|
||||
size_t arena_page_allocations; /**< Pages allocated from pre-distributed arena */
|
||||
size_t arena_refill_events; /**< Times fallback to parent was needed */
|
||||
size_t arena_initial_pages; /**< Initial pages distributed to this subtxn */
|
||||
size_t arena_refill_pages; /**< Additional pages acquired from parent during fallback */
|
||||
size_t pages_from_gc; /**< Pages acquired from parent's repnl (GC) */
|
||||
size_t pages_from_eof; /**< Pages acquired via EOF extension */
|
||||
size_t pages_unused; /**< Pages returned to parent on commit (not consumed) */
|
||||
size_t arena_hint; /**< Original arena hint for this subtxn */
|
||||
MDBX_dbi assigned_dbi; /**< DBI this subtxn is bound to */
|
||||
} MDBX_subtxn_stats;
|
||||
|
||||
/** \brief Get statistics for a parallel subtransaction.
|
||||
*
|
||||
* Retrieves allocation and fallback metrics for a subtransaction.
|
||||
*
|
||||
* \param [in] subtxn A subtransaction handle created by mdbx_txn_create_subtxns().
|
||||
* \param [out] stats Pointer to stats structure to fill.
|
||||
*
|
||||
* \returns A non-zero error value on failure and 0 on success.
|
||||
* \retval MDBX_EINVAL subtxn or stats is NULL, or subtxn is not a subtransaction.
|
||||
* \retval MDBX_BAD_TXN subtxn has an invalid signature.
|
||||
*/
|
||||
LIBMDBX_API int mdbx_subtxn_get_stats(const MDBX_txn *subtxn, MDBX_subtxn_stats *stats);
|
||||
|
||||
/** end of c_parallel @} */
|
||||
|
||||
/** end of c_api @} */
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@@ -2,7 +2,7 @@ use crate::{
|
||||
error::{mdbx_result, Error, Result},
|
||||
flags::*,
|
||||
mdbx_try_optional,
|
||||
transaction::{TransactionKind, RW},
|
||||
transaction::{TransactionKind, TransactionPtr, RW},
|
||||
TableObject, Transaction,
|
||||
};
|
||||
use ffi::{
|
||||
@@ -20,6 +20,14 @@ where
|
||||
{
|
||||
txn: Transaction<K>,
|
||||
cursor: *mut ffi::MDBX_cursor,
|
||||
/// Optional transaction pointer for parallel writes. When set, write operations
|
||||
/// use this pointer directly instead of going through `self.txn.txn_execute()`.
|
||||
/// This is needed because `new_with_ptr` opens a cursor on a subtransaction,
|
||||
/// but stores the parent transaction in `txn`.
|
||||
///
|
||||
/// Uses `TransactionPtr` to ensure proper mutex locking for thread-safety,
|
||||
/// as MDBX requires serialized access to transactions.
|
||||
owned_txn_ptr: Option<TransactionPtr>,
|
||||
}
|
||||
|
||||
impl<K> Cursor<K>
|
||||
@@ -33,7 +41,48 @@ where
|
||||
mdbx_result(ffi::mdbx_cursor_open(txn_ptr, dbi, &mut cursor))
|
||||
})??;
|
||||
}
|
||||
Ok(Self { txn, cursor })
|
||||
Ok(Self { txn, cursor, owned_txn_ptr: None })
|
||||
}
|
||||
|
||||
/// Creates a new cursor using a specific transaction pointer.
|
||||
///
|
||||
/// This is used for parallel writes where the cursor should be opened on
|
||||
/// a subtransaction rather than the parent transaction. The cursor stores
|
||||
/// this pointer and uses it directly for write operations.
|
||||
///
|
||||
/// The transaction pointer's cursor count is incremented on creation and
|
||||
/// decremented when the cursor is dropped.
|
||||
pub(crate) fn new_with_ptr(
|
||||
txn: Transaction<K>,
|
||||
dbi: ffi::MDBX_dbi,
|
||||
txn_ptr: TransactionPtr,
|
||||
) -> Result<Self> {
|
||||
let mut cursor: *mut ffi::MDBX_cursor = ptr::null_mut();
|
||||
txn_ptr.txn_execute_fail_on_timeout(|ptr| unsafe {
|
||||
mdbx_result(ffi::mdbx_cursor_open(ptr, dbi, &mut cursor))
|
||||
})??;
|
||||
txn_ptr.increment_cursor_count();
|
||||
Ok(Self { txn, cursor, owned_txn_ptr: Some(txn_ptr) })
|
||||
}
|
||||
|
||||
/// Executes a closure on the transaction pointer.
|
||||
///
|
||||
/// If this cursor was created with `new_with_ptr`, uses the stored txn pointer
|
||||
/// with proper locking. Otherwise, delegates to `self.txn.txn_execute()`.
|
||||
fn execute_on_txn<F, T>(&self, f: F) -> Result<T>
|
||||
where
|
||||
F: FnOnce(*mut ffi::MDBX_txn) -> T,
|
||||
{
|
||||
if let Some(ref txn_ptr) = self.owned_txn_ptr {
|
||||
txn_ptr.txn_execute_fail_on_timeout(f)
|
||||
} else {
|
||||
self.txn.txn_execute(|txn_ptr| f(txn_ptr))
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns whether this cursor has an owned transaction pointer (for parallel writes).
|
||||
pub fn has_owned_txn_ptr(&self) -> bool {
|
||||
self.owned_txn_ptr.is_some()
|
||||
}
|
||||
|
||||
fn new_at_position(other: &Self) -> Result<Self> {
|
||||
@@ -42,7 +91,8 @@ where
|
||||
|
||||
let res = ffi::mdbx_cursor_copy(other.cursor(), cursor);
|
||||
|
||||
let s = Self { txn: other.txn.clone(), cursor };
|
||||
let s =
|
||||
Self { txn: other.txn.clone(), cursor, owned_txn_ptr: other.owned_txn_ptr.clone() };
|
||||
|
||||
mdbx_result(res)?;
|
||||
|
||||
@@ -90,25 +140,52 @@ where
|
||||
let mut data_val = slice_to_val(data);
|
||||
let key_ptr = key_val.iov_base;
|
||||
let data_ptr = data_val.iov_base;
|
||||
self.txn.txn_execute(|txn| {
|
||||
let v = mdbx_result(ffi::mdbx_cursor_get(
|
||||
self.cursor,
|
||||
&mut key_val,
|
||||
&mut data_val,
|
||||
op,
|
||||
))?;
|
||||
|
||||
// For parallel cursors (with owned_txn_ptr), bypass locking entirely.
|
||||
// For normal cursors, use transaction locking.
|
||||
if let Some(ref txn_ptr) = self.owned_txn_ptr {
|
||||
// Parallel cursor path - no locking needed, subtxns are independent
|
||||
let rc = ffi::mdbx_cursor_get(self.cursor, &mut key_val, &mut data_val, op);
|
||||
let v = mdbx_result(rc)?;
|
||||
|
||||
// Check for NULL data pointer (can happen on partial matches)
|
||||
if data_val.iov_base.is_null() && data_val.iov_len > 0 {
|
||||
return Err(Error::NotFound);
|
||||
}
|
||||
// Check for NULL key pointer (can happen on partial matches)
|
||||
if key_val.iov_base.is_null() && key_val.iov_len > 0 {
|
||||
return Err(Error::NotFound);
|
||||
}
|
||||
|
||||
assert_ne!(data_ptr, data_val.iov_base);
|
||||
let key_out = {
|
||||
// MDBX wrote in new key
|
||||
if ptr::eq(key_ptr, key_val.iov_base) {
|
||||
None
|
||||
} else {
|
||||
Some(Key::decode_val::<K>(txn, key_val)?)
|
||||
}
|
||||
let key_out = if ptr::eq(key_ptr, key_val.iov_base) {
|
||||
None
|
||||
} else {
|
||||
Some(Key::decode_val::<K>(txn_ptr.as_ptr(), key_val)?)
|
||||
};
|
||||
let data_out = Value::decode_val::<K>(txn, data_val)?;
|
||||
let data_out = Value::decode_val::<K>(txn_ptr.as_ptr(), data_val)?;
|
||||
Ok((key_out, data_out, v))
|
||||
})?
|
||||
} else {
|
||||
// Normal cursor path - use transaction locking
|
||||
self.txn.txn_execute(|txn| {
|
||||
let v = mdbx_result(ffi::mdbx_cursor_get(
|
||||
self.cursor,
|
||||
&mut key_val,
|
||||
&mut data_val,
|
||||
op,
|
||||
))?;
|
||||
assert_ne!(data_ptr, data_val.iov_base);
|
||||
let key_out = {
|
||||
if ptr::eq(key_ptr, key_val.iov_base) {
|
||||
None
|
||||
} else {
|
||||
Some(Key::decode_val::<K>(txn, key_val)?)
|
||||
}
|
||||
};
|
||||
let data_out = Value::decode_val::<K>(txn, data_val)?;
|
||||
Ok((key_out, data_out, v))
|
||||
})?
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -438,11 +515,20 @@ impl Cursor<RW> {
|
||||
ffi::MDBX_val { iov_len: key.len(), iov_base: key.as_ptr() as *mut c_void };
|
||||
let mut data_val: ffi::MDBX_val =
|
||||
ffi::MDBX_val { iov_len: data.len(), iov_base: data.as_ptr() as *mut c_void };
|
||||
mdbx_result(unsafe {
|
||||
self.txn.txn_execute(|_| {
|
||||
ffi::mdbx_cursor_put(self.cursor, &key_val, &mut data_val, flags.bits())
|
||||
})?
|
||||
})?;
|
||||
|
||||
unsafe {
|
||||
if self.owned_txn_ptr.is_some() {
|
||||
// Bypass locking entirely for parallel cursors - they have independent subtxns
|
||||
let ret = ffi::mdbx_cursor_put(self.cursor, &key_val, &mut data_val, flags.bits());
|
||||
mdbx_result(ret)?;
|
||||
} else {
|
||||
// Use normal path with locking for non-parallel cursors
|
||||
let ret = self.txn.txn_execute(|_| {
|
||||
ffi::mdbx_cursor_put(self.cursor, &key_val, &mut data_val, flags.bits())
|
||||
})?;
|
||||
mdbx_result(ret)?;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -455,9 +541,8 @@ impl Cursor<RW> {
|
||||
/// current key, if the database was opened with [`DatabaseFlags::DUP_SORT`].
|
||||
pub fn del(&mut self, flags: WriteFlags) -> Result<()> {
|
||||
mdbx_result(unsafe {
|
||||
self.txn.txn_execute(|_| ffi::mdbx_cursor_del(self.cursor, flags.bits()))?
|
||||
self.execute_on_txn(|_txn_ptr| ffi::mdbx_cursor_del(self.cursor, flags.bits()))?
|
||||
})?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -485,11 +570,17 @@ where
|
||||
K: TransactionKind,
|
||||
{
|
||||
fn drop(&mut self) {
|
||||
// To be able to close a cursor of a timed out transaction, we need to renew it first.
|
||||
// Hence the usage of `txn_execute_renew_on_timeout` here.
|
||||
let _ = self
|
||||
.txn
|
||||
.txn_execute_renew_on_timeout(|_| unsafe { ffi::mdbx_cursor_close(self.cursor) });
|
||||
if let Some(ref txn_ptr) = self.owned_txn_ptr {
|
||||
// Cursor was opened on a subtransaction - close on that transaction
|
||||
let _ = txn_ptr
|
||||
.txn_execute_fail_on_timeout(|_| unsafe { ffi::mdbx_cursor_close(self.cursor) });
|
||||
txn_ptr.decrement_cursor_count();
|
||||
} else {
|
||||
// Standard cursor - use parent transaction with renew-on-timeout
|
||||
let _ = self
|
||||
.txn
|
||||
.txn_execute_renew_on_timeout(|_| unsafe { ffi::mdbx_cursor_close(self.cursor) });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -505,6 +596,292 @@ const unsafe fn slice_to_val(slice: Option<&[u8]>) -> ffi::MDBX_val {
|
||||
unsafe impl<K> Send for Cursor<K> where K: TransactionKind {}
|
||||
unsafe impl<K> Sync for Cursor<K> where K: TransactionKind {}
|
||||
|
||||
/// A cursor for parallel writes that borrows from the transaction.
|
||||
///
|
||||
/// This cursor type provides compile-time safety: it cannot outlive the parallel writes
|
||||
/// session because it immutably borrows the transaction. When `commit_subtxns(&mut self)`
|
||||
/// or `abort_subtxns(&mut self)` is called, the borrow checker ensures no `ParallelCursor`
|
||||
/// exists.
|
||||
///
|
||||
/// # Example
|
||||
/// ```ignore
|
||||
/// let txn = env.begin_rw_txn()?;
|
||||
/// txn.enable_parallel_writes(&[dbi])?;
|
||||
/// {
|
||||
/// let mut cursor = txn.cursor_with_dbi_parallel(dbi)?;
|
||||
/// cursor.put(b"key", b"value", WriteFlags::empty())?;
|
||||
/// } // cursor dropped, borrow released
|
||||
/// txn.commit_subtxns()?; // OK - no outstanding borrows
|
||||
/// txn.commit()?;
|
||||
/// ```
|
||||
pub struct ParallelCursor<'txn> {
|
||||
/// Borrow of the parent transaction - prevents commit_subtxns while cursor exists
|
||||
_txn: &'txn Transaction<RW>,
|
||||
/// The raw cursor pointer
|
||||
cursor: *mut ffi::MDBX_cursor,
|
||||
/// Transaction pointer for the subtransaction
|
||||
txn_ptr: TransactionPtr,
|
||||
}
|
||||
|
||||
impl<'txn> ParallelCursor<'txn> {
|
||||
/// Creates a new parallel cursor on the given DBI.
|
||||
pub(crate) fn new(
|
||||
txn: &'txn Transaction<RW>,
|
||||
dbi: ffi::MDBX_dbi,
|
||||
txn_ptr: TransactionPtr,
|
||||
) -> Result<Self> {
|
||||
let mut cursor: *mut ffi::MDBX_cursor = ptr::null_mut();
|
||||
txn_ptr.txn_execute_fail_on_timeout(|ptr| unsafe {
|
||||
mdbx_result(ffi::mdbx_cursor_open(ptr, dbi, &mut cursor))
|
||||
})??;
|
||||
Ok(Self { _txn: txn, cursor, txn_ptr })
|
||||
}
|
||||
|
||||
/// Returns a raw pointer to the underlying MDBX cursor.
|
||||
#[inline]
|
||||
pub const fn cursor(&self) -> *mut ffi::MDBX_cursor {
|
||||
self.cursor
|
||||
}
|
||||
|
||||
/// Puts a key/data pair into the database.
|
||||
pub fn put(&mut self, key: &[u8], data: &[u8], flags: WriteFlags) -> Result<()> {
|
||||
let key_val = ffi::MDBX_val { iov_len: key.len(), iov_base: key.as_ptr() as *mut c_void };
|
||||
let mut data_val =
|
||||
ffi::MDBX_val { iov_len: data.len(), iov_base: data.as_ptr() as *mut c_void };
|
||||
|
||||
let ret =
|
||||
unsafe { ffi::mdbx_cursor_put(self.cursor, &key_val, &mut data_val, flags.bits()) };
|
||||
mdbx_result(ret)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Deletes the current key/data pair.
|
||||
pub fn del(&mut self, flags: WriteFlags) -> Result<()> {
|
||||
mdbx_result(unsafe { ffi::mdbx_cursor_del(self.cursor, flags.bits()) })?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Position at first key/data item.
|
||||
pub fn first<Key, Value>(&mut self) -> Result<Option<(Key, Value)>>
|
||||
where
|
||||
Key: TableObject,
|
||||
Value: TableObject,
|
||||
{
|
||||
self.get_full(None, None, MDBX_FIRST)
|
||||
}
|
||||
|
||||
/// Position at last key/data item.
|
||||
pub fn last<Key, Value>(&mut self) -> Result<Option<(Key, Value)>>
|
||||
where
|
||||
Key: TableObject,
|
||||
Value: TableObject,
|
||||
{
|
||||
self.get_full(None, None, MDBX_LAST)
|
||||
}
|
||||
|
||||
/// Position at next data item.
|
||||
#[expect(clippy::should_implement_trait)]
|
||||
pub fn next<Key, Value>(&mut self) -> Result<Option<(Key, Value)>>
|
||||
where
|
||||
Key: TableObject,
|
||||
Value: TableObject,
|
||||
{
|
||||
self.get_full(None, None, MDBX_NEXT)
|
||||
}
|
||||
|
||||
/// Position at previous data item.
|
||||
pub fn prev<Key, Value>(&mut self) -> Result<Option<(Key, Value)>>
|
||||
where
|
||||
Key: TableObject,
|
||||
Value: TableObject,
|
||||
{
|
||||
self.get_full(None, None, MDBX_PREV)
|
||||
}
|
||||
|
||||
/// Position at specified key.
|
||||
pub fn set<Value>(&mut self, key: &[u8]) -> Result<Option<Value>>
|
||||
where
|
||||
Value: TableObject,
|
||||
{
|
||||
self.get_value(Some(key), None, MDBX_SET_KEY)
|
||||
}
|
||||
|
||||
/// Position at specified key, returning key and value.
|
||||
pub fn set_key<Key, Value>(&mut self, key: &[u8]) -> Result<Option<(Key, Value)>>
|
||||
where
|
||||
Key: TableObject,
|
||||
Value: TableObject,
|
||||
{
|
||||
self.get_full(Some(key), None, MDBX_SET_KEY)
|
||||
}
|
||||
|
||||
/// Position at first key >= specified key.
|
||||
pub fn set_range<Key, Value>(&mut self, key: &[u8]) -> Result<Option<(Key, Value)>>
|
||||
where
|
||||
Key: TableObject,
|
||||
Value: TableObject,
|
||||
{
|
||||
self.get_full(Some(key), None, MDBX_SET_RANGE)
|
||||
}
|
||||
|
||||
/// Return key/data at current cursor position.
|
||||
pub fn get_current<Key, Value>(&mut self) -> Result<Option<(Key, Value)>>
|
||||
where
|
||||
Key: TableObject,
|
||||
Value: TableObject,
|
||||
{
|
||||
self.get_full(None, None, MDBX_GET_CURRENT)
|
||||
}
|
||||
|
||||
/// [`DatabaseFlags::DUP_SORT`]-only: Position at first data item of current key.
|
||||
pub fn first_dup<Value>(&mut self) -> Result<Option<Value>>
|
||||
where
|
||||
Value: TableObject,
|
||||
{
|
||||
self.get_value(None, None, MDBX_FIRST_DUP)
|
||||
}
|
||||
|
||||
/// [`DatabaseFlags::DUP_SORT`]-only: Position at last data item of current key.
|
||||
pub fn last_dup<Value>(&mut self) -> Result<Option<Value>>
|
||||
where
|
||||
Value: TableObject,
|
||||
{
|
||||
self.get_value(None, None, MDBX_LAST_DUP)
|
||||
}
|
||||
|
||||
/// [`DatabaseFlags::DUP_SORT`]-only: Position at next data item of current key.
|
||||
pub fn next_dup<Key, Value>(&mut self) -> Result<Option<(Key, Value)>>
|
||||
where
|
||||
Key: TableObject,
|
||||
Value: TableObject,
|
||||
{
|
||||
self.get_full(None, None, MDBX_NEXT_DUP)
|
||||
}
|
||||
|
||||
/// [`DatabaseFlags::DUP_SORT`]-only: Position at previous data item of current key.
|
||||
pub fn prev_dup<Key, Value>(&mut self) -> Result<Option<(Key, Value)>>
|
||||
where
|
||||
Key: TableObject,
|
||||
Value: TableObject,
|
||||
{
|
||||
self.get_full(None, None, MDBX_PREV_DUP)
|
||||
}
|
||||
|
||||
/// [`DatabaseFlags::DUP_SORT`]-only: Position at key/data pair.
|
||||
pub fn get_both<Value>(&mut self, k: &[u8], v: &[u8]) -> Result<Option<Value>>
|
||||
where
|
||||
Value: TableObject,
|
||||
{
|
||||
self.get_value(Some(k), Some(v), MDBX_GET_BOTH)
|
||||
}
|
||||
|
||||
/// [`DatabaseFlags::DUP_SORT`]-only: Position at given key and at first data >= specified.
|
||||
pub fn get_both_range<Value>(&mut self, k: &[u8], v: &[u8]) -> Result<Option<Value>>
|
||||
where
|
||||
Value: TableObject,
|
||||
{
|
||||
self.get_value(Some(k), Some(v), MDBX_GET_BOTH_RANGE)
|
||||
}
|
||||
|
||||
/// Position at first key > current key.
|
||||
pub fn next_nodup<Key, Value>(&mut self) -> Result<Option<(Key, Value)>>
|
||||
where
|
||||
Key: TableObject,
|
||||
Value: TableObject,
|
||||
{
|
||||
self.get_full(None, None, MDBX_NEXT_NODUP)
|
||||
}
|
||||
|
||||
/// Position at last key < current key.
|
||||
pub fn prev_nodup<Key, Value>(&mut self) -> Result<Option<(Key, Value)>>
|
||||
where
|
||||
Key: TableObject,
|
||||
Value: TableObject,
|
||||
{
|
||||
self.get_full(None, None, MDBX_PREV_NODUP)
|
||||
}
|
||||
|
||||
fn get_full<Key, Value>(
|
||||
&mut self,
|
||||
key: Option<&[u8]>,
|
||||
data: Option<&[u8]>,
|
||||
op: MDBX_cursor_op,
|
||||
) -> Result<Option<(Key, Value)>>
|
||||
where
|
||||
Key: TableObject,
|
||||
Value: TableObject,
|
||||
{
|
||||
let (k, v, _) = mdbx_try_optional!(self.get::<Key, Value>(key, data, op));
|
||||
Ok(Some((k.unwrap(), v)))
|
||||
}
|
||||
|
||||
fn get_value<Value>(
|
||||
&mut self,
|
||||
key: Option<&[u8]>,
|
||||
data: Option<&[u8]>,
|
||||
op: MDBX_cursor_op,
|
||||
) -> Result<Option<Value>>
|
||||
where
|
||||
Value: TableObject,
|
||||
{
|
||||
let (_, v, _) = mdbx_try_optional!(self.get::<(), Value>(key, data, op));
|
||||
Ok(Some(v))
|
||||
}
|
||||
|
||||
fn get<Key, Value>(
|
||||
&self,
|
||||
key: Option<&[u8]>,
|
||||
data: Option<&[u8]>,
|
||||
op: MDBX_cursor_op,
|
||||
) -> Result<(Option<Key>, Value, bool)>
|
||||
where
|
||||
Key: TableObject,
|
||||
Value: TableObject,
|
||||
{
|
||||
unsafe {
|
||||
let mut key_val = slice_to_val(key);
|
||||
let mut data_val = slice_to_val(data);
|
||||
let key_ptr = key_val.iov_base;
|
||||
let data_ptr = data_val.iov_base;
|
||||
|
||||
let rc = ffi::mdbx_cursor_get(self.cursor, &mut key_val, &mut data_val, op);
|
||||
let v = mdbx_result(rc)?;
|
||||
|
||||
if data_val.iov_base.is_null() && data_val.iov_len > 0 {
|
||||
return Err(Error::NotFound);
|
||||
}
|
||||
if key_val.iov_base.is_null() && key_val.iov_len > 0 {
|
||||
return Err(Error::NotFound);
|
||||
}
|
||||
|
||||
assert_ne!(data_ptr, data_val.iov_base);
|
||||
let key_out = if ptr::eq(key_ptr, key_val.iov_base) {
|
||||
None
|
||||
} else {
|
||||
Some(Key::decode_val::<RW>(self.txn_ptr.as_ptr(), key_val)?)
|
||||
};
|
||||
let data_out = Value::decode_val::<RW>(self.txn_ptr.as_ptr(), data_val)?;
|
||||
Ok((key_out, data_out, v))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ParallelCursor<'_> {
|
||||
fn drop(&mut self) {
|
||||
unsafe { ffi::mdbx_cursor_close(self.cursor) }
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for ParallelCursor<'_> {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
f.debug_struct("ParallelCursor").finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
// SAFETY: Access to cursor is protected by the transaction borrow
|
||||
unsafe impl Send for ParallelCursor<'_> {}
|
||||
unsafe impl Sync for ParallelCursor<'_> {}
|
||||
|
||||
/// An iterator over the key/value pairs in an MDBX database.
|
||||
#[derive(Debug)]
|
||||
pub enum IntoIter<K, Key, Value>
|
||||
|
||||
@@ -13,7 +13,7 @@ pub extern crate reth_mdbx_sys as ffi;
|
||||
|
||||
pub use crate::{
|
||||
codec::*,
|
||||
cursor::{Cursor, Iter, IterDup},
|
||||
cursor::{Cursor, Iter, IterDup, ParallelCursor},
|
||||
database::Database,
|
||||
environment::{
|
||||
Environment, EnvironmentBuilder, EnvironmentKind, Geometry, HandleSlowReadersCallback,
|
||||
@@ -21,7 +21,9 @@ pub use crate::{
|
||||
},
|
||||
error::{Error, Result},
|
||||
flags::*,
|
||||
transaction::{CommitLatency, Transaction, TransactionKind, RO, RW},
|
||||
transaction::{
|
||||
CommitLatency, SubTransaction, SubTransactionStats, Transaction, TransactionKind, RO, RW,
|
||||
},
|
||||
};
|
||||
|
||||
#[cfg(feature = "read-tx-timeouts")]
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
use crate::{
|
||||
cursor::ParallelCursor,
|
||||
database::Database,
|
||||
environment::Environment,
|
||||
error::{mdbx_result, Result},
|
||||
@@ -7,19 +8,73 @@ use crate::{
|
||||
Cursor, Error, Stat, TableObject,
|
||||
};
|
||||
use ffi::{MDBX_txn_flags_t, MDBX_TXN_RDONLY, MDBX_TXN_READWRITE};
|
||||
use parking_lot::{Mutex, MutexGuard};
|
||||
use parking_lot::{Mutex, MutexGuard, RwLock};
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
ffi::{c_uint, c_void},
|
||||
fmt::{self, Debug},
|
||||
mem::size_of,
|
||||
ptr, slice,
|
||||
sync::{atomic::AtomicBool, mpsc::sync_channel, Arc},
|
||||
sync::{
|
||||
atomic::{AtomicBool, AtomicUsize},
|
||||
mpsc::sync_channel,
|
||||
Arc,
|
||||
},
|
||||
time::Duration,
|
||||
};
|
||||
|
||||
#[cfg(feature = "read-tx-timeouts")]
|
||||
use ffi::mdbx_txn_renew;
|
||||
|
||||
//
|
||||
// # Two-Tier Safety Model for Parallel Writes
|
||||
//
|
||||
// The parallel writes feature enables multiple threads to write to the same MDBX transaction
|
||||
// concurrently, with each thread writing to a different table (DBI). This is achieved through
|
||||
// a two-tier safety model:
|
||||
//
|
||||
// ## Tier 1: libmdbx layer (`ParallelCursor<'txn>`)
|
||||
//
|
||||
// Provides **compile-time safety** via the borrow checker:
|
||||
// - [`ParallelCursor`] borrows `&'txn Transaction<RW>`, creating a lifetime dependency
|
||||
// - The borrow checker prevents calling [`Transaction::commit_subtxns`] while cursors exist
|
||||
// - Use [`Transaction::cursor_with_dbi_parallel`] to obtain this cursor type
|
||||
//
|
||||
// ## Tier 2: reth-db layer (`DbTxMut` trait)
|
||||
//
|
||||
// Provides **runtime safety** via interior mutability:
|
||||
// - All `DbTxMut` trait methods use `&self` to enable concurrent shared access from multiple
|
||||
// threads
|
||||
// - The `finished` [`AtomicBool`] on cursors prevents double-commit/abort at runtime
|
||||
// - Use [`Transaction::cursor_with_dbi_parallel_owned`] for compatibility with the `DbTxMut` trait,
|
||||
// which returns an owned cursor that doesn't borrow the transaction
|
||||
//
|
||||
// ## Key Invariants
|
||||
//
|
||||
// - **1 DBI = 1 SUBTXN = 1 THREAD**: Each table gets exactly one subtransaction, and each
|
||||
// subtransaction must only be accessed by one thread at a time
|
||||
// - **WRITEMAP mode required**: The environment must be opened with `MDBX_WRITEMAP`
|
||||
// - **Subtxn commit order**: All subtransactions must be committed via [`commit_subtxns`] before
|
||||
// the parent transaction can commit
|
||||
// - **Cursor lifetime**: All cursors must be dropped before calling [`commit_subtxns`]
|
||||
//
|
||||
// ## Usage Example
|
||||
//
|
||||
// ```ignore
|
||||
// // Compile-time safe (libmdbx layer)
|
||||
// let txn = env.begin_rw_txn()?;
|
||||
// txn.enable_parallel_writes(&[dbi])?;
|
||||
// {
|
||||
// let mut cursor = txn.cursor_with_dbi_parallel(dbi)?;
|
||||
// cursor.put(b"key", b"value", WriteFlags::empty())?;
|
||||
// } // cursor dropped here - borrow checker enforces this
|
||||
// txn.commit_subtxns()?;
|
||||
// txn.commit()?;
|
||||
// ```
|
||||
//
|
||||
// [`commit_subtxns`]: Transaction::commit_subtxns
|
||||
//
|
||||
|
||||
mod private {
|
||||
use super::*;
|
||||
|
||||
@@ -63,6 +118,121 @@ where
|
||||
K: TransactionKind,
|
||||
{
|
||||
inner: Arc<TransactionInner<K>>,
|
||||
/// Map of DBI to subtransaction pointer for parallel writes.
|
||||
/// Only used for RW transactions with parallel writes enabled.
|
||||
subtxns: Arc<RwLock<HashMap<ffi::MDBX_dbi, SubTransaction>>>,
|
||||
/// Whether parallel writes mode is enabled.
|
||||
/// Wrapped in Arc to ensure clones share the same flag state.
|
||||
parallel_writes_enabled: Arc<AtomicBool>,
|
||||
}
|
||||
|
||||
/// Statistics for a parallel subtransaction.
|
||||
#[derive(Debug, Clone, Copy, Default)]
|
||||
pub struct SubTransactionStats {
|
||||
/// Pages allocated from pre-distributed arena.
|
||||
pub arena_page_allocations: usize,
|
||||
/// Times fallback to parent was needed (arena refill events).
|
||||
pub arena_refill_events: usize,
|
||||
/// Initial pages distributed to this subtxn.
|
||||
pub arena_initial_pages: usize,
|
||||
/// Additional pages acquired from parent during fallback.
|
||||
pub arena_refill_pages: usize,
|
||||
/// Pages returned to parent on commit (not consumed).
|
||||
pub pages_unused: usize,
|
||||
/// Original arena hint for this subtxn.
|
||||
pub arena_hint: usize,
|
||||
/// DBI this subtxn is bound to.
|
||||
pub assigned_dbi: ffi::MDBX_dbi,
|
||||
/// Pages reclaimed from GC (garbage collector / freeDB).
|
||||
pub pages_from_gc: usize,
|
||||
/// Pages allocated from end-of-file (extending the database).
|
||||
pub pages_from_eof: usize,
|
||||
}
|
||||
|
||||
/// A subtransaction for parallel writes.
|
||||
/// Each subtransaction is bound to a single DBI.
|
||||
#[derive(Debug)]
|
||||
pub struct SubTransaction {
|
||||
/// Transaction pointer with mutex locking for thread-safety.
|
||||
txn_ptr: TransactionPtr,
|
||||
/// The DBI this subtransaction is bound to.
|
||||
dbi: ffi::MDBX_dbi,
|
||||
/// Whether this subtransaction has been finished (committed or aborted).
|
||||
/// Used to prevent double-commit/abort operations.
|
||||
finished: AtomicBool,
|
||||
/// Whether this subtransaction was successfully committed.
|
||||
/// Used by parent transaction to verify all subtxns were committed before parent commit.
|
||||
committed: AtomicBool,
|
||||
}
|
||||
|
||||
impl SubTransaction {
|
||||
/// Creates a new subtransaction wrapper.
|
||||
fn new(ptr: *mut ffi::MDBX_txn, dbi: ffi::MDBX_dbi) -> Self {
|
||||
Self {
|
||||
txn_ptr: TransactionPtr::new(ptr),
|
||||
dbi,
|
||||
finished: AtomicBool::new(false),
|
||||
committed: AtomicBool::new(false),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a clone of the transaction pointer.
|
||||
pub(crate) fn txn_ptr(&self) -> TransactionPtr {
|
||||
self.txn_ptr.clone()
|
||||
}
|
||||
|
||||
/// Returns the DBI this subtransaction is bound to.
|
||||
pub fn dbi(&self) -> ffi::MDBX_dbi {
|
||||
self.dbi
|
||||
}
|
||||
|
||||
/// Commits this subtransaction, merging changes to parent.
|
||||
pub fn commit(&self) -> Result<()> {
|
||||
if self.finished.swap(true, std::sync::atomic::Ordering::SeqCst) {
|
||||
return Ok(());
|
||||
}
|
||||
self.txn_ptr.txn_execute_fail_on_timeout(|ptr| {
|
||||
mdbx_result(unsafe { ffi::mdbx_subtx_commit(ptr) })
|
||||
})??;
|
||||
self.txn_ptr.set_invalidated();
|
||||
self.committed.store(true, std::sync::atomic::Ordering::SeqCst);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Aborts this subtransaction.
|
||||
pub fn abort(&self) -> Result<()> {
|
||||
if self.finished.swap(true, std::sync::atomic::Ordering::SeqCst) {
|
||||
return Ok(());
|
||||
}
|
||||
self.txn_ptr.txn_execute_fail_on_timeout(|ptr| {
|
||||
mdbx_result(unsafe { ffi::mdbx_subtx_abort(ptr) })
|
||||
})??;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Returns statistics for this subtransaction.
|
||||
pub fn get_stats(&self) -> Result<SubTransactionStats> {
|
||||
self.txn_ptr.txn_execute_fail_on_timeout(|ptr| {
|
||||
let mut stats: ffi::MDBX_subtxn_stats = unsafe { std::mem::zeroed() };
|
||||
mdbx_result(unsafe { ffi::mdbx_subtxn_get_stats(ptr, &mut stats) })?;
|
||||
Ok(SubTransactionStats {
|
||||
arena_page_allocations: stats.arena_page_allocations,
|
||||
arena_refill_events: stats.arena_refill_events,
|
||||
arena_initial_pages: stats.arena_initial_pages,
|
||||
arena_refill_pages: stats.arena_refill_pages,
|
||||
pages_unused: stats.pages_unused,
|
||||
arena_hint: stats.arena_hint,
|
||||
assigned_dbi: stats.assigned_dbi,
|
||||
pages_from_gc: stats.pages_from_gc,
|
||||
pages_from_eof: stats.pages_from_eof,
|
||||
})
|
||||
})?
|
||||
}
|
||||
|
||||
/// Returns the number of active cursors on this subtransaction.
|
||||
pub fn cursor_count(&self) -> usize {
|
||||
self.txn_ptr.cursor_count()
|
||||
}
|
||||
}
|
||||
|
||||
impl<K> Transaction<K>
|
||||
@@ -98,7 +268,11 @@ where
|
||||
_marker: Default::default(),
|
||||
};
|
||||
|
||||
Self { inner: Arc::new(inner) }
|
||||
Self {
|
||||
inner: Arc::new(inner),
|
||||
subtxns: Arc::new(RwLock::new(HashMap::new())),
|
||||
parallel_writes_enabled: Arc::new(AtomicBool::new(false)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Executes the given closure once the lock on the transaction is acquired.
|
||||
@@ -170,7 +344,23 @@ where
|
||||
/// Commits the transaction.
|
||||
///
|
||||
/// Any pending operations will be saved.
|
||||
///
|
||||
/// # Errors
|
||||
/// Returns `Error::Busy` if parallel writes is enabled and subtransactions
|
||||
/// have not been committed via `commit_subtxns()`.
|
||||
pub fn commit(self) -> Result<CommitLatency> {
|
||||
// Check that all subtxns are committed before allowing parent commit
|
||||
let parallel_enabled =
|
||||
self.parallel_writes_enabled.load(std::sync::atomic::Ordering::SeqCst);
|
||||
if parallel_enabled {
|
||||
let subtxns = self.subtxns.read();
|
||||
for subtxn in subtxns.values() {
|
||||
if !subtxn.committed.load(std::sync::atomic::Ordering::SeqCst) {
|
||||
return Err(Error::Busy);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
match self.txn_execute(|txn| {
|
||||
if K::IS_READ_ONLY {
|
||||
#[cfg(feature = "read-tx-timeouts")]
|
||||
@@ -269,6 +459,136 @@ where
|
||||
self.env().txn_manager().remove_active_read_transaction(self.inner.txn.txn);
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns whether parallel writes mode is enabled.
|
||||
///
|
||||
/// Always returns false for read-only transactions.
|
||||
pub fn is_parallel_writes_enabled(&self) -> bool {
|
||||
if K::IS_READ_ONLY {
|
||||
false
|
||||
} else {
|
||||
self.parallel_writes_enabled.load(std::sync::atomic::Ordering::SeqCst)
|
||||
}
|
||||
}
|
||||
|
||||
/// Commits all subtransactions serially.
|
||||
///
|
||||
/// This is a no-op for read-only transactions or if subtxns already committed.
|
||||
/// After calling this, `parallel_writes_enabled` is set to false to prevent
|
||||
/// double-commit attempts.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `Error::Busy` if any subtransaction has active cursors that haven't been dropped.
|
||||
/// This is a runtime safety check to prevent use-after-free when using
|
||||
/// `cursor_with_dbi_parallel_owned`.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// When using `ParallelCursor`, all cursors must be dropped before calling this method.
|
||||
/// `ParallelCursor` enforces this at compile time by borrowing the transaction.
|
||||
/// When using `cursor_with_dbi_parallel_owned`, callers must ensure cursors are dropped
|
||||
/// before calling this method.
|
||||
pub fn commit_subtxns(&self) -> Result<()> {
|
||||
if K::IS_READ_ONLY ||
|
||||
!self.parallel_writes_enabled.swap(false, std::sync::atomic::Ordering::SeqCst)
|
||||
{
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let subtxns = self.subtxns.read();
|
||||
|
||||
// Runtime safety check: detect active cursors on subtransactions
|
||||
for subtxn in subtxns.values() {
|
||||
let cursor_count = subtxn.cursor_count();
|
||||
if cursor_count > 0 {
|
||||
tracing::error!(
|
||||
target: "libmdbx",
|
||||
dbi = subtxn.dbi(),
|
||||
cursor_count,
|
||||
"commit_subtxns() called with active cursors on subtransaction - \
|
||||
this would cause use-after-free. All cursors must be dropped before commit."
|
||||
);
|
||||
return Err(Error::Busy);
|
||||
}
|
||||
}
|
||||
|
||||
for subtxn in subtxns.values() {
|
||||
subtxn.commit()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Commits all subtransactions serially and returns their stats.
|
||||
///
|
||||
/// Stats are collected BEFORE commit (commit invalidates the subtxn pointer).
|
||||
/// Returns a vector of (dbi, stats) pairs for each subtransaction.
|
||||
///
|
||||
/// This is a no-op for read-only transactions, returning an empty vector.
|
||||
/// After calling this, `parallel_writes_enabled` is set to false to prevent
|
||||
/// double-commit attempts.
|
||||
///
|
||||
/// # Errors
|
||||
///
|
||||
/// Returns `Error::Busy` if any subtransaction has active cursors that haven't been dropped.
|
||||
/// This is a runtime safety check to prevent use-after-free when using
|
||||
/// `cursor_with_dbi_parallel_owned`.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// When using `ParallelCursor`, all cursors must be dropped before calling this method.
|
||||
/// `ParallelCursor` enforces this at compile time by borrowing the transaction.
|
||||
/// When using `cursor_with_dbi_parallel_owned`, callers must ensure cursors are dropped
|
||||
/// before calling this method.
|
||||
pub fn commit_subtxns_with_stats(&self) -> Result<Vec<(ffi::MDBX_dbi, SubTransactionStats)>> {
|
||||
if K::IS_READ_ONLY ||
|
||||
!self.parallel_writes_enabled.swap(false, std::sync::atomic::Ordering::SeqCst)
|
||||
{
|
||||
return Ok(Vec::new());
|
||||
}
|
||||
|
||||
let subtxns = self.subtxns.read();
|
||||
|
||||
// Runtime safety check: detect active cursors on subtransactions
|
||||
for subtxn in subtxns.values() {
|
||||
let cursor_count = subtxn.cursor_count();
|
||||
if cursor_count > 0 {
|
||||
tracing::error!(
|
||||
target: "libmdbx",
|
||||
dbi = subtxn.dbi(),
|
||||
cursor_count,
|
||||
"commit_subtxns_with_stats() called with active cursors on subtransaction - \
|
||||
this would cause use-after-free. All cursors must be dropped before commit."
|
||||
);
|
||||
return Err(Error::Busy);
|
||||
}
|
||||
}
|
||||
|
||||
let mut stats_vec = Vec::with_capacity(subtxns.len());
|
||||
|
||||
let mut total_page_allocations = 0usize;
|
||||
let mut total_refill_events = 0usize;
|
||||
let mut total_initial_pages = 0usize;
|
||||
let mut total_refill_pages = 0usize;
|
||||
let mut total_unused = 0usize;
|
||||
let mut total_from_gc = 0usize;
|
||||
let mut total_from_eof = 0usize;
|
||||
|
||||
for subtxn in subtxns.values() {
|
||||
let stats = subtxn.get_stats()?;
|
||||
total_page_allocations += stats.arena_page_allocations;
|
||||
total_refill_events += stats.arena_refill_events;
|
||||
total_initial_pages += stats.arena_initial_pages;
|
||||
total_refill_pages += stats.arena_refill_pages;
|
||||
total_unused += stats.pages_unused;
|
||||
total_from_gc += stats.pages_from_gc;
|
||||
total_from_eof += stats.pages_from_eof;
|
||||
subtxn.commit()?;
|
||||
stats_vec.push((subtxn.dbi(), stats));
|
||||
}
|
||||
|
||||
Ok(stats_vec)
|
||||
}
|
||||
}
|
||||
|
||||
impl<K> Clone for Transaction<K>
|
||||
@@ -276,7 +596,11 @@ where
|
||||
K: TransactionKind,
|
||||
{
|
||||
fn clone(&self) -> Self {
|
||||
Self { inner: Arc::clone(&self.inner) }
|
||||
Self {
|
||||
inner: Arc::clone(&self.inner),
|
||||
subtxns: Arc::clone(&self.subtxns),
|
||||
parallel_writes_enabled: Arc::clone(&self.parallel_writes_enabled),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -289,6 +613,25 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<K> Drop for Transaction<K>
|
||||
where
|
||||
K: TransactionKind,
|
||||
{
|
||||
fn drop(&mut self) {
|
||||
// Only abort subtxns if this is the last reference to the shared Arc.
|
||||
// Clone shares the subtxns Arc, so we must not abort if other clones exist.
|
||||
if Arc::strong_count(&self.subtxns) == 1 &&
|
||||
self.parallel_writes_enabled.load(std::sync::atomic::Ordering::SeqCst)
|
||||
{
|
||||
let subtxns = self.subtxns.read();
|
||||
for subtxn in subtxns.values() {
|
||||
let _ = subtxn.abort();
|
||||
}
|
||||
}
|
||||
// TransactionInner::drop will handle aborting the parent transaction
|
||||
}
|
||||
}
|
||||
|
||||
/// Internals of a transaction.
|
||||
struct TransactionInner<K>
|
||||
where
|
||||
@@ -534,6 +877,267 @@ impl Transaction<RW> {
|
||||
rx.recv().unwrap().map(|ptr| Self::new_from_ptr(self.env().clone(), ptr.0))
|
||||
})?
|
||||
}
|
||||
|
||||
/// Enables parallel writes mode by creating subtransactions for the given DBIs.
|
||||
///
|
||||
/// Each DBI gets its own subtransaction that can be written to from a different thread.
|
||||
/// Cursor operations on these DBIs will automatically use the corresponding subtransaction.
|
||||
///
|
||||
/// This requires WRITEMAP mode to be enabled on the environment.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `dbis` - Slice of DBI handles to create subtransactions for.
|
||||
///
|
||||
/// # Returns
|
||||
/// Ok(()) on success, or an error if subtransaction creation fails.
|
||||
pub fn enable_parallel_writes(&self, dbis: &[ffi::MDBX_dbi]) -> Result<()> {
|
||||
let specs_with_hints: Vec<_> = dbis.iter().map(|&dbi| (dbi, 0usize)).collect();
|
||||
self.enable_parallel_writes_with_hints(&specs_with_hints)
|
||||
}
|
||||
|
||||
/// Enables parallel writes mode with arena size hints for specified DBIs.
|
||||
///
|
||||
/// Similar to [`enable_parallel_writes`], but allows specifying an arena_hint
|
||||
/// for each DBI to guide page pre-allocation. An arena_hint of 0 means use
|
||||
/// equal distribution among all subtransactions.
|
||||
///
|
||||
/// # Arguments
|
||||
/// * `specs_input` - Slice of (DBI, arena_hint) tuples.
|
||||
///
|
||||
/// # Returns
|
||||
/// Ok(()) on success, or an error if subtransaction creation fails.
|
||||
pub fn enable_parallel_writes_with_hints(
|
||||
&self,
|
||||
specs_input: &[(ffi::MDBX_dbi, usize)],
|
||||
) -> Result<()> {
|
||||
if specs_input.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Check if already enabled
|
||||
if self.parallel_writes_enabled.load(std::sync::atomic::Ordering::SeqCst) {
|
||||
return Err(Error::Incompatible);
|
||||
}
|
||||
|
||||
// Debug: verify parent can read BEFORE subtxn creation
|
||||
for &(dbi, _) in specs_input {
|
||||
self.txn_execute(|txn| unsafe {
|
||||
let mut cursor: *mut ffi::MDBX_cursor = ptr::null_mut();
|
||||
let rc = ffi::mdbx_cursor_open(txn, dbi, &mut cursor);
|
||||
if rc == 0 {
|
||||
ffi::mdbx_cursor_close(cursor);
|
||||
}
|
||||
})?;
|
||||
}
|
||||
|
||||
// Pre-touch each DBI to ensure MAIN_DBI is dirty in parent.
|
||||
// This prevents races in subtxns when they try to modify the B-tree.
|
||||
// We do this by performing a put+delete operation which triggers cursor_touch/touch_dbi.
|
||||
for &(dbi, _) in specs_input {
|
||||
// Check if this is a DupSort table - they need special handling
|
||||
let db_flags = self.db_flags(dbi)?;
|
||||
let is_dupsort = db_flags.contains(DatabaseFlags::DUP_SORT);
|
||||
|
||||
self.txn_execute(|txn| unsafe {
|
||||
let mut cursor: *mut ffi::MDBX_cursor = ptr::null_mut();
|
||||
let rc = ffi::mdbx_cursor_open(txn, dbi, &mut cursor);
|
||||
if rc != 0 {
|
||||
return;
|
||||
}
|
||||
|
||||
// Use a max key to touch the DBI - this won't conflict with real data
|
||||
// since it's well beyond any reasonable tx_num
|
||||
let temp_key: [u8; 8] = [0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF];
|
||||
let temp_data: [u8; 1] = [0];
|
||||
let mut key = ffi::MDBX_val {
|
||||
iov_len: temp_key.len(),
|
||||
iov_base: temp_key.as_ptr() as *mut c_void,
|
||||
};
|
||||
let mut data = ffi::MDBX_val {
|
||||
iov_len: temp_data.len(),
|
||||
iov_base: temp_data.as_ptr() as *mut c_void,
|
||||
};
|
||||
|
||||
// Put triggers cursor_touch which marks MAIN_DBI as dirty.
|
||||
// For DupSort tables, use NODUPDATA to avoid adding duplicate entries
|
||||
// and properly handle the case where the key+value already exists.
|
||||
let put_flags = if is_dupsort { ffi::MDBX_NODUPDATA } else { 0 };
|
||||
let put_rc = ffi::mdbx_cursor_put(cursor, &mut key, &mut data, put_flags);
|
||||
|
||||
// Delete the temp entry we just inserted (put_rc == 0 means success).
|
||||
// MDBX_KEYEXIST (-30799) means the key (or key+value for DupSort) already
|
||||
// exists, which still triggers the touch - no cleanup needed.
|
||||
if put_rc == 0 {
|
||||
ffi::mdbx_cursor_del(cursor, 0);
|
||||
}
|
||||
|
||||
ffi::mdbx_cursor_close(cursor);
|
||||
})?;
|
||||
}
|
||||
|
||||
// Create specs array for the C API
|
||||
let specs: Vec<ffi::MDBX_subtxn_spec_t> = specs_input
|
||||
.iter()
|
||||
.map(|&(dbi, arena_hint)| ffi::MDBX_subtxn_spec_t { dbi, arena_hint })
|
||||
.collect();
|
||||
|
||||
// Allocate space for subtransaction pointers
|
||||
let mut subtxn_ptrs: Vec<*mut ffi::MDBX_txn> = vec![ptr::null_mut(); specs_input.len()];
|
||||
|
||||
// Create all subtransactions atomically
|
||||
let create_result = self.txn_execute(|parent_txn| unsafe {
|
||||
let rc = ffi::mdbx_txn_create_subtxns(
|
||||
parent_txn,
|
||||
specs.as_ptr(),
|
||||
specs.len(),
|
||||
subtxn_ptrs.as_mut_ptr(),
|
||||
);
|
||||
mdbx_result(rc)
|
||||
});
|
||||
create_result??;
|
||||
|
||||
// Store subtransactions in the map
|
||||
{
|
||||
let mut subtxns = self.subtxns.write();
|
||||
for (i, &(dbi, _)) in specs_input.iter().enumerate() {
|
||||
subtxns.insert(dbi, SubTransaction::new(subtxn_ptrs[i], dbi));
|
||||
}
|
||||
}
|
||||
|
||||
self.parallel_writes_enabled.store(true, std::sync::atomic::Ordering::SeqCst);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Gets the subtransaction pointer for the given DBI, if parallel writes is enabled.
|
||||
///
|
||||
/// Returns the subtransaction pointer if one exists for this DBI.
|
||||
/// Returns an error if parallel writes is enabled but no subtxn exists for this DBI
|
||||
/// (prevents accidental cross-DBI access which would bypass subtxn isolation).
|
||||
/// Falls back to parent txn only if parallel writes is not enabled.
|
||||
pub(crate) fn get_txn_ptr_for_dbi(&self, dbi: ffi::MDBX_dbi) -> Result<TransactionPtr> {
|
||||
let parallel_enabled =
|
||||
self.parallel_writes_enabled.load(std::sync::atomic::Ordering::SeqCst);
|
||||
if parallel_enabled {
|
||||
let subtxns = self.subtxns.read();
|
||||
if let Some(subtxn) = subtxns.get(&dbi) {
|
||||
return Ok(subtxn.txn_ptr());
|
||||
}
|
||||
// Parallel writes enabled but no subtxn for this DBI - reject to enforce isolation
|
||||
return Err(Error::Access);
|
||||
}
|
||||
Ok(self.inner.txn.clone())
|
||||
}
|
||||
|
||||
/// Aborts all subtransactions.
|
||||
///
|
||||
/// This discards all changes made through subtransactions.
|
||||
///
|
||||
/// Takes `&mut self` to ensure no `ParallelCursor` borrows exist (compile-time safety).
|
||||
pub fn abort_subtxns(&mut self) -> Result<()> {
|
||||
if !self.parallel_writes_enabled.load(std::sync::atomic::Ordering::SeqCst) {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let subtxns = self.subtxns.read();
|
||||
for subtxn in subtxns.values() {
|
||||
subtxn.abort()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Stores an item into a database, using the subtransaction if parallel writes is enabled.
|
||||
///
|
||||
/// This is the parallel-writes-aware version of `put`.
|
||||
pub fn put_parallel(
|
||||
&self,
|
||||
dbi: ffi::MDBX_dbi,
|
||||
key: impl AsRef<[u8]>,
|
||||
data: impl AsRef<[u8]>,
|
||||
flags: WriteFlags,
|
||||
) -> Result<()> {
|
||||
let key = key.as_ref();
|
||||
let data = data.as_ref();
|
||||
let key_val: ffi::MDBX_val =
|
||||
ffi::MDBX_val { iov_len: key.len(), iov_base: key.as_ptr() as *mut c_void };
|
||||
let mut data_val: ffi::MDBX_val =
|
||||
ffi::MDBX_val { iov_len: data.len(), iov_base: data.as_ptr() as *mut c_void };
|
||||
|
||||
let txn_ptr = self.get_txn_ptr_for_dbi(dbi)?;
|
||||
mdbx_result(txn_ptr.txn_execute_fail_on_timeout(|txn| unsafe {
|
||||
ffi::mdbx_put(txn, dbi, &key_val, &mut data_val, flags.bits())
|
||||
})?)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Deletes an item from a database, using the subtransaction if parallel writes is enabled.
|
||||
///
|
||||
/// This is the parallel-writes-aware version of `del`.
|
||||
pub fn del_parallel(
|
||||
&self,
|
||||
dbi: ffi::MDBX_dbi,
|
||||
key: impl AsRef<[u8]>,
|
||||
data: Option<&[u8]>,
|
||||
) -> Result<bool> {
|
||||
let key = key.as_ref();
|
||||
let key_val: ffi::MDBX_val =
|
||||
ffi::MDBX_val { iov_len: key.len(), iov_base: key.as_ptr() as *mut c_void };
|
||||
let data_val: Option<ffi::MDBX_val> = data.map(|data| ffi::MDBX_val {
|
||||
iov_len: data.len(),
|
||||
iov_base: data.as_ptr() as *mut c_void,
|
||||
});
|
||||
|
||||
let txn_ptr = self.get_txn_ptr_for_dbi(dbi)?;
|
||||
mdbx_result(txn_ptr.txn_execute_fail_on_timeout(|txn| {
|
||||
if let Some(d) = data_val {
|
||||
unsafe { ffi::mdbx_del(txn, dbi, &key_val, &d) }
|
||||
} else {
|
||||
unsafe { ffi::mdbx_del(txn, dbi, &key_val, ptr::null()) }
|
||||
}
|
||||
})?)
|
||||
.map(|_| true)
|
||||
.or_else(|e| match e {
|
||||
Error::NotFound => Ok(false),
|
||||
other => Err(other),
|
||||
})
|
||||
}
|
||||
|
||||
/// Opens a cursor on the given DBI, using the subtransaction if parallel writes is enabled.
|
||||
///
|
||||
/// Returns a [`ParallelCursor`] that borrows from this transaction. The borrow checker
|
||||
/// ensures the cursor is dropped before `commit_subtxns()` or `abort_subtxns()` can be
|
||||
/// called, providing compile-time safety against use-after-free.
|
||||
///
|
||||
/// # Example
|
||||
/// ```ignore
|
||||
/// txn.enable_parallel_writes(&[dbi])?;
|
||||
/// {
|
||||
/// let mut cursor = txn.cursor_with_dbi_parallel(dbi)?;
|
||||
/// cursor.put(b"key", b"value", WriteFlags::empty())?;
|
||||
/// } // cursor dropped, borrow released
|
||||
/// txn.commit_subtxns()?; // OK - requires &mut self, no borrows exist
|
||||
/// txn.commit()?;
|
||||
/// ```
|
||||
pub fn cursor_with_dbi_parallel(&self, dbi: ffi::MDBX_dbi) -> Result<ParallelCursor<'_>> {
|
||||
let txn_ptr = self.get_txn_ptr_for_dbi(dbi)?;
|
||||
ParallelCursor::new(self, dbi, txn_ptr)
|
||||
}
|
||||
|
||||
/// Opens a cursor for parallel writes that returns `Cursor<RW>` instead of `ParallelCursor`.
|
||||
///
|
||||
/// Unlike `cursor_with_dbi_parallel`, this returns an owned `Cursor<RW>` that can be used
|
||||
/// with APIs expecting the standard cursor type. The returned cursor stores the subtransaction
|
||||
/// pointer internally and uses it for all write operations.
|
||||
///
|
||||
/// # Safety
|
||||
///
|
||||
/// The caller must ensure that `commit_subtxns()` is called before `commit()` and that
|
||||
/// all cursors are dropped before calling `commit_subtxns()`. Unlike `ParallelCursor`,
|
||||
/// this method does not provide compile-time enforcement of this constraint.
|
||||
pub fn cursor_with_dbi_parallel_owned(&self, dbi: ffi::MDBX_dbi) -> Result<Cursor<RW>> {
|
||||
let txn_ptr = self.get_txn_ptr_for_dbi(dbi)?;
|
||||
Cursor::new_with_ptr(self.clone(), dbi, txn_ptr)
|
||||
}
|
||||
}
|
||||
|
||||
/// A shareable pointer to an MDBX transaction.
|
||||
@@ -543,6 +1147,11 @@ pub(crate) struct TransactionPtr {
|
||||
#[cfg(feature = "read-tx-timeouts")]
|
||||
timed_out: Arc<AtomicBool>,
|
||||
lock: Arc<Mutex<()>>,
|
||||
invalidated: Arc<AtomicBool>,
|
||||
/// Tracks the number of active cursors opened on this transaction pointer.
|
||||
/// Used for runtime safety checks in `commit_subtxns()` to detect cursors
|
||||
/// that weren't dropped before commit.
|
||||
cursor_count: Arc<AtomicUsize>,
|
||||
}
|
||||
|
||||
impl TransactionPtr {
|
||||
@@ -552,9 +1161,39 @@ impl TransactionPtr {
|
||||
#[cfg(feature = "read-tx-timeouts")]
|
||||
timed_out: Arc::new(AtomicBool::new(false)),
|
||||
lock: Arc::new(Mutex::new(())),
|
||||
invalidated: Arc::new(AtomicBool::new(false)),
|
||||
cursor_count: Arc::new(AtomicUsize::new(0)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Increments the cursor count for this transaction pointer.
|
||||
pub(crate) fn increment_cursor_count(&self) {
|
||||
self.cursor_count.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
|
||||
}
|
||||
|
||||
/// Decrements the cursor count for this transaction pointer.
|
||||
pub(crate) fn decrement_cursor_count(&self) {
|
||||
self.cursor_count.fetch_sub(1, std::sync::atomic::Ordering::SeqCst);
|
||||
}
|
||||
|
||||
/// Returns the current cursor count.
|
||||
pub(crate) fn cursor_count(&self) -> usize {
|
||||
self.cursor_count.load(std::sync::atomic::Ordering::SeqCst)
|
||||
}
|
||||
|
||||
pub(crate) fn set_invalidated(&self) {
|
||||
self.invalidated.store(true, std::sync::atomic::Ordering::SeqCst);
|
||||
}
|
||||
|
||||
fn is_invalidated(&self) -> bool {
|
||||
self.invalidated.load(std::sync::atomic::Ordering::SeqCst)
|
||||
}
|
||||
|
||||
/// Returns the raw transaction pointer.
|
||||
pub(crate) fn as_ptr(&self) -> *mut ffi::MDBX_txn {
|
||||
self.txn
|
||||
}
|
||||
|
||||
/// Returns `true` if the transaction is timed out.
|
||||
///
|
||||
/// When transaction is timed out via `TxnManager`, it's actually reset using
|
||||
@@ -598,6 +1237,10 @@ impl TransactionPtr {
|
||||
where
|
||||
F: FnOnce(*mut ffi::MDBX_txn) -> T,
|
||||
{
|
||||
if self.is_invalidated() {
|
||||
return Err(Error::BadTxn);
|
||||
}
|
||||
|
||||
let _lck = self.lock();
|
||||
|
||||
// No race condition with the `TxnManager` timing out the transaction is possible here,
|
||||
@@ -719,6 +1362,7 @@ unsafe impl Sync for TransactionPtr {}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::flags::DatabaseFlags;
|
||||
|
||||
const fn assert_send_sync<T: Send + Sync>() {}
|
||||
|
||||
@@ -727,4 +1371,26 @@ mod tests {
|
||||
assert_send_sync::<Transaction<RO>>();
|
||||
assert_send_sync::<Transaction<RW>>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_commit_subtxns_fails_with_active_cursor() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let env = Environment::builder().set_max_dbs(10).write_map().open(dir.path()).unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db = txn.create_db(Some("test_db"), DatabaseFlags::default()).unwrap();
|
||||
let dbi = db.dbi();
|
||||
|
||||
txn.enable_parallel_writes(&[dbi]).unwrap();
|
||||
|
||||
let cursor = txn.cursor_with_dbi_parallel_owned(dbi).unwrap();
|
||||
|
||||
let result = txn.commit_subtxns();
|
||||
assert!(matches!(result, Err(Error::Busy)), "expected Error::Busy, got {:?}", result);
|
||||
|
||||
drop(cursor);
|
||||
|
||||
txn.commit_subtxns().expect("commit_subtxns should succeed after cursor is dropped");
|
||||
txn.commit().expect("parent transaction commit should succeed");
|
||||
}
|
||||
}
|
||||
|
||||
492
crates/storage/libmdbx-rs/tests/dupsort_parallel.rs
Normal file
492
crates/storage/libmdbx-rs/tests/dupsort_parallel.rs
Normal file
@@ -0,0 +1,492 @@
|
||||
#![allow(missing_docs)]
|
||||
|
||||
//! Test for parallel DupSort operations that would fail without per-txn page_auxbuf.
|
||||
//!
|
||||
//! This test simulates the HashedStorages pattern where multiple DupSort upserts
|
||||
//! happen concurrently, which requires thread-safe page_auxbuf handling.
|
||||
|
||||
use reth_libmdbx::*;
|
||||
use std::{
|
||||
collections::HashMap,
|
||||
sync::{Arc, Barrier},
|
||||
thread,
|
||||
};
|
||||
use tempfile::tempdir;
|
||||
|
||||
/// Stress test for DupSort operations with sequential nested transactions.
|
||||
///
|
||||
/// This test performs many DupSort upserts that trigger subpage creation/expansion,
|
||||
/// which uses page_auxbuf internally. While MDBX only allows one nested txn at a time,
|
||||
/// this test verifies the page_auxbuf handling is correct for DupSort operations.
|
||||
///
|
||||
/// The pattern mimics HashedStorages: B256 key -> multiple StorageEntry values.
|
||||
#[test]
|
||||
fn test_dupsort_upsert_stress() {
|
||||
const NUM_KEYS: usize = 100;
|
||||
const VALUES_PER_KEY: usize = 50;
|
||||
const NUM_ITERATIONS: usize = 10;
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
|
||||
let env = Arc::new(
|
||||
Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry {
|
||||
size: Some(10 * 1024 * 1024..1024 * 1024 * 1024),
|
||||
..Default::default()
|
||||
})
|
||||
.open(dir.path())
|
||||
.expect("Failed to open environment"),
|
||||
);
|
||||
|
||||
// Create DupSort table (like HashedStorages)
|
||||
{
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
txn.create_db(Some("hashed_storages"), DatabaseFlags::DUP_SORT)
|
||||
.expect("Failed to create table");
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
|
||||
// Track all expected data across iterations
|
||||
let mut all_expected: HashMap<Vec<u8>, Vec<Vec<u8>>> = HashMap::new();
|
||||
|
||||
for iteration in 0..NUM_ITERATIONS {
|
||||
let mut main_txn = env.begin_rw_txn().expect("Failed to begin txn");
|
||||
|
||||
// Track what we write in this iteration
|
||||
let mut iteration_expected: HashMap<Vec<u8>, Vec<Vec<u8>>> = HashMap::new();
|
||||
|
||||
// Use nested transaction (like save_blocks does)
|
||||
let nested_txn = main_txn.begin_nested_txn().expect("Failed to begin nested txn");
|
||||
let db = nested_txn.open_db(Some("hashed_storages")).expect("Failed to open db");
|
||||
|
||||
for key_id in 0..NUM_KEYS {
|
||||
// B256-like key (32 bytes)
|
||||
let key = format!("{:032x}", key_id + iteration * 1000);
|
||||
|
||||
for value_id in 0..VALUES_PER_KEY {
|
||||
// StorageEntry-like value: subkey (32 bytes) + value (32 bytes)
|
||||
let subkey = format!("{:032x}", value_id);
|
||||
let value_data = format!("{:032x}", iteration * 10000 + value_id);
|
||||
let value = format!("{}{}", subkey, value_data);
|
||||
|
||||
// This upsert uses page_auxbuf for DupSort subpage handling
|
||||
nested_txn
|
||||
.put(db.dbi(), key.as_bytes(), value.as_bytes(), WriteFlags::UPSERT)
|
||||
.unwrap_or_else(|e| {
|
||||
panic!(
|
||||
"Failed to put: iteration={}, key={}, value_id={}, err={:?}",
|
||||
iteration, key_id, value_id, e
|
||||
)
|
||||
});
|
||||
|
||||
// Track for verification
|
||||
iteration_expected
|
||||
.entry(key.as_bytes().to_vec())
|
||||
.or_default()
|
||||
.push(value.as_bytes().to_vec());
|
||||
}
|
||||
}
|
||||
|
||||
nested_txn.commit().expect("Failed to commit nested txn");
|
||||
main_txn.commit().expect("Failed to commit main txn");
|
||||
|
||||
// Merge iteration data into all_expected
|
||||
for (key, values) in iteration_expected {
|
||||
all_expected.entry(key).or_default().extend(values);
|
||||
}
|
||||
|
||||
// Verify reads match writes after commit
|
||||
{
|
||||
let read_txn = env.begin_ro_txn().expect("Failed to begin read txn");
|
||||
let db = read_txn.open_db(Some("hashed_storages")).expect("Failed to open db");
|
||||
let mut cursor = read_txn.cursor(db.dbi()).expect("Failed to create cursor");
|
||||
|
||||
for (key, expected_values) in &all_expected {
|
||||
let actual_values: Vec<Vec<u8>> = cursor
|
||||
.iter_dup_of::<Vec<u8>, Vec<u8>>(key)
|
||||
.map(|r| r.expect("Failed to read value").1)
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
actual_values.len(),
|
||||
expected_values.len(),
|
||||
"Iteration {}: key {:?} value count mismatch: got {}, expected {}",
|
||||
iteration,
|
||||
String::from_utf8_lossy(key),
|
||||
actual_values.len(),
|
||||
expected_values.len()
|
||||
);
|
||||
|
||||
// Verify each expected value exists in actual (order may differ due to sorting)
|
||||
let mut expected_sorted = expected_values.clone();
|
||||
expected_sorted.sort();
|
||||
let mut actual_sorted = actual_values.clone();
|
||||
actual_sorted.sort();
|
||||
|
||||
assert_eq!(
|
||||
actual_sorted,
|
||||
expected_sorted,
|
||||
"Iteration {}: key {:?} values mismatch",
|
||||
iteration,
|
||||
String::from_utf8_lossy(key)
|
||||
);
|
||||
}
|
||||
|
||||
let stat = read_txn.db_stat(db.dbi()).unwrap();
|
||||
if iteration % 5 == 4 {
|
||||
println!("Iteration {}: {} entries verified", iteration + 1, stat.entries());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Final verification
|
||||
let txn = env.begin_ro_txn().unwrap();
|
||||
let db = txn.open_db(Some("hashed_storages")).unwrap();
|
||||
let stat = txn.db_stat(db.dbi()).unwrap();
|
||||
|
||||
let total_expected: usize = all_expected.values().map(|v| v.len()).sum();
|
||||
assert_eq!(
|
||||
stat.entries(),
|
||||
total_expected,
|
||||
"Final entry count mismatch: got {}, expected {}",
|
||||
stat.entries(),
|
||||
total_expected
|
||||
);
|
||||
println!(
|
||||
"Final: {} entries verified (all {} keys checked)",
|
||||
stat.entries(),
|
||||
all_expected.len()
|
||||
);
|
||||
}
|
||||
|
||||
/// Test that exercises DupSort with rapid subpage -> subtree conversion.
|
||||
///
|
||||
/// This pattern is more likely to trigger page_auxbuf corruption if it's shared,
|
||||
/// as the subpage data is manipulated in the scratch buffer during conversion.
|
||||
#[test]
|
||||
fn test_dupsort_subpage_to_subtree_stress() {
|
||||
const NUM_KEYS: usize = 20;
|
||||
const MAX_VALUES_PER_KEY: usize = 200; // Enough to trigger subtree conversion
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
|
||||
let env = Arc::new(
|
||||
Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry {
|
||||
size: Some(10 * 1024 * 1024..2 * 1024 * 1024 * 1024),
|
||||
..Default::default()
|
||||
})
|
||||
.open(dir.path())
|
||||
.expect("Failed to open environment"),
|
||||
);
|
||||
|
||||
{
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
txn.create_db(Some("test_db"), DatabaseFlags::DUP_SORT).expect("Failed to create table");
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
|
||||
let mut main_txn = env.begin_rw_txn().expect("Failed to begin txn");
|
||||
let nested_txn = main_txn.begin_nested_txn().expect("Failed to begin nested txn");
|
||||
let db = nested_txn.open_db(Some("test_db")).expect("Failed to open db");
|
||||
|
||||
for key_id in 0..NUM_KEYS {
|
||||
let key = format!("key_{:08}", key_id);
|
||||
|
||||
for value_id in 0..MAX_VALUES_PER_KEY {
|
||||
// Larger values to fill subpages faster and trigger subtree conversion
|
||||
let value = format!("value_{:08}_{:064}", value_id, value_id);
|
||||
|
||||
nested_txn
|
||||
.put(db.dbi(), key.as_bytes(), value.as_bytes(), WriteFlags::UPSERT)
|
||||
.unwrap_or_else(|e| {
|
||||
panic!("Failed to put: key={}, value_id={}, err={:?}", key_id, value_id, e)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
nested_txn.commit().expect("Failed to commit nested txn");
|
||||
main_txn.commit().expect("Failed to commit main txn");
|
||||
|
||||
// Verify
|
||||
let txn = env.begin_ro_txn().unwrap();
|
||||
let db = txn.open_db(Some("test_db")).unwrap();
|
||||
let stat = txn.db_stat(db.dbi()).unwrap();
|
||||
println!("Subtree stress: {} entries", stat.entries());
|
||||
assert_eq!(stat.entries(), NUM_KEYS * MAX_VALUES_PER_KEY);
|
||||
}
|
||||
|
||||
/// Test rapid seek + delete + upsert pattern on DupSort (like HashedStorages write pattern).
|
||||
///
|
||||
/// This is the exact pattern from write_hashed_state that caused MDBX_PAGE_FULL.
|
||||
#[test]
|
||||
fn test_dupsort_seek_delete_upsert_pattern() {
|
||||
const NUM_ITERATIONS: usize = 20;
|
||||
const NUM_ADDRESSES: usize = 50;
|
||||
const SLOTS_PER_ADDRESS: usize = 30;
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
|
||||
let env = Arc::new(
|
||||
Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry {
|
||||
size: Some(10 * 1024 * 1024..1024 * 1024 * 1024),
|
||||
..Default::default()
|
||||
})
|
||||
.open(dir.path())
|
||||
.expect("Failed to open environment"),
|
||||
);
|
||||
|
||||
{
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
txn.create_db(Some("hashed_storages"), DatabaseFlags::DUP_SORT)
|
||||
.expect("Failed to create table");
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
|
||||
for iteration in 0..NUM_ITERATIONS {
|
||||
let mut main_txn = env.begin_rw_txn().expect("Failed to begin txn");
|
||||
let nested_txn = main_txn.begin_nested_txn().expect("Failed to begin nested txn");
|
||||
let db = nested_txn.open_db(Some("hashed_storages")).expect("Failed to open db");
|
||||
let dbi = db.dbi();
|
||||
|
||||
for addr_id in 0..NUM_ADDRESSES {
|
||||
// Simulated hashed address (32 bytes)
|
||||
let hashed_address = format!("{:032x}", addr_id);
|
||||
|
||||
for slot_id in 0..SLOTS_PER_ADDRESS {
|
||||
// Simulated StorageEntry: hashed_slot (32 bytes) + value (32 bytes)
|
||||
let hashed_slot = format!("{:032x}", slot_id);
|
||||
let value = format!("{:032x}", iteration * 1000 + slot_id);
|
||||
let entry = format!("{}{}", hashed_slot, value);
|
||||
|
||||
// Pattern from write_hashed_state:
|
||||
// 1. seek_by_key_subkey to find existing entry
|
||||
// 2. delete_current if found
|
||||
// 3. upsert new value
|
||||
|
||||
let mut cursor = nested_txn.cursor(dbi).expect("Failed to create cursor");
|
||||
|
||||
// Try to find existing entry with this key+subkey
|
||||
if let Ok(Some((found_key, found_val))) =
|
||||
cursor.set_range::<Vec<u8>, Vec<u8>>(hashed_address.as_bytes())
|
||||
{
|
||||
if found_key.as_slice() == hashed_address.as_bytes() {
|
||||
// Check if the subkey (first 32 bytes of value) matches
|
||||
if found_val.len() >= 32 && &found_val[..32] == hashed_slot.as_bytes() {
|
||||
// Delete existing entry before upsert
|
||||
cursor.del(WriteFlags::empty()).ok();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Now upsert the new value
|
||||
nested_txn
|
||||
.put(dbi, hashed_address.as_bytes(), entry.as_bytes(), WriteFlags::UPSERT)
|
||||
.unwrap_or_else(|e| {
|
||||
panic!(
|
||||
"Failed to put: iteration={}, addr={}, slot={}, err={:?}",
|
||||
iteration, addr_id, slot_id, e
|
||||
)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
nested_txn.commit().expect("Failed to commit nested txn");
|
||||
main_txn.commit().expect("Failed to commit main txn");
|
||||
|
||||
if iteration % 5 == 4 {
|
||||
let txn = env.begin_ro_txn().unwrap();
|
||||
let db = txn.open_db(Some("hashed_storages")).unwrap();
|
||||
let stat = txn.db_stat(db.dbi()).unwrap();
|
||||
println!("Iteration {}: {} entries", iteration + 1, stat.entries());
|
||||
}
|
||||
}
|
||||
|
||||
println!("Seek-delete-upsert stress test completed!");
|
||||
}
|
||||
|
||||
/// Test with multiple threads doing DupSort operations (serialized by MDBX).
|
||||
///
|
||||
/// Even though write transactions serialize, this tests that page_auxbuf
|
||||
/// state is properly reset between transactions.
|
||||
#[test]
|
||||
fn test_dupsort_multithreaded_serialized() {
|
||||
const NUM_THREADS: usize = 4;
|
||||
const ITERATIONS_PER_THREAD: usize = 20;
|
||||
const ENTRIES_PER_ITERATION: usize = 100;
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
|
||||
let env = Arc::new(
|
||||
Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry {
|
||||
size: Some(10 * 1024 * 1024..1024 * 1024 * 1024),
|
||||
..Default::default()
|
||||
})
|
||||
.open(dir.path())
|
||||
.expect("Failed to open environment"),
|
||||
);
|
||||
|
||||
{
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
txn.create_db(Some("test_db"), DatabaseFlags::DUP_SORT).expect("Failed to create table");
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
|
||||
let barrier = Arc::new(Barrier::new(NUM_THREADS));
|
||||
|
||||
let handles: Vec<_> = (0..NUM_THREADS)
|
||||
.map(|thread_id| {
|
||||
let env = env.clone();
|
||||
let barrier = barrier.clone();
|
||||
|
||||
thread::spawn(move || {
|
||||
barrier.wait();
|
||||
|
||||
for iter in 0..ITERATIONS_PER_THREAD {
|
||||
let mut main_txn = env.begin_rw_txn().expect("Failed to begin txn");
|
||||
let nested_txn =
|
||||
main_txn.begin_nested_txn().expect("Failed to begin nested txn");
|
||||
let db = nested_txn.open_db(Some("test_db")).expect("Failed to open db");
|
||||
|
||||
for entry in 0..ENTRIES_PER_ITERATION {
|
||||
let key = format!("t{}_i{}_k{:04}", thread_id, iter, entry);
|
||||
let value = format!("value_{:08}_{:08}", thread_id * 1000 + iter, entry);
|
||||
|
||||
nested_txn
|
||||
.put(db.dbi(), key.as_bytes(), value.as_bytes(), WriteFlags::UPSERT)
|
||||
.unwrap_or_else(|e| {
|
||||
panic!(
|
||||
"Thread {} failed at iter={}, entry={}: {:?}",
|
||||
thread_id, iter, entry, e
|
||||
)
|
||||
});
|
||||
}
|
||||
|
||||
nested_txn.commit().expect("Failed to commit nested txn");
|
||||
main_txn.commit().expect("Failed to commit main txn");
|
||||
}
|
||||
|
||||
thread_id
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
for handle in handles {
|
||||
let thread_id = handle.join().expect("Thread panicked");
|
||||
println!("Thread {} completed", thread_id);
|
||||
}
|
||||
|
||||
// Verify
|
||||
let txn = env.begin_ro_txn().unwrap();
|
||||
let db = txn.open_db(Some("test_db")).unwrap();
|
||||
let stat = txn.db_stat(db.dbi()).unwrap();
|
||||
println!("Multithreaded test: {} entries", stat.entries());
|
||||
|
||||
let expected = NUM_THREADS * ITERATIONS_PER_THREAD * ENTRIES_PER_ITERATION;
|
||||
assert_eq!(stat.entries(), expected);
|
||||
}
|
||||
|
||||
/// Test that verifies data written in nested transactions can be read back correctly.
|
||||
/// This catches any corruption from page_auxbuf or other parallel write issues.
|
||||
#[test]
|
||||
fn test_nested_txn_write_read_integrity() {
|
||||
const NUM_KEYS: usize = 50;
|
||||
const VALUES_PER_KEY: usize = 20;
|
||||
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry {
|
||||
size: Some(10 * 1024 * 1024..1024 * 1024 * 1024),
|
||||
..Default::default()
|
||||
})
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
// Create DupSort table
|
||||
{
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
txn.create_db(Some("test_db"), DatabaseFlags::DUP_SORT).unwrap();
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
|
||||
// Track what we write
|
||||
let mut expected: std::collections::BTreeMap<String, Vec<String>> =
|
||||
std::collections::BTreeMap::new();
|
||||
|
||||
// Write data in nested transaction
|
||||
{
|
||||
let mut main_txn = env.begin_rw_txn().unwrap();
|
||||
let nested_txn = main_txn.begin_nested_txn().unwrap();
|
||||
let db = nested_txn.open_db(Some("test_db")).unwrap();
|
||||
|
||||
for key_id in 0..NUM_KEYS {
|
||||
let key = format!("key_{:08}", key_id);
|
||||
|
||||
for value_id in 0..VALUES_PER_KEY {
|
||||
let value = format!("value_{:08}_{:08}", key_id, value_id);
|
||||
|
||||
nested_txn
|
||||
.put(db.dbi(), key.as_bytes(), value.as_bytes(), WriteFlags::empty())
|
||||
.unwrap();
|
||||
|
||||
expected.entry(key.clone()).or_default().push(value);
|
||||
}
|
||||
}
|
||||
|
||||
nested_txn.commit().unwrap();
|
||||
main_txn.commit().unwrap();
|
||||
}
|
||||
|
||||
// Verify all data can be read back correctly
|
||||
{
|
||||
let txn = env.begin_ro_txn().unwrap();
|
||||
let db = txn.open_db(Some("test_db")).unwrap();
|
||||
let mut cursor = txn.cursor(db.dbi()).unwrap();
|
||||
|
||||
let mut actual_count = 0;
|
||||
|
||||
for (key, expected_values) in &expected {
|
||||
let actual_values: Vec<Vec<u8>> = cursor
|
||||
.iter_dup_of::<Vec<u8>, Vec<u8>>(key.as_bytes())
|
||||
.collect::<Result<Vec<_>>>()
|
||||
.unwrap()
|
||||
.into_iter()
|
||||
.map(|(_, v)| v)
|
||||
.collect();
|
||||
|
||||
assert_eq!(
|
||||
actual_values.len(),
|
||||
expected_values.len(),
|
||||
"Key {:?}: expected {} values, got {}",
|
||||
key,
|
||||
expected_values.len(),
|
||||
actual_values.len()
|
||||
);
|
||||
|
||||
for (i, expected_val) in expected_values.iter().enumerate() {
|
||||
let actual_val = String::from_utf8_lossy(&actual_values[i]);
|
||||
assert_eq!(
|
||||
actual_val.as_ref(),
|
||||
expected_val.as_str(),
|
||||
"Key {:?} value {}: expected {:?}, got {:?}",
|
||||
key,
|
||||
i,
|
||||
expected_val,
|
||||
actual_val
|
||||
);
|
||||
}
|
||||
|
||||
actual_count += actual_values.len();
|
||||
}
|
||||
|
||||
assert_eq!(actual_count, NUM_KEYS * VALUES_PER_KEY);
|
||||
println!("Verified {} key-value pairs", actual_count);
|
||||
}
|
||||
}
|
||||
706
crates/storage/libmdbx-rs/tests/invariant_tests.rs
Normal file
706
crates/storage/libmdbx-rs/tests/invariant_tests.rs
Normal file
@@ -0,0 +1,706 @@
|
||||
//! Invariant tests for MDBX parallel subtransactions.
|
||||
//!
|
||||
//! These tests verify safety invariants that must hold to prevent corruption.
|
||||
//! They are designed to catch regressions during refactors and integrations.
|
||||
|
||||
#![allow(missing_docs)]
|
||||
use reth_libmdbx::*;
|
||||
use std::{borrow::Cow, sync::Arc, thread};
|
||||
use tempfile::tempdir;
|
||||
|
||||
// =============================================================================
|
||||
// INVARIANT 1: 1 DBI = 1 SUBTXN - Cross-DBI access must fail
|
||||
// =============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_invariant_cross_dbi_access_rejected() {
|
||||
// When parallel writes is enabled, accessing a DBI without a subtxn must fail.
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 10)), ..Default::default() })
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db1 = txn.create_db(Some("table1"), DatabaseFlags::empty()).unwrap();
|
||||
let db2 = txn.create_db(Some("table2"), DatabaseFlags::empty()).unwrap();
|
||||
let dbi1 = db1.dbi();
|
||||
let dbi2 = db2.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
let mut txn = env.begin_rw_txn().unwrap();
|
||||
// Only enable subtxn for dbi1
|
||||
txn.enable_parallel_writes(&[dbi1]).unwrap();
|
||||
|
||||
// Get cursor for dbi1 - should work
|
||||
{
|
||||
let cursor1 = txn.cursor_with_dbi_parallel(dbi1);
|
||||
assert!(cursor1.is_ok(), "Cursor for assigned DBI should succeed");
|
||||
} // cursor1 dropped here before abort
|
||||
|
||||
// Try to get cursor for dbi2 - should FAIL (no subtxn for it)
|
||||
{
|
||||
let cursor2 = txn.cursor_with_dbi_parallel(dbi2);
|
||||
assert!(cursor2.is_err(), "Cursor for non-assigned DBI should fail");
|
||||
}
|
||||
|
||||
txn.abort_subtxns().unwrap();
|
||||
drop(txn);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invariant_dbi_subtxn_isolation() {
|
||||
// Each subtxn operates on its assigned DBI - verify data isolation.
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 10)), ..Default::default() })
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db1 = txn.create_db(Some("table1"), DatabaseFlags::empty()).unwrap();
|
||||
let db2 = txn.create_db(Some("table2"), DatabaseFlags::empty()).unwrap();
|
||||
let dbi1 = db1.dbi();
|
||||
let dbi2 = db2.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
let mut txn = env.begin_rw_txn().unwrap();
|
||||
// Enable subtxn for BOTH DBIs - proper usage pattern
|
||||
txn.enable_parallel_writes(&[dbi1, dbi2]).unwrap();
|
||||
|
||||
{
|
||||
// Get cursor for dbi1 - should work via subtxn
|
||||
let cursor1 = txn.cursor_with_dbi_parallel(dbi1);
|
||||
assert!(cursor1.is_ok(), "Cursor for assigned DBI 1 should succeed");
|
||||
|
||||
// Get cursor for dbi2 - should work via its own subtxn
|
||||
let cursor2 = txn.cursor_with_dbi_parallel(dbi2);
|
||||
assert!(cursor2.is_ok(), "Cursor for assigned DBI 2 should succeed");
|
||||
|
||||
// Write to both
|
||||
cursor1.unwrap().put(b"key1", b"val1", WriteFlags::empty()).unwrap();
|
||||
cursor2.unwrap().put(b"key2", b"val2", WriteFlags::empty()).unwrap();
|
||||
}
|
||||
|
||||
txn.commit_subtxns().unwrap();
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Verify isolation - data written via separate subtxns
|
||||
let txn = env.begin_ro_txn().unwrap();
|
||||
let v1: Option<Cow<'_, [u8]>> = txn.get(dbi1, b"key1").unwrap();
|
||||
let v2: Option<Cow<'_, [u8]>> = txn.get(dbi2, b"key2").unwrap();
|
||||
assert_eq!(v1.as_deref(), Some(b"val1".as_slice()));
|
||||
assert_eq!(v2.as_deref(), Some(b"val2".as_slice()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invariant_duplicate_dbi_in_subtxn_list_rejected() {
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 10)), ..Default::default() })
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db = txn.create_db(Some("table"), DatabaseFlags::empty()).unwrap();
|
||||
let dbi = db.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
// Try to create subtxns with duplicate DBI
|
||||
let result = txn.enable_parallel_writes(&[dbi, dbi]);
|
||||
assert!(result.is_err(), "Duplicate DBI should be rejected");
|
||||
drop(txn); // Abort via drop
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// INVARIANT 2: WRITEMAP required for parallel subtxns
|
||||
// =============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_invariant_non_writemap_rejected() {
|
||||
let dir = tempdir().unwrap();
|
||||
// Create env WITHOUT write_map()
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 10)), ..Default::default() })
|
||||
// NO .write_map() - should fail
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db = txn.create_db(Some("table"), DatabaseFlags::empty()).unwrap();
|
||||
let dbi = db.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let result = txn.enable_parallel_writes(&[dbi]);
|
||||
assert!(result.is_err(), "Non-WRITEMAP mode should reject parallel subtxns");
|
||||
drop(txn); // Abort via drop
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// INVARIANT 3: All subtxns must commit before parent
|
||||
// =============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_invariant_parent_commit_blocked_while_subtxns_uncommitted() {
|
||||
// Parent commit must fail if subtxns have not been committed.
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 10)), ..Default::default() })
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db = txn.create_db(Some("table"), DatabaseFlags::empty()).unwrap();
|
||||
let dbi = db.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
txn.enable_parallel_writes(&[dbi]).unwrap();
|
||||
|
||||
// Write something via subtxn
|
||||
{
|
||||
let mut cursor = txn.cursor_with_dbi_parallel(dbi).unwrap();
|
||||
cursor.put(b"key", b"value", WriteFlags::empty()).unwrap();
|
||||
}
|
||||
|
||||
// Try to commit parent WITHOUT committing subtxns first - should fail
|
||||
let result = txn.commit();
|
||||
assert!(result.is_err(), "Parent commit should fail while subtxns uncommitted");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invariant_commit_subtxns_then_parent_succeeds() {
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 10)), ..Default::default() })
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db = txn.create_db(Some("table"), DatabaseFlags::empty()).unwrap();
|
||||
let dbi = db.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
let mut txn = env.begin_rw_txn().unwrap();
|
||||
txn.enable_parallel_writes(&[dbi]).unwrap();
|
||||
|
||||
{
|
||||
let mut cursor = txn.cursor_with_dbi_parallel(dbi).unwrap();
|
||||
cursor.put(b"key", b"value", WriteFlags::empty()).unwrap();
|
||||
}
|
||||
|
||||
// Commit subtxns first
|
||||
txn.commit_subtxns().unwrap();
|
||||
// Now parent commit should succeed
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Verify data persisted
|
||||
let txn = env.begin_ro_txn().unwrap();
|
||||
let val: Option<Cow<'_, [u8]>> = txn.get(dbi, b"key").unwrap();
|
||||
assert_eq!(val.as_deref(), Some(b"value".as_slice()));
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// INVARIANT 4: Subtxn abort returns pages to parent
|
||||
// =============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_invariant_subtxn_abort_no_page_leak() {
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 100)), ..Default::default() })
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db = txn.create_db(Some("table"), DatabaseFlags::empty()).unwrap();
|
||||
let dbi = db.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
let freelist_before = env.freelist().unwrap();
|
||||
|
||||
// Run multiple abort cycles - pages should not leak
|
||||
for _ in 0..10 {
|
||||
let mut txn = env.begin_rw_txn().unwrap();
|
||||
txn.enable_parallel_writes_with_hints(&[(dbi, 100)]).unwrap();
|
||||
|
||||
// Allocate some pages
|
||||
{
|
||||
let mut cursor = txn.cursor_with_dbi_parallel(dbi).unwrap();
|
||||
for i in 0..500u32 {
|
||||
let key = i.to_be_bytes();
|
||||
let val = [0xAA; 200];
|
||||
cursor.put(&key, &val, WriteFlags::empty()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// Abort subtxns (not commit)
|
||||
txn.abort_subtxns().unwrap();
|
||||
drop(txn); // Abort via drop
|
||||
}
|
||||
|
||||
let freelist_after = env.freelist().unwrap();
|
||||
|
||||
// Freelist should not grow significantly (pages returned on abort)
|
||||
assert!(
|
||||
freelist_after <= freelist_before + 50,
|
||||
"Freelist grew too much after aborts: {} -> {}. Page leak on abort!",
|
||||
freelist_before,
|
||||
freelist_after
|
||||
);
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// INVARIANT 5: Each subtxn has own page_auxbuf (DupSort safety)
|
||||
// =============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_invariant_dupsort_concurrent_safety() {
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Arc::new(
|
||||
Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 100)), ..Default::default() })
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db1 = txn.create_db(Some("dupsort1"), DatabaseFlags::DUP_SORT).unwrap();
|
||||
let db2 = txn.create_db(Some("dupsort2"), DatabaseFlags::DUP_SORT).unwrap();
|
||||
let dbi1 = db1.dbi();
|
||||
let dbi2 = db2.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Create parent with subtxns
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
txn.enable_parallel_writes(&[dbi1, dbi2]).unwrap();
|
||||
|
||||
// Get transaction pointer for thread sharing (unsafe but necessary for test)
|
||||
let txn = Arc::new(txn);
|
||||
let txn1 = Arc::clone(&txn);
|
||||
let txn2 = Arc::clone(&txn);
|
||||
|
||||
// Concurrent DupSort writes - would corrupt if sharing page_auxbuf
|
||||
let handle1 = thread::spawn(move || {
|
||||
let mut cursor = txn1.cursor_with_dbi_parallel(dbi1).unwrap();
|
||||
for i in 0..1000u32 {
|
||||
let key = (i % 10).to_be_bytes(); // 10 unique keys
|
||||
let val = format!("value1_{i:05}");
|
||||
cursor.put(&key, val.as_bytes(), WriteFlags::empty()).unwrap();
|
||||
}
|
||||
});
|
||||
|
||||
let handle2 = thread::spawn(move || {
|
||||
let mut cursor = txn2.cursor_with_dbi_parallel(dbi2).unwrap();
|
||||
for i in 0..1000u32 {
|
||||
let key = (i % 10).to_be_bytes();
|
||||
let val = format!("value2_{i:05}");
|
||||
cursor.put(&key, val.as_bytes(), WriteFlags::empty()).unwrap();
|
||||
}
|
||||
});
|
||||
|
||||
handle1.join().unwrap();
|
||||
handle2.join().unwrap();
|
||||
|
||||
// If page_auxbuf was shared, data would be corrupted
|
||||
// Use Arc::try_unwrap to get ownership back
|
||||
let mut txn = Arc::try_unwrap(txn).expect("All thread handles joined");
|
||||
txn.commit_subtxns().unwrap();
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Verify data integrity
|
||||
let txn = env.begin_ro_txn().unwrap();
|
||||
let cursor1 = txn.cursor(dbi1).unwrap();
|
||||
let cursor2 = txn.cursor(dbi2).unwrap();
|
||||
|
||||
let count1: usize = cursor1.iter_slices().count();
|
||||
let count2: usize = cursor2.iter_slices().count();
|
||||
|
||||
assert_eq!(count1, 1000, "DupSort table 1 should have 1000 entries");
|
||||
assert_eq!(count2, 1000, "DupSort table 2 should have 1000 entries");
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// INVARIANT 6: Commit serialization via mutex
|
||||
// =============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_invariant_concurrent_commits_serialized() {
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Arc::new(
|
||||
Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 100)), ..Default::default() })
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap(),
|
||||
);
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db1 = txn.create_db(Some("t1"), DatabaseFlags::empty()).unwrap();
|
||||
let db2 = txn.create_db(Some("t2"), DatabaseFlags::empty()).unwrap();
|
||||
let db3 = txn.create_db(Some("t3"), DatabaseFlags::empty()).unwrap();
|
||||
let dbi1 = db1.dbi();
|
||||
let dbi2 = db2.dbi();
|
||||
let dbi3 = db3.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Run many iterations to stress test concurrent commits
|
||||
for iteration in 0..20 {
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
txn.enable_parallel_writes(&[dbi1, dbi2, dbi3]).unwrap();
|
||||
|
||||
let txn = Arc::new(txn);
|
||||
|
||||
let handles: Vec<_> = [dbi1, dbi2, dbi3]
|
||||
.iter()
|
||||
.map(|&dbi| {
|
||||
let txn = Arc::clone(&txn);
|
||||
thread::spawn(move || {
|
||||
let mut cursor = txn.cursor_with_dbi_parallel(dbi).unwrap();
|
||||
for i in 0..100u32 {
|
||||
let key = format!("iter{iteration}_key{i}");
|
||||
let val = format!("value_{dbi}_{i}");
|
||||
cursor.put(key.as_bytes(), val.as_bytes(), WriteFlags::UPSERT).unwrap();
|
||||
}
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
for h in handles {
|
||||
h.join().unwrap();
|
||||
}
|
||||
|
||||
// All subtxns commit - mutex ensures no race
|
||||
// Use Arc::try_unwrap to get ownership back
|
||||
let mut txn = Arc::try_unwrap(txn).expect("All thread handles joined");
|
||||
txn.commit_subtxns().unwrap();
|
||||
txn.commit().unwrap();
|
||||
}
|
||||
|
||||
// Verify final state
|
||||
let txn = env.begin_ro_txn().unwrap();
|
||||
for dbi in [dbi1, dbi2, dbi3] {
|
||||
let cursor = txn.cursor(dbi).unwrap();
|
||||
let count: usize = cursor.iter_slices().count();
|
||||
// 20 iterations * 100 keys = 2000, but UPSERT overwrites, so should have 100
|
||||
assert!(count >= 100, "Table {dbi} should have at least 100 entries, got {count}");
|
||||
}
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// INVARIANT 7: Parent direct write blocked while subtxns active
|
||||
// =============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_invariant_parent_put_blocked() {
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 10)), ..Default::default() })
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db = txn.create_db(Some("table"), DatabaseFlags::empty()).unwrap();
|
||||
let dbi = db.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
let mut txn = env.begin_rw_txn().unwrap();
|
||||
txn.enable_parallel_writes(&[dbi]).unwrap();
|
||||
|
||||
// Try to use parent's direct put - should fail
|
||||
let result = txn.put(dbi, b"key", b"value", WriteFlags::empty());
|
||||
assert!(result.is_err(), "Parent put should fail while subtxns active");
|
||||
|
||||
txn.abort_subtxns().unwrap();
|
||||
drop(txn); // Abort via drop
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// INVARIANT 8: Data visibility after commit
|
||||
// =============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_invariant_data_visible_after_commit() {
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 10)), ..Default::default() })
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db = txn.create_db(Some("table"), DatabaseFlags::empty()).unwrap();
|
||||
let dbi = db.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Write via parallel subtxn
|
||||
let mut txn = env.begin_rw_txn().unwrap();
|
||||
txn.enable_parallel_writes(&[dbi]).unwrap();
|
||||
|
||||
{
|
||||
let mut cursor = txn.cursor_with_dbi_parallel(dbi).unwrap();
|
||||
cursor.put(b"key1", b"value1", WriteFlags::empty()).unwrap();
|
||||
cursor.put(b"key2", b"value2", WriteFlags::empty()).unwrap();
|
||||
}
|
||||
|
||||
txn.commit_subtxns().unwrap();
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Verify via new read transaction
|
||||
let txn = env.begin_ro_txn().unwrap();
|
||||
let v1: Option<Cow<'_, [u8]>> = txn.get(dbi, b"key1").unwrap();
|
||||
let v2: Option<Cow<'_, [u8]>> = txn.get(dbi, b"key2").unwrap();
|
||||
|
||||
assert_eq!(v1.as_deref(), Some(b"value1".as_slice()));
|
||||
assert_eq!(v2.as_deref(), Some(b"value2".as_slice()));
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// INVARIANT 9: DUPSORT upsert behavior (documented in parallel-mdbx.md)
|
||||
// =============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_invariant_dupsort_upsert_appends_not_replaces() {
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 10)), ..Default::default() })
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db = txn.create_db(Some("dupsort"), DatabaseFlags::DUP_SORT).unwrap();
|
||||
let dbi = db.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Insert initial entry
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
txn.put(dbi, b"key", b"value1", WriteFlags::empty()).unwrap();
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Use UPSERT with same key, different value
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
txn.put(dbi, b"key", b"value2", WriteFlags::UPSERT).unwrap();
|
||||
txn.commit().unwrap();
|
||||
|
||||
// In DUPSORT, UPSERT APPENDS, does not replace!
|
||||
let txn = env.begin_ro_txn().unwrap();
|
||||
let cursor = txn.cursor(dbi).unwrap();
|
||||
|
||||
// Count all entries for the key using iter_slices
|
||||
let entries: Vec<_> = cursor.iter_slices().collect::<Result<Vec<_>>>().unwrap();
|
||||
|
||||
// Filter for our key and collect values
|
||||
let key_entries: Vec<_> =
|
||||
entries.iter().filter(|(k, _)| k.as_ref() == b"key").map(|(_, v)| v.to_vec()).collect();
|
||||
|
||||
// Should have BOTH values (appended, not replaced)
|
||||
assert_eq!(key_entries.len(), 2, "DUPSORT UPSERT should append, not replace");
|
||||
assert_eq!(key_entries[0], b"value1");
|
||||
assert_eq!(key_entries[1], b"value2");
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// INVARIANT 10: Arena pages exhaustion triggers fallback
|
||||
// =============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_invariant_arena_exhaustion_fallback() {
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 100)), ..Default::default() })
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db = txn.create_db(Some("table"), DatabaseFlags::empty()).unwrap();
|
||||
let dbi = db.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Use very small arena hint - should trigger fallback
|
||||
let mut txn = env.begin_rw_txn().unwrap();
|
||||
txn.enable_parallel_writes_with_hints(&[(dbi, 2)]).unwrap(); // Only 2 pages!
|
||||
|
||||
// Write enough data to exceed 2 pages
|
||||
{
|
||||
let mut cursor = txn.cursor_with_dbi_parallel(dbi).unwrap();
|
||||
for i in 0..1000u32 {
|
||||
let key = i.to_be_bytes();
|
||||
let val = [0xBB; 500]; // Large values to quickly exhaust pages
|
||||
cursor.put(&key, &val, WriteFlags::empty()).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// Should succeed via fallback to parent
|
||||
txn.commit_subtxns().unwrap();
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Verify data
|
||||
let txn = env.begin_ro_txn().unwrap();
|
||||
let cursor = txn.cursor(dbi).unwrap();
|
||||
let count: usize = cursor.iter_slices().count();
|
||||
assert_eq!(count, 1000);
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// INVARIANT 11: Clone shares parallel_writes_enabled flag
|
||||
// =============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_invariant_clone_shares_parallel_writes_flag() {
|
||||
// Clones of a transaction must share the parallel_writes_enabled flag.
|
||||
// When commit_subtxns() is called on one, all clones must see the change.
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 10)), ..Default::default() })
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db = txn.create_db(Some("table"), DatabaseFlags::empty()).unwrap();
|
||||
let dbi = db.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
let mut txn = env.begin_rw_txn().unwrap();
|
||||
txn.enable_parallel_writes(&[dbi]).unwrap();
|
||||
|
||||
// Clone the transaction
|
||||
let txn_clone = txn.clone();
|
||||
|
||||
// Both should see parallel writes as enabled
|
||||
assert!(txn.is_parallel_writes_enabled());
|
||||
assert!(txn_clone.is_parallel_writes_enabled());
|
||||
|
||||
// Write via the original transaction's subtxn
|
||||
{
|
||||
let mut cursor = txn.cursor_with_dbi_parallel(dbi).unwrap();
|
||||
cursor.put(b"key", b"value", WriteFlags::empty()).unwrap();
|
||||
}
|
||||
|
||||
// Commit subtxns via the original - clone should also see the flag change
|
||||
txn.commit_subtxns().unwrap();
|
||||
|
||||
// Both should now see parallel writes as disabled
|
||||
assert!(!txn.is_parallel_writes_enabled(), "Original should see flag as false");
|
||||
assert!(!txn_clone.is_parallel_writes_enabled(), "Clone must share flag state");
|
||||
|
||||
// Now commit the parent (using either clone works since they share inner)
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Verify data persisted
|
||||
let txn = env.begin_ro_txn().unwrap();
|
||||
let val: Option<Cow<'_, [u8]>> = txn.get(dbi, b"key").unwrap();
|
||||
assert_eq!(val.as_deref(), Some(b"value".as_slice()));
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// INVARIANT 12: Active cursor blocks commit_subtxns
|
||||
// =============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_invariant_active_cursor_blocks_commit_subtxns() {
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 10)), ..Default::default() })
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db = txn.create_db(Some("table"), DatabaseFlags::empty()).unwrap();
|
||||
let dbi = db.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
txn.enable_parallel_writes(&[dbi]).unwrap();
|
||||
|
||||
// Get an owned cursor (keeps strong reference count elevated)
|
||||
let cursor = txn.cursor_with_dbi_parallel_owned(dbi).unwrap();
|
||||
|
||||
// Attempt commit_subtxns() with active cursor - should return Error::Busy
|
||||
let result = txn.commit_subtxns();
|
||||
assert!(
|
||||
matches!(result, Err(Error::Busy)),
|
||||
"commit_subtxns should return Error::Busy with active cursor, got {:?}",
|
||||
result
|
||||
);
|
||||
|
||||
// Drop the cursor
|
||||
drop(cursor);
|
||||
|
||||
// Now commit_subtxns should succeed
|
||||
txn.commit_subtxns().expect("commit_subtxns should succeed after cursor dropped");
|
||||
txn.commit().expect("parent commit should succeed");
|
||||
}
|
||||
|
||||
// =============================================================================
|
||||
// INVARIANT 13: DBI opened in previous transaction handled correctly
|
||||
// =============================================================================
|
||||
|
||||
#[test]
|
||||
fn test_invariant_dbi_stale_handled() {
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 10)), ..Default::default() })
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
// Transaction 1: Open DBI and write some data
|
||||
let txn1 = env.begin_rw_txn().unwrap();
|
||||
let db = txn1.create_db(Some("table"), DatabaseFlags::empty()).unwrap();
|
||||
let dbi = db.dbi();
|
||||
txn1.put(dbi, b"initial_key", b"initial_value", WriteFlags::empty()).unwrap();
|
||||
txn1.commit().unwrap();
|
||||
|
||||
// Transaction 2: Use the DBI from previous transaction with parallel writes
|
||||
let txn2 = env.begin_rw_txn().unwrap();
|
||||
// enable_parallel_writes should handle DBI correctly (pre-touch logic handles stale DBIs)
|
||||
txn2.enable_parallel_writes(&[dbi])
|
||||
.expect("enable_parallel_writes should handle DBI from previous txn");
|
||||
|
||||
// Write data via parallel cursor
|
||||
{
|
||||
let mut cursor = txn2.cursor_with_dbi_parallel(dbi).unwrap();
|
||||
cursor.put(b"new_key", b"new_value", WriteFlags::empty()).unwrap();
|
||||
}
|
||||
|
||||
// Commit subtxns and parent
|
||||
txn2.commit_subtxns().expect("commit_subtxns should succeed");
|
||||
txn2.commit().expect("parent commit should succeed");
|
||||
|
||||
// Verify both initial and new data are readable
|
||||
let txn = env.begin_ro_txn().unwrap();
|
||||
let initial: Option<Cow<'_, [u8]>> = txn.get(dbi, b"initial_key").unwrap();
|
||||
let new: Option<Cow<'_, [u8]>> = txn.get(dbi, b"new_key").unwrap();
|
||||
assert_eq!(initial.as_deref(), Some(b"initial_value".as_slice()));
|
||||
assert_eq!(new.as_deref(), Some(b"new_value".as_slice()));
|
||||
}
|
||||
255
crates/storage/libmdbx-rs/tests/parallel_writes.rs
Normal file
255
crates/storage/libmdbx-rs/tests/parallel_writes.rs
Normal file
@@ -0,0 +1,255 @@
|
||||
#![allow(missing_docs)]
|
||||
use reth_libmdbx::*;
|
||||
use std::borrow::Cow;
|
||||
use tempfile::tempdir;
|
||||
|
||||
#[test]
|
||||
fn test_parallel_subtx_dupsort_storage_pattern() {
|
||||
// Setup with WRITEMAP mode
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry { size: Some(0..(1024 * 1024 * 10)), ..Default::default() })
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
// Create DupSort db (like PlainStorageState)
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db = txn.create_db(Some("storage"), DatabaseFlags::DUP_SORT).unwrap();
|
||||
let dbi = db.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Insert initial data - address + storage_key as subkey pattern
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
{
|
||||
let mut cursor = txn.cursor(dbi).unwrap();
|
||||
// addr1 has storage keys key1, key2
|
||||
cursor.put(b"addr1", b"key1\x00val1", WriteFlags::empty()).unwrap();
|
||||
cursor.put(b"addr1", b"key2\x00val2", WriteFlags::empty()).unwrap();
|
||||
// addr2 has storage key key1
|
||||
cursor.put(b"addr2", b"key1\x00val3", WriteFlags::empty()).unwrap();
|
||||
}
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Now do parallel subtxn with the exact write_state_changes pattern
|
||||
let mut txn = env.begin_rw_txn().unwrap();
|
||||
txn.enable_parallel_writes(&[dbi]).unwrap();
|
||||
|
||||
{
|
||||
let mut cursor = txn.cursor_with_dbi_parallel(dbi).unwrap();
|
||||
|
||||
// Simulate updating multiple storage entries like write_state_changes
|
||||
let updates: Vec<(&[u8], &[u8])> = vec![
|
||||
(b"addr1", b"key1\x00new_val1"), // update existing
|
||||
(b"addr1", b"key3\x00val_new"), // insert new
|
||||
(b"addr2", b"key1\x00"), // delete (zero value)
|
||||
];
|
||||
|
||||
for (addr, entry) in updates {
|
||||
let key_part = &entry[..4]; // first 4 bytes as "key"
|
||||
|
||||
// Step 1: seek_by_key_subkey pattern using get_both_range
|
||||
let seek_result: Result<Option<Cow<'_, [u8]>>> = cursor.get_both_range(addr, key_part);
|
||||
|
||||
if let Ok(Some(found_val)) = seek_result {
|
||||
// Check if the found key matches (like db_entry.key == entry.key)
|
||||
if found_val.starts_with(key_part) {
|
||||
// Step 2: delete_current
|
||||
cursor.del(WriteFlags::CURRENT).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3: upsert if value is not "zero" (not empty after key)
|
||||
if entry.len() > 5 {
|
||||
// has actual value
|
||||
cursor.put(addr, entry, WriteFlags::empty()).unwrap();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
txn.commit_subtxns().unwrap();
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Verify
|
||||
let txn = env.begin_ro_txn().unwrap();
|
||||
let cursor = txn.cursor(dbi).unwrap();
|
||||
let entries: Vec<(Cow<'_, [u8]>, Cow<'_, [u8]>)> =
|
||||
cursor.iter_slices().collect::<Result<Vec<_>>>().unwrap();
|
||||
println!("Final entries: {} items", entries.len());
|
||||
for (k, v) in &entries {
|
||||
println!(" {:?} -> {:?}", String::from_utf8_lossy(k), String::from_utf8_lossy(v));
|
||||
}
|
||||
// Expected: addr1+key1 updated, addr1+key2 unchanged, addr1+key3 new, addr2+key1 deleted
|
||||
assert_eq!(entries.len(), 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parallel_subtx_dupsort_realistic_data() {
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry {
|
||||
size: Some(0..(1024 * 1024 * 100)), // 100MB
|
||||
..Default::default()
|
||||
})
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db = txn.create_db(Some("realistic"), DatabaseFlags::DUP_SORT).unwrap();
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Create realistic data - 20 byte address as key, 64 byte storage entries as values
|
||||
let addr1: [u8; 20] = [0x11; 20];
|
||||
let addr2: [u8; 20] = [0x22; 20];
|
||||
|
||||
// Storage entry: 32 byte key + 32 byte value
|
||||
let make_entry = |k: u8, v: u8| -> [u8; 64] {
|
||||
let mut entry = [0u8; 64];
|
||||
entry[..32].fill(k);
|
||||
entry[32..].fill(v);
|
||||
entry
|
||||
};
|
||||
|
||||
// Insert initial data
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let dbi = db.dbi();
|
||||
{
|
||||
let mut cursor = txn.cursor(dbi).unwrap();
|
||||
cursor.put(&addr1, &make_entry(0x01, 0xAA), WriteFlags::empty()).unwrap();
|
||||
cursor.put(&addr1, &make_entry(0x02, 0xBB), WriteFlags::empty()).unwrap();
|
||||
cursor.put(&addr2, &make_entry(0x01, 0xCC), WriteFlags::empty()).unwrap();
|
||||
}
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Test parallel subtxn with realistic operations
|
||||
let mut txn = env.begin_rw_txn().unwrap();
|
||||
txn.enable_parallel_writes(&[dbi]).unwrap();
|
||||
|
||||
{
|
||||
let mut cursor = txn.cursor_with_dbi_parallel(dbi).unwrap();
|
||||
|
||||
// Update addr1's first entry - use get_both_range for DUPSORT seek
|
||||
let target = &make_entry(0x01, 0x00)[..32];
|
||||
let seek_result: Result<Option<Cow<'_, [u8]>>> = cursor.get_both_range(&addr1, target);
|
||||
if seek_result.is_ok() {
|
||||
cursor.del(WriteFlags::CURRENT).unwrap();
|
||||
}
|
||||
cursor.put(&addr1, &make_entry(0x01, 0xFF), WriteFlags::empty()).unwrap();
|
||||
}
|
||||
|
||||
txn.commit_subtxns().unwrap();
|
||||
txn.commit().unwrap();
|
||||
|
||||
println!("Realistic data test passed!");
|
||||
}
|
||||
|
||||
/// Test that freelist pages are reused by parallel subtxns, not just EOF extension
|
||||
#[test]
|
||||
fn test_parallel_subtxn_freelist_reuse() {
|
||||
let dir = tempdir().unwrap();
|
||||
let env = Environment::builder()
|
||||
.set_max_dbs(10)
|
||||
.set_geometry(Geometry {
|
||||
size: Some(0..(1024 * 1024 * 100)), // 100MB
|
||||
..Default::default()
|
||||
})
|
||||
.write_map()
|
||||
.open(dir.path())
|
||||
.unwrap();
|
||||
|
||||
// Create multiple DBs like we have in reth
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
let db1 = txn.create_db(Some("accounts"), DatabaseFlags::empty()).unwrap();
|
||||
let db2 = txn.create_db(Some("storage"), DatabaseFlags::DUP_SORT).unwrap();
|
||||
let db3 = txn.create_db(Some("trie"), DatabaseFlags::empty()).unwrap();
|
||||
let dbi1 = db1.dbi();
|
||||
let dbi2 = db2.dbi();
|
||||
let dbi3 = db3.dbi();
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Insert initial data to create pages that will be retired on update
|
||||
let txn = env.begin_rw_txn().unwrap();
|
||||
for i in 0..1000u32 {
|
||||
let key = i.to_be_bytes();
|
||||
let val = [0xAA; 100]; // ~100 bytes per entry
|
||||
txn.put(dbi1, &key, &val, WriteFlags::empty()).unwrap();
|
||||
txn.put(dbi3, &key, &val, WriteFlags::empty()).unwrap();
|
||||
}
|
||||
txn.commit().unwrap();
|
||||
|
||||
// Get initial stat
|
||||
let stat_before = env.stat().unwrap();
|
||||
let freelist_before = env.freelist().unwrap();
|
||||
println!("Before updates: pages={}, freelist={}", stat_before.leaf_pages(), freelist_before);
|
||||
|
||||
// Run multiple transactions with parallel subtxns
|
||||
// This should cause pages to be retired and then reused from freelist
|
||||
for round in 0..5 {
|
||||
let mut txn = env.begin_rw_txn().unwrap();
|
||||
|
||||
// Enable parallel writes with arena hints
|
||||
txn.enable_parallel_writes_with_hints(&[
|
||||
(dbi1, 50), // accounts - expect ~50 pages
|
||||
(dbi2, 20), // storage - expect ~20 pages
|
||||
(dbi3, 30), // trie - expect ~30 pages
|
||||
])
|
||||
.unwrap();
|
||||
|
||||
// Thread 1: Update accounts
|
||||
{
|
||||
let mut cursor = txn.cursor_with_dbi_parallel(dbi1).unwrap();
|
||||
for i in 0..200u32 {
|
||||
let key = i.to_be_bytes();
|
||||
let val = [round as u8; 100];
|
||||
cursor.put(&key, &val, WriteFlags::UPSERT).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// Thread 2: Update storage (simulated - same thread for simplicity)
|
||||
{
|
||||
let mut cursor = txn.cursor_with_dbi_parallel(dbi2).unwrap();
|
||||
for i in 0..100u32 {
|
||||
let key = i.to_be_bytes();
|
||||
let val = [round as u8; 64];
|
||||
cursor.put(&key, &val, WriteFlags::UPSERT).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
// Thread 3: Update trie
|
||||
{
|
||||
let mut cursor = txn.cursor_with_dbi_parallel(dbi3).unwrap();
|
||||
for i in 0..150u32 {
|
||||
let key = i.to_be_bytes();
|
||||
let val = [round as u8; 80];
|
||||
cursor.put(&key, &val, WriteFlags::UPSERT).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
txn.commit_subtxns().unwrap();
|
||||
txn.commit().unwrap();
|
||||
|
||||
let freelist_after_round = env.freelist().unwrap();
|
||||
println!("After round {}: freelist={}", round, freelist_after_round);
|
||||
}
|
||||
|
||||
let stat_after = env.stat().unwrap();
|
||||
let freelist_after = env.freelist().unwrap();
|
||||
println!("After all updates: pages={}, freelist={}", stat_after.leaf_pages(), freelist_after);
|
||||
|
||||
// The freelist should NOT grow unboundedly
|
||||
// It may grow a bit due to B-tree rebalancing, but should stabilize
|
||||
// Allow 2x growth maximum - if it's more, pages aren't being reused
|
||||
let max_allowed_growth = freelist_before.saturating_mul(2).max(100);
|
||||
assert!(
|
||||
freelist_after <= max_allowed_growth,
|
||||
"Freelist grew too much: {} -> {} (max allowed: {}). Pages not being reused from GC!",
|
||||
freelist_before,
|
||||
freelist_after,
|
||||
max_allowed_growth
|
||||
);
|
||||
|
||||
println!("Freelist reuse test passed! {} -> {}", freelist_before, freelist_after);
|
||||
}
|
||||
@@ -40,7 +40,7 @@ alloy-primitives.workspace = true
|
||||
alloy-rpc-types-engine.workspace = true
|
||||
alloy-consensus.workspace = true
|
||||
revm-database.workspace = true
|
||||
revm-state = { workspace = true, optional = true }
|
||||
revm-state.workspace = true
|
||||
|
||||
# tracing
|
||||
tracing.workspace = true
|
||||
@@ -87,8 +87,8 @@ rand.workspace = true
|
||||
tokio = { workspace = true, features = ["sync", "macros", "rt-multi-thread"] }
|
||||
|
||||
[features]
|
||||
edge = ["reth-storage-api/edge", "rocksdb"]
|
||||
rocksdb = ["reth-storage-api/rocksdb", "dep:rocksdb"]
|
||||
edge = ["rocksdb"]
|
||||
test-utils = [
|
||||
"reth-db/test-utils",
|
||||
"reth-nippy-jar/test-utils",
|
||||
@@ -103,6 +103,5 @@ test-utils = [
|
||||
"reth-trie-db/test-utils",
|
||||
"reth-prune-types/test-utils",
|
||||
"reth-stages-types/test-utils",
|
||||
"revm-state",
|
||||
"tokio",
|
||||
]
|
||||
|
||||
@@ -31,6 +31,7 @@ impl<'a> DurationsRecorder<'a> {
|
||||
}
|
||||
|
||||
#[derive(Debug, Copy, Clone)]
|
||||
#[allow(dead_code)] // Edge variants used at runtime based on StorageSettings::is_v2()
|
||||
pub(crate) enum Action {
|
||||
InsertBlock,
|
||||
InsertState,
|
||||
@@ -42,6 +43,14 @@ pub(crate) enum Action {
|
||||
InsertTransactionBlocks,
|
||||
InsertTransactionSenders,
|
||||
InsertTransactionHashNumbers,
|
||||
// Parallel write actions (used when StorageSettings::is_v2() returns true)
|
||||
EdgeWritePlainAccounts,
|
||||
EdgeWriteBytecodes,
|
||||
EdgeWritePlainStorage,
|
||||
EdgeWriteHashedAccounts,
|
||||
EdgeWriteHashedStorages,
|
||||
EdgeWriteAccountTrie,
|
||||
EdgeWriteStorageTrie,
|
||||
}
|
||||
|
||||
/// Database provider metrics
|
||||
@@ -124,6 +133,181 @@ pub(crate) struct DatabaseProviderMetrics {
|
||||
save_blocks_commit_sf_last: Gauge,
|
||||
/// Last duration of `RocksDB` commit in `save_blocks`
|
||||
save_blocks_commit_rocksdb_last: Gauge,
|
||||
// Edge mode parallel write metrics
|
||||
/// Duration of PlainAccountState writes
|
||||
edge_write_plain_accounts: Histogram,
|
||||
/// Last duration of PlainAccountState writes
|
||||
edge_write_plain_accounts_last: Gauge,
|
||||
/// Duration of Bytecodes writes
|
||||
edge_write_bytecodes: Histogram,
|
||||
/// Last duration of Bytecodes writes
|
||||
edge_write_bytecodes_last: Gauge,
|
||||
/// Duration of PlainStorageState writes
|
||||
edge_write_plain_storage: Histogram,
|
||||
/// Last duration of PlainStorageState writes
|
||||
edge_write_plain_storage_last: Gauge,
|
||||
/// Duration of HashedAccounts writes
|
||||
edge_write_hashed_accounts: Histogram,
|
||||
/// Last duration of HashedAccounts writes
|
||||
edge_write_hashed_accounts_last: Gauge,
|
||||
/// Duration of HashedStorages writes
|
||||
edge_write_hashed_storages: Histogram,
|
||||
/// Last duration of HashedStorages writes
|
||||
edge_write_hashed_storages_last: Gauge,
|
||||
/// Duration of AccountsTrie writes
|
||||
edge_write_account_trie: Histogram,
|
||||
/// Last duration of AccountsTrie writes
|
||||
edge_write_account_trie_last: Gauge,
|
||||
/// Duration of StoragesTrie writes
|
||||
edge_write_storage_trie: Histogram,
|
||||
/// Last duration of StoragesTrie writes
|
||||
edge_write_storage_trie_last: Gauge,
|
||||
/// Duration of preprocessing (merging, sorting, converting)
|
||||
edge_preprocessing: Histogram,
|
||||
/// Last duration of preprocessing
|
||||
edge_preprocessing_last: Gauge,
|
||||
/// Wall-clock time for parallel writes only (excludes preprocessing)
|
||||
edge_parallel_wall: Histogram,
|
||||
/// Last wall-clock time for parallel writes
|
||||
edge_parallel_wall_last: Gauge,
|
||||
/// Total edge mode time including preprocessing
|
||||
edge_parallel_writes_total: Histogram,
|
||||
/// Last total edge mode time
|
||||
edge_parallel_writes_total_last: Gauge,
|
||||
/// Number of parallel subtxns used
|
||||
edge_parallel_subtxn_count: Histogram,
|
||||
/// Last number of parallel subtxns used
|
||||
edge_parallel_subtxn_count_last: Gauge,
|
||||
/// Storage trie seek operation count
|
||||
edge_storage_trie_seek_count: Gauge,
|
||||
/// Storage trie delete operation count
|
||||
edge_storage_trie_delete_count: Gauge,
|
||||
/// Storage trie upsert operation count
|
||||
edge_storage_trie_upsert_count: Gauge,
|
||||
}
|
||||
|
||||
/// Per-table arena hint metrics for tracking estimation quality.
|
||||
#[derive(Debug)]
|
||||
|
||||
pub(crate) struct ArenaHintMetrics {
|
||||
handles: std::collections::HashMap<&'static str, ArenaHintTableMetrics>,
|
||||
}
|
||||
|
||||
impl Default for ArenaHintMetrics {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl ArenaHintMetrics {
|
||||
pub(crate) fn new() -> Self {
|
||||
use reth_db::tables;
|
||||
use reth_db_api::table::Table;
|
||||
let tables = [
|
||||
tables::PlainAccountState::NAME,
|
||||
tables::PlainStorageState::NAME,
|
||||
tables::Bytecodes::NAME,
|
||||
tables::HashedAccounts::NAME,
|
||||
tables::HashedStorages::NAME,
|
||||
tables::AccountsTrie::NAME,
|
||||
tables::StoragesTrie::NAME,
|
||||
];
|
||||
|
||||
let handles =
|
||||
tables.into_iter().map(|name| (name, ArenaHintTableMetrics::new(name))).collect();
|
||||
|
||||
Self { handles }
|
||||
}
|
||||
|
||||
pub(crate) fn record(&self, table: &'static str, detail: &super::ArenaHintDetail) {
|
||||
if let Some(metrics) = self.handles.get(&table) {
|
||||
ArenaHintTableMetrics::record(metrics, detail);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug)]
|
||||
struct ArenaHintTableMetrics {
|
||||
estimated: Gauge,
|
||||
used: Gauge,
|
||||
source: Gauge,
|
||||
}
|
||||
|
||||
impl ArenaHintTableMetrics {
|
||||
fn new(table: &'static str) -> Self {
|
||||
Self {
|
||||
estimated: metrics::gauge!("database_edge_arena_hint_estimated", "table" => table),
|
||||
used: metrics::gauge!("database_edge_arena_hint_used", "table" => table),
|
||||
source: metrics::gauge!("database_edge_arena_hint_source", "table" => table),
|
||||
}
|
||||
}
|
||||
|
||||
fn record(&self, detail: &super::ArenaHintDetail) {
|
||||
self.estimated.set(detail.estimated as f64);
|
||||
self.used.set(detail.used as f64);
|
||||
self.source.set(detail.source.as_f64());
|
||||
}
|
||||
}
|
||||
|
||||
/// Raw input counts used for arena hint estimation.
|
||||
/// These metrics enable correlation between inputs and actual page demand.
|
||||
#[derive(Debug, Default, Clone, Copy)]
|
||||
|
||||
pub(crate) struct ArenaHintInputs {
|
||||
/// Number of account changes in batch
|
||||
pub num_accounts: usize,
|
||||
/// Total number of storage slot changes across all addresses
|
||||
pub num_storage: usize,
|
||||
/// Number of new contracts
|
||||
pub num_contracts: usize,
|
||||
/// Number of account trie node updates
|
||||
pub num_account_trie_nodes: usize,
|
||||
/// Number of storage trie node updates (summed across all addresses)
|
||||
pub num_storage_trie_nodes: usize,
|
||||
/// Number of unique addresses with storage trie updates
|
||||
pub num_storage_trie_addresses: usize,
|
||||
}
|
||||
|
||||
/// Metrics for recording arena hint estimation inputs.
|
||||
#[derive(Debug)]
|
||||
|
||||
pub(crate) struct ArenaHintInputMetrics {
|
||||
num_accounts: Gauge,
|
||||
num_storage: Gauge,
|
||||
num_contracts: Gauge,
|
||||
num_account_trie_nodes: Gauge,
|
||||
num_storage_trie_nodes: Gauge,
|
||||
num_storage_trie_addresses: Gauge,
|
||||
}
|
||||
|
||||
impl Default for ArenaHintInputMetrics {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl ArenaHintInputMetrics {
|
||||
pub(crate) fn new() -> Self {
|
||||
Self {
|
||||
num_accounts: metrics::gauge!("database_edge_input_num_accounts"),
|
||||
num_storage: metrics::gauge!("database_edge_input_num_storage"),
|
||||
num_contracts: metrics::gauge!("database_edge_input_num_contracts"),
|
||||
num_account_trie_nodes: metrics::gauge!("database_edge_input_num_account_trie_nodes"),
|
||||
num_storage_trie_nodes: metrics::gauge!("database_edge_input_num_storage_trie_nodes"),
|
||||
num_storage_trie_addresses: metrics::gauge!(
|
||||
"database_edge_input_num_storage_trie_addresses"
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn record(&self, inputs: &ArenaHintInputs) {
|
||||
self.num_accounts.set(inputs.num_accounts as f64);
|
||||
self.num_storage.set(inputs.num_storage as f64);
|
||||
self.num_contracts.set(inputs.num_contracts as f64);
|
||||
self.num_account_trie_nodes.set(inputs.num_account_trie_nodes as f64);
|
||||
self.num_storage_trie_nodes.set(inputs.num_storage_trie_nodes as f64);
|
||||
self.num_storage_trie_addresses.set(inputs.num_storage_trie_addresses as f64);
|
||||
}
|
||||
}
|
||||
|
||||
/// Timings collected during a `save_blocks` call.
|
||||
@@ -150,6 +334,51 @@ pub(crate) struct CommitTimings {
|
||||
pub rocksdb: Duration,
|
||||
}
|
||||
|
||||
/// Timings collected during edge mode parallel writes.
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
pub(crate) struct EdgeWriteTimings {
|
||||
/// Duration of preprocessing (merging states, sorting, converting)
|
||||
pub preprocessing: Duration,
|
||||
pub plain_accounts: Duration,
|
||||
pub bytecodes: Duration,
|
||||
pub plain_storage: Duration,
|
||||
pub hashed_accounts: Duration,
|
||||
pub hashed_storages: Duration,
|
||||
pub account_trie: Duration,
|
||||
pub storage_trie: Duration,
|
||||
/// Wall-clock time for parallel writes only (excludes preprocessing)
|
||||
pub parallel_wall: Duration,
|
||||
/// Total time including preprocessing
|
||||
pub total: Duration,
|
||||
pub subtxn_count: u64,
|
||||
/// Storage trie operation counts for debugging
|
||||
pub storage_trie_op_counts: StorageTrieOpCounts,
|
||||
}
|
||||
|
||||
/// Operation counts for storage trie cursor operations.
|
||||
/// Used to identify which operation type dominates storage_trie write time.
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy)]
|
||||
pub(crate) struct StorageTrieOpCounts {
|
||||
/// Number of `seek_by_key_subkey` calls
|
||||
pub seek_count: u64,
|
||||
/// Number of `delete_current` calls
|
||||
pub delete_count: u64,
|
||||
/// Number of `upsert` calls
|
||||
pub upsert_count: u64,
|
||||
}
|
||||
|
||||
impl From<reth_trie_db::StorageTrieOpCounts> for StorageTrieOpCounts {
|
||||
fn from(counts: reth_trie_db::StorageTrieOpCounts) -> Self {
|
||||
Self {
|
||||
seek_count: counts.seek_count,
|
||||
delete_count: counts.delete_count,
|
||||
upsert_count: counts.upsert_count,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl DatabaseProviderMetrics {
|
||||
/// Records the duration for the given action.
|
||||
pub(crate) fn record_duration(&self, action: Action, duration: Duration) {
|
||||
@@ -166,6 +395,20 @@ impl DatabaseProviderMetrics {
|
||||
Action::InsertTransactionHashNumbers => {
|
||||
self.insert_transaction_hash_numbers.record(duration)
|
||||
}
|
||||
|
||||
Action::EdgeWritePlainAccounts => self.edge_write_plain_accounts.record(duration),
|
||||
|
||||
Action::EdgeWriteBytecodes => self.edge_write_bytecodes.record(duration),
|
||||
|
||||
Action::EdgeWritePlainStorage => self.edge_write_plain_storage.record(duration),
|
||||
|
||||
Action::EdgeWriteHashedAccounts => self.edge_write_hashed_accounts.record(duration),
|
||||
|
||||
Action::EdgeWriteHashedStorages => self.edge_write_hashed_storages.record(duration),
|
||||
|
||||
Action::EdgeWriteAccountTrie => self.edge_write_account_trie.record(duration),
|
||||
|
||||
Action::EdgeWriteStorageTrie => self.edge_write_storage_trie.record(duration),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -208,4 +451,45 @@ impl DatabaseProviderMetrics {
|
||||
self.save_blocks_commit_sf_last.set(timings.sf.as_secs_f64());
|
||||
self.save_blocks_commit_rocksdb_last.set(timings.rocksdb.as_secs_f64());
|
||||
}
|
||||
|
||||
/// Records all edge mode parallel write timings.
|
||||
|
||||
pub(crate) fn record_edge_writes(&self, timings: &EdgeWriteTimings) {
|
||||
self.edge_write_plain_accounts.record(timings.plain_accounts);
|
||||
self.edge_write_plain_accounts_last.set(timings.plain_accounts.as_secs_f64());
|
||||
|
||||
self.edge_write_bytecodes.record(timings.bytecodes);
|
||||
self.edge_write_bytecodes_last.set(timings.bytecodes.as_secs_f64());
|
||||
|
||||
self.edge_write_plain_storage.record(timings.plain_storage);
|
||||
self.edge_write_plain_storage_last.set(timings.plain_storage.as_secs_f64());
|
||||
|
||||
self.edge_write_hashed_accounts.record(timings.hashed_accounts);
|
||||
self.edge_write_hashed_accounts_last.set(timings.hashed_accounts.as_secs_f64());
|
||||
|
||||
self.edge_write_hashed_storages.record(timings.hashed_storages);
|
||||
self.edge_write_hashed_storages_last.set(timings.hashed_storages.as_secs_f64());
|
||||
|
||||
self.edge_write_account_trie.record(timings.account_trie);
|
||||
self.edge_write_account_trie_last.set(timings.account_trie.as_secs_f64());
|
||||
|
||||
self.edge_write_storage_trie.record(timings.storage_trie);
|
||||
self.edge_write_storage_trie_last.set(timings.storage_trie.as_secs_f64());
|
||||
|
||||
self.edge_preprocessing.record(timings.preprocessing);
|
||||
self.edge_preprocessing_last.set(timings.preprocessing.as_secs_f64());
|
||||
|
||||
self.edge_parallel_wall.record(timings.parallel_wall);
|
||||
self.edge_parallel_wall_last.set(timings.parallel_wall.as_secs_f64());
|
||||
|
||||
self.edge_parallel_writes_total.record(timings.total);
|
||||
self.edge_parallel_writes_total_last.set(timings.total.as_secs_f64());
|
||||
|
||||
self.edge_parallel_subtxn_count.record(timings.subtxn_count as f64);
|
||||
self.edge_parallel_subtxn_count_last.set(timings.subtxn_count as f64);
|
||||
|
||||
self.edge_storage_trie_seek_count.set(timings.storage_trie_op_counts.seek_count as f64);
|
||||
self.edge_storage_trie_delete_count.set(timings.storage_trie_op_counts.delete_count as f64);
|
||||
self.edge_storage_trie_upsert_count.set(timings.storage_trie_op_counts.upsert_count as f64);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -55,6 +55,12 @@ pub use builder::{ProviderFactoryBuilder, ReadOnlyConfig};
|
||||
|
||||
mod metrics;
|
||||
|
||||
mod parallel_writes;
|
||||
pub use parallel_writes::{
|
||||
ArenaHintDetail, ArenaHintDetails, ArenaHintSource, ArenaHints, ArenaUsageSnapshot,
|
||||
ArenaUsageTracker, ParallelWriteTimings, PreparedStateWrites, PreparedStorageWrite,
|
||||
};
|
||||
|
||||
mod chain;
|
||||
pub use chain::*;
|
||||
|
||||
|
||||
1135
crates/storage/provider/src/providers/database/parallel_writes.rs
Normal file
1135
crates/storage/provider/src/providers/database/parallel_writes.rs
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -3,7 +3,7 @@ use crate::{
|
||||
ProviderError, RocksDBProviderFactory, StateProvider, StateRootProvider,
|
||||
};
|
||||
use alloy_eips::merge::EPOCH_SLOTS;
|
||||
use alloy_primitives::{Address, BlockNumber, Bytes, StorageKey, StorageValue, B256};
|
||||
use alloy_primitives::{keccak256, Address, BlockNumber, Bytes, StorageKey, StorageValue, B256};
|
||||
use reth_db_api::{
|
||||
cursor::{DbCursorRO, DbDupCursorRO},
|
||||
table::Table,
|
||||
@@ -147,6 +147,15 @@ impl<'b, Provider: DBProvider + ChangeSetReader + StorageChangeSetReader + Block
|
||||
}
|
||||
|
||||
/// Lookup a storage key in the `StoragesHistory` table using `EitherReader`.
|
||||
///
|
||||
/// # Key format
|
||||
///
|
||||
/// When `use_hashed_state` is enabled, the caller is expected to pass an already-hashed
|
||||
/// storage key (the keccak256 hash of the plain slot). The `StoragesHistory` table stores
|
||||
/// entries with hashed slots in this mode since the changesets also use hashed slots.
|
||||
///
|
||||
/// When `use_hashed_state` is disabled (default), the caller passes the plain storage key
|
||||
/// and the history table uses plain keys.
|
||||
pub fn storage_history_lookup(
|
||||
&self,
|
||||
address: Address,
|
||||
@@ -418,25 +427,45 @@ impl<
|
||||
address: Address,
|
||||
storage_key: StorageKey,
|
||||
) -> ProviderResult<Option<StorageValue>> {
|
||||
match self.storage_history_lookup(address, storage_key)? {
|
||||
let use_hashed_state = self.provider.cached_storage_settings().use_hashed_state;
|
||||
|
||||
// When use_hashed_state is enabled, the history table uses hashed slots.
|
||||
// Hash the storage key for the history lookup.
|
||||
let lookup_key = if use_hashed_state { keccak256(storage_key) } else { storage_key };
|
||||
|
||||
match self.storage_history_lookup(address, lookup_key)? {
|
||||
HistoryInfo::NotYetWritten => Ok(None),
|
||||
HistoryInfo::InChangeset(changeset_block_number) => self
|
||||
.provider
|
||||
.get_storage_before_block(changeset_block_number, address, storage_key)?
|
||||
.get_storage_before_block(changeset_block_number, address, lookup_key)?
|
||||
.ok_or_else(|| ProviderError::StorageChangesetNotFound {
|
||||
block_number: changeset_block_number,
|
||||
address,
|
||||
storage_key: Box::new(storage_key),
|
||||
storage_key: Box::new(lookup_key),
|
||||
})
|
||||
.map(|entry| entry.value)
|
||||
.map(Some),
|
||||
HistoryInfo::InPlainState | HistoryInfo::MaybeInPlainState => Ok(self
|
||||
.tx()
|
||||
.cursor_dup_read::<tables::PlainStorageState>()?
|
||||
.seek_by_key_subkey(address, storage_key)?
|
||||
.filter(|entry| entry.key == storage_key)
|
||||
.map(|entry| entry.value)
|
||||
.or(Some(StorageValue::ZERO))),
|
||||
HistoryInfo::InPlainState | HistoryInfo::MaybeInPlainState => {
|
||||
if use_hashed_state {
|
||||
// When use_hashed_state is enabled, read from HashedStorages table.
|
||||
let hashed_address = keccak256(address);
|
||||
Ok(self
|
||||
.tx()
|
||||
.cursor_dup_read::<tables::HashedStorages>()?
|
||||
.seek_by_key_subkey(hashed_address, lookup_key)?
|
||||
.filter(|entry| entry.key == lookup_key)
|
||||
.map(|entry| entry.value)
|
||||
.or(Some(StorageValue::ZERO)))
|
||||
} else {
|
||||
Ok(self
|
||||
.tx()
|
||||
.cursor_dup_read::<tables::PlainStorageState>()?
|
||||
.seek_by_key_subkey(address, storage_key)?
|
||||
.filter(|entry| entry.key == storage_key)
|
||||
.map(|entry| entry.value)
|
||||
.or(Some(StorageValue::ZERO)))
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,6 +101,8 @@ pub struct StaticFileWriteCtx {
|
||||
pub receipts_prune_mode: Option<reth_prune_types::PruneMode>,
|
||||
/// Whether receipts are prunable (based on storage settings and prune distance).
|
||||
pub receipts_prunable: bool,
|
||||
/// Whether to use hashed state tables instead of plain state tables.
|
||||
pub use_hashed_state: bool,
|
||||
}
|
||||
|
||||
/// [`StaticFileProvider`] manages all existing [`StaticFileJarProvider`].
|
||||
@@ -626,10 +628,14 @@ impl<N: NodePrimitives> StaticFileProvider<N> {
|
||||
}
|
||||
|
||||
/// Writes storage changesets for all blocks to the static file segment.
|
||||
///
|
||||
/// When `use_hashed_state` is true, storage keys are stored as keccak256 hashes of the plain
|
||||
/// slot rather than plain slots, enabling compatibility with hashed state storage.
|
||||
#[instrument(level = "debug", target = "providers::db", skip_all)]
|
||||
fn write_storage_changesets(
|
||||
w: &mut StaticFileProviderRWRefMut<'_, N>,
|
||||
blocks: &[ExecutedBlock<N>],
|
||||
use_hashed_state: bool,
|
||||
) -> ProviderResult<()> {
|
||||
for block in blocks {
|
||||
let block_number = block.recovered_block().number();
|
||||
@@ -641,9 +647,12 @@ impl<N: NodePrimitives> StaticFileProvider<N> {
|
||||
.flatten()
|
||||
.flat_map(|revert| {
|
||||
revert.storage_revert.into_iter().map(move |(key, revert_to_slot)| {
|
||||
let plain_key = B256::new(key.to_be_bytes());
|
||||
let storage_key =
|
||||
if use_hashed_state { keccak256(plain_key) } else { plain_key };
|
||||
StorageBeforeTx {
|
||||
address: revert.address,
|
||||
key: B256::new(key.to_be_bytes()),
|
||||
key: storage_key,
|
||||
value: revert_to_slot.to_previous_value(),
|
||||
}
|
||||
})
|
||||
@@ -747,7 +756,7 @@ impl<N: NodePrimitives> StaticFileProvider<N> {
|
||||
r_storage_changesets = Some(self.write_segment(
|
||||
StaticFileSegment::StorageChangeSets,
|
||||
first_block_number,
|
||||
|w| Self::write_storage_changesets(w, blocks),
|
||||
|w| Self::write_storage_changesets(w, blocks, ctx.use_hashed_state),
|
||||
));
|
||||
});
|
||||
}
|
||||
|
||||
@@ -16,3 +16,36 @@ impl KeyHasher for KeccakKeyHasher {
|
||||
keccak256(bytes)
|
||||
}
|
||||
}
|
||||
|
||||
/// A no-op key hasher that passes through already-hashed keys.
|
||||
///
|
||||
/// Use this when storage keys in changesets are already hashed (e.g., when `use_hashed_state` is
|
||||
/// enabled). The input must be exactly 32 bytes representing a pre-hashed `B256` value.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct IdentityKeyHasher;
|
||||
|
||||
impl KeyHasher for IdentityKeyHasher {
|
||||
#[inline]
|
||||
fn hash_key<T: AsRef<[u8]>>(bytes: T) -> B256 {
|
||||
B256::from_slice(bytes.as_ref())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn keccak_key_hasher() {
|
||||
let input = [0x42u8; 20];
|
||||
let result = KeccakKeyHasher::hash_key(&input);
|
||||
assert_eq!(result, keccak256(&input));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn identity_key_hasher_passthrough() {
|
||||
let hash = B256::repeat_byte(0xab);
|
||||
let result = IdentityKeyHasher::hash_key(hash);
|
||||
assert_eq!(result, hash);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ mod account;
|
||||
pub use account::TrieAccount;
|
||||
|
||||
mod key;
|
||||
pub use key::{KeccakKeyHasher, KeyHasher};
|
||||
pub use key::{IdentityKeyHasher, KeccakKeyHasher, KeyHasher};
|
||||
|
||||
mod nibbles;
|
||||
pub use nibbles::{Nibbles, StoredNibbles, StoredNibblesSubKey};
|
||||
|
||||
@@ -21,5 +21,6 @@ pub use state::{DatabaseHashedPostState, DatabaseStateRoot};
|
||||
pub use storage::{hashed_storage_from_reverts_with_provider, DatabaseStorageRoot};
|
||||
pub use trie_cursor::{
|
||||
DatabaseAccountTrieCursor, DatabaseStorageTrieCursor, DatabaseTrieCursorFactory,
|
||||
StorageTrieOpCounts,
|
||||
};
|
||||
pub use witness::DatabaseTrieWitness;
|
||||
|
||||
@@ -11,6 +11,18 @@ use reth_trie::{
|
||||
BranchNodeCompact, Nibbles, StorageTrieEntry, StoredNibbles, StoredNibblesSubKey,
|
||||
};
|
||||
|
||||
/// Operation counts for storage trie cursor operations.
|
||||
/// Used to identify which operation type dominates storage_trie write time.
|
||||
#[derive(Debug, Default, Clone, Copy)]
|
||||
pub struct StorageTrieOpCounts {
|
||||
/// Number of `seek_by_key_subkey` calls
|
||||
pub seek_count: u64,
|
||||
/// Number of `delete_current` calls
|
||||
pub delete_count: u64,
|
||||
/// Number of `upsert` calls
|
||||
pub upsert_count: u64,
|
||||
}
|
||||
|
||||
/// Wrapper struct for database transaction implementing trie cursor factory trait.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct DatabaseTrieCursorFactory<T>(T);
|
||||
@@ -120,11 +132,14 @@ where
|
||||
+ DbDupCursorRO<tables::StoragesTrie>
|
||||
+ DbDupCursorRW<tables::StoragesTrie>,
|
||||
{
|
||||
/// Writes storage updates that are already sorted
|
||||
/// Writes storage updates that are already sorted.
|
||||
/// Returns a tuple of (entries modified, operation counts).
|
||||
pub fn write_storage_trie_updates_sorted(
|
||||
&mut self,
|
||||
updates: &StorageTrieUpdatesSorted,
|
||||
) -> Result<usize, DatabaseError> {
|
||||
) -> Result<(usize, StorageTrieOpCounts), DatabaseError> {
|
||||
let mut op_counts = StorageTrieOpCounts::default();
|
||||
|
||||
// The storage trie for this account has to be deleted.
|
||||
if updates.is_deleted() && self.cursor.seek_exact(self.hashed_address)?.is_some() {
|
||||
self.cursor.delete_current_duplicates()?;
|
||||
@@ -136,17 +151,20 @@ where
|
||||
num_entries += 1;
|
||||
let nibbles = StoredNibblesSubKey(*nibbles);
|
||||
// Delete the old entry if it exists.
|
||||
op_counts.seek_count += 1;
|
||||
if self
|
||||
.cursor
|
||||
.seek_by_key_subkey(self.hashed_address, nibbles.clone())?
|
||||
.filter(|e| e.nibbles == nibbles)
|
||||
.is_some()
|
||||
{
|
||||
op_counts.delete_count += 1;
|
||||
self.cursor.delete_current()?;
|
||||
}
|
||||
|
||||
// There is an updated version of this node, insert new entry.
|
||||
if let Some(node) = maybe_updated {
|
||||
op_counts.upsert_count += 1;
|
||||
self.cursor.upsert(
|
||||
self.hashed_address,
|
||||
&StorageTrieEntry { nibbles, node: node.clone() },
|
||||
@@ -154,7 +172,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
Ok(num_entries)
|
||||
Ok((num_entries, op_counts))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -274,4 +292,62 @@ mod tests {
|
||||
let mut cursor = DatabaseStorageTrieCursor::new(cursor, hashed_address);
|
||||
assert_eq!(cursor.seek(key.into()).unwrap().unwrap().1, value);
|
||||
}
|
||||
|
||||
/// Tests MDBX DUPSORT upsert behavior: upsert APPENDS rather than replaces
|
||||
/// when the subkey is the same but the full value differs.
|
||||
///
|
||||
/// This test documents why we MUST use seek+delete before upsert for updates
|
||||
/// in DUPSORT tables like StoragesTrie. Without the delete, we'd create duplicate
|
||||
/// entries with the same nibbles but different node values.
|
||||
#[test]
|
||||
fn test_dupsort_upsert_appends_not_replaces() {
|
||||
use reth_db_api::cursor::DbDupCursorRO;
|
||||
|
||||
let factory = create_test_provider_factory();
|
||||
let provider = factory.provider_rw().unwrap();
|
||||
let mut cursor = provider.tx_ref().cursor_dup_write::<tables::StoragesTrie>().unwrap();
|
||||
|
||||
let hashed_address = B256::random();
|
||||
let nibbles = StoredNibblesSubKey::from(vec![0x1, 0x2]);
|
||||
|
||||
// Insert initial value
|
||||
let value1 = BranchNodeCompact::new(1, 1, 0, vec![], None);
|
||||
cursor
|
||||
.upsert(
|
||||
hashed_address,
|
||||
&StorageTrieEntry { nibbles: nibbles.clone(), node: value1.clone() },
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Upsert with same nibbles but different node - this will APPEND in DUPSORT
|
||||
let value2 = BranchNodeCompact::new(2, 2, 0, vec![], None);
|
||||
cursor
|
||||
.upsert(
|
||||
hashed_address,
|
||||
&StorageTrieEntry { nibbles: nibbles.clone(), node: value2.clone() },
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
// Count how many entries we have for this key
|
||||
let entries: Vec<_> = cursor
|
||||
.walk_dup(Some(hashed_address), None)
|
||||
.unwrap()
|
||||
.collect::<Result<Vec<_>, _>>()
|
||||
.unwrap();
|
||||
|
||||
// CRITICAL: With DUPSORT, upsert APPENDS - so we have 2 entries.
|
||||
// This demonstrates why write_storage_trie_updates_sorted MUST seek+delete
|
||||
// before upserting, otherwise we'd accumulate duplicate entries.
|
||||
assert_eq!(
|
||||
entries.len(),
|
||||
2,
|
||||
"MDBX DUPSORT upsert should APPEND, not replace. Got {} entries",
|
||||
entries.len()
|
||||
);
|
||||
|
||||
// Verify both values are present (entries are (key, StorageTrieEntry) tuples)
|
||||
let nodes: Vec<_> = entries.iter().map(|(_, e)| e.node.clone()).collect();
|
||||
assert!(nodes.contains(&value1), "Original value should still exist");
|
||||
assert!(nodes.contains(&value2), "New value should be appended");
|
||||
}
|
||||
}
|
||||
|
||||
1348
etc/grafana/dashboards/reth-edge-mode.json
Normal file
1348
etc/grafana/dashboards/reth-edge-mode.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user