Compare commits

...

1 Commits

Author SHA1 Message Date
Georgios Konstantopoulos
d701d70dc1 perf(db): disable prefault_write to avoid mincore() syscall overhead
In WRITEMAP mode, MDBX by default calls mincore() before each page write to check if
the page is resident in memory. Profiling shows this accounts for ~20-22% of
persistence time, yet pages are usually already warm from Engine Task reads.

Disabling prefault_write skips the mincore() syscall and lets the kernel handle
page faults directly. Since pages are likely resident, this trades potential
page faults (which rarely occur due to cache warmth) for syscall overhead
elimination.

Amp-Thread-ID: https://ampcode.com/threads/T-019c233a-3fa9-733b-8985-991c393eb610
Co-authored-by: Amp <amp@ampcode.com>
2026-02-03 11:23:57 +00:00
2 changed files with 29 additions and 0 deletions

View File

@@ -488,6 +488,10 @@ impl DatabaseEnv {
inner_env.set_max_read_transaction_duration(max_read_transaction_duration);
}
// Disable prefault writes: avoids mincore() syscall overhead since pages are likely
// already warm from Engine Task reads in WRITEMAP mode.
inner_env.set_prefault_write(false);
let env = Self {
inner: inner_env.open(path).map_err(|e| DatabaseError::Open(e.into()))?,
dbis: Arc::default(),

View File

@@ -55,6 +55,7 @@ impl Environment {
handle_slow_readers: None,
#[cfg(feature = "read-tx-timeouts")]
max_read_transaction_duration: None,
prefault_write: None,
}
}
@@ -602,6 +603,9 @@ pub struct EnvironmentBuilder {
/// The maximum duration of a read transaction. If [None], but the `read-tx-timeout` feature is
/// enabled, the default value of [`DEFAULT_MAX_READ_TRANSACTION_DURATION`] is used.
max_read_transaction_duration: Option<read_transactions::MaxReadTransactionDuration>,
/// Controls whether prefault write is enabled. If [None], MDBX uses its default.
/// When disabled, avoids `mincore()` syscall overhead in WRITEMAP mode.
prefault_write: Option<bool>,
}
impl EnvironmentBuilder {
@@ -723,6 +727,14 @@ impl EnvironmentBuilder {
}
}
if let Some(prefault_write) = self.prefault_write {
mdbx_result(ffi::mdbx_env_set_option(
env,
ffi::MDBX_opt_prefault_write_enable,
prefault_write as u64,
))?;
}
Ok(())
})() {
ffi::mdbx_env_close_ex(env, false);
@@ -873,6 +885,19 @@ impl EnvironmentBuilder {
self.handle_slow_readers = Some(hsr);
self
}
/// Set whether to enable prefault writes.
///
/// In WRITEMAP mode, MDBX by default uses `mincore()` to check if pages are resident before
/// touching them. This avoids page faults but adds syscall overhead. Disabling prefault
/// writes skips the `mincore()` check and lets the kernel handle page faults directly.
///
/// This is beneficial when pages are likely already in memory (e.g., recently read by
/// other transactions), as it eliminates unnecessary syscall overhead.
pub const fn set_prefault_write(&mut self, prefault_write: bool) -> &mut Self {
self.prefault_write = Some(prefault_write);
self
}
}
#[cfg(feature = "read-tx-timeouts")]