From 28e46bfd488478e8afae5cbdcc0dbbb7f0cbfb1c Mon Sep 17 00:00:00 2001 From: nk_ysg Date: Fri, 30 Aug 2024 15:02:14 +0800 Subject: [PATCH] chore: remove phf from static files (#10259) Co-authored-by: joshieDo <93316087+joshieDo@users.noreply.github.com> Co-authored-by: Matthias Seitz --- Cargo.lock | 44 ---- crates/storage/db/src/static_file/cursor.rs | 4 +- crates/storage/nippy-jar/Cargo.toml | 1 - crates/storage/nippy-jar/src/cursor.rs | 67 +---- crates/storage/nippy-jar/src/error.rs | 4 - crates/storage/nippy-jar/src/lib.rs | 255 +++----------------- crates/storage/nippy-jar/src/phf/fmph.rs | 99 -------- crates/storage/nippy-jar/src/phf/go_fmph.rs | 100 -------- crates/storage/nippy-jar/src/phf/mod.rs | 46 ---- 9 files changed, 40 insertions(+), 580 deletions(-) delete mode 100644 crates/storage/nippy-jar/src/phf/fmph.rs delete mode 100644 crates/storage/nippy-jar/src/phf/go_fmph.rs delete mode 100644 crates/storage/nippy-jar/src/phf/mod.rs diff --git a/Cargo.lock b/Cargo.lock index 171b6d22e5..329a5e3c08 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1233,12 +1233,6 @@ dependencies = [ "syn 2.0.76", ] -[[package]] -name = "binout" -version = "0.2.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b60b1af88a588fca5fe424ae7d735bc52814f80ff57614f57043cc4e2024f2ea" - [[package]] name = "bit-set" version = "0.5.3" @@ -1270,15 +1264,6 @@ dependencies = [ "serde", ] -[[package]] -name = "bitm" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b06e8e5bec3490b9f6f3adbb78aa4f53e8396fd9994e8a62a346b44ea7c15f35" -dependencies = [ - "dyn_size_of", -] - [[package]] name = "bitvec" version = "1.0.1" @@ -2542,12 +2527,6 @@ version = "1.0.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d6ef0072f8a535281e4876be788938b528e9a1d43900b82c2569af7da799125" -[[package]] -name = "dyn_size_of" -version = "0.4.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "33d4f78a40b1ec35bf8cafdaaf607ba2f773c366b0b3bda48937cacd7a8d5134" - [[package]] name = "ecdsa" version = "0.16.9" @@ -5293,19 +5272,6 @@ dependencies = [ "ucd-trie", ] -[[package]] -name = "ph" -version = "0.8.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "86b7b74d575d7c11fb653fae69688be5206cafc1ead33c01ce61ac7f36eae45b" -dependencies = [ - "binout", - "bitm", - "dyn_size_of", - "rayon", - "wyhash", -] - [[package]] name = "pharos" version = "0.5.3" @@ -7502,7 +7468,6 @@ dependencies = [ "derive_more 1.0.0", "lz4_flex", "memmap2", - "ph", "rand 0.8.5", "reth-fs-util", "serde", @@ -11185,15 +11150,6 @@ dependencies = [ "web-sys", ] -[[package]] -name = "wyhash" -version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "baf6e163c25e3fac820b4b453185ea2dea3b6a3e0a721d4d23d75bd33734c295" -dependencies = [ - "rand_core 0.6.4", -] - [[package]] name = "wyz" version = "0.5.1" diff --git a/crates/storage/db/src/static_file/cursor.rs b/crates/storage/db/src/static_file/cursor.rs index 4a052c6abf..f22006c462 100644 --- a/crates/storage/db/src/static_file/cursor.rs +++ b/crates/storage/db/src/static_file/cursor.rs @@ -1,7 +1,7 @@ use super::mask::{ColumnSelectorOne, ColumnSelectorThree, ColumnSelectorTwo}; use derive_more::{Deref, DerefMut}; use reth_db_api::table::Decompress; -use reth_nippy_jar::{DataReader, NippyJar, NippyJarCursor}; +use reth_nippy_jar::{DataReader, NippyJar, NippyJarCursor, NippyJarError}; use reth_primitives::{static_file::SegmentHeader, B256}; use reth_storage_errors::provider::{ProviderError, ProviderResult}; use std::sync::Arc; @@ -39,7 +39,7 @@ impl<'a> StaticFileCursor<'a> { } let row = match key_or_num { - KeyOrNumber::Key(k) => self.row_by_key_with_cols(k, mask), + KeyOrNumber::Key(_) => Err(NippyJarError::UnsupportedFilterQuery), KeyOrNumber::Number(n) => match self.jar().user_header().start() { Some(offset) => { if offset > n { diff --git a/crates/storage/nippy-jar/Cargo.toml b/crates/storage/nippy-jar/Cargo.toml index 0bc3e40dc2..ba5846bdc4 100644 --- a/crates/storage/nippy-jar/Cargo.toml +++ b/crates/storage/nippy-jar/Cargo.toml @@ -19,7 +19,6 @@ name = "reth_nippy_jar" reth-fs-util.workspace = true # filter -ph = "0.8.0" cuckoofilter = { version = "0.5.0", features = [ "serde_support", "serde_bytes", diff --git a/crates/storage/nippy-jar/src/cursor.rs b/crates/storage/nippy-jar/src/cursor.rs index d42b0d364b..7af55fd436 100644 --- a/crates/storage/nippy-jar/src/cursor.rs +++ b/crates/storage/nippy-jar/src/cursor.rs @@ -1,10 +1,8 @@ use crate::{ compression::{Compression, Compressors, Zstd}, - DataReader, InclusionFilter, NippyJar, NippyJarError, NippyJarHeader, PerfectHashingFunction, - RefRow, + DataReader, NippyJar, NippyJarError, NippyJarHeader, RefRow, }; use std::{ops::Range, sync::Arc}; -use sucds::int_vectors::Access; use zstd::bulk::Decompressor; /// Simple cursor implementation to retrieve data from [`NippyJar`]. @@ -67,35 +65,6 @@ impl<'a, H: NippyJarHeader> NippyJarCursor<'a, H> { self.row = 0; } - /// Returns a row, searching it by a key. - /// - /// **May return false positives.** - /// - /// Example usage would be querying a transactions file with a transaction hash which is **NOT** - /// stored in file. - pub fn row_by_key(&mut self, key: &[u8]) -> Result>, NippyJarError> { - if let (Some(filter), Some(phf)) = (&self.jar.filter, &self.jar.phf) { - // TODO: is it worth to parallelize both? - - // May have false positives - if filter.contains(key)? { - // May have false positives - if let Some(row_index) = phf.get_index(key)? { - self.row = self - .jar - .offsets_index - .access(row_index as usize) - .expect("built from same set") as u64; - return self.next_row() - } - } - } else { - return Err(NippyJarError::UnsupportedFilterQuery) - } - - Ok(None) - } - /// Returns a row by its number. pub fn row_by_number(&mut self, row: usize) -> Result>, NippyJarError> { self.row = row as u64; @@ -130,40 +99,6 @@ impl<'a, H: NippyJarHeader> NippyJarCursor<'a, H> { )) } - /// Returns a row, searching it by a key using a - /// `mask` to only read certain columns from the row. - /// - /// **May return false positives.** - /// - /// Example usage would be querying a transactions file with a transaction hash which is **NOT** - /// stored in file. - pub fn row_by_key_with_cols( - &mut self, - key: &[u8], - mask: usize, - ) -> Result>, NippyJarError> { - if let (Some(filter), Some(phf)) = (&self.jar.filter, &self.jar.phf) { - // TODO: is it worth to parallelize both? - - // May have false positives - if filter.contains(key)? { - // May have false positives - if let Some(row_index) = phf.get_index(key)? { - self.row = self - .jar - .offsets_index - .access(row_index as usize) - .expect("built from same set") as u64; - return self.next_row_with_cols(mask) - } - } - } else { - return Err(NippyJarError::UnsupportedFilterQuery) - } - - Ok(None) - } - /// Returns a row by its number by using a `mask` to only read certain columns from the row. pub fn row_by_number_with_cols( &mut self, diff --git a/crates/storage/nippy-jar/src/error.rs b/crates/storage/nippy-jar/src/error.rs index 225d4fba30..6a5714e1e4 100644 --- a/crates/storage/nippy-jar/src/error.rs +++ b/crates/storage/nippy-jar/src/error.rs @@ -31,10 +31,6 @@ pub enum NippyJarError { FilterMaxCapacity, #[error("cuckoo was not properly initialized after loaded")] FilterCuckooNotLoaded, - #[error("perfect hashing function doesn't have any keys added")] - PHFMissingKeys, - #[error("nippy jar initialized without perfect hashing function")] - PHFMissing, #[error("nippy jar was built without an index")] UnsupportedFilterQuery, #[error("the size of an offset must be at most 8 bytes, got {offset_size}")] diff --git a/crates/storage/nippy-jar/src/lib.rs b/crates/storage/nippy-jar/src/lib.rs index 056f456eb2..60ed573461 100644 --- a/crates/storage/nippy-jar/src/lib.rs +++ b/crates/storage/nippy-jar/src/lib.rs @@ -32,9 +32,10 @@ pub mod compression; use compression::Compression; use compression::Compressors; -pub mod phf; -pub use phf::PHFKey; -use phf::{Fmph, Functions, GoFmph, PerfectHashingFunction}; +/// empty enum for backwards compatibility +#[derive(Debug, Serialize, Deserialize)] +#[cfg_attr(test, derive(PartialEq, Eq))] +pub enum Functions {} mod error; pub use error::NippyJarError; @@ -74,24 +75,6 @@ impl NippyJarHeader for T where /// /// Data is organized into a columnar format, enabling column-based compression. Data retrieval /// entails consulting an offset list and fetching the data from file via `mmap`. -/// -/// PHF & Filters: -/// For data membership verification, the `filter` field can be configured with algorithms like -/// Bloom or Cuckoo filters. While these filters enable rapid membership checks, it's important to -/// note that **they may yield false positives but not false negatives**. Therefore, they serve as -/// preliminary checks (eg. in `by_hash` queries) and should be followed by data verification on -/// retrieval. -/// -/// The `phf` (Perfect Hashing Function) and `offsets_index` fields facilitate the data retrieval -/// process in for example `by_hash` queries. Specifically, the PHF converts a query, such as a -/// block hash, into a unique integer. This integer is then used as an index in `offsets_index`, -/// which maps to the actual data location in the `offsets` list. Similar to the `filter`, the PHF -/// may also produce false positives but not false negatives, necessitating subsequent data -/// verification. -/// -/// Note: that the key (eg. `BlockHash`) passed to a filter and phf does not need to actually be -/// stored. -/// /// Ultimately, the `freeze` function yields two files: a data file containing both the data and its /// configuration, and an index file that houses the offsets and `offsets_index`. #[derive(Serialize, Deserialize)] @@ -112,7 +95,7 @@ pub struct NippyJar { /// Optional filter function for data membership checks. filter: Option, #[serde(skip)] - /// Optional Perfect Hashing Function (PHF) for unique offset mapping. + /// Optional field for backwards compatibility phf: Option, /// Index mapping PHF output to value offsets in `offsets`. #[serde(skip)] @@ -196,18 +179,6 @@ impl NippyJar { self } - /// Adds [`phf::Fmph`] perfect hashing function. - pub fn with_fmph(mut self) -> Self { - self.phf = Some(Functions::Fmph(Fmph::new())); - self - } - - /// Adds [`phf::GoFmph`] perfect hashing function. - pub fn with_gofmph(mut self) -> Self { - self.phf = Some(Functions::GoFmph(GoFmph::new())); - self - } - /// Gets a reference to the user header. pub const fn user_header(&self) -> &H { &self.user_header @@ -346,16 +317,6 @@ impl InclusionFilter for NippyJar { } } -impl PerfectHashingFunction for NippyJar { - fn set_keys(&mut self, keys: &[T]) -> Result<(), NippyJarError> { - self.phf.as_mut().ok_or(NippyJarError::PHFMissing)?.set_keys(keys) - } - - fn get_index(&self, key: &[u8]) -> Result, NippyJarError> { - self.phf.as_ref().ok_or(NippyJarError::PHFMissing)?.get_index(key) - } -} - #[cfg(test)] impl NippyJar { /// If required, prepares any compression algorithm to an early pass of the data. @@ -371,55 +332,6 @@ impl NippyJar { Ok(()) } - /// Prepares beforehand the offsets index for querying rows based on `values` (eg. transaction - /// hash). Expects `values` to be sorted in the same way as the data that is going to be - /// later on inserted. - /// - /// Currently collecting all items before acting on them. - pub fn prepare_index( - &mut self, - values: impl IntoIterator>, - row_count: usize, - ) -> Result<(), NippyJarError> { - debug!(target: "nippy-jar", ?row_count, "Preparing index."); - - let values = values.into_iter().collect::, _>>()?; - - debug_assert!( - row_count == values.len(), - "Row count ({row_count}) differs from value list count ({}).", - values.len() - ); - - let mut offsets_index = vec![0; row_count]; - - // Builds perfect hashing function from the values - if let Some(phf) = self.phf.as_mut() { - debug!(target: "nippy-jar", ?row_count, values_count = ?values.len(), "Setting keys for perfect hashing function."); - phf.set_keys(&values)?; - } - - if self.filter.is_some() || self.phf.is_some() { - debug!(target: "nippy-jar", ?row_count, "Creating filter and offsets_index."); - - for (row_num, v) in values.into_iter().enumerate() { - if let Some(filter) = self.filter.as_mut() { - filter.add(v.as_ref())?; - } - - if let Some(phf) = self.phf.as_mut() { - // Points to the first column value offset of the row. - let index = phf.get_index(v.as_ref())?.expect("initialized") as usize; - let _ = std::mem::replace(&mut offsets_index[index], row_num as u64); - } - } - } - - debug!(target: "nippy-jar", ?row_count, "Encoding offsets index list."); - self.offsets_index = PrefixSummedEliasFano::from_slice(&offsets_index)?; - Ok(()) - } - /// Writes all data and configuration to a file and the offset index to another. pub fn freeze( self, @@ -447,7 +359,7 @@ impl NippyJar { Ok(writer.into_jar()) } - /// Freezes [`PerfectHashingFunction`], [`InclusionFilter`] and the offset index to file. + /// Freezes [`InclusionFilter`] and the offset index to file. fn freeze_filters(&self) -> Result<(), NippyJarError> { debug!(target: "nippy-jar", path=?self.index_path(), "Writing offsets and offsets index to file."); @@ -474,11 +386,6 @@ impl NippyJar { } } - // Check `prepare_index` was called. - if let Some(phf) = &self.phf { - let _ = phf.get_index(&[])?; - } - Ok(()) } } @@ -588,7 +495,7 @@ mod tests { use super::*; use compression::Compression; use rand::{rngs::SmallRng, seq::SliceRandom, RngCore, SeedableRng}; - use std::{collections::HashSet, fs::OpenOptions}; + use std::{fs::OpenOptions, io::Read}; type ColumnResults = Vec>; type ColumnValues = Vec>; @@ -617,57 +524,30 @@ mod tests { } #[test] - fn test_phf() { - let (col1, col2) = test_data(None); - let num_columns = 2; - let num_rows = col1.len() as u64; - let file_path = tempfile::NamedTempFile::new().unwrap(); + fn test_config_serialization() { + let file = tempfile::NamedTempFile::new().unwrap(); + let jar = NippyJar::new_without_header(23, file.path()).with_lz4(); + jar.freeze_config().unwrap(); - let create_nippy = || -> NippyJar<()> { - let mut nippy = NippyJar::new_without_header(num_columns, file_path.path()); - assert!(matches!( - NippyJar::set_keys(&mut nippy, &col1), - Err(NippyJarError::PHFMissing) - )); - nippy - }; + let mut config_file = OpenOptions::new().read(true).open(jar.config_path()).unwrap(); + let config_file_len = config_file.metadata().unwrap().len(); + assert_eq!(config_file_len, 37); - let check_phf = |mut nippy: NippyJar<_>| { - assert!(matches!( - NippyJar::get_index(&nippy, &col1[0]), - Err(NippyJarError::PHFMissingKeys) - )); - assert!(NippyJar::set_keys(&mut nippy, &col1).is_ok()); + let mut buf = Vec::with_capacity(config_file_len as usize); + config_file.read_to_end(&mut buf).unwrap(); - let collect_indexes = |nippy: &NippyJar<_>| -> Vec { - col1.iter() - .map(|value| NippyJar::get_index(nippy, value.as_slice()).unwrap().unwrap()) - .collect() - }; + assert_eq!( + vec![ + 1, 0, 0, 0, 0, 0, 0, 0, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + ], + buf + ); - // Ensure all indexes are unique - let indexes = collect_indexes(&nippy); - assert_eq!(indexes.iter().collect::>().len(), indexes.len()); - - // Ensure reproducibility - assert!(NippyJar::set_keys(&mut nippy, &col1).is_ok()); - assert_eq!(indexes, collect_indexes(&nippy)); - - // Ensure that loaded phf provides the same function outputs - nippy.prepare_index(clone_with_result(&col1), col1.len()).unwrap(); - nippy - .freeze(vec![clone_with_result(&col1), clone_with_result(&col2)], num_rows) - .unwrap(); - let mut loaded_nippy = NippyJar::load_without_header(file_path.path()).unwrap(); - loaded_nippy.load_filters().unwrap(); - assert_eq!(indexes, collect_indexes(&loaded_nippy)); - }; - - // fmph bytes size for 100 values of 32 bytes: 54 - check_phf(create_nippy().with_fmph()); - - // fmph bytes size for 100 values of 32 bytes: 46 - check_phf(create_nippy().with_gofmph()); + let mut read_jar = bincode::deserialize_from::<_, NippyJar>(&buf[..]).unwrap(); + // Path is not ser/de + read_jar.path = file.path().to_path_buf(); + assert_eq!(jar, read_jar); } #[test] @@ -891,11 +771,9 @@ mod tests { let mut nippy = NippyJar::new(num_columns, file_path.path(), BlockJarHeader { block_start }) .with_zstd(true, 5000) - .with_cuckoo_filter(col1.len()) - .with_fmph(); + .with_cuckoo_filter(col1.len()); nippy.prepare_compression(data.clone()).unwrap(); - nippy.prepare_index(clone_with_result(&col1), col1.len()).unwrap(); nippy .freeze(vec![clone_with_result(&col1), clone_with_result(&col2)], num_rows) .unwrap(); @@ -908,7 +786,6 @@ mod tests { assert!(loaded_nippy.compressor().is_some()); assert!(loaded_nippy.filter.is_some()); - assert!(loaded_nippy.phf.is_some()); assert_eq!(loaded_nippy.user_header().block_start, block_start); if let Some(Compressors::Zstd(_zstd)) = loaded_nippy.compressor() { @@ -929,22 +806,9 @@ mod tests { data.shuffle(&mut rand::thread_rng()); for (row_num, (v0, v1)) in data { - // Simulates `by_hash` queries by iterating col1 values, which were used to - // create the inner index. - { - let row_by_value = cursor - .row_by_key(v0) - .unwrap() - .unwrap() - .iter() - .map(|a| a.to_vec()) - .collect::>(); - assert_eq!((&row_by_value[0], &row_by_value[1]), (v0, v1)); - - // Simulates `by_number` queries - let row_by_num = cursor.row_by_number(row_num).unwrap().unwrap(); - assert_eq!(row_by_value, row_by_num); - } + // Simulates `by_number` queries + let row_by_num = cursor.row_by_number(row_num).unwrap().unwrap(); + assert_eq!((&row_by_num[0].to_vec(), &row_by_num[1].to_vec()), (v0, v1)); } } } @@ -962,11 +826,9 @@ mod tests { { let mut nippy = NippyJar::new_without_header(num_columns, file_path.path()) .with_zstd(true, 5000) - .with_cuckoo_filter(col1.len()) - .with_fmph(); + .with_cuckoo_filter(col1.len()); nippy.prepare_compression(data).unwrap(); - nippy.prepare_index(clone_with_result(&col1), col1.len()).unwrap(); nippy .freeze(vec![clone_with_result(&col1), clone_with_result(&col2)], num_rows) .unwrap(); @@ -989,84 +851,41 @@ mod tests { // Read both columns for (row_num, (v0, v1)) in &data { - // Simulates `by_hash` queries by iterating col1 values, which were used to - // create the inner index. - let row_by_value = cursor - .row_by_key_with_cols(v0, BLOCKS_FULL_MASK) - .unwrap() - .unwrap() - .iter() - .map(|a| a.to_vec()) - .collect::>(); - assert_eq!((&row_by_value[0], &row_by_value[1]), (*v0, *v1)); - // Simulates `by_number` queries let row_by_num = cursor .row_by_number_with_cols(*row_num, BLOCKS_FULL_MASK) .unwrap() .unwrap(); - assert_eq!(row_by_value, row_by_num); + assert_eq!((&row_by_num[0].to_vec(), &row_by_num[1].to_vec()), (*v0, *v1)); } // Read first column only: `Block` const BLOCKS_BLOCK_MASK: usize = 0b01; for (row_num, (v0, _)) in &data { - // Simulates `by_hash` queries by iterating col1 values, which were used to - // create the inner index. - let row_by_value = cursor - .row_by_key_with_cols(v0, BLOCKS_BLOCK_MASK) - .unwrap() - .unwrap() - .iter() - .map(|a| a.to_vec()) - .collect::>(); - assert_eq!(row_by_value.len(), 1); - assert_eq!(&row_by_value[0], *v0); - // Simulates `by_number` queries let row_by_num = cursor .row_by_number_with_cols(*row_num, BLOCKS_BLOCK_MASK) .unwrap() .unwrap(); assert_eq!(row_by_num.len(), 1); - assert_eq!(row_by_value, row_by_num); + assert_eq!(&row_by_num[0].to_vec(), *v0); } // Read second column only: `Block` const BLOCKS_WITHDRAWAL_MASK: usize = 0b10; - for (row_num, (v0, v1)) in &data { - // Simulates `by_hash` queries by iterating col1 values, which were used to - // create the inner index. - let row_by_value = cursor - .row_by_key_with_cols(v0, BLOCKS_WITHDRAWAL_MASK) - .unwrap() - .unwrap() - .iter() - .map(|a| a.to_vec()) - .collect::>(); - assert_eq!(row_by_value.len(), 1); - assert_eq!(&row_by_value[0], *v1); - + for (row_num, (_, v1)) in &data { // Simulates `by_number` queries let row_by_num = cursor .row_by_number_with_cols(*row_num, BLOCKS_WITHDRAWAL_MASK) .unwrap() .unwrap(); assert_eq!(row_by_num.len(), 1); - assert_eq!(row_by_value, row_by_num); + assert_eq!(&row_by_num[0].to_vec(), *v1); } // Read nothing const BLOCKS_EMPTY_MASK: usize = 0b00; - for (row_num, (v0, _)) in &data { - // Simulates `by_hash` queries by iterating col1 values, which were used to - // create the inner index. - assert!(cursor - .row_by_key_with_cols(v0, BLOCKS_EMPTY_MASK) - .unwrap() - .unwrap() - .is_empty()); - + for (row_num, _) in &data { // Simulates `by_number` queries assert!(cursor .row_by_number_with_cols(*row_num, BLOCKS_EMPTY_MASK) diff --git a/crates/storage/nippy-jar/src/phf/fmph.rs b/crates/storage/nippy-jar/src/phf/fmph.rs deleted file mode 100644 index a332c40cf7..0000000000 --- a/crates/storage/nippy-jar/src/phf/fmph.rs +++ /dev/null @@ -1,99 +0,0 @@ -use crate::{NippyJarError, PHFKey, PerfectHashingFunction}; -use ph::fmph::{BuildConf, Function}; -use serde::{ - de::Error as DeSerdeError, ser::Error as SerdeError, Deserialize, Deserializer, Serialize, - Serializer, -}; - -/// Wrapper struct for [`Function`]. Implementation of the following [paper](https://dl.acm.org/doi/10.1145/3596453). -#[derive(Default)] -pub struct Fmph { - function: Option, -} - -impl Fmph { - pub const fn new() -> Self { - Self { function: None } - } -} - -impl PerfectHashingFunction for Fmph { - fn set_keys(&mut self, keys: &[T]) -> Result<(), NippyJarError> { - self.function = Some(Function::from_slice_with_conf( - keys, - BuildConf { use_multiple_threads: true, ..Default::default() }, - )); - Ok(()) - } - - fn get_index(&self, key: &[u8]) -> Result, NippyJarError> { - if let Some(f) = &self.function { - return Ok(f.get(key)) - } - Err(NippyJarError::PHFMissingKeys) - } -} - -#[cfg(test)] -impl PartialEq for Fmph { - fn eq(&self, _other: &Self) -> bool { - match (&self.function, &_other.function) { - (Some(func1), Some(func2)) => { - func1.level_sizes() == func2.level_sizes() && - func1.write_bytes() == func2.write_bytes() && - { - let mut f1 = Vec::with_capacity(func1.write_bytes()); - func1.write(&mut f1).expect("enough capacity"); - - let mut f2 = Vec::with_capacity(func2.write_bytes()); - func2.write(&mut f2).expect("enough capacity"); - - f1 == f2 - } - } - (None, None) => true, - _ => false, - } - } -} - -impl std::fmt::Debug for Fmph { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("Fmph") - .field("bytes_size", &self.function.as_ref().map(|f| f.write_bytes())) - .finish_non_exhaustive() - } -} - -impl Serialize for Fmph { - /// Potentially expensive, but should be used only when creating the file. - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - match &self.function { - Some(f) => { - let mut v = Vec::with_capacity(f.write_bytes()); - f.write(&mut v).map_err(S::Error::custom)?; - serializer.serialize_some(&v) - } - None => serializer.serialize_none(), - } - } -} - -impl<'de> Deserialize<'de> for Fmph { - fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - if let Some(buffer) = >>::deserialize(deserializer)? { - return Ok(Self { - function: Some( - Function::read(&mut std::io::Cursor::new(buffer)).map_err(D::Error::custom)?, - ), - }) - } - Ok(Self { function: None }) - } -} diff --git a/crates/storage/nippy-jar/src/phf/go_fmph.rs b/crates/storage/nippy-jar/src/phf/go_fmph.rs deleted file mode 100644 index 328ddcb4dd..0000000000 --- a/crates/storage/nippy-jar/src/phf/go_fmph.rs +++ /dev/null @@ -1,100 +0,0 @@ -use crate::{NippyJarError, PHFKey, PerfectHashingFunction}; -use ph::fmph::{GOBuildConf, GOFunction}; -use serde::{ - de::Error as DeSerdeError, ser::Error as SerdeError, Deserialize, Deserializer, Serialize, - Serializer, -}; - -/// Wrapper struct for [`GOFunction`]. Implementation of the following [paper](https://dl.acm.org/doi/10.1145/3596453). -#[derive(Default)] -pub struct GoFmph { - function: Option, -} - -impl GoFmph { - pub const fn new() -> Self { - Self { function: None } - } -} - -impl PerfectHashingFunction for GoFmph { - fn set_keys(&mut self, keys: &[T]) -> Result<(), NippyJarError> { - self.function = Some(GOFunction::from_slice_with_conf( - keys, - GOBuildConf { use_multiple_threads: true, ..Default::default() }, - )); - Ok(()) - } - - fn get_index(&self, key: &[u8]) -> Result, NippyJarError> { - if let Some(f) = &self.function { - return Ok(f.get(key)) - } - Err(NippyJarError::PHFMissingKeys) - } -} - -#[cfg(test)] -impl PartialEq for GoFmph { - fn eq(&self, other: &Self) -> bool { - match (&self.function, &other.function) { - (Some(func1), Some(func2)) => { - func1.level_sizes() == func2.level_sizes() && - func1.write_bytes() == func2.write_bytes() && - { - let mut f1 = Vec::with_capacity(func1.write_bytes()); - func1.write(&mut f1).expect("enough capacity"); - - let mut f2 = Vec::with_capacity(func2.write_bytes()); - func2.write(&mut f2).expect("enough capacity"); - - f1 == f2 - } - } - (None, None) => true, - _ => false, - } - } -} - -impl std::fmt::Debug for GoFmph { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.debug_struct("GoFmph") - .field("bytes_size", &self.function.as_ref().map(|f| f.write_bytes())) - .finish_non_exhaustive() - } -} - -impl Serialize for GoFmph { - /// Potentially expensive, but should be used only when creating the file. - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - match &self.function { - Some(f) => { - let mut v = Vec::with_capacity(f.write_bytes()); - f.write(&mut v).map_err(S::Error::custom)?; - serializer.serialize_some(&v) - } - None => serializer.serialize_none(), - } - } -} - -impl<'de> Deserialize<'de> for GoFmph { - fn deserialize(deserializer: D) -> Result - where - D: Deserializer<'de>, - { - if let Some(buffer) = >>::deserialize(deserializer)? { - return Ok(Self { - function: Some( - GOFunction::read(&mut std::io::Cursor::new(buffer)) - .map_err(D::Error::custom)?, - ), - }) - } - Ok(Self { function: None }) - } -} diff --git a/crates/storage/nippy-jar/src/phf/mod.rs b/crates/storage/nippy-jar/src/phf/mod.rs deleted file mode 100644 index ade48b60a3..0000000000 --- a/crates/storage/nippy-jar/src/phf/mod.rs +++ /dev/null @@ -1,46 +0,0 @@ -use crate::NippyJarError; -use serde::{Deserialize, Serialize}; -use std::hash::Hash; - -mod fmph; -pub use fmph::Fmph; - -mod go_fmph; -pub use go_fmph::GoFmph; - -/// Trait alias for [`PerfectHashingFunction`] keys. -pub trait PHFKey: AsRef<[u8]> + Sync + Clone + Hash {} -impl + Sync + Clone + Hash> PHFKey for T {} - -/// Trait to build and query a perfect hashing function. -pub trait PerfectHashingFunction: Serialize + for<'a> Deserialize<'a> { - /// Adds the key set and builds the perfect hashing function. - fn set_keys(&mut self, keys: &[T]) -> Result<(), NippyJarError>; - - /// Get corresponding associated integer. There might be false positives. - fn get_index(&self, key: &[u8]) -> Result, NippyJarError>; -} - -/// Enumerates all types of perfect hashing functions. -#[derive(Debug, Serialize, Deserialize)] -#[cfg_attr(test, derive(PartialEq))] -pub enum Functions { - Fmph(Fmph), - GoFmph(GoFmph), -} - -impl PerfectHashingFunction for Functions { - fn set_keys(&mut self, keys: &[T]) -> Result<(), NippyJarError> { - match self { - Self::Fmph(f) => f.set_keys(keys), - Self::GoFmph(f) => f.set_keys(keys), - } - } - - fn get_index(&self, key: &[u8]) -> Result, NippyJarError> { - match self { - Self::Fmph(f) => f.get_index(key), - Self::GoFmph(f) => f.get_index(key), - } - } -}