mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-26 03:55:07 +08:00
Generate the dictionary from the first 10k documents
This commit is contained in:
parent
0d63d02ab2
commit
767f20e30d
24
Cargo.lock
generated
24
Cargo.lock
generated
@ -3563,6 +3563,7 @@ dependencies = [
|
||||
"ureq",
|
||||
"url",
|
||||
"uuid",
|
||||
"zstd 0.11.2+zstd.1.5.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -6406,7 +6407,7 @@ dependencies = [
|
||||
"time",
|
||||
"zeroize",
|
||||
"zopfli",
|
||||
"zstd",
|
||||
"zstd 0.13.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -6423,13 +6424,32 @@ dependencies = [
|
||||
"simd-adler32",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd"
|
||||
version = "0.11.2+zstd.1.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4"
|
||||
dependencies = [
|
||||
"zstd-safe 5.0.2+zstd.1.5.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd"
|
||||
version = "0.13.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fcf2b778a664581e31e389454a7072dab1647606d44f7feea22cd5abb9c9f3f9"
|
||||
dependencies = [
|
||||
"zstd-safe",
|
||||
"zstd-safe 7.2.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zstd-safe"
|
||||
version = "5.0.2+zstd.1.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"zstd-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
@ -39,6 +39,7 @@ indexmap = { version = "2.2.6", features = ["serde"] }
|
||||
json-depth-checker = { path = "../json-depth-checker" }
|
||||
levenshtein_automata = { version = "0.2.1", features = ["fst_automaton"] }
|
||||
lz4_flex = "0.11.3"
|
||||
zstd = { version = "0.11.2", features = ["zdict_builder"] }
|
||||
memmap2 = "0.9.4"
|
||||
obkv = "0.2.2"
|
||||
once_cell = "1.19.0"
|
||||
|
@ -28,13 +28,13 @@ impl<'a> CompressedKvReaderU16<'a> {
|
||||
pub fn decompress_with<'b>(
|
||||
&self,
|
||||
buffer: &'b mut Vec<u8>,
|
||||
dictionnary: &[u8],
|
||||
dictionary: &[u8],
|
||||
) -> Result<KvReaderU16<'b>, lz4_flex::block::DecompressError> {
|
||||
let (size, input) = lz4_flex::block::uncompressed_size(self.0)?;
|
||||
buffer.resize(size, 0);
|
||||
// TODO loop to increase the buffer size of need be
|
||||
let size =
|
||||
lz4_flex::block::decompress_into_with_dict(input, &mut buffer[..size], dictionnary)?;
|
||||
lz4_flex::block::decompress_into_with_dict(input, &mut buffer[..size], dictionary)?;
|
||||
Ok(KvReaderU16::new(&buffer[..size]))
|
||||
}
|
||||
|
||||
|
@ -5,7 +5,7 @@ mod transform;
|
||||
mod typed_chunk;
|
||||
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::io::{Read, Seek};
|
||||
use std::io::{BufWriter, Read, Seek, Write};
|
||||
use std::iter;
|
||||
use std::num::NonZeroU32;
|
||||
use std::result::Result as StdResult;
|
||||
@ -41,7 +41,7 @@ use crate::update::{
|
||||
IndexerConfig, UpdateIndexingStep, WordPrefixDocids, WordPrefixIntegerDocids, WordsPrefixesFst,
|
||||
};
|
||||
use crate::vector::EmbeddingConfigs;
|
||||
use crate::{CboRoaringBitmapCodec, Index, Result};
|
||||
use crate::{CboRoaringBitmapCodec, Index, Result, BEU32};
|
||||
|
||||
static MERGED_DATABASE_COUNT: usize = 7;
|
||||
static PREFIX_DATABASE_COUNT: usize = 4;
|
||||
@ -568,7 +568,7 @@ where
|
||||
|
||||
// TODO increase this number to 10k and put it in a const somewhere
|
||||
// I don't like that this dangerous condition is here...
|
||||
if number_of_documents > 1_000
|
||||
if number_of_documents > 10_000
|
||||
&& self.index.document_compression_dictionary(self.wtxn)?.is_none()
|
||||
{
|
||||
self.manage_compression_dictionary()?;
|
||||
@ -767,17 +767,29 @@ where
|
||||
name = "compress_documents_database"
|
||||
)]
|
||||
pub fn manage_compression_dictionary(&mut self) -> Result<()> {
|
||||
// TODO This is a dumb dictionary, just so you get the idea.
|
||||
// We need to compute a better one by using zstd or something else.
|
||||
let dictionary = b"movietraileradventurehorror";
|
||||
self.index.put_document_compression_dictionary(self.wtxn, dictionary)?;
|
||||
let mut sample_file = tempfile::tempfile().map(BufWriter::new)?;
|
||||
let mut sample_sizes = Vec::new();
|
||||
// TODO make this 1_000 be 10k and const
|
||||
let documents = self.index.documents.remap_types::<BEU32, Bytes>();
|
||||
for result in documents.iter(self.wtxn)?.take(10_000) {
|
||||
let (_id, bytes) = result?;
|
||||
sample_file.write_all(bytes)?;
|
||||
sample_sizes.push(bytes.len());
|
||||
}
|
||||
|
||||
// TODO manage this unwrap correctly
|
||||
let sample_file = sample_file.into_inner().unwrap();
|
||||
let sample_data = unsafe { memmap2::Mmap::map(&sample_file)? };
|
||||
// TODO make this 64_000 const
|
||||
let dictionary = zstd::dict::from_continuous(&sample_data, &sample_sizes, 64_000)?;
|
||||
self.index.put_document_compression_dictionary(self.wtxn, &dictionary)?;
|
||||
|
||||
// TODO do not remap types here but rather expose the &[u8] for the KvReaderU16
|
||||
let mut iter = self.index.documents.remap_data_type::<Bytes>().iter_mut(self.wtxn)?;
|
||||
while let Some(result) = iter.next() {
|
||||
let (docid, document) = result?;
|
||||
// TODO manage this unwrap correctly
|
||||
let compressed = CompressedKvWriterU16::new_with_dictionary(document, dictionary);
|
||||
let compressed = CompressedKvWriterU16::new_with_dictionary(document, &dictionary);
|
||||
// safety the compressed document is entirely owned
|
||||
unsafe {
|
||||
iter.put_current_with_options::<CompressedObkvCodec>(
|
||||
|
Loading…
Reference in New Issue
Block a user