meilisearch/milli/src/update/word_prefix_pair_proximity_docids.rs

use std::str;

use fst::automaton::{Automaton, Str};
use fst::{IntoStreamer, Streamer};
use grenad::CompressionType;
use heed::types::ByteSlice;
use heed::BytesEncode;
use log::debug;

use crate::heed_codec::StrStrU8Codec;
use crate::update::index_documents::{
    cbo_roaring_bitmap_merge, create_sorter, sorter_into_lmdb_database, WriteMethod,
};
use crate::{Index, Result};

pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
    wtxn: &'t mut heed::RwTxn<'i, 'u>,
    index: &'i Index,
    pub(crate) chunk_compression_type: CompressionType,
    pub(crate) chunk_compression_level: Option<u32>,
    pub(crate) chunk_fusing_shrink_size: Option<u64>,
    pub(crate) max_nb_chunks: Option<usize>,
    pub(crate) max_memory: Option<usize>,
}

impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
    pub fn new(
        wtxn: &'t mut heed::RwTxn<'i, 'u>,
        index: &'i Index,
    ) -> WordPrefixPairProximityDocids<'t, 'u, 'i> {
        WordPrefixPairProximityDocids {
            wtxn,
            index,
            chunk_compression_type: CompressionType::None,
            chunk_compression_level: None,
            chunk_fusing_shrink_size: None,
            max_nb_chunks: None,
            max_memory: None,
        }
    }

    pub fn execute(self) -> Result<()> {
        debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");

        self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?;

        let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;

        // Here we create a sorter akin to the previous one.
        let mut word_prefix_pair_proximity_docids_sorter = create_sorter(
            cbo_roaring_bitmap_merge,
            self.chunk_compression_type,
            self.chunk_compression_level,
            self.chunk_fusing_shrink_size,
            self.max_nb_chunks,
            self.max_memory,
        );

        // We insert all the word pairs corresponding to the word-prefix pairs
        // where the prefixes appears in the prefix FST previously constructed.
        let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>();
        for result in db.iter(self.wtxn)? {
            let ((word1, word2, prox), data) = result?;
            let automaton = Str::new(word2).starts_with();
            let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
            while let Some(prefix) = matching_prefixes.next() {
                let prefix = str::from_utf8(prefix)?;
                let pair = (word1, prefix, prox);
                let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap();
                word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?;
            }
        }

        drop(prefix_fst);

        // We finally write the word prefix pair proximity docids into the LMDB database.
        sorter_into_lmdb_database(
            self.wtxn,
            *self.index.word_prefix_pair_proximity_docids.as_polymorph(),
            word_prefix_pair_proximity_docids_sorter,
            cbo_roaring_bitmap_merge,
            WriteMethod::Append,
        )?;

        Ok(())
    }
}
Compute the words prefixes at the end of an update 2021-03-25 18:10:12 +08:00			`use std::str;`

			`use fst::automaton::{Automaton, Str};`
format the whole project 2021-06-17 00:33:33 +08:00			`use fst::{IntoStreamer, Streamer};`
Compute the words prefixes at the end of an update 2021-03-25 18:10:12 +08:00			`use grenad::CompressionType;`
			`use heed::types::ByteSlice;`
format the whole project 2021-06-17 00:33:33 +08:00			`use heed::BytesEncode;`
Compute the words prefixes at the end of an update 2021-03-25 18:10:12 +08:00			`use log::debug;`

			`use crate::heed_codec::StrStrU8Codec;`
			`use crate::update::index_documents::{`
format the whole project 2021-06-17 00:33:33 +08:00			`cbo_roaring_bitmap_merge, create_sorter, sorter_into_lmdb_database, WriteMethod,`
Compute the words prefixes at the end of an update 2021-03-25 18:10:12 +08:00			`};`
format the whole project 2021-06-17 00:33:33 +08:00			`use crate::{Index, Result};`
Compute the words prefixes at the end of an update 2021-03-25 18:10:12 +08:00
			`pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {`
			`wtxn: &'t mut heed::RwTxn<'i, 'u>,`
			`index: &'i Index,`
			`pub(crate) chunk_compression_type: CompressionType,`
			`pub(crate) chunk_compression_level: Option<u32>,`
			`pub(crate) chunk_fusing_shrink_size: Option<u64>,`
			`pub(crate) max_nb_chunks: Option<usize>,`
			`pub(crate) max_memory: Option<usize>,`
			`}`

			`impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {`
			`pub fn new(`
			`wtxn: &'t mut heed::RwTxn<'i, 'u>,`
			`index: &'i Index,`
format the whole project 2021-06-17 00:33:33 +08:00			`) -> WordPrefixPairProximityDocids<'t, 'u, 'i> {`
Compute the words prefixes at the end of an update 2021-03-25 18:10:12 +08:00			`WordPrefixPairProximityDocids {`
			`wtxn,`
			`index,`
			`chunk_compression_type: CompressionType::None,`
			`chunk_compression_level: None,`
			`chunk_fusing_shrink_size: None,`
			`max_nb_chunks: None,`
			`max_memory: None,`
			`}`
			`}`

Use the Error enum everywhere in the project 2021-06-14 22:46:19 +08:00			`pub fn execute(self) -> Result<()> {`
Compute the words prefixes at the end of an update 2021-03-25 18:10:12 +08:00			`debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");`

			`self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?;`

			`let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;`

			`// Here we create a sorter akin to the previous one.`
			`let mut word_prefix_pair_proximity_docids_sorter = create_sorter(`
Prefer using an explicit merge function name 2021-06-09 18:17:11 +08:00			`cbo_roaring_bitmap_merge,`
Compute the words prefixes at the end of an update 2021-03-25 18:10:12 +08:00			`self.chunk_compression_type,`
			`self.chunk_compression_level,`
			`self.chunk_fusing_shrink_size,`
			`self.max_nb_chunks,`
			`self.max_memory,`
			`);`

			`// We insert all the word pairs corresponding to the word-prefix pairs`
			`// where the prefixes appears in the prefix FST previously constructed.`
			`let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>();`
			`for result in db.iter(self.wtxn)? {`
			`let ((word1, word2, prox), data) = result?;`
			`let automaton = Str::new(word2).starts_with();`
			`let mut matching_prefixes = prefix_fst.search(automaton).into_stream();`
			`while let Some(prefix) = matching_prefixes.next() {`
			`let prefix = str::from_utf8(prefix)?;`
			`let pair = (word1, prefix, prox);`
			`let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap();`
			`word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?;`
			`}`
			`}`

			`drop(prefix_fst);`

			`// We finally write the word prefix pair proximity docids into the LMDB database.`
			`sorter_into_lmdb_database(`
			`self.wtxn,`
			`*self.index.word_prefix_pair_proximity_docids.as_polymorph(),`
			`word_prefix_pair_proximity_docids_sorter,`
Prefer using an explicit merge function name 2021-06-09 18:17:11 +08:00			`cbo_roaring_bitmap_merge,`
Compute the words prefixes at the end of an update 2021-03-25 18:10:12 +08:00			`WriteMethod::Append,`
			`)?;`

			`Ok(())`
			`}`
			`}`