mirror of
https://github.com/meilisearch/meilisearch.git
synced 2025-01-18 08:48:32 +08:00
Compute the words prefixes at the end of an update
This commit is contained in:
parent
ab92c814c3
commit
e65bad16cc
@ -228,8 +228,6 @@ enum UpdateMeta {
|
|||||||
ClearDocuments,
|
ClearDocuments,
|
||||||
Settings(Settings),
|
Settings(Settings),
|
||||||
Facets(Facets),
|
Facets(Facets),
|
||||||
WordsPrefixes(WordsPrefixes),
|
|
||||||
WordsLevelPositions(WordsLevelPositions),
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
@ -290,14 +288,6 @@ struct WordsLevelPositions {
|
|||||||
min_level_size: Option<NonZeroU32>,
|
min_level_size: Option<NonZeroU32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Any value that is present is considered Some value, including null.
|
|
||||||
fn deserialize_some<'de, T, D>(deserializer: D) -> Result<Option<T>, D::Error>
|
|
||||||
where T: Deserialize<'de>,
|
|
||||||
D: Deserializer<'de>
|
|
||||||
{
|
|
||||||
Deserialize::deserialize(deserializer).map(Some)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[tokio::main]
|
#[tokio::main]
|
||||||
async fn main() -> anyhow::Result<()> {
|
async fn main() -> anyhow::Result<()> {
|
||||||
let opt = Opt::from_args();
|
let opt = Opt::from_args();
|
||||||
@ -496,36 +486,6 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
Err(e) => Err(e)
|
Err(e) => Err(e)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
UpdateMeta::WordsPrefixes(settings) => {
|
|
||||||
// We must use the write transaction of the update here.
|
|
||||||
let mut wtxn = index_cloned.write_txn()?;
|
|
||||||
let mut builder = update_builder.words_prefixes(&mut wtxn, &index_cloned);
|
|
||||||
if let Some(value) = settings.threshold {
|
|
||||||
builder.threshold(value);
|
|
||||||
}
|
|
||||||
if let Some(value) = settings.max_prefix_length {
|
|
||||||
builder.max_prefix_length(value);
|
|
||||||
}
|
|
||||||
match builder.execute() {
|
|
||||||
Ok(()) => wtxn.commit().map_err(Into::into),
|
|
||||||
Err(e) => Err(e)
|
|
||||||
}
|
|
||||||
},
|
|
||||||
UpdateMeta::WordsLevelPositions(levels) => {
|
|
||||||
// We must use the write transaction of the update here.
|
|
||||||
let mut wtxn = index_cloned.write_txn()?;
|
|
||||||
let mut builder = update_builder.words_level_positions(&mut wtxn, &index_cloned);
|
|
||||||
if let Some(value) = levels.level_group_size {
|
|
||||||
builder.level_group_size(value);
|
|
||||||
}
|
|
||||||
if let Some(value) = levels.min_level_size {
|
|
||||||
builder.min_level_size(value);
|
|
||||||
}
|
|
||||||
match builder.execute() {
|
|
||||||
Ok(()) => wtxn.commit().map_err(Into::into),
|
|
||||||
Err(e) => Err(e.into())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
};
|
||||||
|
|
||||||
let meta = match result {
|
let meta = match result {
|
||||||
@ -942,32 +902,6 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
warp::reply()
|
warp::reply()
|
||||||
});
|
});
|
||||||
|
|
||||||
let update_store_cloned = update_store.clone();
|
|
||||||
let update_status_sender_cloned = update_status_sender.clone();
|
|
||||||
let change_words_prefixes_route = warp::filters::method::post()
|
|
||||||
.and(warp::path!("words-prefixes"))
|
|
||||||
.and(warp::body::json())
|
|
||||||
.map(move |settings: WordsPrefixes| {
|
|
||||||
let meta = UpdateMeta::WordsPrefixes(settings);
|
|
||||||
let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
|
|
||||||
let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
|
|
||||||
eprintln!("update {} registered", update_id);
|
|
||||||
warp::reply()
|
|
||||||
});
|
|
||||||
|
|
||||||
let update_store_cloned = update_store.clone();
|
|
||||||
let update_status_sender_cloned = update_status_sender.clone();
|
|
||||||
let change_words_level_positions_route = warp::filters::method::post()
|
|
||||||
.and(warp::path!("words-level-positions"))
|
|
||||||
.and(warp::body::json())
|
|
||||||
.map(move |levels: WordsLevelPositions| {
|
|
||||||
let meta = UpdateMeta::WordsLevelPositions(levels);
|
|
||||||
let update_id = update_store_cloned.register_update(&meta, &[]).unwrap();
|
|
||||||
let _ = update_status_sender_cloned.send(UpdateStatus::Pending { update_id, meta });
|
|
||||||
eprintln!("update {} registered", update_id);
|
|
||||||
warp::reply()
|
|
||||||
});
|
|
||||||
|
|
||||||
let update_store_cloned = update_store.clone();
|
let update_store_cloned = update_store.clone();
|
||||||
let update_status_sender_cloned = update_status_sender.clone();
|
let update_status_sender_cloned = update_status_sender.clone();
|
||||||
let abort_update_id_route = warp::filters::method::delete()
|
let abort_update_id_route = warp::filters::method::delete()
|
||||||
@ -1042,8 +976,6 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
.or(clearing_route)
|
.or(clearing_route)
|
||||||
.or(change_settings_route)
|
.or(change_settings_route)
|
||||||
.or(change_facet_levels_route)
|
.or(change_facet_levels_route)
|
||||||
.or(change_words_prefixes_route)
|
|
||||||
.or(change_words_level_positions_route)
|
|
||||||
.or(update_ws_route);
|
.or(update_ws_route);
|
||||||
|
|
||||||
let addr = SocketAddr::from_str(&opt.http_listen_addr)?;
|
let addr = SocketAddr::from_str(&opt.http_listen_addr)?;
|
||||||
|
@ -338,6 +338,7 @@ fn biggest_value_sizes(index: &Index, rtxn: &heed::RoTxn, limit: usize) -> anyho
|
|||||||
facet_field_id_value_docids,
|
facet_field_id_value_docids,
|
||||||
field_id_docid_facet_values: _,
|
field_id_docid_facet_values: _,
|
||||||
documents,
|
documents,
|
||||||
|
..
|
||||||
} = index;
|
} = index;
|
||||||
|
|
||||||
let main_name = "main";
|
let main_name = "main";
|
||||||
|
@ -54,6 +54,8 @@ pub struct Index {
|
|||||||
pub word_prefix_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
|
pub word_prefix_pair_proximity_docids: Database<StrStrU8Codec, CboRoaringBitmapCodec>,
|
||||||
/// Maps the word, level and position range with the docids that corresponds to it.
|
/// Maps the word, level and position range with the docids that corresponds to it.
|
||||||
pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
|
pub word_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
|
||||||
|
/// Maps the level positions of a word prefix with all the docids where this prefix appears.
|
||||||
|
pub word_prefix_level_position_docids: Database<StrLevelPositionCodec, CboRoaringBitmapCodec>,
|
||||||
/// Maps the facet field id and the globally ordered value with the docids that corresponds to it.
|
/// Maps the facet field id and the globally ordered value with the docids that corresponds to it.
|
||||||
pub facet_field_id_value_docids: Database<ByteSlice, CboRoaringBitmapCodec>,
|
pub facet_field_id_value_docids: Database<ByteSlice, CboRoaringBitmapCodec>,
|
||||||
/// Maps the document id, the facet field id and the globally ordered value.
|
/// Maps the document id, the facet field id and the globally ordered value.
|
||||||
@ -64,7 +66,7 @@ pub struct Index {
|
|||||||
|
|
||||||
impl Index {
|
impl Index {
|
||||||
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
|
pub fn new<P: AsRef<Path>>(mut options: heed::EnvOpenOptions, path: P) -> anyhow::Result<Index> {
|
||||||
options.max_dbs(10);
|
options.max_dbs(11);
|
||||||
|
|
||||||
let env = options.open(path)?;
|
let env = options.open(path)?;
|
||||||
let main = env.create_poly_database(Some("main"))?;
|
let main = env.create_poly_database(Some("main"))?;
|
||||||
@ -74,6 +76,7 @@ impl Index {
|
|||||||
let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?;
|
let word_pair_proximity_docids = env.create_database(Some("word-pair-proximity-docids"))?;
|
||||||
let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?;
|
let word_prefix_pair_proximity_docids = env.create_database(Some("word-prefix-pair-proximity-docids"))?;
|
||||||
let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?;
|
let word_level_position_docids = env.create_database(Some("word-level-position-docids"))?;
|
||||||
|
let word_prefix_level_position_docids = env.create_database(Some("word-prefix-level-position-docids"))?;
|
||||||
let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?;
|
let facet_field_id_value_docids = env.create_database(Some("facet-field-id-value-docids"))?;
|
||||||
let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?;
|
let field_id_docid_facet_values = env.create_database(Some("field-id-docid-facet-values"))?;
|
||||||
let documents = env.create_database(Some("documents"))?;
|
let documents = env.create_database(Some("documents"))?;
|
||||||
@ -98,6 +101,7 @@ impl Index {
|
|||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
word_level_position_docids,
|
word_level_position_docids,
|
||||||
|
word_prefix_level_position_docids,
|
||||||
facet_field_id_value_docids,
|
facet_field_id_value_docids,
|
||||||
field_id_docid_facet_values,
|
field_id_docid_facet_values,
|
||||||
documents,
|
documents,
|
||||||
|
@ -29,6 +29,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
word_level_position_docids,
|
word_level_position_docids,
|
||||||
|
word_prefix_level_position_docids,
|
||||||
facet_field_id_value_docids,
|
facet_field_id_value_docids,
|
||||||
field_id_docid_facet_values,
|
field_id_docid_facet_values,
|
||||||
documents,
|
documents,
|
||||||
@ -57,6 +58,7 @@ impl<'t, 'u, 'i> ClearDocuments<'t, 'u, 'i> {
|
|||||||
word_pair_proximity_docids.clear(self.wtxn)?;
|
word_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
word_level_position_docids.clear(self.wtxn)?;
|
word_level_position_docids.clear(self.wtxn)?;
|
||||||
|
word_prefix_level_position_docids.clear(self.wtxn)?;
|
||||||
facet_field_id_value_docids.clear(self.wtxn)?;
|
facet_field_id_value_docids.clear(self.wtxn)?;
|
||||||
field_id_docid_facet_values.clear(self.wtxn)?;
|
field_id_docid_facet_values.clear(self.wtxn)?;
|
||||||
documents.clear(self.wtxn)?;
|
documents.clear(self.wtxn)?;
|
||||||
|
@ -89,6 +89,7 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
word_pair_proximity_docids,
|
word_pair_proximity_docids,
|
||||||
word_prefix_pair_proximity_docids,
|
word_prefix_pair_proximity_docids,
|
||||||
word_level_position_docids,
|
word_level_position_docids,
|
||||||
|
word_prefix_level_position_docids,
|
||||||
facet_field_id_value_docids,
|
facet_field_id_value_docids,
|
||||||
field_id_docid_facet_values,
|
field_id_docid_facet_values,
|
||||||
documents,
|
documents,
|
||||||
@ -345,6 +346,21 @@ impl<'t, 'u, 'i> DeleteDocuments<'t, 'u, 'i> {
|
|||||||
|
|
||||||
drop(iter);
|
drop(iter);
|
||||||
|
|
||||||
|
// We delete the documents ids that are under the word prefix level position docids.
|
||||||
|
let mut iter = word_prefix_level_position_docids.iter_mut(self.wtxn)?.remap_key_type::<ByteSlice>();
|
||||||
|
while let Some(result) = iter.next() {
|
||||||
|
let (bytes, mut docids) = result?;
|
||||||
|
let previous_len = docids.len();
|
||||||
|
docids.difference_with(&self.documents_ids);
|
||||||
|
if docids.is_empty() {
|
||||||
|
iter.del_current()?;
|
||||||
|
} else if docids.len() != previous_len {
|
||||||
|
iter.put_current(bytes, &docids)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
drop(iter);
|
||||||
|
|
||||||
Ok(self.documents_ids.len())
|
Ok(self.documents_ids.len())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -52,6 +52,10 @@ pub fn words_pairs_proximities_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -
|
|||||||
cbo_roaring_bitmap_merge(values)
|
cbo_roaring_bitmap_merge(values)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn word_prefix_level_positions_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
||||||
|
cbo_roaring_bitmap_merge(values)
|
||||||
|
}
|
||||||
|
|
||||||
pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
pub fn word_level_position_docids_merge(_key: &[u8], values: &[Cow<[u8]>]) -> anyhow::Result<Vec<u8>> {
|
||||||
cbo_roaring_bitmap_merge(values)
|
cbo_roaring_bitmap_merge(values)
|
||||||
}
|
}
|
||||||
|
@ -3,6 +3,7 @@ use std::collections::HashSet;
|
|||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::io::{self, Seek, SeekFrom};
|
use std::io::{self, Seek, SeekFrom};
|
||||||
use std::num::{NonZeroU32, NonZeroUsize};
|
use std::num::{NonZeroU32, NonZeroUsize};
|
||||||
|
use std::str;
|
||||||
use std::sync::mpsc::sync_channel;
|
use std::sync::mpsc::sync_channel;
|
||||||
use std::time::Instant;
|
use std::time::Instant;
|
||||||
|
|
||||||
@ -13,18 +14,21 @@ use grenad::{MergerIter, Writer, Sorter, Merger, Reader, FileFuse, CompressionTy
|
|||||||
use heed::types::ByteSlice;
|
use heed::types::ByteSlice;
|
||||||
use log::{debug, info, error};
|
use log::{debug, info, error};
|
||||||
use memmap::Mmap;
|
use memmap::Mmap;
|
||||||
use rayon::ThreadPool;
|
|
||||||
use rayon::prelude::*;
|
use rayon::prelude::*;
|
||||||
|
use rayon::ThreadPool;
|
||||||
use serde::{Serialize, Deserialize};
|
use serde::{Serialize, Deserialize};
|
||||||
|
|
||||||
use crate::index::Index;
|
use crate::index::Index;
|
||||||
use crate::update::{Facets, WordsLevelPositions, WordsPrefixes, UpdateIndexingStep};
|
use crate::update::{
|
||||||
|
Facets, WordsLevelPositions, WordPrefixDocids, WordsPrefixesFst, UpdateIndexingStep,
|
||||||
|
WordPrefixPairProximityDocids,
|
||||||
|
};
|
||||||
use self::store::{Store, Readers};
|
use self::store::{Store, Readers};
|
||||||
pub use self::merge_function::{
|
pub use self::merge_function::{
|
||||||
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
|
main_merge, word_docids_merge, words_pairs_proximities_docids_merge,
|
||||||
docid_word_positions_merge, documents_merge,
|
docid_word_positions_merge, documents_merge,
|
||||||
word_level_position_docids_merge, facet_field_value_docids_merge,
|
word_level_position_docids_merge, word_prefix_level_positions_docids_merge,
|
||||||
field_id_docid_facet_values_merge,
|
facet_field_value_docids_merge, field_id_docid_facet_values_merge,
|
||||||
};
|
};
|
||||||
pub use self::transform::{Transform, TransformOutput};
|
pub use self::transform::{Transform, TransformOutput};
|
||||||
|
|
||||||
@ -719,10 +723,7 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
builder.execute()?;
|
builder.execute()?;
|
||||||
|
|
||||||
// Run the words prefixes update operation.
|
// Run the words prefixes update operation.
|
||||||
let mut builder = WordsPrefixes::new(self.wtxn, self.index, self.update_id);
|
let mut builder = WordsPrefixesFst::new(self.wtxn, self.index, self.update_id);
|
||||||
builder.chunk_compression_type = self.chunk_compression_type;
|
|
||||||
builder.chunk_compression_level = self.chunk_compression_level;
|
|
||||||
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
|
||||||
if let Some(value) = self.words_prefix_threshold {
|
if let Some(value) = self.words_prefix_threshold {
|
||||||
builder.threshold(value);
|
builder.threshold(value);
|
||||||
}
|
}
|
||||||
@ -731,8 +732,26 @@ impl<'t, 'u, 'i, 'a> IndexDocuments<'t, 'u, 'i, 'a> {
|
|||||||
}
|
}
|
||||||
builder.execute()?;
|
builder.execute()?;
|
||||||
|
|
||||||
|
// Run the word prefix docids update operation.
|
||||||
|
let mut builder = WordPrefixDocids::new(self.wtxn, self.index);
|
||||||
|
builder.chunk_compression_type = self.chunk_compression_type;
|
||||||
|
builder.chunk_compression_level = self.chunk_compression_level;
|
||||||
|
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||||
|
builder.max_nb_chunks = self.max_nb_chunks;
|
||||||
|
builder.max_memory = self.max_memory;
|
||||||
|
builder.execute()?;
|
||||||
|
|
||||||
|
// Run the word prefix pair proximity docids update operation.
|
||||||
|
let mut builder = WordPrefixPairProximityDocids::new(self.wtxn, self.index);
|
||||||
|
builder.chunk_compression_type = self.chunk_compression_type;
|
||||||
|
builder.chunk_compression_level = self.chunk_compression_level;
|
||||||
|
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||||
|
builder.max_nb_chunks = self.max_nb_chunks;
|
||||||
|
builder.max_memory = self.max_memory;
|
||||||
|
builder.execute()?;
|
||||||
|
|
||||||
// Run the words level positions update operation.
|
// Run the words level positions update operation.
|
||||||
let mut builder = WordsLevelPositions::new(self.wtxn, self.index, self.update_id);
|
let mut builder = WordsLevelPositions::new(self.wtxn, self.index);
|
||||||
builder.chunk_compression_type = self.chunk_compression_type;
|
builder.chunk_compression_type = self.chunk_compression_type;
|
||||||
builder.chunk_compression_level = self.chunk_compression_level;
|
builder.chunk_compression_level = self.chunk_compression_level;
|
||||||
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
||||||
|
@ -6,8 +6,10 @@ pub use self::index_documents::{DocumentAdditionResult, IndexDocuments, IndexDoc
|
|||||||
pub use self::settings::{Setting, Settings};
|
pub use self::settings::{Setting, Settings};
|
||||||
pub use self::update_builder::UpdateBuilder;
|
pub use self::update_builder::UpdateBuilder;
|
||||||
pub use self::update_step::UpdateIndexingStep;
|
pub use self::update_step::UpdateIndexingStep;
|
||||||
|
pub use self::word_prefix_docids::WordPrefixDocids;
|
||||||
|
pub use self::word_prefix_pair_proximity_docids::WordPrefixPairProximityDocids;
|
||||||
pub use self::words_level_positions::WordsLevelPositions;
|
pub use self::words_level_positions::WordsLevelPositions;
|
||||||
pub use self::words_prefixes::WordsPrefixes;
|
pub use self::words_prefixes_fst::WordsPrefixesFst;
|
||||||
|
|
||||||
mod available_documents_ids;
|
mod available_documents_ids;
|
||||||
mod clear_documents;
|
mod clear_documents;
|
||||||
@ -17,6 +19,7 @@ mod index_documents;
|
|||||||
mod settings;
|
mod settings;
|
||||||
mod update_builder;
|
mod update_builder;
|
||||||
mod update_step;
|
mod update_step;
|
||||||
|
mod word_prefix_docids;
|
||||||
|
mod word_prefix_pair_proximity_docids;
|
||||||
mod words_level_positions;
|
mod words_level_positions;
|
||||||
mod words_prefixes;
|
mod words_prefixes_fst;
|
||||||
|
|
||||||
|
@ -2,10 +2,7 @@ use grenad::CompressionType;
|
|||||||
use rayon::ThreadPool;
|
use rayon::ThreadPool;
|
||||||
|
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
use super::{
|
use super::{ClearDocuments, DeleteDocuments, IndexDocuments, Settings, Facets};
|
||||||
ClearDocuments, DeleteDocuments, IndexDocuments, Settings,
|
|
||||||
Facets, WordsPrefixes, WordsLevelPositions,
|
|
||||||
};
|
|
||||||
|
|
||||||
pub struct UpdateBuilder<'a> {
|
pub struct UpdateBuilder<'a> {
|
||||||
pub(crate) log_every_n: Option<usize>,
|
pub(crate) log_every_n: Option<usize>,
|
||||||
@ -138,34 +135,4 @@ impl<'a> UpdateBuilder<'a> {
|
|||||||
|
|
||||||
builder
|
builder
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn words_prefixes<'t, 'u, 'i>(
|
|
||||||
self,
|
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
|
||||||
index: &'i Index,
|
|
||||||
) -> WordsPrefixes<'t, 'u, 'i>
|
|
||||||
{
|
|
||||||
let mut builder = WordsPrefixes::new(wtxn, index, self.update_id);
|
|
||||||
|
|
||||||
builder.chunk_compression_type = self.chunk_compression_type;
|
|
||||||
builder.chunk_compression_level = self.chunk_compression_level;
|
|
||||||
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
|
||||||
|
|
||||||
builder
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn words_level_positions<'t, 'u, 'i>(
|
|
||||||
self,
|
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
|
||||||
index: &'i Index,
|
|
||||||
) -> WordsLevelPositions<'t, 'u, 'i>
|
|
||||||
{
|
|
||||||
let mut builder = WordsLevelPositions::new(wtxn, index, self.update_id);
|
|
||||||
|
|
||||||
builder.chunk_compression_type = self.chunk_compression_type;
|
|
||||||
builder.chunk_compression_level = self.chunk_compression_level;
|
|
||||||
builder.chunk_fusing_shrink_size = self.chunk_fusing_shrink_size;
|
|
||||||
|
|
||||||
builder
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
75
milli/src/update/word_prefix_docids.rs
Normal file
75
milli/src/update/word_prefix_docids.rs
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
use std::str;
|
||||||
|
|
||||||
|
use crate::Index;
|
||||||
|
use fst::Streamer;
|
||||||
|
use grenad::CompressionType;
|
||||||
|
use heed::types::ByteSlice;
|
||||||
|
|
||||||
|
use crate::update::index_documents::WriteMethod;
|
||||||
|
use crate::update::index_documents::{create_sorter, word_docids_merge, sorter_into_lmdb_database};
|
||||||
|
|
||||||
|
pub struct WordPrefixDocids<'t, 'u, 'i> {
|
||||||
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
|
index: &'i Index,
|
||||||
|
pub(crate) chunk_compression_type: CompressionType,
|
||||||
|
pub(crate) chunk_compression_level: Option<u32>,
|
||||||
|
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||||
|
pub(crate) max_nb_chunks: Option<usize>,
|
||||||
|
pub(crate) max_memory: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'t, 'u, 'i> WordPrefixDocids<'t, 'u, 'i> {
|
||||||
|
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordPrefixDocids<'t, 'u, 'i> {
|
||||||
|
WordPrefixDocids {
|
||||||
|
wtxn,
|
||||||
|
index,
|
||||||
|
chunk_compression_type: CompressionType::None,
|
||||||
|
chunk_compression_level: None,
|
||||||
|
chunk_fusing_shrink_size: None,
|
||||||
|
max_nb_chunks: None,
|
||||||
|
max_memory: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn execute(self) -> anyhow::Result<()> {
|
||||||
|
// Clear the word prefix docids database.
|
||||||
|
self.index.word_prefix_docids.clear(self.wtxn)?;
|
||||||
|
|
||||||
|
let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
||||||
|
|
||||||
|
// It is forbidden to keep a mutable reference into the database
|
||||||
|
// and write into it at the same time, therefore we write into another file.
|
||||||
|
let mut prefix_docids_sorter = create_sorter(
|
||||||
|
word_docids_merge,
|
||||||
|
self.chunk_compression_type,
|
||||||
|
self.chunk_compression_level,
|
||||||
|
self.chunk_fusing_shrink_size,
|
||||||
|
self.max_nb_chunks,
|
||||||
|
self.max_memory,
|
||||||
|
);
|
||||||
|
|
||||||
|
// We iterate over all the prefixes and retrieve the corresponding docids.
|
||||||
|
let mut prefix_stream = prefix_fst.stream();
|
||||||
|
while let Some(bytes) = prefix_stream.next() {
|
||||||
|
let prefix = str::from_utf8(bytes)?;
|
||||||
|
let db = self.index.word_docids.remap_data_type::<ByteSlice>();
|
||||||
|
for result in db.prefix_iter(self.wtxn, prefix)? {
|
||||||
|
let (_word, data) = result?;
|
||||||
|
prefix_docids_sorter.insert(prefix, data)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
drop(prefix_fst);
|
||||||
|
|
||||||
|
// We finally write the word prefix docids into the LMDB database.
|
||||||
|
sorter_into_lmdb_database(
|
||||||
|
self.wtxn,
|
||||||
|
*self.index.word_prefix_docids.as_polymorph(),
|
||||||
|
prefix_docids_sorter,
|
||||||
|
word_docids_merge,
|
||||||
|
WriteMethod::Append,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
89
milli/src/update/word_prefix_pair_proximity_docids.rs
Normal file
89
milli/src/update/word_prefix_pair_proximity_docids.rs
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
use std::str;
|
||||||
|
|
||||||
|
use fst::automaton::{Automaton, Str};
|
||||||
|
use fst::{Streamer, IntoStreamer};
|
||||||
|
use grenad::CompressionType;
|
||||||
|
use heed::BytesEncode;
|
||||||
|
use heed::types::ByteSlice;
|
||||||
|
use log::debug;
|
||||||
|
|
||||||
|
use crate::Index;
|
||||||
|
use crate::heed_codec::StrStrU8Codec;
|
||||||
|
use crate::update::index_documents::{
|
||||||
|
WriteMethod, create_sorter, sorter_into_lmdb_database,
|
||||||
|
words_pairs_proximities_docids_merge,
|
||||||
|
};
|
||||||
|
|
||||||
|
pub struct WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
||||||
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
|
index: &'i Index,
|
||||||
|
pub(crate) chunk_compression_type: CompressionType,
|
||||||
|
pub(crate) chunk_compression_level: Option<u32>,
|
||||||
|
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||||
|
pub(crate) max_nb_chunks: Option<usize>,
|
||||||
|
pub(crate) max_memory: Option<usize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'t, 'u, 'i> WordPrefixPairProximityDocids<'t, 'u, 'i> {
|
||||||
|
pub fn new(
|
||||||
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
|
index: &'i Index,
|
||||||
|
) -> WordPrefixPairProximityDocids<'t, 'u, 'i>
|
||||||
|
{
|
||||||
|
WordPrefixPairProximityDocids {
|
||||||
|
wtxn,
|
||||||
|
index,
|
||||||
|
chunk_compression_type: CompressionType::None,
|
||||||
|
chunk_compression_level: None,
|
||||||
|
chunk_fusing_shrink_size: None,
|
||||||
|
max_nb_chunks: None,
|
||||||
|
max_memory: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn execute(self) -> anyhow::Result<()> {
|
||||||
|
debug!("Computing and writing the word prefix pair proximity docids into LMDB on disk...");
|
||||||
|
|
||||||
|
self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
||||||
|
|
||||||
|
let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
||||||
|
|
||||||
|
// Here we create a sorter akin to the previous one.
|
||||||
|
let mut word_prefix_pair_proximity_docids_sorter = create_sorter(
|
||||||
|
words_pairs_proximities_docids_merge,
|
||||||
|
self.chunk_compression_type,
|
||||||
|
self.chunk_compression_level,
|
||||||
|
self.chunk_fusing_shrink_size,
|
||||||
|
self.max_nb_chunks,
|
||||||
|
self.max_memory,
|
||||||
|
);
|
||||||
|
|
||||||
|
// We insert all the word pairs corresponding to the word-prefix pairs
|
||||||
|
// where the prefixes appears in the prefix FST previously constructed.
|
||||||
|
let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>();
|
||||||
|
for result in db.iter(self.wtxn)? {
|
||||||
|
let ((word1, word2, prox), data) = result?;
|
||||||
|
let automaton = Str::new(word2).starts_with();
|
||||||
|
let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
|
||||||
|
while let Some(prefix) = matching_prefixes.next() {
|
||||||
|
let prefix = str::from_utf8(prefix)?;
|
||||||
|
let pair = (word1, prefix, prox);
|
||||||
|
let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap();
|
||||||
|
word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
drop(prefix_fst);
|
||||||
|
|
||||||
|
// We finally write the word prefix pair proximity docids into the LMDB database.
|
||||||
|
sorter_into_lmdb_database(
|
||||||
|
self.wtxn,
|
||||||
|
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
|
||||||
|
word_prefix_pair_proximity_docids_sorter,
|
||||||
|
words_pairs_proximities_docids_merge,
|
||||||
|
WriteMethod::Append,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
@ -1,17 +1,22 @@
|
|||||||
use std::cmp;
|
use std::{cmp, str};
|
||||||
use std::convert::TryFrom;
|
use std::convert::TryFrom;
|
||||||
use std::fs::File;
|
use std::fs::File;
|
||||||
use std::num::NonZeroU32;
|
use std::num::NonZeroU32;
|
||||||
|
|
||||||
|
use fst::automaton::{self, Automaton};
|
||||||
|
use fst::{Streamer, IntoStreamer};
|
||||||
use grenad::{CompressionType, Reader, Writer, FileFuse};
|
use grenad::{CompressionType, Reader, Writer, FileFuse};
|
||||||
use heed::types::{DecodeIgnore, Str};
|
use heed::types::{ByteSlice, DecodeIgnore, Str};
|
||||||
use heed::{BytesEncode, Error};
|
use heed::{BytesEncode, Error};
|
||||||
use log::debug;
|
use log::debug;
|
||||||
use roaring::RoaringBitmap;
|
use roaring::RoaringBitmap;
|
||||||
|
|
||||||
use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec};
|
use crate::heed_codec::{StrLevelPositionCodec, CboRoaringBitmapCodec};
|
||||||
use crate::update::index_documents::WriteMethod;
|
use crate::update::index_documents::WriteMethod;
|
||||||
use crate::update::index_documents::{create_writer, writer_into_reader, write_into_lmdb_database};
|
use crate::update::index_documents::{
|
||||||
|
create_writer, create_sorter, writer_into_reader, write_into_lmdb_database,
|
||||||
|
word_prefix_level_positions_docids_merge, sorter_into_lmdb_database
|
||||||
|
};
|
||||||
use crate::{Index, TreeLevel};
|
use crate::{Index, TreeLevel};
|
||||||
|
|
||||||
pub struct WordsLevelPositions<'t, 'u, 'i> {
|
pub struct WordsLevelPositions<'t, 'u, 'i> {
|
||||||
@ -20,27 +25,24 @@ pub struct WordsLevelPositions<'t, 'u, 'i> {
|
|||||||
pub(crate) chunk_compression_type: CompressionType,
|
pub(crate) chunk_compression_type: CompressionType,
|
||||||
pub(crate) chunk_compression_level: Option<u32>,
|
pub(crate) chunk_compression_level: Option<u32>,
|
||||||
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
||||||
|
pub(crate) max_nb_chunks: Option<usize>,
|
||||||
|
pub(crate) max_memory: Option<usize>,
|
||||||
level_group_size: NonZeroU32,
|
level_group_size: NonZeroU32,
|
||||||
min_level_size: NonZeroU32,
|
min_level_size: NonZeroU32,
|
||||||
_update_id: u64,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
||||||
pub fn new(
|
pub fn new(wtxn: &'t mut heed::RwTxn<'i, 'u>, index: &'i Index) -> WordsLevelPositions<'t, 'u, 'i> {
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
|
||||||
index: &'i Index,
|
|
||||||
update_id: u64,
|
|
||||||
) -> WordsLevelPositions<'t, 'u, 'i>
|
|
||||||
{
|
|
||||||
WordsLevelPositions {
|
WordsLevelPositions {
|
||||||
wtxn,
|
wtxn,
|
||||||
index,
|
index,
|
||||||
chunk_compression_type: CompressionType::None,
|
chunk_compression_type: CompressionType::None,
|
||||||
chunk_compression_level: None,
|
chunk_compression_level: None,
|
||||||
chunk_fusing_shrink_size: None,
|
chunk_fusing_shrink_size: None,
|
||||||
|
max_nb_chunks: None,
|
||||||
|
max_memory: None,
|
||||||
level_group_size: NonZeroU32::new(4).unwrap(),
|
level_group_size: NonZeroU32::new(4).unwrap(),
|
||||||
min_level_size: NonZeroU32::new(5).unwrap(),
|
min_level_size: NonZeroU32::new(5).unwrap(),
|
||||||
_update_id: update_id,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -76,7 +78,71 @@ impl<'t, 'u, 'i> WordsLevelPositions<'t, 'u, 'i> {
|
|||||||
self.wtxn,
|
self.wtxn,
|
||||||
*self.index.word_level_position_docids.as_polymorph(),
|
*self.index.word_level_position_docids.as_polymorph(),
|
||||||
entries,
|
entries,
|
||||||
|_, _| anyhow::bail!("invalid facet level merging"),
|
|_, _| anyhow::bail!("invalid word level position merging"),
|
||||||
|
WriteMethod::Append,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
// We compute the word prefix level positions database.
|
||||||
|
self.index.word_prefix_level_position_docids.clear(self.wtxn)?;
|
||||||
|
|
||||||
|
let mut word_prefix_level_positions_docids_sorter = create_sorter(
|
||||||
|
word_prefix_level_positions_docids_merge,
|
||||||
|
self.chunk_compression_type,
|
||||||
|
self.chunk_compression_level,
|
||||||
|
self.chunk_fusing_shrink_size,
|
||||||
|
self.max_nb_chunks,
|
||||||
|
self.max_memory,
|
||||||
|
);
|
||||||
|
|
||||||
|
// We insert the word prefix level positions where the level is equal to 0 and
|
||||||
|
// corresponds to the word-prefix level positions where the prefixes appears
|
||||||
|
// in the prefix FST previously constructed.
|
||||||
|
let prefix_fst = self.index.words_prefixes_fst(self.wtxn)?;
|
||||||
|
let db = self.index.word_level_position_docids.remap_data_type::<ByteSlice>();
|
||||||
|
for result in db.iter(self.wtxn)? {
|
||||||
|
let ((word, level, left, right), data) = result?;
|
||||||
|
if level == TreeLevel::min_value() {
|
||||||
|
let automaton = automaton::Str::new(word).starts_with();
|
||||||
|
let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
|
||||||
|
while let Some(prefix) = matching_prefixes.next() {
|
||||||
|
let prefix = str::from_utf8(prefix)?;
|
||||||
|
let key = (prefix, level, left, right);
|
||||||
|
let bytes = StrLevelPositionCodec::bytes_encode(&key).unwrap();
|
||||||
|
word_prefix_level_positions_docids_sorter.insert(bytes, data)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We finally write all the word prefix level positions docids with
|
||||||
|
// a level equal to 0 into the LMDB database.
|
||||||
|
sorter_into_lmdb_database(
|
||||||
|
self.wtxn,
|
||||||
|
*self.index.word_prefix_level_position_docids.as_polymorph(),
|
||||||
|
word_prefix_level_positions_docids_sorter,
|
||||||
|
word_prefix_level_positions_docids_merge,
|
||||||
|
WriteMethod::Append,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
let entries = compute_positions_levels(
|
||||||
|
self.wtxn,
|
||||||
|
self.index.word_prefix_docids.remap_data_type::<DecodeIgnore>(),
|
||||||
|
self.index.word_prefix_level_position_docids,
|
||||||
|
self.chunk_compression_type,
|
||||||
|
self.chunk_compression_level,
|
||||||
|
self.chunk_fusing_shrink_size,
|
||||||
|
self.level_group_size,
|
||||||
|
self.min_level_size,
|
||||||
|
)?;
|
||||||
|
|
||||||
|
// The previously computed entries also defines the level 0 entries
|
||||||
|
// so we can clear the database and append all of these entries.
|
||||||
|
self.index.word_prefix_level_position_docids.clear(self.wtxn)?;
|
||||||
|
|
||||||
|
write_into_lmdb_database(
|
||||||
|
self.wtxn,
|
||||||
|
*self.index.word_prefix_level_position_docids.as_polymorph(),
|
||||||
|
entries,
|
||||||
|
|_, _| anyhow::bail!("invalid word prefix level position merging"),
|
||||||
WriteMethod::Append,
|
WriteMethod::Append,
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
@ -1,196 +0,0 @@
|
|||||||
use std::iter::FromIterator;
|
|
||||||
use std::str;
|
|
||||||
|
|
||||||
use chrono::Utc;
|
|
||||||
use fst::automaton::Str;
|
|
||||||
use fst::{Automaton, Streamer, IntoStreamer};
|
|
||||||
use grenad::CompressionType;
|
|
||||||
use heed::BytesEncode;
|
|
||||||
use heed::types::ByteSlice;
|
|
||||||
|
|
||||||
use crate::heed_codec::StrStrU8Codec;
|
|
||||||
use crate::update::index_documents::WriteMethod;
|
|
||||||
use crate::update::index_documents::{create_sorter, sorter_into_lmdb_database};
|
|
||||||
use crate::update::index_documents::{word_docids_merge, words_pairs_proximities_docids_merge};
|
|
||||||
use crate::{Index, SmallString32};
|
|
||||||
|
|
||||||
pub struct WordsPrefixes<'t, 'u, 'i> {
|
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
|
||||||
index: &'i Index,
|
|
||||||
pub(crate) chunk_compression_type: CompressionType,
|
|
||||||
pub(crate) chunk_compression_level: Option<u32>,
|
|
||||||
pub(crate) chunk_fusing_shrink_size: Option<u64>,
|
|
||||||
pub(crate) max_nb_chunks: Option<usize>,
|
|
||||||
pub(crate) max_memory: Option<usize>,
|
|
||||||
threshold: f64,
|
|
||||||
max_prefix_length: usize,
|
|
||||||
_update_id: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl<'t, 'u, 'i> WordsPrefixes<'t, 'u, 'i> {
|
|
||||||
pub fn new(
|
|
||||||
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
|
||||||
index: &'i Index,
|
|
||||||
update_id: u64,
|
|
||||||
) -> WordsPrefixes<'t, 'u, 'i>
|
|
||||||
{
|
|
||||||
WordsPrefixes {
|
|
||||||
wtxn,
|
|
||||||
index,
|
|
||||||
chunk_compression_type: CompressionType::None,
|
|
||||||
chunk_compression_level: None,
|
|
||||||
chunk_fusing_shrink_size: None,
|
|
||||||
max_nb_chunks: None,
|
|
||||||
max_memory: None,
|
|
||||||
threshold: 0.1 / 100.0, // .01%
|
|
||||||
max_prefix_length: 4,
|
|
||||||
_update_id: update_id,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set the ratio of concerned words required to make a prefix be part of the words prefixes
|
|
||||||
/// database. If a word prefix is supposed to match more than this number of words in the
|
|
||||||
/// dictionnary, therefore this prefix is added to the words prefixes datastructures.
|
|
||||||
///
|
|
||||||
/// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped
|
|
||||||
/// to these bounds otherwise.
|
|
||||||
pub fn threshold(&mut self, value: f64) -> &mut Self {
|
|
||||||
self.threshold = value.min(1.0).max(0.0); // clamp [0, 1]
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Set the maximum length of prefixes in bytes.
|
|
||||||
///
|
|
||||||
/// Default value is `4` bytes. This value must be between 1 and 25 will be clamped
|
|
||||||
/// to these bounds, otherwise.
|
|
||||||
pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
|
|
||||||
self.max_prefix_length = value.min(25).max(1); // clamp [1, 25]
|
|
||||||
self
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn execute(self) -> anyhow::Result<()> {
|
|
||||||
self.index.set_updated_at(self.wtxn, &Utc::now())?;
|
|
||||||
// Clear the words prefixes datastructures.
|
|
||||||
self.index.word_prefix_docids.clear(self.wtxn)?;
|
|
||||||
self.index.word_prefix_pair_proximity_docids.clear(self.wtxn)?;
|
|
||||||
|
|
||||||
let words_fst = self.index.words_fst(&self.wtxn)?;
|
|
||||||
let number_of_words = words_fst.len();
|
|
||||||
let min_number_of_words = (number_of_words as f64 * self.threshold) as usize;
|
|
||||||
|
|
||||||
// It is forbidden to keep a mutable reference into the database
|
|
||||||
// and write into it at the same time, therefore we write into another file.
|
|
||||||
let mut prefix_docids_sorter = create_sorter(
|
|
||||||
word_docids_merge,
|
|
||||||
self.chunk_compression_type,
|
|
||||||
self.chunk_compression_level,
|
|
||||||
self.chunk_fusing_shrink_size,
|
|
||||||
self.max_nb_chunks,
|
|
||||||
self.max_memory,
|
|
||||||
);
|
|
||||||
|
|
||||||
let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
|
|
||||||
for n in 1..=self.max_prefix_length {
|
|
||||||
|
|
||||||
let mut current_prefix = SmallString32::new();
|
|
||||||
let mut current_prefix_count = 0;
|
|
||||||
let mut builder = fst::SetBuilder::memory();
|
|
||||||
|
|
||||||
let mut stream = words_fst.stream();
|
|
||||||
while let Some(bytes) = stream.next() {
|
|
||||||
// We try to get the first n bytes out of this string but we only want
|
|
||||||
// to split at valid characters bounds. If we try to split in the middle of
|
|
||||||
// a character we ignore this word and go to the next one.
|
|
||||||
let word = str::from_utf8(bytes)?;
|
|
||||||
let prefix = match word.get(..n) {
|
|
||||||
Some(prefix) => prefix,
|
|
||||||
None => continue,
|
|
||||||
};
|
|
||||||
|
|
||||||
// This is the first iteration of the loop,
|
|
||||||
// or the current word doesn't starts with the current prefix.
|
|
||||||
if current_prefix_count == 0 || prefix != current_prefix.as_str() {
|
|
||||||
current_prefix = SmallString32::from(prefix);
|
|
||||||
current_prefix_count = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
current_prefix_count += 1;
|
|
||||||
|
|
||||||
// There is enough words corresponding to this prefix to add it to the cache.
|
|
||||||
if current_prefix_count == min_number_of_words {
|
|
||||||
builder.insert(prefix)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// We construct the final set for prefixes of size n.
|
|
||||||
prefix_fsts.push(builder.into_set());
|
|
||||||
}
|
|
||||||
|
|
||||||
// We merge all of the previously computed prefixes into on final set.
|
|
||||||
let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
|
|
||||||
let mut builder = fst::SetBuilder::memory();
|
|
||||||
builder.extend_stream(op.r#union())?;
|
|
||||||
let prefix_fst = builder.into_set();
|
|
||||||
|
|
||||||
// We iterate over all the prefixes and retrieve the corresponding docids.
|
|
||||||
let mut prefix_stream = prefix_fst.stream();
|
|
||||||
while let Some(bytes) = prefix_stream.next() {
|
|
||||||
let prefix = str::from_utf8(bytes)?;
|
|
||||||
let db = self.index.word_docids.remap_data_type::<ByteSlice>();
|
|
||||||
for result in db.prefix_iter(self.wtxn, prefix)? {
|
|
||||||
let (_word, data) = result?;
|
|
||||||
prefix_docids_sorter.insert(prefix, data)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set the words prefixes FST in the dtabase.
|
|
||||||
self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?;
|
|
||||||
|
|
||||||
// We finally write the word prefix docids into the LMDB database.
|
|
||||||
sorter_into_lmdb_database(
|
|
||||||
self.wtxn,
|
|
||||||
*self.index.word_prefix_docids.as_polymorph(),
|
|
||||||
prefix_docids_sorter,
|
|
||||||
word_docids_merge,
|
|
||||||
WriteMethod::Append,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
// We compute the word prefix pair proximity database.
|
|
||||||
|
|
||||||
// Here we create a sorter akin to the previous one.
|
|
||||||
let mut word_prefix_pair_proximity_docids_sorter = create_sorter(
|
|
||||||
words_pairs_proximities_docids_merge,
|
|
||||||
self.chunk_compression_type,
|
|
||||||
self.chunk_compression_level,
|
|
||||||
self.chunk_fusing_shrink_size,
|
|
||||||
self.max_nb_chunks,
|
|
||||||
self.max_memory,
|
|
||||||
);
|
|
||||||
|
|
||||||
// We insert all the word pairs corresponding to the word-prefix pairs
|
|
||||||
// where the prefixes appears in the prefix FST previously constructed.
|
|
||||||
let db = self.index.word_pair_proximity_docids.remap_data_type::<ByteSlice>();
|
|
||||||
for result in db.iter(self.wtxn)? {
|
|
||||||
let ((word1, word2, prox), data) = result?;
|
|
||||||
let automaton = Str::new(word2).starts_with();
|
|
||||||
let mut matching_prefixes = prefix_fst.search(automaton).into_stream();
|
|
||||||
while let Some(prefix) = matching_prefixes.next() {
|
|
||||||
let prefix = str::from_utf8(prefix)?;
|
|
||||||
let pair = (word1, prefix, prox);
|
|
||||||
let bytes = StrStrU8Codec::bytes_encode(&pair).unwrap();
|
|
||||||
word_prefix_pair_proximity_docids_sorter.insert(bytes, data)?;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// We finally write the word prefix pair proximity docids into the LMDB database.
|
|
||||||
sorter_into_lmdb_database(
|
|
||||||
self.wtxn,
|
|
||||||
*self.index.word_prefix_pair_proximity_docids.as_polymorph(),
|
|
||||||
word_prefix_pair_proximity_docids_sorter,
|
|
||||||
words_pairs_proximities_docids_merge,
|
|
||||||
WriteMethod::Append,
|
|
||||||
)?;
|
|
||||||
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
104
milli/src/update/words_prefixes_fst.rs
Normal file
104
milli/src/update/words_prefixes_fst.rs
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
use std::iter::FromIterator;
|
||||||
|
use std::str;
|
||||||
|
|
||||||
|
use fst::Streamer;
|
||||||
|
use crate::{Index, SmallString32};
|
||||||
|
|
||||||
|
pub struct WordsPrefixesFst<'t, 'u, 'i> {
|
||||||
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
|
index: &'i Index,
|
||||||
|
threshold: f64,
|
||||||
|
max_prefix_length: usize,
|
||||||
|
_update_id: u64,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'t, 'u, 'i> WordsPrefixesFst<'t, 'u, 'i> {
|
||||||
|
pub fn new(
|
||||||
|
wtxn: &'t mut heed::RwTxn<'i, 'u>,
|
||||||
|
index: &'i Index,
|
||||||
|
update_id: u64,
|
||||||
|
) -> WordsPrefixesFst<'t, 'u, 'i>
|
||||||
|
{
|
||||||
|
WordsPrefixesFst {
|
||||||
|
wtxn,
|
||||||
|
index,
|
||||||
|
threshold: 0.1 / 100.0, // .01%
|
||||||
|
max_prefix_length: 4,
|
||||||
|
_update_id: update_id,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the ratio of concerned words required to make a prefix be part of the words prefixes
|
||||||
|
/// database. If a word prefix is supposed to match more than this number of words in the
|
||||||
|
/// dictionnary, therefore this prefix is added to the words prefixes datastructures.
|
||||||
|
///
|
||||||
|
/// Default value is `0.01` or `1%`. This value must be between 0 and 1 and will be clamped
|
||||||
|
/// to these bounds otherwise.
|
||||||
|
pub fn threshold(&mut self, value: f64) -> &mut Self {
|
||||||
|
self.threshold = value.min(1.0).max(0.0); // clamp [0, 1]
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Set the maximum length of prefixes in bytes.
|
||||||
|
///
|
||||||
|
/// Default value is `4` bytes. This value must be between 1 and 25 will be clamped
|
||||||
|
/// to these bounds, otherwise.
|
||||||
|
pub fn max_prefix_length(&mut self, value: usize) -> &mut Self {
|
||||||
|
self.max_prefix_length = value.min(25).max(1); // clamp [1, 25]
|
||||||
|
self
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn execute(self) -> anyhow::Result<()> {
|
||||||
|
let words_fst = self.index.words_fst(&self.wtxn)?;
|
||||||
|
let number_of_words = words_fst.len();
|
||||||
|
let min_number_of_words = (number_of_words as f64 * self.threshold) as usize;
|
||||||
|
|
||||||
|
let mut prefix_fsts = Vec::with_capacity(self.max_prefix_length);
|
||||||
|
for n in 1..=self.max_prefix_length {
|
||||||
|
|
||||||
|
let mut current_prefix = SmallString32::new();
|
||||||
|
let mut current_prefix_count = 0;
|
||||||
|
let mut builder = fst::SetBuilder::memory();
|
||||||
|
|
||||||
|
let mut stream = words_fst.stream();
|
||||||
|
while let Some(bytes) = stream.next() {
|
||||||
|
// We try to get the first n bytes out of this string but we only want
|
||||||
|
// to split at valid characters bounds. If we try to split in the middle of
|
||||||
|
// a character we ignore this word and go to the next one.
|
||||||
|
let word = str::from_utf8(bytes)?;
|
||||||
|
let prefix = match word.get(..n) {
|
||||||
|
Some(prefix) => prefix,
|
||||||
|
None => continue,
|
||||||
|
};
|
||||||
|
|
||||||
|
// This is the first iteration of the loop,
|
||||||
|
// or the current word doesn't starts with the current prefix.
|
||||||
|
if current_prefix_count == 0 || prefix != current_prefix.as_str() {
|
||||||
|
current_prefix = SmallString32::from(prefix);
|
||||||
|
current_prefix_count = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
current_prefix_count += 1;
|
||||||
|
|
||||||
|
// There is enough words corresponding to this prefix to add it to the cache.
|
||||||
|
if current_prefix_count == min_number_of_words {
|
||||||
|
builder.insert(prefix)?;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// We construct the final set for prefixes of size n.
|
||||||
|
prefix_fsts.push(builder.into_set());
|
||||||
|
}
|
||||||
|
|
||||||
|
// We merge all of the previously computed prefixes into on final set.
|
||||||
|
let op = fst::set::OpBuilder::from_iter(prefix_fsts.iter());
|
||||||
|
let mut builder = fst::SetBuilder::memory();
|
||||||
|
builder.extend_stream(op.r#union())?;
|
||||||
|
let prefix_fst = builder.into_set();
|
||||||
|
|
||||||
|
// Set the words prefixes FST in the dtabase.
|
||||||
|
self.index.put_words_prefixes_fst(self.wtxn, &prefix_fst)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user