mirror of
https://github.com/meilisearch/meilisearch.git
synced 2024-11-25 03:25:06 +08:00
Use the word pair proximity and fid word count docids extractors
Co-authored-by: ManyTheFish <many@meilisearch.com>
This commit is contained in:
parent
0fc02f7351
commit
73ce67862d
@ -112,23 +112,27 @@ pub struct WriterOperation {
|
||||
}
|
||||
|
||||
pub enum Database {
|
||||
WordDocids,
|
||||
ExactWordDocids,
|
||||
WordFidDocids,
|
||||
WordPositionDocids,
|
||||
Documents,
|
||||
ExactWordDocids,
|
||||
FidWordCountDocids,
|
||||
Main,
|
||||
WordDocids,
|
||||
WordFidDocids,
|
||||
WordPairProximityDocids,
|
||||
WordPositionDocids,
|
||||
}
|
||||
|
||||
impl WriterOperation {
|
||||
pub fn database(&self, index: &Index) -> heed::Database<Bytes, Bytes> {
|
||||
match self.database {
|
||||
Database::Main => index.main.remap_types(),
|
||||
Database::Documents => index.documents.remap_types(),
|
||||
Database::WordDocids => index.word_docids.remap_types(),
|
||||
Database::ExactWordDocids => index.exact_word_docids.remap_types(),
|
||||
Database::Main => index.main.remap_types(),
|
||||
Database::WordDocids => index.word_docids.remap_types(),
|
||||
Database::WordFidDocids => index.word_fid_docids.remap_types(),
|
||||
Database::WordPositionDocids => index.word_position_docids.remap_types(),
|
||||
Database::FidWordCountDocids => index.field_id_word_count_docids.remap_types(),
|
||||
Database::WordPairProximityDocids => index.word_pair_proximity_docids.remap_types(),
|
||||
}
|
||||
}
|
||||
|
||||
@ -198,9 +202,11 @@ impl MainSender<'_> {
|
||||
}
|
||||
}
|
||||
|
||||
pub enum WordDocids {}
|
||||
pub enum ExactWordDocids {}
|
||||
pub enum FidWordCountDocids {}
|
||||
pub enum WordDocids {}
|
||||
pub enum WordFidDocids {}
|
||||
pub enum WordPairProximityDocids {}
|
||||
pub enum WordPositionDocids {}
|
||||
|
||||
pub trait DatabaseType {
|
||||
@ -209,14 +215,6 @@ pub trait DatabaseType {
|
||||
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation;
|
||||
}
|
||||
|
||||
impl DatabaseType for WordDocids {
|
||||
const DATABASE: Database = Database::WordDocids;
|
||||
|
||||
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
|
||||
MergerOperation::WordDocidsMerger(merger)
|
||||
}
|
||||
}
|
||||
|
||||
impl DatabaseType for ExactWordDocids {
|
||||
const DATABASE: Database = Database::ExactWordDocids;
|
||||
|
||||
@ -225,6 +223,22 @@ impl DatabaseType for ExactWordDocids {
|
||||
}
|
||||
}
|
||||
|
||||
impl DatabaseType for FidWordCountDocids {
|
||||
const DATABASE: Database = Database::FidWordCountDocids;
|
||||
|
||||
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
|
||||
MergerOperation::FidWordCountDocidsMerger(merger)
|
||||
}
|
||||
}
|
||||
|
||||
impl DatabaseType for WordDocids {
|
||||
const DATABASE: Database = Database::WordDocids;
|
||||
|
||||
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
|
||||
MergerOperation::WordDocidsMerger(merger)
|
||||
}
|
||||
}
|
||||
|
||||
impl DatabaseType for WordFidDocids {
|
||||
const DATABASE: Database = Database::WordFidDocids;
|
||||
|
||||
@ -233,6 +247,14 @@ impl DatabaseType for WordFidDocids {
|
||||
}
|
||||
}
|
||||
|
||||
impl DatabaseType for WordPairProximityDocids {
|
||||
const DATABASE: Database = Database::WordPairProximityDocids;
|
||||
|
||||
fn new_merger_operation(merger: Merger<File, MergeDeladdCboRoaringBitmaps>) -> MergerOperation {
|
||||
MergerOperation::WordPairProximityDocidsMerger(merger)
|
||||
}
|
||||
}
|
||||
|
||||
impl DatabaseType for WordPositionDocids {
|
||||
const DATABASE: Database = Database::WordPositionDocids;
|
||||
|
||||
@ -293,12 +315,14 @@ impl DocumentsSender<'_> {
|
||||
}
|
||||
|
||||
pub enum MergerOperation {
|
||||
WordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
|
||||
ExactWordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
|
||||
FidWordCountDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
|
||||
WordDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
|
||||
WordFidDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
|
||||
WordPairProximityDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
|
||||
WordPositionDocidsMerger(Merger<File, MergeDeladdCboRoaringBitmaps>),
|
||||
InsertDocument { docid: DocumentId, document: Box<KvReaderFieldId> },
|
||||
DeleteDocument { docid: DocumentId },
|
||||
InsertDocument { docid: DocumentId, document: Box<KvReaderFieldId> },
|
||||
}
|
||||
|
||||
pub struct MergerReceiver(Receiver<MergerOperation>);
|
||||
|
@ -1,5 +1,6 @@
|
||||
use serde_json::Value;
|
||||
|
||||
use crate::update::new::extract::perm_json_p;
|
||||
use crate::update::new::KvReaderFieldId;
|
||||
use crate::{FieldId, GlobalFieldsIdsMap, InternalError, Result, UserError};
|
||||
|
||||
|
@ -2,6 +2,7 @@ use std::collections::HashSet;
|
||||
use std::fmt::Debug;
|
||||
use std::fs::File;
|
||||
|
||||
pub use extract_facets::*;
|
||||
use grenad::{MergeFunction, Merger};
|
||||
use heed::RoTxn;
|
||||
use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||
|
@ -2,16 +2,8 @@ mod cache;
|
||||
mod faceted;
|
||||
mod searchable;
|
||||
|
||||
pub use faceted::modname::{
|
||||
FieldIdFacetExistsDocidsExtractor, FieldIdFacetIsEmptyDocidsExtractor,
|
||||
FieldIdFacetIsNullDocidsExtractor, FieldIdFacetNumberDocidsExtractor,
|
||||
FieldIdFacetStringDocidsExtractor,
|
||||
};
|
||||
pub use faceted::FacetedExtractor;
|
||||
pub use searchable::{
|
||||
ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor,
|
||||
WordPositionDocidsExtractor,
|
||||
};
|
||||
pub use faceted::*;
|
||||
pub use searchable::*;
|
||||
|
||||
/// TODO move in permissive json pointer
|
||||
pub mod perm_json_p {
|
||||
|
@ -1,15 +1,14 @@
|
||||
use std::{borrow::Cow, collections::HashMap};
|
||||
use std::borrow::Cow;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use heed::RoTxn;
|
||||
|
||||
use super::{tokenize_document::DocumentTokenizer, SearchableExtractor};
|
||||
use crate::{
|
||||
update::{
|
||||
new::{extract::cache::CboCachedSorter, DocumentChange},
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
},
|
||||
FieldId, GlobalFieldsIdsMap, Index, Result,
|
||||
};
|
||||
use super::tokenize_document::DocumentTokenizer;
|
||||
use super::SearchableExtractor;
|
||||
use crate::update::new::extract::cache::CboCachedSorter;
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::update::MergeDeladdCboRoaringBitmaps;
|
||||
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
|
||||
|
||||
const MAX_COUNTED_WORDS: usize = 30;
|
||||
|
||||
@ -22,12 +21,13 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
|
||||
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
|
||||
}
|
||||
|
||||
fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>> {
|
||||
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
/// This case is unreachable because extract_document_change has been reimplemented to not call this function.
|
||||
fn build_key<'a>(_field_id: FieldId, _position: u16, _word: &'a str) -> Cow<'a, [u8]> {
|
||||
fn build_key(_field_id: FieldId, _position: u16, _word: &str) -> Cow<[u8]> {
|
||||
/// TODO remove this
|
||||
unreachable!()
|
||||
}
|
||||
|
||||
@ -45,7 +45,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
|
||||
match document_change {
|
||||
DocumentChange::Deletion(inner) => {
|
||||
let mut fid_word_count = HashMap::new();
|
||||
let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
|
||||
let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| {
|
||||
fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1);
|
||||
Ok(())
|
||||
};
|
||||
@ -66,10 +66,10 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
|
||||
}
|
||||
DocumentChange::Update(inner) => {
|
||||
let mut fid_word_count = HashMap::new();
|
||||
let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
|
||||
let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| {
|
||||
fid_word_count
|
||||
.entry(fid)
|
||||
.and_modify(|(current_count, new_count)| *current_count += 1)
|
||||
.and_modify(|(current_count, _new_count)| *current_count += 1)
|
||||
.or_insert((1, 0));
|
||||
Ok(())
|
||||
};
|
||||
@ -79,10 +79,10 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
|
||||
&mut token_fn,
|
||||
)?;
|
||||
|
||||
let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
|
||||
let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| {
|
||||
fid_word_count
|
||||
.entry(fid)
|
||||
.and_modify(|(current_count, new_count)| *new_count += 1)
|
||||
.and_modify(|(_current_count, new_count)| *new_count += 1)
|
||||
.or_insert((0, 1));
|
||||
Ok(())
|
||||
};
|
||||
@ -106,7 +106,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor {
|
||||
}
|
||||
DocumentChange::Insertion(inner) => {
|
||||
let mut fid_word_count = HashMap::new();
|
||||
let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
|
||||
let mut token_fn = |fid: FieldId, _pos: u16, _word: &str| {
|
||||
fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1);
|
||||
Ok(())
|
||||
};
|
||||
|
@ -20,7 +20,7 @@ impl SearchableExtractor for WordDocidsExtractor {
|
||||
}
|
||||
|
||||
/// TODO write in an external Vec buffer
|
||||
fn build_key<'a>(_field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> {
|
||||
fn build_key(_field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> {
|
||||
Cow::Borrowed(word.as_bytes())
|
||||
}
|
||||
}
|
||||
@ -49,7 +49,7 @@ impl SearchableExtractor for ExactWordDocidsExtractor {
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
fn build_key<'a>(_field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> {
|
||||
fn build_key(_field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> {
|
||||
Cow::Borrowed(word.as_bytes())
|
||||
}
|
||||
}
|
||||
@ -67,7 +67,7 @@ impl SearchableExtractor for WordFidDocidsExtractor {
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
fn build_key<'a>(field_id: FieldId, _position: u16, word: &'a str) -> Cow<'a, [u8]> {
|
||||
fn build_key(field_id: FieldId, _position: u16, word: &str) -> Cow<[u8]> {
|
||||
let mut key = Vec::new();
|
||||
key.extend_from_slice(word.as_bytes());
|
||||
key.push(0);
|
||||
@ -89,7 +89,7 @@ impl SearchableExtractor for WordPositionDocidsExtractor {
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
fn build_key<'a>(_field_id: FieldId, position: u16, word: &'a str) -> Cow<'a, [u8]> {
|
||||
fn build_key(_field_id: FieldId, position: u16, word: &str) -> Cow<[u8]> {
|
||||
// position must be bucketed to reduce the number of keys in the DB.
|
||||
let position = bucketed_position(position);
|
||||
let mut key = Vec::new();
|
||||
|
@ -1,21 +1,17 @@
|
||||
use std::{
|
||||
borrow::Cow,
|
||||
collections::{BTreeMap, VecDeque},
|
||||
};
|
||||
use std::borrow::Cow;
|
||||
use std::collections::{BTreeMap, VecDeque};
|
||||
|
||||
use heed::RoTxn;
|
||||
use itertools::merge_join_by;
|
||||
use obkv::KvReader;
|
||||
|
||||
use super::{tokenize_document::DocumentTokenizer, SearchableExtractor};
|
||||
use crate::{
|
||||
proximity::{index_proximity, MAX_DISTANCE},
|
||||
update::{
|
||||
new::{extract::cache::CboCachedSorter, DocumentChange},
|
||||
MergeDeladdCboRoaringBitmaps,
|
||||
},
|
||||
FieldId, GlobalFieldsIdsMap, Index, Result,
|
||||
};
|
||||
use super::tokenize_document::DocumentTokenizer;
|
||||
use super::SearchableExtractor;
|
||||
use crate::proximity::{index_proximity, MAX_DISTANCE};
|
||||
use crate::update::new::extract::cache::CboCachedSorter;
|
||||
use crate::update::new::DocumentChange;
|
||||
use crate::update::MergeDeladdCboRoaringBitmaps;
|
||||
use crate::{FieldId, GlobalFieldsIdsMap, Index, Result};
|
||||
|
||||
pub struct WordPairProximityDocidsExtractor;
|
||||
impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
||||
@ -26,12 +22,13 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor {
|
||||
index.user_defined_searchable_fields(rtxn).map_err(Into::into)
|
||||
}
|
||||
|
||||
fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>> {
|
||||
fn attributes_to_skip<'a>(_rtxn: &'a RoTxn, _index: &'a Index) -> Result<Vec<&'a str>> {
|
||||
Ok(vec![])
|
||||
}
|
||||
|
||||
/// This case is unreachable because extract_document_change has been reimplemented to not call this function.
|
||||
fn build_key<'a>(_field_id: FieldId, _position: u16, _word: &'a str) -> Cow<'a, [u8]> {
|
||||
fn build_key(_field_id: FieldId, _position: u16, _word: &str) -> Cow<[u8]> {
|
||||
/// TODO remove this
|
||||
unreachable!()
|
||||
}
|
||||
|
||||
@ -159,7 +156,7 @@ fn process_document_tokens(
|
||||
word_positions: &mut VecDeque<(String, u16)>,
|
||||
word_pair_proximity: &mut BTreeMap<(String, String), u8>,
|
||||
) -> Result<()> {
|
||||
let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
|
||||
let mut token_fn = |_fid: FieldId, pos: u16, word: &str| {
|
||||
// drain the proximity window until the head word is considered close to the word we are inserting.
|
||||
while word_positions
|
||||
.front()
|
||||
|
@ -11,15 +11,9 @@ use rayon::iter::{IntoParallelIterator, ParallelIterator};
|
||||
use rayon::ThreadPool;
|
||||
pub use update_by_function::UpdateByFunction;
|
||||
|
||||
use super::channel::{
|
||||
extractors_merger_channels, merger_writer_channel, EntryOperation, ExactWordDocids, WordDocids,
|
||||
WordFidDocids, WordPositionDocids,
|
||||
};
|
||||
use super::channel::*;
|
||||
use super::document_change::DocumentChange;
|
||||
use super::extract::{
|
||||
ExactWordDocidsExtractor, SearchableExtractor, WordDocidsExtractor, WordFidDocidsExtractor,
|
||||
WordPositionDocidsExtractor,
|
||||
};
|
||||
use super::extract::*;
|
||||
use super::merger::merge_grenad_entries;
|
||||
use super::StdResult;
|
||||
use crate::documents::{
|
||||
@ -127,6 +121,25 @@ where
|
||||
&extractor_sender,
|
||||
)?;
|
||||
|
||||
extract_and_send_docids::<FidWordCountDocidsExtractor, FidWordCountDocids>(
|
||||
index,
|
||||
&global_fields_ids_map,
|
||||
GrenadParameters::default(),
|
||||
document_changes.clone(),
|
||||
&extractor_sender,
|
||||
)?;
|
||||
|
||||
extract_and_send_docids::<
|
||||
WordPairProximityDocidsExtractor,
|
||||
WordPairProximityDocids,
|
||||
>(
|
||||
index,
|
||||
&global_fields_ids_map,
|
||||
GrenadParameters::default(),
|
||||
document_changes.clone(),
|
||||
&extractor_sender,
|
||||
)?;
|
||||
|
||||
// TODO THIS IS TOO MUCH
|
||||
// Extract fieldid docid facet number
|
||||
// Extract fieldid docid facet string
|
||||
|
@ -8,10 +8,7 @@ use memmap2::Mmap;
|
||||
use roaring::RoaringBitmap;
|
||||
use tempfile::tempfile;
|
||||
|
||||
use super::channel::{
|
||||
DatabaseType, DocidsSender, ExactWordDocids, MergerReceiver, MergerSender, WordDocids,
|
||||
WordFidDocids, WordPositionDocids,
|
||||
};
|
||||
use super::channel::*;
|
||||
use super::KvReaderDelAdd;
|
||||
use crate::update::del_add::DelAdd;
|
||||
use crate::update::new::channel::MergerOperation;
|
||||
@ -30,6 +27,29 @@ pub fn merge_grenad_entries(
|
||||
|
||||
for merger_operation in receiver {
|
||||
match merger_operation {
|
||||
MergerOperation::ExactWordDocidsMerger(merger) => {
|
||||
merge_and_send_docids(
|
||||
merger,
|
||||
/// TODO do a MergerOperation::database(&Index) -> Database<Bytes, Bytes>.
|
||||
index.exact_word_docids.remap_types(),
|
||||
rtxn,
|
||||
&mut buffer,
|
||||
sender.docids::<ExactWordDocids>(),
|
||||
|_key| Ok(()),
|
||||
|_key| Ok(()),
|
||||
)?;
|
||||
}
|
||||
MergerOperation::FidWordCountDocidsMerger(merger) => {
|
||||
merge_and_send_docids(
|
||||
merger,
|
||||
index.field_id_word_count_docids.remap_types(),
|
||||
rtxn,
|
||||
&mut buffer,
|
||||
sender.docids::<FidWordCountDocids>(),
|
||||
|_key| Ok(()),
|
||||
|_key| Ok(()),
|
||||
)?;
|
||||
}
|
||||
MergerOperation::WordDocidsMerger(merger) => {
|
||||
let mut add_words_fst = SetBuilder::new(tempfile()?)?;
|
||||
let mut del_words_fst = SetBuilder::new(tempfile()?)?;
|
||||
@ -49,17 +69,6 @@ pub fn merge_grenad_entries(
|
||||
let mmap = compute_new_words_fst(add_words_fst, del_words_fst, words_fst)?;
|
||||
sender.main().write_words_fst(mmap).unwrap();
|
||||
}
|
||||
MergerOperation::ExactWordDocidsMerger(merger) => {
|
||||
merge_and_send_docids(
|
||||
merger,
|
||||
index.exact_word_docids.remap_types(),
|
||||
rtxn,
|
||||
&mut buffer,
|
||||
sender.docids::<ExactWordDocids>(),
|
||||
|_key| Ok(()),
|
||||
|_key| Ok(()),
|
||||
)?;
|
||||
}
|
||||
MergerOperation::WordFidDocidsMerger(merger) => {
|
||||
merge_and_send_docids(
|
||||
merger,
|
||||
@ -71,6 +80,17 @@ pub fn merge_grenad_entries(
|
||||
|_key| Ok(()),
|
||||
)?;
|
||||
}
|
||||
MergerOperation::WordPairProximityDocidsMerger(merger) => {
|
||||
merge_and_send_docids(
|
||||
merger,
|
||||
index.word_pair_proximity_docids.remap_types(),
|
||||
rtxn,
|
||||
&mut buffer,
|
||||
sender.docids::<WordPairProximityDocids>(),
|
||||
|_key| Ok(()),
|
||||
|_key| Ok(()),
|
||||
)?;
|
||||
}
|
||||
MergerOperation::WordPositionDocidsMerger(merger) => {
|
||||
merge_and_send_docids(
|
||||
merger,
|
||||
|
Loading…
Reference in New Issue
Block a user