diff --git a/milli/src/update/new/extract/faceted/mod.rs b/milli/src/update/new/extract/faceted/mod.rs index 62aa7adb2..b4d6b4131 100644 --- a/milli/src/update/new/extract/faceted/mod.rs +++ b/milli/src/update/new/extract/faceted/mod.rs @@ -87,11 +87,11 @@ pub trait FacetedExtractor { where MF: MergeFunction, MF::Error: Debug, + grenad::Error: Into, { buffer.clear(); match Self::build_key(fid, value, buffer) { - // TODO manage errors - Some(key) => Ok(cache_fn(cached_sorter, &key, docid).unwrap()), + Some(key) => cache_fn(cached_sorter, &key, docid).map_err(Into::into), None => Ok(()), } } diff --git a/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs b/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs index 7cb11c11b..4d90b46d4 100644 --- a/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs @@ -1,4 +1,3 @@ -use std::borrow::Cow; use std::collections::HashMap; use heed::RoTxn; @@ -25,12 +24,6 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { Ok(vec![]) } - /// This case is unreachable because extract_document_change has been reimplemented to not call this function. - fn build_key(_field_id: FieldId, _position: u16, _word: &str) -> Cow<[u8]> { - /// TODO remove this - unreachable!() - } - // This method is reimplemented to count the number of words in the document in each field // and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS. fn extract_document_change( @@ -59,8 +52,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { for (fid, count) in fid_word_count.iter() { if *count <= MAX_COUNTED_WORDS { let key = build_key(*fid, *count as u8, &mut key_buffer); - /// TODO manage the error - cached_sorter.insert_del_u32(key, inner.docid()).unwrap(); + cached_sorter.insert_del_u32(key, inner.docid())?; } } } @@ -93,13 +85,11 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { if *current_count != *new_count { if *current_count <= MAX_COUNTED_WORDS { let key = build_key(*fid, *current_count as u8, &mut key_buffer); - /// TODO manage the error - cached_sorter.insert_del_u32(key, inner.docid()).unwrap(); + cached_sorter.insert_del_u32(key, inner.docid())?; } if *new_count <= MAX_COUNTED_WORDS { let key = build_key(*fid, *new_count as u8, &mut key_buffer); - /// TODO manage the error - cached_sorter.insert_add_u32(key, inner.docid()).unwrap(); + cached_sorter.insert_add_u32(key, inner.docid())?; } } } @@ -116,8 +106,7 @@ impl SearchableExtractor for FidWordCountDocidsExtractor { for (fid, count) in fid_word_count.iter() { if *count <= MAX_COUNTED_WORDS { let key = build_key(*fid, *count as u8, &mut key_buffer); - /// TODO manage the error - cached_sorter.insert_add_u32(key, inner.docid()).unwrap(); + cached_sorter.insert_add_u32(key, inner.docid())?; } } } diff --git a/milli/src/update/new/extract/searchable/extract_word_docids.rs b/milli/src/update/new/extract/searchable/extract_word_docids.rs index db8bb7993..0cf36cf00 100644 --- a/milli/src/update/new/extract/searchable/extract_word_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_docids.rs @@ -2,11 +2,93 @@ use std::borrow::Cow; use heed::RoTxn; -use super::SearchableExtractor; -use crate::{bucketed_position, FieldId, Index, Result}; +use super::{tokenize_document::DocumentTokenizer, SearchableExtractor}; +use crate::{ + bucketed_position, + update::{ + new::{extract::cache::CboCachedSorter, DocumentChange}, + MergeDeladdCboRoaringBitmaps, + }, + FieldId, GlobalFieldsIdsMap, Index, Result, +}; + +trait ProtoWordDocidsExtractor { + fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>; + fn attributes_to_extract<'a>( + _rtxn: &'a RoTxn, + _index: &'a Index, + ) -> Result>>; + + fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; +} + +impl SearchableExtractor for T +where + T: ProtoWordDocidsExtractor, +{ + fn extract_document_change( + rtxn: &RoTxn, + index: &Index, + document_tokenizer: &DocumentTokenizer, + fields_ids_map: &mut GlobalFieldsIdsMap, + cached_sorter: &mut CboCachedSorter, + document_change: DocumentChange, + ) -> Result<()> { + match document_change { + DocumentChange::Deletion(inner) => { + let mut token_fn = |fid, pos: u16, word: &str| { + let key = Self::build_key(fid, pos, word); + cached_sorter.insert_del_u32(&key, inner.docid()).map_err(crate::Error::from) + }; + document_tokenizer.tokenize_document( + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut token_fn, + )?; + } + DocumentChange::Update(inner) => { + let mut token_fn = |fid, pos, word: &str| { + let key = Self::build_key(fid, pos, word); + cached_sorter.insert_del_u32(&key, inner.docid()).map_err(crate::Error::from) + }; + document_tokenizer.tokenize_document( + inner.current(rtxn, index)?.unwrap(), + fields_ids_map, + &mut token_fn, + )?; + + let mut token_fn = |fid, pos, word: &str| { + let key = Self::build_key(fid, pos, word); + cached_sorter.insert_add_u32(&key, inner.docid()).map_err(crate::Error::from) + }; + document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; + } + DocumentChange::Insertion(inner) => { + let mut token_fn = |fid, pos, word: &str| { + let key = Self::build_key(fid, pos, word); + cached_sorter.insert_add_u32(&key, inner.docid()).map_err(crate::Error::from) + }; + document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; + } + } + + Ok(()) + } + + fn attributes_to_extract<'a>( + rtxn: &'a RoTxn, + index: &'a Index, + ) -> Result>> { + Self::attributes_to_extract(rtxn, index) + } + + fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result> { + Self::attributes_to_skip(rtxn, index) + } +} pub struct WordDocidsExtractor; -impl SearchableExtractor for WordDocidsExtractor { +impl ProtoWordDocidsExtractor for WordDocidsExtractor { fn attributes_to_extract<'a>( rtxn: &'a RoTxn, index: &'a Index, @@ -26,7 +108,7 @@ impl SearchableExtractor for WordDocidsExtractor { } pub struct ExactWordDocidsExtractor; -impl SearchableExtractor for ExactWordDocidsExtractor { +impl ProtoWordDocidsExtractor for ExactWordDocidsExtractor { fn attributes_to_extract<'a>( rtxn: &'a RoTxn, index: &'a Index, @@ -55,7 +137,7 @@ impl SearchableExtractor for ExactWordDocidsExtractor { } pub struct WordFidDocidsExtractor; -impl SearchableExtractor for WordFidDocidsExtractor { +impl ProtoWordDocidsExtractor for WordFidDocidsExtractor { fn attributes_to_extract<'a>( rtxn: &'a RoTxn, index: &'a Index, @@ -77,7 +159,7 @@ impl SearchableExtractor for WordFidDocidsExtractor { } pub struct WordPositionDocidsExtractor; -impl SearchableExtractor for WordPositionDocidsExtractor { +impl ProtoWordDocidsExtractor for WordPositionDocidsExtractor { fn attributes_to_extract<'a>( rtxn: &'a RoTxn, index: &'a Index, diff --git a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs index e9de6e9f2..dbd08901b 100644 --- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs +++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs @@ -1,4 +1,3 @@ -use std::borrow::Cow; use std::collections::{BTreeMap, VecDeque}; use heed::RoTxn; @@ -26,12 +25,6 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { Ok(vec![]) } - /// This case is unreachable because extract_document_change has been reimplemented to not call this function. - fn build_key(_field_id: FieldId, _position: u16, _word: &str) -> Cow<[u8]> { - /// TODO remove this - unreachable!() - } - // This method is reimplemented to count the number of words in the document in each field // and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS. fn extract_document_change( @@ -100,18 +93,18 @@ impl SearchableExtractor for WordPairProximityDocidsExtractor { match eob { Left(((w1, w2), prox)) => { let key = build_key(*prox, w1, w2, &mut key_buffer); - cached_sorter.insert_del_u32(key, docid).unwrap(); + cached_sorter.insert_del_u32(key, docid)?; } Right(((w1, w2), prox)) => { let key = build_key(*prox, w1, w2, &mut key_buffer); - cached_sorter.insert_add_u32(key, docid).unwrap(); + cached_sorter.insert_add_u32(key, docid)?; } Both(((w1, w2), del_prox), (_, add_prox)) => { if del_prox != add_prox { let key = build_key(*del_prox, w1, w2, &mut key_buffer); - cached_sorter.insert_del_u32(key, docid).unwrap(); + cached_sorter.insert_del_u32(key, docid)?; let key = build_key(*add_prox, w1, w2, &mut key_buffer); - cached_sorter.insert_add_u32(key, docid).unwrap(); + cached_sorter.insert_add_u32(key, docid)?; } } }; diff --git a/milli/src/update/new/extract/searchable/mod.rs b/milli/src/update/new/extract/searchable/mod.rs index a7498d0d9..c3ac30b17 100644 --- a/milli/src/update/new/extract/searchable/mod.rs +++ b/milli/src/update/new/extract/searchable/mod.rs @@ -3,7 +3,6 @@ mod extract_word_docids; mod extract_word_pair_proximity_docids; mod tokenize_document; -use std::borrow::Cow; use std::fs::File; pub use extract_fid_word_count_docids::FidWordCountDocidsExtractor; @@ -20,7 +19,7 @@ use tokenize_document::{tokenizer_builder, DocumentTokenizer}; use super::cache::CboCachedSorter; use crate::update::new::{DocumentChange, ItemsPool}; use crate::update::{create_sorter, GrenadParameters, MergeDeladdCboRoaringBitmaps}; -use crate::{FieldId, GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; +use crate::{GlobalFieldsIdsMap, Index, Result, MAX_POSITION_PER_ATTRIBUTE}; pub trait SearchableExtractor { fn run_extraction( @@ -109,60 +108,10 @@ pub trait SearchableExtractor { fields_ids_map: &mut GlobalFieldsIdsMap, cached_sorter: &mut CboCachedSorter, document_change: DocumentChange, - ) -> Result<()> { - match document_change { - DocumentChange::Deletion(inner) => { - let mut token_fn = |fid, pos: u16, word: &str| { - let key = Self::build_key(fid, pos, word); - /// TODO manage the error - cached_sorter.insert_del_u32(&key, inner.docid()).unwrap(); - Ok(()) - }; - document_tokenizer.tokenize_document( - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, - &mut token_fn, - )?; - } - DocumentChange::Update(inner) => { - let mut token_fn = |fid, pos, word: &str| { - let key = Self::build_key(fid, pos, word); - /// TODO manage the error - cached_sorter.insert_del_u32(&key, inner.docid()).unwrap(); - Ok(()) - }; - document_tokenizer.tokenize_document( - inner.current(rtxn, index)?.unwrap(), - fields_ids_map, - &mut token_fn, - )?; - - let mut token_fn = |fid, pos, word: &str| { - let key = Self::build_key(fid, pos, word); - /// TODO manage the error - cached_sorter.insert_add_u32(&key, inner.docid()).unwrap(); - Ok(()) - }; - document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; - } - DocumentChange::Insertion(inner) => { - let mut token_fn = |fid, pos, word: &str| { - let key = Self::build_key(fid, pos, word); - /// TODO manage the error - cached_sorter.insert_add_u32(&key, inner.docid()).unwrap(); - Ok(()) - }; - document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?; - } - } - - Ok(()) - } + ) -> Result<()>; fn attributes_to_extract<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>>; fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result>; - - fn build_key(field_id: FieldId, position: u16, word: &str) -> Cow<'_, [u8]>; }