Implement word count and word pair proximity extractors

2024-11-25 19:45:05 +08:00 · 2024-09-05 10:30:39 +02:00 · 2024-09-05 10:30:39 +02:00 · 34f11e3380
commit 34f11e3380
parent 27308eaab1
4 changed files with 331 additions and 7 deletions
--- a/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs
+++ b/milli/src/update/new/extract/searchable/extract_fid_word_count_docids.rs
@ -0,0 +1,135 @@
 use std::{borrow::Cow, collections::HashMap};
 use heed::RoTxn;
 use super::{tokenize_document::DocumentTokenizer, SearchableExtractor};
 use crate::{
    update::{
        new::{extract::cache::CboCachedSorter, DocumentChange},
        MergeDeladdCboRoaringBitmaps,
    },
    FieldId, GlobalFieldsIdsMap, Index, Result,
 };
 const MAX_COUNTED_WORDS: usize = 30;
 pub struct FidWordCountDocidsExtractor;
 impl SearchableExtractor for FidWordCountDocidsExtractor {
    fn attributes_to_extract<'a>(
        rtxn: &'a RoTxn,
        index: &'a Index,
    ) -> Result<Option<Vec<&'a str>>> {
        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
    }
    fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>> {
        Ok(vec![])
    }
    /// This case is unreachable because extract_document_change has been reimplemented to not call this function.
    fn build_key<'a>(_field_id: FieldId, _position: u16, _word: &'a str) -> Cow<'a, [u8]> {
        unreachable!()
    }
    // This method is reimplemented to count the number of words in the document in each field
    // and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS.
    fn extract_document_change(
        rtxn: &RoTxn,
        index: &Index,
        document_tokenizer: &DocumentTokenizer,
        fields_ids_map: &mut GlobalFieldsIdsMap,
        cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
        document_change: DocumentChange,
    ) -> Result<()> {
        let mut key_buffer = Vec::new();
        match document_change {
            DocumentChange::Deletion(inner) => {
                let mut fid_word_count = HashMap::new();
                let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
                    fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1);
                    Ok(())
                };
                document_tokenizer.tokenize_document(
                    inner.current(rtxn, index)?.unwrap(),
                    fields_ids_map,
                    &mut token_fn,
                )?;
                // The docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS are deleted.
                for (fid, count) in fid_word_count.iter() {
                    if *count <= MAX_COUNTED_WORDS {
                        let key = build_key(*fid, *count as u8, &mut key_buffer);
                        /// TODO manage the error
                        cached_sorter.insert_del_u32(key, inner.docid()).unwrap();
                    }
                }
            }
            DocumentChange::Update(inner) => {
                let mut fid_word_count = HashMap::new();
                let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
                    fid_word_count
                        .entry(fid)
                        .and_modify(|(current_count, new_count)| *current_count += 1)
                        .or_insert((1, 0));
                    Ok(())
                };
                document_tokenizer.tokenize_document(
                    inner.current(rtxn, index)?.unwrap(),
                    fields_ids_map,
                    &mut token_fn,
                )?;
                let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
                    fid_word_count
                        .entry(fid)
                        .and_modify(|(current_count, new_count)| *new_count += 1)
                        .or_insert((0, 1));
                    Ok(())
                };
                document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
                // Only the fields that have a change in the number of words are updated.
                for (fid, (current_count, new_count)) in fid_word_count.iter() {
                    if *current_count != *new_count {
                        if *current_count <= MAX_COUNTED_WORDS {
                            let key = build_key(*fid, *current_count as u8, &mut key_buffer);
                            /// TODO manage the error
                            cached_sorter.insert_del_u32(key, inner.docid()).unwrap();
                        }
                        if *new_count <= MAX_COUNTED_WORDS {
                            let key = build_key(*fid, *new_count as u8, &mut key_buffer);
                            /// TODO manage the error
                            cached_sorter.insert_add_u32(key, inner.docid()).unwrap();
                        }
                    }
                }
            }
            DocumentChange::Insertion(inner) => {
                let mut fid_word_count = HashMap::new();
                let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
                    fid_word_count.entry(fid).and_modify(|count| *count += 1).or_insert(1);
                    Ok(())
                };
                document_tokenizer.tokenize_document(inner.new(), fields_ids_map, &mut token_fn)?;
                // The docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS are stored.
                for (fid, count) in fid_word_count.iter() {
                    if *count <= MAX_COUNTED_WORDS {
                        let key = build_key(*fid, *count as u8, &mut key_buffer);
                        /// TODO manage the error
                        cached_sorter.insert_add_u32(key, inner.docid()).unwrap();
                    }
                }
            }
        }
        Ok(())
    }
 }
 fn build_key(fid: FieldId, count: u8, key_buffer: &mut Vec<u8>) -> &[u8] {
    key_buffer.clear();
    key_buffer.extend_from_slice(&fid.to_be_bytes());
    key_buffer.push(count);
    key_buffer.as_slice()
 }
--- a/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
+++ b/milli/src/update/new/extract/searchable/extract_word_pair_proximity_docids.rs
@ -0,0 +1,182 @@
 use std::{
    borrow::Cow,
    collections::{BTreeMap, VecDeque},
 };
 use heed::RoTxn;
 use itertools::merge_join_by;
 use obkv::KvReader;
 use super::{tokenize_document::DocumentTokenizer, SearchableExtractor};
 use crate::{
    proximity::{index_proximity, MAX_DISTANCE},
    update::{
        new::{extract::cache::CboCachedSorter, DocumentChange},
        MergeDeladdCboRoaringBitmaps,
    },
    FieldId, GlobalFieldsIdsMap, Index, Result,
 };
 pub struct WordPairProximityDocidsExtractor;
 impl SearchableExtractor for WordPairProximityDocidsExtractor {
    fn attributes_to_extract<'a>(
        rtxn: &'a RoTxn,
        index: &'a Index,
    ) -> Result<Option<Vec<&'a str>>> {
        index.user_defined_searchable_fields(rtxn).map_err(Into::into)
    }
    fn attributes_to_skip<'a>(rtxn: &'a RoTxn, index: &'a Index) -> Result<Vec<&'a str>> {
        Ok(vec![])
    }
    /// This case is unreachable because extract_document_change has been reimplemented to not call this function.
    fn build_key<'a>(_field_id: FieldId, _position: u16, _word: &'a str) -> Cow<'a, [u8]> {
        unreachable!()
    }
    // This method is reimplemented to count the number of words in the document in each field
    // and to store the docids of the documents that have a number of words in a given field equal to or under than MAX_COUNTED_WORDS.
    fn extract_document_change(
        rtxn: &RoTxn,
        index: &Index,
        document_tokenizer: &DocumentTokenizer,
        fields_ids_map: &mut GlobalFieldsIdsMap,
        cached_sorter: &mut CboCachedSorter<MergeDeladdCboRoaringBitmaps>,
        document_change: DocumentChange,
    ) -> Result<()> {
        /// TODO: mutualize those buffers
        let mut key_buffer = Vec::new();
        let mut add_word_pair_proximity = BTreeMap::new();
        let mut del_word_pair_proximity = BTreeMap::new();
        let mut word_positions: VecDeque<(String, u16)> =
            VecDeque::with_capacity(MAX_DISTANCE as usize);
        let docid = document_change.docid();
        match document_change {
            DocumentChange::Deletion(inner) => {
                let document = inner.current(rtxn, index)?.unwrap();
                process_document_tokens(
                    document,
                    document_tokenizer,
                    fields_ids_map,
                    &mut word_positions,
                    &mut del_word_pair_proximity,
                )?;
            }
            DocumentChange::Update(inner) => {
                let document = inner.current(rtxn, index)?.unwrap();
                process_document_tokens(
                    &document,
                    document_tokenizer,
                    fields_ids_map,
                    &mut word_positions,
                    &mut del_word_pair_proximity,
                )?;
                let document = inner.new();
                process_document_tokens(
                    document,
                    document_tokenizer,
                    fields_ids_map,
                    &mut word_positions,
                    &mut add_word_pair_proximity,
                )?;
            }
            DocumentChange::Insertion(inner) => {
                let document = inner.new();
                process_document_tokens(
                    document,
                    document_tokenizer,
                    fields_ids_map,
                    &mut word_positions,
                    &mut add_word_pair_proximity,
                )?;
            }
        }
        use itertools::EitherOrBoth::*;
        for eob in
            merge_join_by(del_word_pair_proximity.iter(), add_word_pair_proximity.iter(), |d, a| {
                d.cmp(a)
            })
        {
            match eob {
                Left(((w1, w2), prox)) => {
                    let key = build_key(*prox, w1, w2, &mut key_buffer);
                    cached_sorter.insert_del_u32(key, docid).unwrap();
                }
                Right(((w1, w2), prox)) => {
                    let key = build_key(*prox, w1, w2, &mut key_buffer);
                    cached_sorter.insert_add_u32(key, docid).unwrap();
                }
                Both(((w1, w2), del_prox), (_, add_prox)) => {
                    if del_prox != add_prox {
                        let key = build_key(*del_prox, w1, w2, &mut key_buffer);
                        cached_sorter.insert_del_u32(key, docid).unwrap();
                        let key = build_key(*add_prox, w1, w2, &mut key_buffer);
                        cached_sorter.insert_add_u32(key, docid).unwrap();
                    }
                }
            };
        }
        Ok(())
    }
 }
 fn build_key<'a>(prox: u8, w1: &str, w2: &str, key_buffer: &'a mut Vec<u8>) -> &'a [u8] {
    key_buffer.clear();
    key_buffer.push(prox);
    key_buffer.extend_from_slice(w1.as_bytes());
    key_buffer.push(0);
    key_buffer.extend_from_slice(w2.as_bytes());
    key_buffer.as_slice()
 }
 fn word_positions_into_word_pair_proximity(
    word_positions: &mut VecDeque<(String, u16)>,
    word_pair_proximity: &mut BTreeMap<(String, String), u8>,
 ) -> Result<()> {
    let (head_word, head_position) = word_positions.pop_front().unwrap();
    for (word, position) in word_positions.iter() {
        let prox = index_proximity(head_position as u32, *position as u32) as u8;
        if prox > 0 && prox < MAX_DISTANCE as u8 {
            word_pair_proximity
                .entry((head_word.clone(), word.clone()))
                .and_modify(|p| {
                    *p = std::cmp::min(*p, prox);
                })
                .or_insert(prox);
        }
    }
    Ok(())
 }
 fn process_document_tokens(
    document: &KvReader<FieldId>,
    document_tokenizer: &DocumentTokenizer,
    fields_ids_map: &mut GlobalFieldsIdsMap,
    word_positions: &mut VecDeque<(String, u16)>,
    word_pair_proximity: &mut BTreeMap<(String, String), u8>,
 ) -> Result<()> {
    let mut token_fn = |fid: FieldId, pos: u16, word: &str| {
        // drain the proximity window until the head word is considered close to the word we are inserting.
        while word_positions
            .front()
            .map_or(false, |(_w, p)| index_proximity(*p as u32, pos as u32) >= MAX_DISTANCE)
        {
            word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?;
        }
        // insert the new word.
        word_positions.push_back((word.to_string(), pos));
        Ok(())
    };
    document_tokenizer.tokenize_document(document, fields_ids_map, &mut token_fn)?;
    while !word_positions.is_empty() {
        word_positions_into_word_pair_proximity(word_positions, word_pair_proximity)?;
    }
    Ok(())
 }
--- a/milli/src/update/new/extract/searchable/mod.rs
+++ b/milli/src/update/new/extract/searchable/mod.rs
@ -1,13 +1,17 @@
 mod extract_fid_word_count_docids;
 mod extract_word_docids;
 mod extract_word_pair_proximity_docids;
 mod tokenize_document;
 use std::borrow::Cow;
 use std::fs::File;
 pub use extract_fid_word_count_docids::FidWordCountDocidsExtractor;
 pub use extract_word_docids::{
    ExactWordDocidsExtractor, WordDocidsExtractor, WordFidDocidsExtractor,
    WordPositionDocidsExtractor,
 };
 pub use extract_word_pair_proximity_docids::WordPairProximityDocidsExtractor;
 use grenad::Merger;
 use heed::RoTxn;
 use rayon::iter::{IntoParallelIterator, ParallelIterator};
--- a/milli/src/update/new/extract/searchable/tokenize_document.rs
+++ b/milli/src/update/new/extract/searchable/tokenize_document.rs
@ -3,6 +3,7 @@ use std::collections::HashMap;
 use charabia::{SeparatorKind, Token, TokenKind, Tokenizer, TokenizerBuilder};
 use serde_json::Value;
 use crate::proximity::MAX_DISTANCE;
 use crate::update::new::extract::perm_json_p::{
    seek_leaf_values_in_array, seek_leaf_values_in_object, select_field,
 };
@ -43,8 +44,10 @@ impl<'a> DocumentTokenizer<'a> {
                    return Err(UserError::AttributeLimitReached.into());
                };
-                let position =
+                let position = field_position
-                    field_position.entry(field_id).and_modify(|counter| *counter += 8).or_insert(0);
+                    .entry(field_id)
                    .and_modify(|counter| *counter += MAX_DISTANCE)
                    .or_insert(0);
                if *position as u32 >= self.max_positions_per_attributes {
                    return Ok(());
                }
@ -116,19 +119,19 @@ impl<'a> DocumentTokenizer<'a> {
 }
 /// take an iterator on tokens and compute their relative position depending on separator kinds
-/// if it's an `Hard` separator we add an additional relative proximity of 8 between words,
+/// if it's an `Hard` separator we add an additional relative proximity of MAX_DISTANCE between words,
 /// else we keep the standard proximity of 1 between words.
 fn process_tokens<'a>(
-    start_offset: usize,
+    start_offset: u32,
    tokens: impl Iterator<Item = Token<'a>>,
-) -> impl Iterator<Item = (usize, Token<'a>)> {
+) -> impl Iterator<Item = (u32, Token<'a>)> {
    tokens
        .skip_while(|token| token.is_separator())
        .scan((start_offset, None), |(offset, prev_kind), mut token| {
            match token.kind {
                TokenKind::Word | TokenKind::StopWord if !token.lemma().is_empty() => {
                    *offset += match *prev_kind {
-                        Some(TokenKind::Separator(SeparatorKind::Hard)) => 8,
+                        Some(TokenKind::Separator(SeparatorKind::Hard)) => MAX_DISTANCE,
                        Some(_) => 1,
                        None => 0,
                    };
@ -246,7 +249,7 @@ mod test {
            ]: "doggo",
            [
                2,
-                8,
+                MAX_DISTANCE,
            ]: "doggo",
            [
                2,